123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675 |
- import argparse
- import dataclasses
- from dataclasses import dataclass
- from typing import Optional
- from aphrodite.common.config import (CacheConfig, DecodingConfig, DeviceConfig,
- EngineConfig, LoRAConfig, ModelConfig,
- ParallelConfig, SchedulerConfig,
- SpeculativeConfig, TokenizerPoolConfig,
- VisionLanguageConfig)
- from aphrodite.common.utils import str_to_int_tuple
- @dataclass
- class EngineArgs:
- """Arguments for Aphrodite engine."""
- model: str
- tokenizer: Optional[str] = None
- tokenizer_mode: str = "auto"
- trust_remote_code: bool = False
- download_dir: Optional[str] = None
- load_format: str = "auto"
- dtype: str = "auto"
- kv_cache_dtype: str = "auto"
- quantization_param_path: Optional[str] = None
- seed: int = 0
- max_model_len: Optional[int] = None
- worker_use_ray: bool = False
- pipeline_parallel_size: int = 1
- tensor_parallel_size: int = 1
- max_parallel_loading_workers: Optional[int] = None
- block_size: int = 16
- context_shift: bool = False
- use_v2_block_manager: bool = False
- swap_space: int = 4 # GiB
- gpu_memory_utilization: float = 0.90
- max_num_batched_tokens: Optional[int] = None
- max_num_seqs: int = 256
- max_log_probs: int = 10 # OpenAI default is 5, setting to 10 because ST
- disable_log_stats: bool = False
- revision: Optional[str] = None
- code_revision: Optional[str] = None
- tokenizer_revision: Optional[str] = None
- quantization: Optional[str] = None
- load_in_4bit: bool = False
- load_in_8bit: bool = False
- load_in_smooth: bool = False
- enforce_eager: bool = True
- max_context_len_to_capture: int = 8192
- disable_custom_all_reduce: bool = False
- tokenizer_pool_size: int = 0
- tokenizer_pool_type: str = "ray"
- tokenizer_pool_extra_config: Optional[dict] = None
- enable_lora: bool = False
- max_loras: int = 1
- max_lora_rank: int = 16
- lora_extra_vocab_size: int = 256
- lora_dtype = "auto"
- max_cpu_loras: Optional[int] = None
- device: str = "auto"
- ray_workers_use_nsight: bool = False
- num_gpu_blocks_override: Optional[int] = None
- num_lookahead_slots: int = 0
- # Related to Vision-language models such as llava
- image_input_type: Optional[str] = None
- image_token_id: Optional[int] = None
- image_input_shape: Optional[str] = None
- image_feature_size: Optional[int] = None
- scheduler_delay_factor: float = 0.0
- enable_chunked_prefill: bool = False
- guided_decoding_backend: str = 'outlines'
- # Speculative decoding config
- speculative_model: Optional[str] = None
- num_speculative_tokens: Optional[int] = None
- speculative_max_model_len: Optional[int] = None
- ngram_prompt_lookup_max: Optional[int] = None
- ngram_prompt_lookup_min: Optional[int] = None
- def __post_init__(self):
- if self.tokenizer is None:
- self.tokenizer = self.model
- @staticmethod
- def add_cli_args(
- parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
- """Shared CLI arguments for the Aphrodite engine."""
- # NOTE: If you update any of the arguments below, please also
- # make sure to update docs/source/models/engine_args.rst
- # Model arguments
- parser.add_argument(
- "--model",
- type=str,
- default="EleutherAI/pythia-70m-deduped",
- help="name or path of the huggingface model to use",
- )
- parser.add_argument(
- "--tokenizer",
- type=str,
- default=EngineArgs.tokenizer,
- help="name or path of the huggingface tokenizer to use",
- )
- parser.add_argument(
- "--revision",
- type=str,
- default=None,
- help="the specific model version to use. It can be a branch "
- "name, a tag name, or a commit id. If unspecified, will use "
- "the default version.",
- )
- parser.add_argument(
- "--code-revision",
- type=str,
- default=None,
- help="the specific revision to use for the model code on "
- "Hugging Face Hub. It can be a branch name, a tag name, or a "
- "commit id. If unspecified, will use the default version.",
- )
- parser.add_argument(
- "--tokenizer-revision",
- type=str,
- default=None,
- help="the specific tokenizer version to use. It can be a branch "
- "name, a tag name, or a commit id. If unspecified, will use "
- "the default version.",
- )
- parser.add_argument(
- "--tokenizer-mode",
- type=str,
- default=EngineArgs.tokenizer_mode,
- choices=["auto", "slow"],
- help='tokenizer mode. "auto" will use the fast '
- 'tokenizer if available, and "slow" will '
- "always use the slow tokenizer.",
- )
- parser.add_argument(
- "--trust-remote-code",
- action="store_true",
- help="trust remote code from huggingface",
- )
- parser.add_argument(
- "--download-dir",
- type=str,
- default=EngineArgs.download_dir,
- help="directory to download and load the weights, "
- "default to the default cache dir of "
- "huggingface",
- )
- parser.add_argument(
- "--load-format",
- type=str,
- default=EngineArgs.load_format,
- choices=["auto", "pt", "safetensors", "npcache", "dummy"],
- help="The format of the model weights to load. "
- '"auto" will try to load the weights in the safetensors format '
- "and fall back to the pytorch bin format if safetensors format "
- "is not available. "
- '"pt" will load the weights in the pytorch bin format. '
- '"safetensors" will load the weights in the safetensors format. '
- '"npcache" will load the weights in pytorch format and store '
- "a numpy cache to speed up the loading. "
- '"dummy" will initialize the weights with random values, '
- "which is mainly for profiling.",
- )
- parser.add_argument(
- "--dtype",
- type=str,
- default=EngineArgs.dtype,
- choices=[
- "auto", "half", "float16", "bfloat16", "float", "float32"
- ],
- help="data type for model weights and activations. "
- 'The "auto" option will use FP16 precision '
- "for FP32 and FP16 models, and BF16 precision "
- "for BF16 models.",
- )
- parser.add_argument(
- '--kv-cache-dtype',
- type=str,
- choices=['auto', 'fp8'],
- default=EngineArgs.kv_cache_dtype,
- help='Data type for kv cache storage. If "auto", will use model '
- 'data type. FP8_E5M2 (without scaling) is only supported on cuda '
- 'version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead '
- 'supported for common inference criteria. ')
- parser.add_argument(
- '--quantization-param-path',
- type=str,
- default=None,
- help='Path to the JSON file containing the KV cache '
- 'scaling factors. This should generally be supplied, when '
- 'KV cache dtype is FP8. Otherwise, KV cache scaling factors '
- 'default to 1.0, which may cause accuracy issues. '
- 'FP8_E5M2 (without scaling) is only supported on cuda version'
- 'greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead '
- 'supported for common inference criteria. ')
- parser.add_argument(
- "--max-model-len",
- type=int,
- default=EngineArgs.max_model_len,
- help="model context length. If unspecified, "
- "will be automatically derived from the model.",
- )
- parser.add_argument(
- '--guided-decoding-backend',
- type=str,
- default='outlines',
- choices=['outlines', 'lm-format-enforcer'],
- help='Which engine will be used for guided decoding'
- ' (JSON schema / regex etc)')
- # Parallel arguments
- parser.add_argument(
- "--worker-use-ray",
- action="store_true",
- help="use Ray for distributed serving, will be "
- "automatically set when using more than 1 GPU",
- )
- parser.add_argument(
- "--pipeline-parallel-size",
- "-pp",
- type=int,
- default=EngineArgs.pipeline_parallel_size,
- help="number of pipeline stages",
- )
- parser.add_argument(
- "--tensor-parallel-size",
- "-tp",
- type=int,
- default=EngineArgs.tensor_parallel_size,
- help="number of tensor parallel replicas",
- )
- parser.add_argument(
- "--max-parallel-loading-workers",
- type=int,
- default=EngineArgs.max_parallel_loading_workers,
- help="load model sequentially in multiple batches, "
- "to avoid RAM OOM when using tensor "
- "parallel and large models",
- )
- parser.add_argument(
- "--ray-workers-use-nsight",
- action="store_true",
- help="If specified, use nsight to profile ray workers",
- )
- # KV cache arguments
- parser.add_argument(
- "--block-size",
- type=int,
- default=EngineArgs.block_size,
- choices=[8, 16, 32],
- help="token block size",
- )
- parser.add_argument(
- "--context-shift",
- action="store_true",
- help="Enable context shifting.",
- )
- parser.add_argument("--use-v2-block-manager",
- action="store_true",
- help="Use the v2 block manager.")
- parser.add_argument(
- "--num-lookahead-slots",
- type=int,
- default=EngineArgs.num_lookahead_slots,
- help="Experimental scheduling config necessary for "
- "speculative decoding. This will be replaced by "
- "speculative decoding config in the future; it is "
- "present for testing purposes until then.")
- parser.add_argument("--seed",
- type=int,
- default=EngineArgs.seed,
- help="random seed")
- parser.add_argument(
- "--swap-space",
- type=int,
- default=EngineArgs.swap_space,
- help="CPU swap space size (GiB) per GPU",
- )
- parser.add_argument(
- "--gpu-memory-utilization",
- "-gmu",
- type=float,
- default=EngineArgs.gpu_memory_utilization,
- help="the fraction of GPU memory to be used for "
- "the model executor, which can range from 0 to 1."
- "If unspecified, will use the default value of 0.9.",
- )
- parser.add_argument(
- "--num-gpu-blocks-override",
- type=int,
- default=None,
- help="If specified, ignore GPU profiling result and use this "
- "number of GPU blocks. Used for testing preemption.")
- parser.add_argument(
- "--max-num-batched-tokens",
- type=int,
- default=EngineArgs.max_num_batched_tokens,
- help="maximum number of batched tokens per "
- "iteration",
- )
- parser.add_argument(
- "--max-num-seqs",
- type=int,
- default=EngineArgs.max_num_seqs,
- help="maximum number of sequences per iteration",
- )
- parser.add_argument(
- "--max-log-probs",
- type=int,
- default=EngineArgs.max_log_probs,
- help="maximum number of log probabilities to "
- "return.",
- )
- parser.add_argument(
- "--disable-log-stats",
- action="store_true",
- help="disable logging statistics",
- )
- # Quantization settings.
- parser.add_argument(
- "--quantization",
- "-q",
- type=str,
- choices=[
- "aqlm",
- "awq",
- "bnb",
- "eetq",
- "exl2",
- "gguf",
- "gptq",
- "quip",
- "squeezellm",
- "marlin",
- None,
- ],
- default=EngineArgs.quantization,
- help="Method used to quantize the weights. If "
- "None, we first check the `quantization_config` "
- "attribute in the model config file. If that is "
- "None, we assume the model weights are not "
- "quantized and use `dtype` to determine the data "
- "type of the weights.",
- )
- parser.add_argument(
- "--load-in-4bit",
- action="store_true",
- help="Load the FP16 model in 4-bit format. Also "
- "works with AWQ models. Throughput at 2.5x of "
- "FP16.",
- )
- parser.add_argument(
- "--load-in-8bit",
- action="store_true",
- help="Load the FP16 model in 8-bit format. "
- "Throughput at 0.3x of FP16.",
- )
- parser.add_argument(
- "--load-in-smooth",
- action="store_true",
- help="Load the FP16 model in smoothquant "
- "8bit format. Throughput at 0.7x of FP16. ",
- )
- parser.add_argument(
- "--enforce-eager",
- type=lambda x: (str(x).lower() == 'true'),
- default=EngineArgs.enforce_eager,
- help="Always use eager-mode PyTorch. If False, "
- "will use eager mode and CUDA graph in hybrid "
- "for maximal performance and flexibility.",
- )
- parser.add_argument(
- "--max-context-len-to-capture",
- type=int,
- default=EngineArgs.max_context_len_to_capture,
- help="maximum context length covered by CUDA "
- "graphs. When a sequence has context length "
- "larger than this, we fall back to eager mode.",
- )
- parser.add_argument(
- "--disable-custom-all-reduce",
- action="store_true",
- default=EngineArgs.disable_custom_all_reduce,
- help="See ParallelConfig",
- )
- parser.add_argument("--tokenizer-pool-size",
- type=int,
- default=EngineArgs.tokenizer_pool_size,
- help="Size of tokenizer pool to use for "
- "asynchronous tokenization. If 0, will "
- "use synchronous tokenization.")
- parser.add_argument("--tokenizer-pool-type",
- type=str,
- default=EngineArgs.tokenizer_pool_type,
- help="The type of tokenizer pool to use for "
- "asynchronous tokenization. Ignored if "
- "tokenizer_pool_size is 0.")
- parser.add_argument("--tokenizer-pool-extra-config",
- type=str,
- default=EngineArgs.tokenizer_pool_extra_config,
- help="Extra config for tokenizer pool. "
- "This should be a JSON string that will be "
- "parsed into a dictionary. Ignored if "
- "tokenizer_pool_size is 0.")
- # LoRA related configs
- parser.add_argument(
- "--enable-lora",
- action="store_true",
- help="If True, enable handling of LoRA adapters.",
- )
- parser.add_argument(
- "--max-loras",
- type=int,
- default=EngineArgs.max_loras,
- help="Max number of LoRAs in a single batch.",
- )
- parser.add_argument(
- "--max-lora-rank",
- type=int,
- default=EngineArgs.max_lora_rank,
- help="Max LoRA rank.",
- )
- parser.add_argument(
- "--lora-extra-vocab-size",
- type=int,
- default=EngineArgs.lora_extra_vocab_size,
- help=("Maximum size of extra vocabulary that can be "
- "present in a LoRA adapter (added to the base "
- "model vocabulary)."),
- )
- parser.add_argument(
- "--lora-dtype",
- type=str,
- default=EngineArgs.lora_dtype,
- choices=["auto", "float16", "bfloat16", "float32"],
- help=("Data type for LoRA. If auto, will default to "
- "base model dtype."),
- )
- parser.add_argument(
- "--max-cpu-loras",
- type=int,
- default=EngineArgs.max_cpu_loras,
- help=("Maximum number of LoRAs to store in CPU memory. "
- "Must be >= than max_num_seqs. "
- "Defaults to max_num_seqs."),
- )
- parser.add_argument(
- "--device",
- type=str,
- default=EngineArgs.device,
- choices=["auto", "cuda", "neuron", "cpu"],
- help=("Device to use for model execution."),
- )
- # Related to Vision-language models such as llava
- parser.add_argument(
- "--image-input-type",
- type=str,
- default=None,
- choices=[
- t.name.lower() for t in VisionLanguageConfig.ImageInputType
- ],
- help=("The image input type passed into Aphrodite. "
- "Should be one of `pixel_values` or `image_features`"))
- parser.add_argument("--image-token-id",
- type=int,
- default=None,
- help=("Input id for image token."))
- parser.add_argument(
- '--image-input-shape',
- type=str,
- default=None,
- help=(
- 'The biggest image input shape (worst for memory footprint) '
- 'given an input type. Only used for Aphrodite\'s profile_run.'
- ))
- parser.add_argument(
- '--image-feature-size',
- type=int,
- default=None,
- help=('The image feature size along the context dimension.'))
- parser.add_argument(
- "--scheduler-delay-factor",
- "-sdf",
- type=float,
- default=EngineArgs.scheduler_delay_factor,
- help="Apply a delay (of delay factor multiplied by previous "
- "prompt latency) before scheduling next prompt.")
- parser.add_argument(
- "--enable-chunked-prefill",
- action="store_true",
- help="If True, the prefill requests can be chunked based on the "
- "max_num_batched_tokens.")
- parser.add_argument(
- "--speculative-model",
- type=str,
- default=EngineArgs.speculative_model,
- help=
- "The name of the draft model to be used in speculative decoding.")
- parser.add_argument(
- "--num-speculative-tokens",
- type=int,
- default=EngineArgs.num_speculative_tokens,
- help="The number of speculative tokens to sample from "
- "the draft model in speculative decoding")
- parser.add_argument(
- "--speculative-max-model-len",
- type=str,
- default=EngineArgs.speculative_max_model_len,
- help="The maximum sequence length supported by the "
- "draft model. Sequences over this length will skip "
- "speculation.")
- parser.add_argument(
- "--ngram-prompt-lookup-max",
- type=int,
- default=None,
- help='Max size of window for ngram prompt lookup in speculative '
- 'decoding.')
- parser.add_argument(
- '--ngram-prompt-lookup-min',
- type=int,
- default=None,
- help='Min size of window for ngram prompt lookup in speculative '
- 'decoding.')
- return parser
- @classmethod
- def from_cli_args(cls, args: argparse.Namespace) -> "EngineArgs":
- # Get the list of attributes of this dataclass.
- attrs = [attr.name for attr in dataclasses.fields(cls)]
- # Set the attributes from the parsed arguments.
- engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
- return engine_args
- def create_engine_config(self, ) -> EngineConfig:
- device_config = DeviceConfig(self.device)
- model_config = ModelConfig(
- self.model,
- self.tokenizer,
- self.tokenizer_mode,
- self.trust_remote_code,
- self.download_dir,
- self.load_format,
- self.dtype,
- self.seed,
- self.revision,
- self.code_revision,
- self.tokenizer_revision,
- self.max_model_len,
- self.quantization,
- self.load_in_4bit,
- self.load_in_8bit,
- self.load_in_smooth,
- self.quantization_param_path,
- self.enforce_eager,
- self.max_context_len_to_capture,
- self.max_log_probs,
- )
- cache_config = CacheConfig(
- self.block_size,
- self.gpu_memory_utilization,
- self.swap_space,
- self.kv_cache_dtype,
- # self.kv_quant_params_path,
- self.num_gpu_blocks_override,
- model_config.get_sliding_window(),
- self.context_shift,
- )
- parallel_config = ParallelConfig(
- self.pipeline_parallel_size,
- self.tensor_parallel_size,
- self.worker_use_ray,
- self.max_parallel_loading_workers,
- self.disable_custom_all_reduce,
- TokenizerPoolConfig.create_config(
- self.tokenizer_pool_size,
- self.tokenizer_pool_type,
- self.tokenizer_pool_extra_config,
- ),
- self.ray_workers_use_nsight,
- )
- speculative_config = SpeculativeConfig.maybe_create_spec_config(
- target_model_config=model_config,
- target_parallel_config=parallel_config,
- target_dtype=self.dtype,
- speculative_model=self.speculative_model,
- num_speculative_tokens=self.num_speculative_tokens,
- speculative_max_model_len=self.speculative_max_model_len,
- enable_chunked_prefill=self.enable_chunked_prefill,
- use_v2_block_manager=self.use_v2_block_manager,
- ngram_prompt_lookup_max=self.ngram_prompt_lookup_max,
- ngram_prompt_lookup_min=self.ngram_prompt_lookup_min,
- )
- scheduler_config = SchedulerConfig(
- self.max_num_batched_tokens,
- self.max_num_seqs,
- model_config.max_model_len,
- self.use_v2_block_manager,
- num_lookahead_slots=(self.num_lookahead_slots
- if speculative_config is None else
- speculative_config.num_lookahead_slots),
- delay_factor=self.scheduler_delay_factor,
- enable_chunked_prefill=self.enable_chunked_prefill,
- )
- lora_config = (LoRAConfig(
- max_lora_rank=self.max_lora_rank,
- max_loras=self.max_loras,
- lora_extra_vocab_size=self.lora_extra_vocab_size,
- lora_dtype=self.lora_dtype,
- max_cpu_loras=self.max_cpu_loras
- if self.max_cpu_loras and self.max_cpu_loras > 0 else None,
- ) if self.enable_lora else None)
- if self.image_input_type:
- if (not self.image_token_id or not self.image_input_shape
- or not self.image_feature_size):
- raise ValueError(
- "Specify `image_token_id`, `image_input_shape` and "
- "`image_feature_size` together with `image_input_type`.")
- vision_language_config = VisionLanguageConfig(
- image_input_type=VisionLanguageConfig.
- get_image_input_enum_type(self.image_input_type),
- image_token_id=self.image_token_id,
- image_input_shape=str_to_int_tuple(self.image_input_shape),
- image_feature_size=self.image_feature_size,
- )
- else:
- vision_language_config = None
- decoding_config = DecodingConfig(
- guided_decoding_backend=self.guided_decoding_backend)
- return EngineConfig(model_config=model_config,
- cache_config=cache_config,
- parallel_config=parallel_config,
- scheduler_config=scheduler_config,
- device_config=device_config,
- lora_config=lora_config,
- vision_language_config=vision_language_config,
- speculative_config=speculative_config,
- decoding_config=decoding_config)
- @dataclass
- class AsyncEngineArgs(EngineArgs):
- """Arguments for asynchronous Aphrodite engine."""
- engine_use_ray: bool = False
- disable_log_requests: bool = False
- max_log_len: int = 0
- @staticmethod
- def add_cli_args(
- parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
- parser = EngineArgs.add_cli_args(parser)
- parser.add_argument(
- "--engine-use-ray",
- action="store_true",
- help="use Ray to start the LLM engine in a "
- "separate process as the server process.",
- )
- parser.add_argument(
- "--disable-log-requests",
- action="store_true",
- help="disable logging requests",
- )
- parser.add_argument(
- "--max-log-len",
- type=int,
- default=0,
- help="max number of prompt characters or prompt "
- "ID numbers being printed in log. "
- "Default: unlimited.",
- )
- return parser
|