import argparse import dataclasses from dataclasses import dataclass from typing import Optional from aphrodite.common.config import (CacheConfig, DecodingConfig, DeviceConfig, EngineConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, TokenizerPoolConfig, VisionLanguageConfig) from aphrodite.common.utils import str_to_int_tuple @dataclass class EngineArgs: """Arguments for Aphrodite engine.""" model: str tokenizer: Optional[str] = None tokenizer_mode: str = "auto" trust_remote_code: bool = False download_dir: Optional[str] = None load_format: str = "auto" dtype: str = "auto" kv_cache_dtype: str = "auto" quantization_param_path: Optional[str] = None seed: int = 0 max_model_len: Optional[int] = None worker_use_ray: bool = False pipeline_parallel_size: int = 1 tensor_parallel_size: int = 1 max_parallel_loading_workers: Optional[int] = None block_size: int = 16 context_shift: bool = False use_v2_block_manager: bool = False swap_space: int = 4 # GiB gpu_memory_utilization: float = 0.90 max_num_batched_tokens: Optional[int] = None max_num_seqs: int = 256 max_log_probs: int = 10 # OpenAI default is 5, setting to 10 because ST disable_log_stats: bool = False revision: Optional[str] = None code_revision: Optional[str] = None tokenizer_revision: Optional[str] = None quantization: Optional[str] = None load_in_4bit: bool = False load_in_8bit: bool = False load_in_smooth: bool = False enforce_eager: bool = True max_context_len_to_capture: int = 8192 disable_custom_all_reduce: bool = False tokenizer_pool_size: int = 0 tokenizer_pool_type: str = "ray" tokenizer_pool_extra_config: Optional[dict] = None enable_lora: bool = False max_loras: int = 1 max_lora_rank: int = 16 lora_extra_vocab_size: int = 256 lora_dtype = "auto" max_cpu_loras: Optional[int] = None device: str = "auto" ray_workers_use_nsight: bool = False num_gpu_blocks_override: Optional[int] = None num_lookahead_slots: int = 0 # Related to Vision-language models such as llava image_input_type: Optional[str] = None image_token_id: Optional[int] = None image_input_shape: Optional[str] = None image_feature_size: Optional[int] = None scheduler_delay_factor: float = 0.0 enable_chunked_prefill: bool = False guided_decoding_backend: str = 'outlines' # Speculative decoding config speculative_model: Optional[str] = None num_speculative_tokens: Optional[int] = None speculative_max_model_len: Optional[int] = None ngram_prompt_lookup_max: Optional[int] = None ngram_prompt_lookup_min: Optional[int] = None def __post_init__(self): if self.tokenizer is None: self.tokenizer = self.model @staticmethod def add_cli_args( parser: argparse.ArgumentParser) -> argparse.ArgumentParser: """Shared CLI arguments for the Aphrodite engine.""" # NOTE: If you update any of the arguments below, please also # make sure to update docs/source/models/engine_args.rst # Model arguments parser.add_argument( "--model", type=str, default="EleutherAI/pythia-70m-deduped", help="name or path of the huggingface model to use", ) parser.add_argument( "--tokenizer", type=str, default=EngineArgs.tokenizer, help="name or path of the huggingface tokenizer to use", ) parser.add_argument( "--revision", type=str, default=None, help="the specific model version to use. It can be a branch " "name, a tag name, or a commit id. If unspecified, will use " "the default version.", ) parser.add_argument( "--code-revision", type=str, default=None, help="the specific revision to use for the model code on " "Hugging Face Hub. It can be a branch name, a tag name, or a " "commit id. If unspecified, will use the default version.", ) parser.add_argument( "--tokenizer-revision", type=str, default=None, help="the specific tokenizer version to use. It can be a branch " "name, a tag name, or a commit id. If unspecified, will use " "the default version.", ) parser.add_argument( "--tokenizer-mode", type=str, default=EngineArgs.tokenizer_mode, choices=["auto", "slow"], help='tokenizer mode. "auto" will use the fast ' 'tokenizer if available, and "slow" will ' "always use the slow tokenizer.", ) parser.add_argument( "--trust-remote-code", action="store_true", help="trust remote code from huggingface", ) parser.add_argument( "--download-dir", type=str, default=EngineArgs.download_dir, help="directory to download and load the weights, " "default to the default cache dir of " "huggingface", ) parser.add_argument( "--load-format", type=str, default=EngineArgs.load_format, choices=["auto", "pt", "safetensors", "npcache", "dummy"], help="The format of the model weights to load. " '"auto" will try to load the weights in the safetensors format ' "and fall back to the pytorch bin format if safetensors format " "is not available. " '"pt" will load the weights in the pytorch bin format. ' '"safetensors" will load the weights in the safetensors format. ' '"npcache" will load the weights in pytorch format and store ' "a numpy cache to speed up the loading. " '"dummy" will initialize the weights with random values, ' "which is mainly for profiling.", ) parser.add_argument( "--dtype", type=str, default=EngineArgs.dtype, choices=[ "auto", "half", "float16", "bfloat16", "float", "float32" ], help="data type for model weights and activations. " 'The "auto" option will use FP16 precision ' "for FP32 and FP16 models, and BF16 precision " "for BF16 models.", ) parser.add_argument( '--kv-cache-dtype', type=str, choices=['auto', 'fp8'], default=EngineArgs.kv_cache_dtype, help='Data type for kv cache storage. If "auto", will use model ' 'data type. FP8_E5M2 (without scaling) is only supported on cuda ' 'version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead ' 'supported for common inference criteria. ') parser.add_argument( '--quantization-param-path', type=str, default=None, help='Path to the JSON file containing the KV cache ' 'scaling factors. This should generally be supplied, when ' 'KV cache dtype is FP8. Otherwise, KV cache scaling factors ' 'default to 1.0, which may cause accuracy issues. ' 'FP8_E5M2 (without scaling) is only supported on cuda version' 'greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead ' 'supported for common inference criteria. ') parser.add_argument( "--max-model-len", type=int, default=EngineArgs.max_model_len, help="model context length. If unspecified, " "will be automatically derived from the model.", ) parser.add_argument( '--guided-decoding-backend', type=str, default='outlines', choices=['outlines', 'lm-format-enforcer'], help='Which engine will be used for guided decoding' ' (JSON schema / regex etc)') # Parallel arguments parser.add_argument( "--worker-use-ray", action="store_true", help="use Ray for distributed serving, will be " "automatically set when using more than 1 GPU", ) parser.add_argument( "--pipeline-parallel-size", "-pp", type=int, default=EngineArgs.pipeline_parallel_size, help="number of pipeline stages", ) parser.add_argument( "--tensor-parallel-size", "-tp", type=int, default=EngineArgs.tensor_parallel_size, help="number of tensor parallel replicas", ) parser.add_argument( "--max-parallel-loading-workers", type=int, default=EngineArgs.max_parallel_loading_workers, help="load model sequentially in multiple batches, " "to avoid RAM OOM when using tensor " "parallel and large models", ) parser.add_argument( "--ray-workers-use-nsight", action="store_true", help="If specified, use nsight to profile ray workers", ) # KV cache arguments parser.add_argument( "--block-size", type=int, default=EngineArgs.block_size, choices=[8, 16, 32], help="token block size", ) parser.add_argument( "--context-shift", action="store_true", help="Enable context shifting.", ) parser.add_argument("--use-v2-block-manager", action="store_true", help="Use the v2 block manager.") parser.add_argument( "--num-lookahead-slots", type=int, default=EngineArgs.num_lookahead_slots, help="Experimental scheduling config necessary for " "speculative decoding. This will be replaced by " "speculative decoding config in the future; it is " "present for testing purposes until then.") parser.add_argument("--seed", type=int, default=EngineArgs.seed, help="random seed") parser.add_argument( "--swap-space", type=int, default=EngineArgs.swap_space, help="CPU swap space size (GiB) per GPU", ) parser.add_argument( "--gpu-memory-utilization", "-gmu", type=float, default=EngineArgs.gpu_memory_utilization, help="the fraction of GPU memory to be used for " "the model executor, which can range from 0 to 1." "If unspecified, will use the default value of 0.9.", ) parser.add_argument( "--num-gpu-blocks-override", type=int, default=None, help="If specified, ignore GPU profiling result and use this " "number of GPU blocks. Used for testing preemption.") parser.add_argument( "--max-num-batched-tokens", type=int, default=EngineArgs.max_num_batched_tokens, help="maximum number of batched tokens per " "iteration", ) parser.add_argument( "--max-num-seqs", type=int, default=EngineArgs.max_num_seqs, help="maximum number of sequences per iteration", ) parser.add_argument( "--max-log-probs", type=int, default=EngineArgs.max_log_probs, help="maximum number of log probabilities to " "return.", ) parser.add_argument( "--disable-log-stats", action="store_true", help="disable logging statistics", ) # Quantization settings. parser.add_argument( "--quantization", "-q", type=str, choices=[ "aqlm", "awq", "bnb", "eetq", "exl2", "gguf", "gptq", "quip", "squeezellm", "marlin", None, ], default=EngineArgs.quantization, help="Method used to quantize the weights. If " "None, we first check the `quantization_config` " "attribute in the model config file. If that is " "None, we assume the model weights are not " "quantized and use `dtype` to determine the data " "type of the weights.", ) parser.add_argument( "--load-in-4bit", action="store_true", help="Load the FP16 model in 4-bit format. Also " "works with AWQ models. Throughput at 2.5x of " "FP16.", ) parser.add_argument( "--load-in-8bit", action="store_true", help="Load the FP16 model in 8-bit format. " "Throughput at 0.3x of FP16.", ) parser.add_argument( "--load-in-smooth", action="store_true", help="Load the FP16 model in smoothquant " "8bit format. Throughput at 0.7x of FP16. ", ) parser.add_argument( "--enforce-eager", type=lambda x: (str(x).lower() == 'true'), default=EngineArgs.enforce_eager, help="Always use eager-mode PyTorch. If False, " "will use eager mode and CUDA graph in hybrid " "for maximal performance and flexibility.", ) parser.add_argument( "--max-context-len-to-capture", type=int, default=EngineArgs.max_context_len_to_capture, help="maximum context length covered by CUDA " "graphs. When a sequence has context length " "larger than this, we fall back to eager mode.", ) parser.add_argument( "--disable-custom-all-reduce", action="store_true", default=EngineArgs.disable_custom_all_reduce, help="See ParallelConfig", ) parser.add_argument("--tokenizer-pool-size", type=int, default=EngineArgs.tokenizer_pool_size, help="Size of tokenizer pool to use for " "asynchronous tokenization. If 0, will " "use synchronous tokenization.") parser.add_argument("--tokenizer-pool-type", type=str, default=EngineArgs.tokenizer_pool_type, help="The type of tokenizer pool to use for " "asynchronous tokenization. Ignored if " "tokenizer_pool_size is 0.") parser.add_argument("--tokenizer-pool-extra-config", type=str, default=EngineArgs.tokenizer_pool_extra_config, help="Extra config for tokenizer pool. " "This should be a JSON string that will be " "parsed into a dictionary. Ignored if " "tokenizer_pool_size is 0.") # LoRA related configs parser.add_argument( "--enable-lora", action="store_true", help="If True, enable handling of LoRA adapters.", ) parser.add_argument( "--max-loras", type=int, default=EngineArgs.max_loras, help="Max number of LoRAs in a single batch.", ) parser.add_argument( "--max-lora-rank", type=int, default=EngineArgs.max_lora_rank, help="Max LoRA rank.", ) parser.add_argument( "--lora-extra-vocab-size", type=int, default=EngineArgs.lora_extra_vocab_size, help=("Maximum size of extra vocabulary that can be " "present in a LoRA adapter (added to the base " "model vocabulary)."), ) parser.add_argument( "--lora-dtype", type=str, default=EngineArgs.lora_dtype, choices=["auto", "float16", "bfloat16", "float32"], help=("Data type for LoRA. If auto, will default to " "base model dtype."), ) parser.add_argument( "--max-cpu-loras", type=int, default=EngineArgs.max_cpu_loras, help=("Maximum number of LoRAs to store in CPU memory. " "Must be >= than max_num_seqs. " "Defaults to max_num_seqs."), ) parser.add_argument( "--device", type=str, default=EngineArgs.device, choices=["auto", "cuda", "neuron", "cpu"], help=("Device to use for model execution."), ) # Related to Vision-language models such as llava parser.add_argument( "--image-input-type", type=str, default=None, choices=[ t.name.lower() for t in VisionLanguageConfig.ImageInputType ], help=("The image input type passed into Aphrodite. " "Should be one of `pixel_values` or `image_features`")) parser.add_argument("--image-token-id", type=int, default=None, help=("Input id for image token.")) parser.add_argument( '--image-input-shape', type=str, default=None, help=( 'The biggest image input shape (worst for memory footprint) ' 'given an input type. Only used for Aphrodite\'s profile_run.' )) parser.add_argument( '--image-feature-size', type=int, default=None, help=('The image feature size along the context dimension.')) parser.add_argument( "--scheduler-delay-factor", "-sdf", type=float, default=EngineArgs.scheduler_delay_factor, help="Apply a delay (of delay factor multiplied by previous " "prompt latency) before scheduling next prompt.") parser.add_argument( "--enable-chunked-prefill", action="store_true", help="If True, the prefill requests can be chunked based on the " "max_num_batched_tokens.") parser.add_argument( "--speculative-model", type=str, default=EngineArgs.speculative_model, help= "The name of the draft model to be used in speculative decoding.") parser.add_argument( "--num-speculative-tokens", type=int, default=EngineArgs.num_speculative_tokens, help="The number of speculative tokens to sample from " "the draft model in speculative decoding") parser.add_argument( "--speculative-max-model-len", type=str, default=EngineArgs.speculative_max_model_len, help="The maximum sequence length supported by the " "draft model. Sequences over this length will skip " "speculation.") parser.add_argument( "--ngram-prompt-lookup-max", type=int, default=None, help='Max size of window for ngram prompt lookup in speculative ' 'decoding.') parser.add_argument( '--ngram-prompt-lookup-min', type=int, default=None, help='Min size of window for ngram prompt lookup in speculative ' 'decoding.') return parser @classmethod def from_cli_args(cls, args: argparse.Namespace) -> "EngineArgs": # Get the list of attributes of this dataclass. attrs = [attr.name for attr in dataclasses.fields(cls)] # Set the attributes from the parsed arguments. engine_args = cls(**{attr: getattr(args, attr) for attr in attrs}) return engine_args def create_engine_config(self, ) -> EngineConfig: device_config = DeviceConfig(self.device) model_config = ModelConfig( self.model, self.tokenizer, self.tokenizer_mode, self.trust_remote_code, self.download_dir, self.load_format, self.dtype, self.seed, self.revision, self.code_revision, self.tokenizer_revision, self.max_model_len, self.quantization, self.load_in_4bit, self.load_in_8bit, self.load_in_smooth, self.quantization_param_path, self.enforce_eager, self.max_context_len_to_capture, self.max_log_probs, ) cache_config = CacheConfig( self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, # self.kv_quant_params_path, self.num_gpu_blocks_override, model_config.get_sliding_window(), self.context_shift, ) parallel_config = ParallelConfig( self.pipeline_parallel_size, self.tensor_parallel_size, self.worker_use_ray, self.max_parallel_loading_workers, self.disable_custom_all_reduce, TokenizerPoolConfig.create_config( self.tokenizer_pool_size, self.tokenizer_pool_type, self.tokenizer_pool_extra_config, ), self.ray_workers_use_nsight, ) speculative_config = SpeculativeConfig.maybe_create_spec_config( target_model_config=model_config, target_parallel_config=parallel_config, target_dtype=self.dtype, speculative_model=self.speculative_model, num_speculative_tokens=self.num_speculative_tokens, speculative_max_model_len=self.speculative_max_model_len, enable_chunked_prefill=self.enable_chunked_prefill, use_v2_block_manager=self.use_v2_block_manager, ngram_prompt_lookup_max=self.ngram_prompt_lookup_max, ngram_prompt_lookup_min=self.ngram_prompt_lookup_min, ) scheduler_config = SchedulerConfig( self.max_num_batched_tokens, self.max_num_seqs, model_config.max_model_len, self.use_v2_block_manager, num_lookahead_slots=(self.num_lookahead_slots if speculative_config is None else speculative_config.num_lookahead_slots), delay_factor=self.scheduler_delay_factor, enable_chunked_prefill=self.enable_chunked_prefill, ) lora_config = (LoRAConfig( max_lora_rank=self.max_lora_rank, max_loras=self.max_loras, lora_extra_vocab_size=self.lora_extra_vocab_size, lora_dtype=self.lora_dtype, max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras and self.max_cpu_loras > 0 else None, ) if self.enable_lora else None) if self.image_input_type: if (not self.image_token_id or not self.image_input_shape or not self.image_feature_size): raise ValueError( "Specify `image_token_id`, `image_input_shape` and " "`image_feature_size` together with `image_input_type`.") vision_language_config = VisionLanguageConfig( image_input_type=VisionLanguageConfig. get_image_input_enum_type(self.image_input_type), image_token_id=self.image_token_id, image_input_shape=str_to_int_tuple(self.image_input_shape), image_feature_size=self.image_feature_size, ) else: vision_language_config = None decoding_config = DecodingConfig( guided_decoding_backend=self.guided_decoding_backend) return EngineConfig(model_config=model_config, cache_config=cache_config, parallel_config=parallel_config, scheduler_config=scheduler_config, device_config=device_config, lora_config=lora_config, vision_language_config=vision_language_config, speculative_config=speculative_config, decoding_config=decoding_config) @dataclass class AsyncEngineArgs(EngineArgs): """Arguments for asynchronous Aphrodite engine.""" engine_use_ray: bool = False disable_log_requests: bool = False max_log_len: int = 0 @staticmethod def add_cli_args( parser: argparse.ArgumentParser) -> argparse.ArgumentParser: parser = EngineArgs.add_cli_args(parser) parser.add_argument( "--engine-use-ray", action="store_true", help="use Ray to start the LLM engine in a " "separate process as the server process.", ) parser.add_argument( "--disable-log-requests", action="store_true", help="disable logging requests", ) parser.add_argument( "--max-log-len", type=int, default=0, help="max number of prompt characters or prompt " "ID numbers being printed in log. " "Default: unlimited.", ) return parser