import argparse import dataclasses from dataclasses import dataclass from typing import Optional, Tuple from aphrodite.common.config import (CacheConfig, ModelConfig, ParallelConfig, SchedulerConfig, LoRAConfig, DeviceConfig) @dataclass class EngineArgs: """Arguments for the Aphrodite engine.""" model: str tokenizer: Optional[str] = None tokenizer_mode: str = 'auto' trust_remote_code: bool = False download_dir: Optional[str] = None load_format: str = 'auto' dtype: str = 'auto' kv_cache_dtype: str = 'auto' kv_quant_params_path: str = None seed: int = 0 max_model_len: Optional[int] = None worker_use_ray: bool = False pipeline_parallel_size: int = 1 tensor_parallel_size: int = 1 max_parallel_loading_workers: Optional[int] = None block_size: int = 16 context_shift: bool = False swap_space: int = 4 # GiB gpu_memory_utilization: float = 0.90 max_num_batched_tokens: Optional[int] = None max_num_seqs: int = 256 max_paddings: int = 256 max_log_probs: int = 10 disable_log_stats: bool = False revision: Optional[str] = None tokenizer_revision: Optional[str] = None quantization: Optional[str] = None load_in_4bit: bool = False load_in_8bit: bool = False load_in_smooth: bool = False enforce_eager: bool = False max_context_len_to_capture: int = 8192 disable_custom_all_reduce: bool = False enable_lora: bool = False max_loras: int = 1 max_lora_rank: int = 16 lora_extra_vocab_size: int = 256 lora_dtype = 'auto' max_cpu_loras: Optional[int] = None device: str = 'cuda' def __post_init__(self): if self.tokenizer is None: self.tokenizer = self.model @staticmethod def add_cli_args( parser: argparse.ArgumentParser) -> argparse.ArgumentParser: """Shared CLI arguments for the Aphrodite engine.""" # NOTE: If you update any of the arguments below, please also # make sure to update docs/source/models/engine_args.rst # Model arguments parser.add_argument( '--model', type=str, default='EleutherAI/pythia-70m-deduped', help='name or path of the huggingface model to use') parser.add_argument( '--tokenizer', type=str, default=EngineArgs.tokenizer, help='name or path of the huggingface tokenizer to use') parser.add_argument( '--revision', type=str, default=None, help='the specific model version to use. It can be a branch ' 'name, a tag name, or a commit id. If unspecified, will use ' 'the default version.') parser.add_argument( '--tokenizer-revision', type=str, default=None, help='the specific tokenizer version to use. It can be a branch ' 'name, a tag name, or a commit id. If unspecified, will use ' 'the default version.') parser.add_argument('--tokenizer-mode', type=str, default=EngineArgs.tokenizer_mode, choices=['auto', 'slow'], help='tokenizer mode. "auto" will use the fast ' 'tokenizer if available, and "slow" will ' 'always use the slow tokenizer.') parser.add_argument('--trust-remote-code', action='store_true', help='trust remote code from huggingface') parser.add_argument('--download-dir', type=str, default=EngineArgs.download_dir, help='directory to download and load the weights, ' 'default to the default cache dir of ' 'huggingface') parser.add_argument( '--load-format', type=str, default=EngineArgs.load_format, choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'], help='The format of the model weights to load. ' '"auto" will try to load the weights in the safetensors format ' 'and fall back to the pytorch bin format if safetensors format ' 'is not available. ' '"pt" will load the weights in the pytorch bin format. ' '"safetensors" will load the weights in the safetensors format. ' '"npcache" will load the weights in pytorch format and store ' 'a numpy cache to speed up the loading. ' '"dummy" will initialize the weights with random values, ' 'which is mainly for profiling.') parser.add_argument( '--dtype', type=str, default=EngineArgs.dtype, choices=[ 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32' ], help='data type for model weights and activations. ' 'The "auto" option will use FP16 precision ' 'for FP32 and FP16 models, and BF16 precision ' 'for BF16 models.') parser.add_argument( '--kv-cache-dtype', type=str, choices=['auto', 'fp8_e5m2', 'int8'], default=EngineArgs.kv_cache_dtype, help='Data type for kv cache storage. If "auto", will use model ' 'data type. Note FP8 is not supported when cuda version is ' 'lower than 11.8.') parser.add_argument( '--kv-quant-params-path', type=str, default=EngineArgs.kv_quant_params_path, help='Path to scales and zero points of KV cache ' 'quantization. Only applicable when kv-cache-dtype ' 'is int8.') parser.add_argument('--max-model-len', type=int, default=EngineArgs.max_model_len, help='model context length. If unspecified, ' 'will be automatically derived from the model.') # Parallel arguments parser.add_argument('--worker-use-ray', action='store_true', help='use Ray for distributed serving, will be ' 'automatically set when using more than 1 GPU') parser.add_argument('--pipeline-parallel-size', '-pp', type=int, default=EngineArgs.pipeline_parallel_size, help='number of pipeline stages') parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=EngineArgs.tensor_parallel_size, help='number of tensor parallel replicas') parser.add_argument( '--max-parallel-loading-workers', type=int, default=EngineArgs.max_parallel_loading_workers, help='load model sequentially in multiple batches, ' 'to avoid RAM OOM when using tensor ' 'parallel and large models') # KV cache arguments parser.add_argument('--block-size', type=int, default=EngineArgs.block_size, choices=[8, 16, 32], help='token block size') parser.add_argument('--context-shift', action='store_true', help='Enable context shifting.') parser.add_argument('--seed', type=int, default=EngineArgs.seed, help='random seed') parser.add_argument('--swap-space', type=int, default=EngineArgs.swap_space, help='CPU swap space size (GiB) per GPU') parser.add_argument( '--gpu-memory-utilization', '-gmu', type=float, default=EngineArgs.gpu_memory_utilization, help='the fraction of GPU memory to be used for ' 'the model executor, which can range from 0 to 1.' 'If unspecified, will use the default value of 0.9.') parser.add_argument('--max-num-batched-tokens', type=int, default=EngineArgs.max_num_batched_tokens, help='maximum number of batched tokens per ' 'iteration') parser.add_argument('--max-num-seqs', type=int, default=EngineArgs.max_num_seqs, help='maximum number of sequences per iteration') parser.add_argument('--max-paddings', type=int, default=EngineArgs.max_paddings, help='maximum number of paddings in a batch') parser.add_argument('--max-log-probs', type=int, default=EngineArgs.max_log_probs, help='maximum number of log probabilities to ' 'return.') parser.add_argument('--disable-log-stats', action='store_true', help='disable logging statistics') # Quantization settings. parser.add_argument('--quantization', '-q', type=str, choices=[ 'aqlm', 'awq', 'bnb', 'exl2', 'gguf', 'gptq', 'quip', 'squeezellm', 'marlin', None ], default=EngineArgs.quantization, help='Method used to quantize the weights. If ' 'None, we first check the `quantization_config` ' 'attribute in the model config file. If that is ' 'None, we assume the model weights are not ' 'quantized and use `dtype` to determine the data ' 'type of the weights.') parser.add_argument('--load-in-4bit', action='store_true', help='Load the FP16 model in 4-bit format. Also ' 'works with AWQ models. Throughput at 2.5x of ' 'FP16.') parser.add_argument('--load-in-8bit', action='store_true', help='Load the FP16 model in 8-bit format. ' 'Throughput at 0.3x of FP16.') parser.add_argument('--load-in-smooth', action='store_true', help='Load the FP16 model in smoothquant ' '8bit format. Throughput at 0.7x of FP16. ') parser.add_argument('--enforce-eager', action='store_true', help='Always use eager-mode PyTorch. If False, ' 'will use eager mode and CUDA graph in hybrid ' 'for maximal performance and flexibility.') parser.add_argument('--max-context-len-to-capture', type=int, default=EngineArgs.max_context_len_to_capture, help='maximum context length covered by CUDA ' 'graphs. When a sequence has context length ' 'larger than this, we fall back to eager mode.') parser.add_argument('--disable-custom-all-reduce', action='store_true', default=EngineArgs.disable_custom_all_reduce, help='See ParallelConfig') # LoRA related configs parser.add_argument('--enable-lora', action='store_true', help='If True, enable handling of LoRA adapters.') parser.add_argument('--max-loras', type=int, default=EngineArgs.max_loras, help='Max number of LoRAs in a single batch.') parser.add_argument('--max-lora-rank', type=int, default=EngineArgs.max_lora_rank, help='Max LoRA rank.') parser.add_argument( '--lora-extra-vocab-size', type=int, default=EngineArgs.lora_extra_vocab_size, help=('Maximum size of extra vocabulary that can be ' 'present in a LoRA adapter (added to the base ' 'model vocabulary).')) parser.add_argument( '--lora-dtype', type=str, default=EngineArgs.lora_dtype, choices=['auto', 'float16', 'bfloat16', 'float32'], help=('Data type for LoRA. If auto, will default to ' 'base model dtype.')) parser.add_argument( '--max-cpu-loras', type=int, default=EngineArgs.max_cpu_loras, help=('Maximum number of LoRAs to store in CPU memory. ' 'Must be >= than max_num_seqs. ' 'Defaults to max_num_seqs.')) parser.add_argument('--device', type=str, default=EngineArgs.device, choices=['cuda'], help=('Device to use for model execution. ' 'Currently, only "cuda" is supported.')) return parser @classmethod def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs': # Get the list of attributes of this dataclass. attrs = [attr.name for attr in dataclasses.fields(cls)] # Set the attributes from the parsed arguments. engine_args = cls(**{attr: getattr(args, attr) for attr in attrs}) return engine_args def create_engine_configs( self, ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig, DeviceConfig, Optional[LoRAConfig]]: device_config = DeviceConfig(self.device) model_config = ModelConfig( self.model, self.tokenizer, self.tokenizer_mode, self.trust_remote_code, self.download_dir, self.load_format, self.dtype, self.seed, self.revision, self.tokenizer_revision, self.max_model_len, self.quantization, self.load_in_4bit, self.load_in_8bit, self.load_in_smooth, self.enforce_eager, self.max_context_len_to_capture, self.max_log_probs) cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, self.kv_quant_params_path, model_config.get_sliding_window(), self.context_shift) parallel_config = ParallelConfig(self.pipeline_parallel_size, self.tensor_parallel_size, self.worker_use_ray, self.max_parallel_loading_workers, self.disable_custom_all_reduce) scheduler_config = SchedulerConfig(self.max_num_batched_tokens, self.max_num_seqs, model_config.max_model_len, self.max_paddings) lora_config = LoRAConfig( max_lora_rank=self.max_lora_rank, max_loras=self.max_loras, lora_extra_vocab_size=self.lora_extra_vocab_size, lora_dtype=self.lora_dtype, max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras and self.max_cpu_loras > 0 else None) if self.enable_lora else None return (model_config, cache_config, parallel_config, scheduler_config, device_config, lora_config) @dataclass class AsyncEngineArgs(EngineArgs): """Arguments for asynchronous Aphrodite engine.""" engine_use_ray: bool = False disable_log_requests: bool = False max_log_len: Optional[int] = None @staticmethod def add_cli_args( parser: argparse.ArgumentParser) -> argparse.ArgumentParser: parser = EngineArgs.add_cli_args(parser) parser.add_argument('--engine-use-ray', action='store_true', help='use Ray to start the LLM engine in a ' 'separate process as the server process.') parser.add_argument('--disable-log-requests', action='store_true', help='disable logging requests') parser.add_argument('--max-log-len', type=int, default=None, help='max number of prompt characters or prompt ' 'ID numbers being printed in log. ' 'Default: unlimited.') return parser