123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375 |
- import argparse
- import dataclasses
- from dataclasses import dataclass
- from typing import Optional, Tuple
- from aphrodite.common.config import (CacheConfig, ModelConfig, ParallelConfig,
- SchedulerConfig, LoRAConfig, DeviceConfig)
- @dataclass
- class EngineArgs:
- """Arguments for the Aphrodite engine."""
- model: str
- tokenizer: Optional[str] = None
- tokenizer_mode: str = 'auto'
- trust_remote_code: bool = False
- download_dir: Optional[str] = None
- load_format: str = 'auto'
- dtype: str = 'auto'
- kv_cache_dtype: str = 'auto'
- kv_quant_params_path: str = None
- seed: int = 0
- max_model_len: Optional[int] = None
- worker_use_ray: bool = False
- pipeline_parallel_size: int = 1
- tensor_parallel_size: int = 1
- max_parallel_loading_workers: Optional[int] = None
- block_size: int = 16
- context_shift: bool = False
- swap_space: int = 4 # GiB
- gpu_memory_utilization: float = 0.90
- max_num_batched_tokens: Optional[int] = None
- max_num_seqs: int = 256
- max_paddings: int = 256
- max_log_probs: int = 10
- disable_log_stats: bool = False
- revision: Optional[str] = None
- tokenizer_revision: Optional[str] = None
- quantization: Optional[str] = None
- load_in_4bit: bool = False
- load_in_8bit: bool = False
- load_in_smooth: bool = False
- enforce_eager: bool = False
- max_context_len_to_capture: int = 8192
- disable_custom_all_reduce: bool = False
- enable_lora: bool = False
- max_loras: int = 1
- max_lora_rank: int = 16
- lora_extra_vocab_size: int = 256
- lora_dtype = 'auto'
- max_cpu_loras: Optional[int] = None
- device: str = 'cuda'
- def __post_init__(self):
- if self.tokenizer is None:
- self.tokenizer = self.model
- @staticmethod
- def add_cli_args(
- parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
- """Shared CLI arguments for the Aphrodite engine."""
- # NOTE: If you update any of the arguments below, please also
- # make sure to update docs/source/models/engine_args.rst
- # Model arguments
- parser.add_argument(
- '--model',
- type=str,
- default='EleutherAI/pythia-70m-deduped',
- help='name or path of the huggingface model to use')
- parser.add_argument(
- '--tokenizer',
- type=str,
- default=EngineArgs.tokenizer,
- help='name or path of the huggingface tokenizer to use')
- parser.add_argument(
- '--revision',
- type=str,
- default=None,
- help='the specific model version to use. It can be a branch '
- 'name, a tag name, or a commit id. If unspecified, will use '
- 'the default version.')
- parser.add_argument(
- '--tokenizer-revision',
- type=str,
- default=None,
- help='the specific tokenizer version to use. It can be a branch '
- 'name, a tag name, or a commit id. If unspecified, will use '
- 'the default version.')
- parser.add_argument('--tokenizer-mode',
- type=str,
- default=EngineArgs.tokenizer_mode,
- choices=['auto', 'slow'],
- help='tokenizer mode. "auto" will use the fast '
- 'tokenizer if available, and "slow" will '
- 'always use the slow tokenizer.')
- parser.add_argument('--trust-remote-code',
- action='store_true',
- help='trust remote code from huggingface')
- parser.add_argument('--download-dir',
- type=str,
- default=EngineArgs.download_dir,
- help='directory to download and load the weights, '
- 'default to the default cache dir of '
- 'huggingface')
- parser.add_argument(
- '--load-format',
- type=str,
- default=EngineArgs.load_format,
- choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'],
- help='The format of the model weights to load. '
- '"auto" will try to load the weights in the safetensors format '
- 'and fall back to the pytorch bin format if safetensors format '
- 'is not available. '
- '"pt" will load the weights in the pytorch bin format. '
- '"safetensors" will load the weights in the safetensors format. '
- '"npcache" will load the weights in pytorch format and store '
- 'a numpy cache to speed up the loading. '
- '"dummy" will initialize the weights with random values, '
- 'which is mainly for profiling.')
- parser.add_argument(
- '--dtype',
- type=str,
- default=EngineArgs.dtype,
- choices=[
- 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'
- ],
- help='data type for model weights and activations. '
- 'The "auto" option will use FP16 precision '
- 'for FP32 and FP16 models, and BF16 precision '
- 'for BF16 models.')
- parser.add_argument(
- '--kv-cache-dtype',
- type=str,
- choices=['auto', 'fp8_e5m2', 'int8'],
- default=EngineArgs.kv_cache_dtype,
- help='Data type for kv cache storage. If "auto", will use model '
- 'data type. Note FP8 is not supported when cuda version is '
- 'lower than 11.8.')
- parser.add_argument(
- '--kv-quant-params-path',
- type=str,
- default=EngineArgs.kv_quant_params_path,
- help='Path to scales and zero points of KV cache '
- 'quantization. Only applicable when kv-cache-dtype '
- 'is int8.')
- parser.add_argument('--max-model-len',
- type=int,
- default=EngineArgs.max_model_len,
- help='model context length. If unspecified, '
- 'will be automatically derived from the model.')
- # Parallel arguments
- parser.add_argument('--worker-use-ray',
- action='store_true',
- help='use Ray for distributed serving, will be '
- 'automatically set when using more than 1 GPU')
- parser.add_argument('--pipeline-parallel-size',
- '-pp',
- type=int,
- default=EngineArgs.pipeline_parallel_size,
- help='number of pipeline stages')
- parser.add_argument('--tensor-parallel-size',
- '-tp',
- type=int,
- default=EngineArgs.tensor_parallel_size,
- help='number of tensor parallel replicas')
- parser.add_argument(
- '--max-parallel-loading-workers',
- type=int,
- default=EngineArgs.max_parallel_loading_workers,
- help='load model sequentially in multiple batches, '
- 'to avoid RAM OOM when using tensor '
- 'parallel and large models')
- # KV cache arguments
- parser.add_argument('--block-size',
- type=int,
- default=EngineArgs.block_size,
- choices=[8, 16, 32],
- help='token block size')
- parser.add_argument('--context-shift',
- action='store_true',
- help='Enable context shifting.')
- parser.add_argument('--seed',
- type=int,
- default=EngineArgs.seed,
- help='random seed')
- parser.add_argument('--swap-space',
- type=int,
- default=EngineArgs.swap_space,
- help='CPU swap space size (GiB) per GPU')
- parser.add_argument(
- '--gpu-memory-utilization',
- '-gmu',
- type=float,
- default=EngineArgs.gpu_memory_utilization,
- help='the fraction of GPU memory to be used for '
- 'the model executor, which can range from 0 to 1.'
- 'If unspecified, will use the default value of 0.9.')
- parser.add_argument('--max-num-batched-tokens',
- type=int,
- default=EngineArgs.max_num_batched_tokens,
- help='maximum number of batched tokens per '
- 'iteration')
- parser.add_argument('--max-num-seqs',
- type=int,
- default=EngineArgs.max_num_seqs,
- help='maximum number of sequences per iteration')
- parser.add_argument('--max-paddings',
- type=int,
- default=EngineArgs.max_paddings,
- help='maximum number of paddings in a batch')
- parser.add_argument('--max-log-probs',
- type=int,
- default=EngineArgs.max_log_probs,
- help='maximum number of log probabilities to '
- 'return.')
- parser.add_argument('--disable-log-stats',
- action='store_true',
- help='disable logging statistics')
- # Quantization settings.
- parser.add_argument('--quantization',
- '-q',
- type=str,
- choices=[
- 'aqlm', 'awq', 'bnb', 'exl2', 'gguf', 'gptq',
- 'quip', 'squeezellm', 'marlin', None
- ],
- default=EngineArgs.quantization,
- help='Method used to quantize the weights. If '
- 'None, we first check the `quantization_config` '
- 'attribute in the model config file. If that is '
- 'None, we assume the model weights are not '
- 'quantized and use `dtype` to determine the data '
- 'type of the weights.')
- parser.add_argument('--load-in-4bit',
- action='store_true',
- help='Load the FP16 model in 4-bit format. Also '
- 'works with AWQ models. Throughput at 2.5x of '
- 'FP16.')
- parser.add_argument('--load-in-8bit',
- action='store_true',
- help='Load the FP16 model in 8-bit format. '
- 'Throughput at 0.3x of FP16.')
- parser.add_argument('--load-in-smooth',
- action='store_true',
- help='Load the FP16 model in smoothquant '
- '8bit format. Throughput at 0.7x of FP16. ')
- parser.add_argument('--enforce-eager',
- action='store_true',
- help='Always use eager-mode PyTorch. If False, '
- 'will use eager mode and CUDA graph in hybrid '
- 'for maximal performance and flexibility.')
- parser.add_argument('--max-context-len-to-capture',
- type=int,
- default=EngineArgs.max_context_len_to_capture,
- help='maximum context length covered by CUDA '
- 'graphs. When a sequence has context length '
- 'larger than this, we fall back to eager mode.')
- parser.add_argument('--disable-custom-all-reduce',
- action='store_true',
- default=EngineArgs.disable_custom_all_reduce,
- help='See ParallelConfig')
- # LoRA related configs
- parser.add_argument('--enable-lora',
- action='store_true',
- help='If True, enable handling of LoRA adapters.')
- parser.add_argument('--max-loras',
- type=int,
- default=EngineArgs.max_loras,
- help='Max number of LoRAs in a single batch.')
- parser.add_argument('--max-lora-rank',
- type=int,
- default=EngineArgs.max_lora_rank,
- help='Max LoRA rank.')
- parser.add_argument(
- '--lora-extra-vocab-size',
- type=int,
- default=EngineArgs.lora_extra_vocab_size,
- help=('Maximum size of extra vocabulary that can be '
- 'present in a LoRA adapter (added to the base '
- 'model vocabulary).'))
- parser.add_argument(
- '--lora-dtype',
- type=str,
- default=EngineArgs.lora_dtype,
- choices=['auto', 'float16', 'bfloat16', 'float32'],
- help=('Data type for LoRA. If auto, will default to '
- 'base model dtype.'))
- parser.add_argument(
- '--max-cpu-loras',
- type=int,
- default=EngineArgs.max_cpu_loras,
- help=('Maximum number of LoRAs to store in CPU memory. '
- 'Must be >= than max_num_seqs. '
- 'Defaults to max_num_seqs.'))
- parser.add_argument('--device',
- type=str,
- default=EngineArgs.device,
- choices=['cuda'],
- help=('Device to use for model execution. '
- 'Currently, only "cuda" is supported.'))
- return parser
- @classmethod
- def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
- # Get the list of attributes of this dataclass.
- attrs = [attr.name for attr in dataclasses.fields(cls)]
- # Set the attributes from the parsed arguments.
- engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
- return engine_args
- def create_engine_configs(
- self,
- ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig,
- DeviceConfig, Optional[LoRAConfig]]:
- device_config = DeviceConfig(self.device)
- model_config = ModelConfig(
- self.model, self.tokenizer, self.tokenizer_mode,
- self.trust_remote_code, self.download_dir, self.load_format,
- self.dtype, self.seed, self.revision, self.tokenizer_revision,
- self.max_model_len, self.quantization, self.load_in_4bit,
- self.load_in_8bit, self.load_in_smooth, self.enforce_eager,
- self.max_context_len_to_capture, self.max_log_probs)
- cache_config = CacheConfig(self.block_size,
- self.gpu_memory_utilization,
- self.swap_space, self.kv_cache_dtype,
- self.kv_quant_params_path,
- model_config.get_sliding_window(),
- self.context_shift)
- parallel_config = ParallelConfig(self.pipeline_parallel_size,
- self.tensor_parallel_size,
- self.worker_use_ray,
- self.max_parallel_loading_workers,
- self.disable_custom_all_reduce)
- scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
- self.max_num_seqs,
- model_config.max_model_len,
- self.max_paddings)
- lora_config = LoRAConfig(
- max_lora_rank=self.max_lora_rank,
- max_loras=self.max_loras,
- lora_extra_vocab_size=self.lora_extra_vocab_size,
- lora_dtype=self.lora_dtype,
- max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
- and self.max_cpu_loras > 0 else None) if self.enable_lora else None
- return (model_config, cache_config, parallel_config, scheduler_config,
- device_config, lora_config)
- @dataclass
- class AsyncEngineArgs(EngineArgs):
- """Arguments for asynchronous Aphrodite engine."""
- engine_use_ray: bool = False
- disable_log_requests: bool = False
- max_log_len: Optional[int] = None
- @staticmethod
- def add_cli_args(
- parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
- parser = EngineArgs.add_cli_args(parser)
- parser.add_argument('--engine-use-ray',
- action='store_true',
- help='use Ray to start the LLM engine in a '
- 'separate process as the server process.')
- parser.add_argument('--disable-log-requests',
- action='store_true',
- help='disable logging requests')
- parser.add_argument('--max-log-len',
- type=int,
- default=None,
- help='max number of prompt characters or prompt '
- 'ID numbers being printed in log. '
- 'Default: unlimited.')
- return parser
|