args_tools.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. import argparse
  2. import dataclasses
  3. from dataclasses import dataclass
  4. from typing import Optional, Tuple
  5. from aphrodite.common.config import (CacheConfig, ModelConfig, ParallelConfig,
  6. SchedulerConfig, LoRAConfig, DeviceConfig)
  7. @dataclass
  8. class EngineArgs:
  9. """Arguments for the Aphrodite engine."""
  10. model: str
  11. tokenizer: Optional[str] = None
  12. tokenizer_mode: str = 'auto'
  13. trust_remote_code: bool = False
  14. download_dir: Optional[str] = None
  15. load_format: str = 'auto'
  16. dtype: str = 'auto'
  17. kv_cache_dtype: str = 'auto'
  18. seed: int = 0
  19. max_model_len: Optional[int] = None
  20. worker_use_ray: bool = False
  21. pipeline_parallel_size: int = 1
  22. tensor_parallel_size: int = 1
  23. max_parallel_loading_workers: Optional[int] = None
  24. block_size: int = 16
  25. swap_space: int = 4 # GiB
  26. gpu_memory_utilization: float = 0.90
  27. max_num_batched_tokens: Optional[int] = None
  28. max_num_seqs: int = 256
  29. max_paddings: int = 256
  30. disable_log_stats: bool = False
  31. revision: Optional[str] = None
  32. tokenizer_revision: Optional[str] = None
  33. quantization: Optional[str] = None
  34. enforce_eager: bool = False
  35. max_context_len_to_capture: int = 8192
  36. disable_custom_all_reduce: bool = False
  37. enable_lora: bool = False
  38. max_loras: int = 1
  39. max_lora_rank: int = 16
  40. lora_extra_vocab_size: int = 256
  41. lora_dtype = 'auto'
  42. max_cpu_loras: Optional[int] = None
  43. device: str = 'cuda'
  44. def __post_init__(self):
  45. if self.tokenizer is None:
  46. self.tokenizer = self.model
  47. @staticmethod
  48. def add_cli_args(
  49. parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
  50. """Shared CLI arguments for the Aphrodite engine."""
  51. # NOTE: If you update any of the arguments below, please also
  52. # make sure to update docs/source/models/engine_args.rst
  53. # Model arguments
  54. parser.add_argument(
  55. '--model',
  56. type=str,
  57. default='EleutherAI/pythia-70m-deduped',
  58. help='name or path of the huggingface model to use')
  59. parser.add_argument(
  60. '--tokenizer',
  61. type=str,
  62. default=EngineArgs.tokenizer,
  63. help='name or path of the huggingface tokenizer to use')
  64. parser.add_argument(
  65. '--revision',
  66. type=str,
  67. default=None,
  68. help='the specific model version to use. It can be a branch '
  69. 'name, a tag name, or a commit id. If unspecified, will use '
  70. 'the default version.')
  71. parser.add_argument(
  72. '--tokenizer-revision',
  73. type=str,
  74. default=None,
  75. help='the specific tokenizer version to use. It can be a branch '
  76. 'name, a tag name, or a commit id. If unspecified, will use '
  77. 'the default version.')
  78. parser.add_argument('--tokenizer-mode',
  79. type=str,
  80. default=EngineArgs.tokenizer_mode,
  81. choices=['auto', 'slow'],
  82. help='tokenizer mode. "auto" will use the fast '
  83. 'tokenizer if available, and "slow" will '
  84. 'always use the slow tokenizer.')
  85. parser.add_argument('--trust-remote-code',
  86. action='store_true',
  87. help='trust remote code from huggingface')
  88. parser.add_argument('--download-dir',
  89. type=str,
  90. default=EngineArgs.download_dir,
  91. help='directory to download and load the weights, '
  92. 'default to the default cache dir of '
  93. 'huggingface')
  94. parser.add_argument(
  95. '--load-format',
  96. type=str,
  97. default=EngineArgs.load_format,
  98. choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'],
  99. help='The format of the model weights to load. '
  100. '"auto" will try to load the weights in the safetensors format '
  101. 'and fall back to the pytorch bin format if safetensors format '
  102. 'is not available. '
  103. '"pt" will load the weights in the pytorch bin format. '
  104. '"safetensors" will load the weights in the safetensors format. '
  105. '"npcache" will load the weights in pytorch format and store '
  106. 'a numpy cache to speed up the loading. '
  107. '"dummy" will initialize the weights with random values, '
  108. 'which is mainly for profiling.')
  109. parser.add_argument(
  110. '--dtype',
  111. type=str,
  112. default=EngineArgs.dtype,
  113. choices=[
  114. 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'
  115. ],
  116. help='data type for model weights and activations. '
  117. 'The "auto" option will use FP16 precision '
  118. 'for FP32 and FP16 models, and BF16 precision '
  119. 'for BF16 models.')
  120. parser.add_argument(
  121. '--kv-cache-dtype',
  122. type=str,
  123. choices=['auto', 'fp8_e5m2'],
  124. default=EngineArgs.kv_cache_dtype,
  125. help='Data type for kv cache storage. If "auto", will use model '
  126. 'data type. Note FP8 is not supported when cuda version is '
  127. 'lower than 11.8.')
  128. parser.add_argument('--max-model-len',
  129. type=int,
  130. default=EngineArgs.max_model_len,
  131. help='model context length. If unspecified, '
  132. 'will be automatically derived from the model.')
  133. # Parallel arguments
  134. parser.add_argument('--worker-use-ray',
  135. action='store_true',
  136. help='use Ray for distributed serving, will be '
  137. 'automatically set when using more than 1 GPU')
  138. parser.add_argument('--pipeline-parallel-size',
  139. '-pp',
  140. type=int,
  141. default=EngineArgs.pipeline_parallel_size,
  142. help='number of pipeline stages')
  143. parser.add_argument('--tensor-parallel-size',
  144. '-tp',
  145. type=int,
  146. default=EngineArgs.tensor_parallel_size,
  147. help='number of tensor parallel replicas')
  148. parser.add_argument(
  149. '--max-parallel-loading-workers',
  150. type=int,
  151. default=EngineArgs.max_parallel_loading_workers,
  152. help='load model sequentially in multiple batches, '
  153. 'to avoid RAM OOM when using tensor '
  154. 'parallel and large models')
  155. # KV cache arguments
  156. parser.add_argument('--block-size',
  157. type=int,
  158. default=EngineArgs.block_size,
  159. choices=[8, 16, 32],
  160. help='token block size')
  161. # TODO: Support fine-grained seeds (e.g., seed per request).
  162. parser.add_argument('--seed',
  163. type=int,
  164. default=EngineArgs.seed,
  165. help='random seed')
  166. parser.add_argument('--swap-space',
  167. type=int,
  168. default=EngineArgs.swap_space,
  169. help='CPU swap space size (GiB) per GPU')
  170. parser.add_argument(
  171. '--gpu-memory-utilization',
  172. '-gmu',
  173. type=float,
  174. default=EngineArgs.gpu_memory_utilization,
  175. help='the fraction of GPU memory to be used for '
  176. 'the model executor, which can range from 0 to 1.'
  177. 'If unspecified, will use the default value of 0.9.')
  178. parser.add_argument('--max-num-batched-tokens',
  179. type=int,
  180. default=EngineArgs.max_num_batched_tokens,
  181. help='maximum number of batched tokens per '
  182. 'iteration')
  183. parser.add_argument('--max-num-seqs',
  184. type=int,
  185. default=EngineArgs.max_num_seqs,
  186. help='maximum number of sequences per iteration')
  187. parser.add_argument('--max-paddings',
  188. type=int,
  189. default=EngineArgs.max_paddings,
  190. help='maximum number of paddings in a batch')
  191. parser.add_argument('--disable-log-stats',
  192. action='store_true',
  193. help='disable logging statistics')
  194. # Quantization settings.
  195. parser.add_argument(
  196. '--quantization',
  197. '-q',
  198. type=str,
  199. choices=['awq', 'gguf', 'gptq', 'quip', 'squeezellm', None],
  200. default=EngineArgs.quantization,
  201. help='Method used to quantize the weights. If '
  202. 'None, we first check the `quantization_config` '
  203. 'attribute in the model config file. If that is '
  204. 'None, we assume the model weights are not '
  205. 'quantized and use `dtype` to determine the data '
  206. 'type of the weights.')
  207. parser.add_argument('--enforce-eager',
  208. action='store_true',
  209. help='Always use eager-mode PyTorch. If False, '
  210. 'will use eager mode and CUDA graph in hybrid '
  211. 'for maximal performance and flexibility.')
  212. parser.add_argument('--max-context-len-to-capture',
  213. type=int,
  214. default=EngineArgs.max_context_len_to_capture,
  215. help='maximum context length covered by CUDA '
  216. 'graphs. When a sequence has context length '
  217. 'larger than this, we fall back to eager mode.')
  218. parser.add_argument('--disable-custom-all-reduce',
  219. action='store_true',
  220. default=EngineArgs.disable_custom_all_reduce,
  221. help='See ParallelConfig')
  222. # LoRA related configs
  223. parser.add_argument('--enable-lora',
  224. action='store_true',
  225. help='If True, enable handling of LoRA adapters.')
  226. parser.add_argument('--max-loras',
  227. type=int,
  228. default=EngineArgs.max_loras,
  229. help='Max number of LoRAs in a single batch.')
  230. parser.add_argument('--max-lora-rank',
  231. type=int,
  232. default=EngineArgs.max_lora_rank,
  233. help='Max LoRA rank.')
  234. parser.add_argument(
  235. '--lora-extra-vocab-size',
  236. type=int,
  237. default=EngineArgs.lora_extra_vocab_size,
  238. help=('Maximum size of extra vocabulary that can be '
  239. 'present in a LoRA adapter (added to the base '
  240. 'model vocabulary).'))
  241. parser.add_argument(
  242. '--lora-dtype',
  243. type=str,
  244. default=EngineArgs.lora_dtype,
  245. choices=['auto', 'float16', 'bfloat16', 'float32'],
  246. help=('Data type for LoRA. If auto, will default to '
  247. 'base model dtype.'))
  248. parser.add_argument(
  249. '--max-cpu-loras',
  250. type=int,
  251. default=EngineArgs.max_cpu_loras,
  252. help=('Maximum number of LoRAs to store in CPU memory. '
  253. 'Must be >= than max_num_seqs. '
  254. 'Defaults to max_num_seqs.'))
  255. parser.add_argument('--device',
  256. type=str,
  257. default=EngineArgs.device,
  258. choices=['cuda'],
  259. help=('Device to use for model execution. '
  260. 'Currently, only "cuda" is supported.'))
  261. return parser
  262. @classmethod
  263. def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
  264. # Get the list of attributes of this dataclass.
  265. attrs = [attr.name for attr in dataclasses.fields(cls)]
  266. # Set the attributes from the parsed arguments.
  267. engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
  268. return engine_args
  269. def create_engine_configs(
  270. self,
  271. ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig,
  272. DeviceConfig, Optional[LoRAConfig]]:
  273. device_config = DeviceConfig(self.device)
  274. model_config = ModelConfig(self.model, self.tokenizer,
  275. self.tokenizer_mode, self.trust_remote_code,
  276. self.download_dir, self.load_format,
  277. self.dtype, self.seed, self.revision,
  278. self.tokenizer_revision, self.max_model_len,
  279. self.quantization, self.enforce_eager,
  280. self.max_context_len_to_capture)
  281. cache_config = CacheConfig(self.block_size,
  282. self.gpu_memory_utilization,
  283. self.swap_space, self.kv_cache_dtype,
  284. model_config.get_sliding_window())
  285. parallel_config = ParallelConfig(self.pipeline_parallel_size,
  286. self.tensor_parallel_size,
  287. self.worker_use_ray,
  288. self.max_parallel_loading_workers,
  289. self.disable_custom_all_reduce)
  290. scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
  291. self.max_num_seqs,
  292. model_config.max_model_len,
  293. self.max_paddings)
  294. lora_config = LoRAConfig(
  295. max_lora_rank=self.max_lora_rank,
  296. max_loras=self.max_loras,
  297. lora_extra_vocab_size=self.lora_extra_vocab_size,
  298. lora_dtype=self.lora_dtype,
  299. max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
  300. and self.max_cpu_loras > 0 else None) if self.enable_lora else None
  301. return (model_config, cache_config, parallel_config, scheduler_config,
  302. device_config, lora_config)
  303. @dataclass
  304. class AsyncEngineArgs(EngineArgs):
  305. """Arguments for asynchronous Aphrodite engine."""
  306. engine_use_ray: bool = False
  307. disable_log_requests: bool = False
  308. max_log_len: Optional[int] = None
  309. @staticmethod
  310. def add_cli_args(
  311. parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
  312. parser = EngineArgs.add_cli_args(parser)
  313. parser.add_argument('--engine-use-ray',
  314. action='store_true',
  315. help='use Ray to start the LLM engine in a '
  316. 'separate process as the server process.')
  317. parser.add_argument('--disable-log-requests',
  318. action='store_true',
  319. help='disable logging requests')
  320. parser.add_argument('--max-log-len',
  321. type=int,
  322. default=None,
  323. help='max number of prompt characters or prompt '
  324. 'ID numbers being printed in log. '
  325. 'Default: unlimited.')
  326. return parser