args_tools.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. import argparse
  2. import dataclasses
  3. from dataclasses import dataclass
  4. from typing import Optional, Tuple
  5. from aphrodite.common.config import (CacheConfig, ModelConfig, ParallelConfig,
  6. SchedulerConfig, LoRAConfig, DeviceConfig)
  7. @dataclass
  8. class EngineArgs:
  9. """Arguments for the Aphrodite engine."""
  10. model: str
  11. tokenizer: Optional[str] = None
  12. tokenizer_mode: str = 'auto'
  13. trust_remote_code: bool = False
  14. download_dir: Optional[str] = None
  15. load_format: str = 'auto'
  16. dtype: str = 'auto'
  17. kv_cache_dtype: str = 'auto'
  18. kv_quant_params_path: str = None
  19. seed: int = 0
  20. max_model_len: Optional[int] = None
  21. worker_use_ray: bool = False
  22. pipeline_parallel_size: int = 1
  23. tensor_parallel_size: int = 1
  24. max_parallel_loading_workers: Optional[int] = None
  25. block_size: int = 16
  26. context_shift: bool = False
  27. swap_space: int = 4 # GiB
  28. gpu_memory_utilization: float = 0.90
  29. max_num_batched_tokens: Optional[int] = None
  30. max_num_seqs: int = 256
  31. max_paddings: int = 256
  32. max_log_probs: int = 10
  33. disable_log_stats: bool = False
  34. revision: Optional[str] = None
  35. tokenizer_revision: Optional[str] = None
  36. quantization: Optional[str] = None
  37. load_in_4bit: bool = False
  38. load_in_8bit: bool = False
  39. load_in_smooth: bool = False
  40. enforce_eager: bool = False
  41. max_context_len_to_capture: int = 8192
  42. disable_custom_all_reduce: bool = False
  43. enable_lora: bool = False
  44. max_loras: int = 1
  45. max_lora_rank: int = 16
  46. lora_extra_vocab_size: int = 256
  47. lora_dtype = 'auto'
  48. max_cpu_loras: Optional[int] = None
  49. device: str = 'cuda'
  50. def __post_init__(self):
  51. if self.tokenizer is None:
  52. self.tokenizer = self.model
  53. @staticmethod
  54. def add_cli_args(
  55. parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
  56. """Shared CLI arguments for the Aphrodite engine."""
  57. # NOTE: If you update any of the arguments below, please also
  58. # make sure to update docs/source/models/engine_args.rst
  59. # Model arguments
  60. parser.add_argument(
  61. '--model',
  62. type=str,
  63. default='EleutherAI/pythia-70m-deduped',
  64. help='name or path of the huggingface model to use')
  65. parser.add_argument(
  66. '--tokenizer',
  67. type=str,
  68. default=EngineArgs.tokenizer,
  69. help='name or path of the huggingface tokenizer to use')
  70. parser.add_argument(
  71. '--revision',
  72. type=str,
  73. default=None,
  74. help='the specific model version to use. It can be a branch '
  75. 'name, a tag name, or a commit id. If unspecified, will use '
  76. 'the default version.')
  77. parser.add_argument(
  78. '--tokenizer-revision',
  79. type=str,
  80. default=None,
  81. help='the specific tokenizer version to use. It can be a branch '
  82. 'name, a tag name, or a commit id. If unspecified, will use '
  83. 'the default version.')
  84. parser.add_argument('--tokenizer-mode',
  85. type=str,
  86. default=EngineArgs.tokenizer_mode,
  87. choices=['auto', 'slow'],
  88. help='tokenizer mode. "auto" will use the fast '
  89. 'tokenizer if available, and "slow" will '
  90. 'always use the slow tokenizer.')
  91. parser.add_argument('--trust-remote-code',
  92. action='store_true',
  93. help='trust remote code from huggingface')
  94. parser.add_argument('--download-dir',
  95. type=str,
  96. default=EngineArgs.download_dir,
  97. help='directory to download and load the weights, '
  98. 'default to the default cache dir of '
  99. 'huggingface')
  100. parser.add_argument(
  101. '--load-format',
  102. type=str,
  103. default=EngineArgs.load_format,
  104. choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'],
  105. help='The format of the model weights to load. '
  106. '"auto" will try to load the weights in the safetensors format '
  107. 'and fall back to the pytorch bin format if safetensors format '
  108. 'is not available. '
  109. '"pt" will load the weights in the pytorch bin format. '
  110. '"safetensors" will load the weights in the safetensors format. '
  111. '"npcache" will load the weights in pytorch format and store '
  112. 'a numpy cache to speed up the loading. '
  113. '"dummy" will initialize the weights with random values, '
  114. 'which is mainly for profiling.')
  115. parser.add_argument(
  116. '--dtype',
  117. type=str,
  118. default=EngineArgs.dtype,
  119. choices=[
  120. 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'
  121. ],
  122. help='data type for model weights and activations. '
  123. 'The "auto" option will use FP16 precision '
  124. 'for FP32 and FP16 models, and BF16 precision '
  125. 'for BF16 models.')
  126. parser.add_argument(
  127. '--kv-cache-dtype',
  128. type=str,
  129. choices=['auto', 'fp8_e5m2', 'int8'],
  130. default=EngineArgs.kv_cache_dtype,
  131. help='Data type for kv cache storage. If "auto", will use model '
  132. 'data type. Note FP8 is not supported when cuda version is '
  133. 'lower than 11.8.')
  134. parser.add_argument(
  135. '--kv-quant-params-path',
  136. type=str,
  137. default=EngineArgs.kv_quant_params_path,
  138. help='Path to scales and zero points of KV cache '
  139. 'quantization. Only applicable when kv-cache-dtype '
  140. 'is int8.')
  141. parser.add_argument('--max-model-len',
  142. type=int,
  143. default=EngineArgs.max_model_len,
  144. help='model context length. If unspecified, '
  145. 'will be automatically derived from the model.')
  146. # Parallel arguments
  147. parser.add_argument('--worker-use-ray',
  148. action='store_true',
  149. help='use Ray for distributed serving, will be '
  150. 'automatically set when using more than 1 GPU')
  151. parser.add_argument('--pipeline-parallel-size',
  152. '-pp',
  153. type=int,
  154. default=EngineArgs.pipeline_parallel_size,
  155. help='number of pipeline stages')
  156. parser.add_argument('--tensor-parallel-size',
  157. '-tp',
  158. type=int,
  159. default=EngineArgs.tensor_parallel_size,
  160. help='number of tensor parallel replicas')
  161. parser.add_argument(
  162. '--max-parallel-loading-workers',
  163. type=int,
  164. default=EngineArgs.max_parallel_loading_workers,
  165. help='load model sequentially in multiple batches, '
  166. 'to avoid RAM OOM when using tensor '
  167. 'parallel and large models')
  168. # KV cache arguments
  169. parser.add_argument('--block-size',
  170. type=int,
  171. default=EngineArgs.block_size,
  172. choices=[8, 16, 32],
  173. help='token block size')
  174. parser.add_argument('--context-shift',
  175. action='store_true',
  176. help='Enable context shifting.')
  177. parser.add_argument('--seed',
  178. type=int,
  179. default=EngineArgs.seed,
  180. help='random seed')
  181. parser.add_argument('--swap-space',
  182. type=int,
  183. default=EngineArgs.swap_space,
  184. help='CPU swap space size (GiB) per GPU')
  185. parser.add_argument(
  186. '--gpu-memory-utilization',
  187. '-gmu',
  188. type=float,
  189. default=EngineArgs.gpu_memory_utilization,
  190. help='the fraction of GPU memory to be used for '
  191. 'the model executor, which can range from 0 to 1.'
  192. 'If unspecified, will use the default value of 0.9.')
  193. parser.add_argument('--max-num-batched-tokens',
  194. type=int,
  195. default=EngineArgs.max_num_batched_tokens,
  196. help='maximum number of batched tokens per '
  197. 'iteration')
  198. parser.add_argument('--max-num-seqs',
  199. type=int,
  200. default=EngineArgs.max_num_seqs,
  201. help='maximum number of sequences per iteration')
  202. parser.add_argument('--max-paddings',
  203. type=int,
  204. default=EngineArgs.max_paddings,
  205. help='maximum number of paddings in a batch')
  206. parser.add_argument('--max-log-probs',
  207. type=int,
  208. default=EngineArgs.max_log_probs,
  209. help='maximum number of log probabilities to '
  210. 'return.')
  211. parser.add_argument('--disable-log-stats',
  212. action='store_true',
  213. help='disable logging statistics')
  214. # Quantization settings.
  215. parser.add_argument('--quantization',
  216. '-q',
  217. type=str,
  218. choices=[
  219. 'aqlm', 'awq', 'bnb', 'exl2', 'gguf', 'gptq',
  220. 'quip', 'squeezellm', 'marlin', None
  221. ],
  222. default=EngineArgs.quantization,
  223. help='Method used to quantize the weights. If '
  224. 'None, we first check the `quantization_config` '
  225. 'attribute in the model config file. If that is '
  226. 'None, we assume the model weights are not '
  227. 'quantized and use `dtype` to determine the data '
  228. 'type of the weights.')
  229. parser.add_argument('--load-in-4bit',
  230. action='store_true',
  231. help='Load the FP16 model in 4-bit format. Also '
  232. 'works with AWQ models. Throughput at 2.5x of '
  233. 'FP16.')
  234. parser.add_argument('--load-in-8bit',
  235. action='store_true',
  236. help='Load the FP16 model in 8-bit format. '
  237. 'Throughput at 0.3x of FP16.')
  238. parser.add_argument('--load-in-smooth',
  239. action='store_true',
  240. help='Load the FP16 model in smoothquant '
  241. '8bit format. Throughput at 0.7x of FP16. ')
  242. parser.add_argument('--enforce-eager',
  243. action='store_true',
  244. help='Always use eager-mode PyTorch. If False, '
  245. 'will use eager mode and CUDA graph in hybrid '
  246. 'for maximal performance and flexibility.')
  247. parser.add_argument('--max-context-len-to-capture',
  248. type=int,
  249. default=EngineArgs.max_context_len_to_capture,
  250. help='maximum context length covered by CUDA '
  251. 'graphs. When a sequence has context length '
  252. 'larger than this, we fall back to eager mode.')
  253. parser.add_argument('--disable-custom-all-reduce',
  254. action='store_true',
  255. default=EngineArgs.disable_custom_all_reduce,
  256. help='See ParallelConfig')
  257. # LoRA related configs
  258. parser.add_argument('--enable-lora',
  259. action='store_true',
  260. help='If True, enable handling of LoRA adapters.')
  261. parser.add_argument('--max-loras',
  262. type=int,
  263. default=EngineArgs.max_loras,
  264. help='Max number of LoRAs in a single batch.')
  265. parser.add_argument('--max-lora-rank',
  266. type=int,
  267. default=EngineArgs.max_lora_rank,
  268. help='Max LoRA rank.')
  269. parser.add_argument(
  270. '--lora-extra-vocab-size',
  271. type=int,
  272. default=EngineArgs.lora_extra_vocab_size,
  273. help=('Maximum size of extra vocabulary that can be '
  274. 'present in a LoRA adapter (added to the base '
  275. 'model vocabulary).'))
  276. parser.add_argument(
  277. '--lora-dtype',
  278. type=str,
  279. default=EngineArgs.lora_dtype,
  280. choices=['auto', 'float16', 'bfloat16', 'float32'],
  281. help=('Data type for LoRA. If auto, will default to '
  282. 'base model dtype.'))
  283. parser.add_argument(
  284. '--max-cpu-loras',
  285. type=int,
  286. default=EngineArgs.max_cpu_loras,
  287. help=('Maximum number of LoRAs to store in CPU memory. '
  288. 'Must be >= than max_num_seqs. '
  289. 'Defaults to max_num_seqs.'))
  290. parser.add_argument('--device',
  291. type=str,
  292. default=EngineArgs.device,
  293. choices=['cuda'],
  294. help=('Device to use for model execution. '
  295. 'Currently, only "cuda" is supported.'))
  296. return parser
  297. @classmethod
  298. def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
  299. # Get the list of attributes of this dataclass.
  300. attrs = [attr.name for attr in dataclasses.fields(cls)]
  301. # Set the attributes from the parsed arguments.
  302. engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
  303. return engine_args
  304. def create_engine_configs(
  305. self,
  306. ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig,
  307. DeviceConfig, Optional[LoRAConfig]]:
  308. device_config = DeviceConfig(self.device)
  309. model_config = ModelConfig(
  310. self.model, self.tokenizer, self.tokenizer_mode,
  311. self.trust_remote_code, self.download_dir, self.load_format,
  312. self.dtype, self.seed, self.revision, self.tokenizer_revision,
  313. self.max_model_len, self.quantization, self.load_in_4bit,
  314. self.load_in_8bit, self.load_in_smooth, self.enforce_eager,
  315. self.max_context_len_to_capture, self.max_log_probs)
  316. cache_config = CacheConfig(self.block_size,
  317. self.gpu_memory_utilization,
  318. self.swap_space, self.kv_cache_dtype,
  319. self.kv_quant_params_path,
  320. model_config.get_sliding_window(),
  321. self.context_shift)
  322. parallel_config = ParallelConfig(self.pipeline_parallel_size,
  323. self.tensor_parallel_size,
  324. self.worker_use_ray,
  325. self.max_parallel_loading_workers,
  326. self.disable_custom_all_reduce)
  327. scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
  328. self.max_num_seqs,
  329. model_config.max_model_len,
  330. self.max_paddings)
  331. lora_config = LoRAConfig(
  332. max_lora_rank=self.max_lora_rank,
  333. max_loras=self.max_loras,
  334. lora_extra_vocab_size=self.lora_extra_vocab_size,
  335. lora_dtype=self.lora_dtype,
  336. max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
  337. and self.max_cpu_loras > 0 else None) if self.enable_lora else None
  338. return (model_config, cache_config, parallel_config, scheduler_config,
  339. device_config, lora_config)
  340. @dataclass
  341. class AsyncEngineArgs(EngineArgs):
  342. """Arguments for asynchronous Aphrodite engine."""
  343. engine_use_ray: bool = False
  344. disable_log_requests: bool = False
  345. max_log_len: Optional[int] = None
  346. @staticmethod
  347. def add_cli_args(
  348. parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
  349. parser = EngineArgs.add_cli_args(parser)
  350. parser.add_argument('--engine-use-ray',
  351. action='store_true',
  352. help='use Ray to start the LLM engine in a '
  353. 'separate process as the server process.')
  354. parser.add_argument('--disable-log-requests',
  355. action='store_true',
  356. help='disable logging requests')
  357. parser.add_argument('--max-log-len',
  358. type=int,
  359. default=None,
  360. help='max number of prompt characters or prompt '
  361. 'ID numbers being printed in log. '
  362. 'Default: unlimited.')
  363. return parser