args_tools.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. import argparse
  2. import dataclasses
  3. from dataclasses import dataclass
  4. from typing import Optional, Tuple
  5. from aphrodite.common.config import (CacheConfig, ModelConfig, ParallelConfig,
  6. SchedulerConfig)
  7. @dataclass
  8. class EngineArgs:
  9. """Arguments for the Aphrodite engine."""
  10. model: str
  11. tokenizer: Optional[str] = None
  12. tokenizer_mode: str = 'auto'
  13. trust_remote_code: bool = False
  14. download_dir: Optional[str] = None
  15. load_format: str = 'auto'
  16. dtype: str = 'auto'
  17. seed: int = 0
  18. max_model_len: Optional[int] = None
  19. worker_use_ray: bool = False
  20. pipeline_parallel_size: int = 1
  21. tensor_parallel_size: int = 1
  22. max_parallel_loading_workers: Optional[int] = None
  23. block_size: int = 16
  24. swap_space: int = 4 # GiB
  25. gpu_memory_utilization: float = 0.90
  26. max_num_batched_tokens: Optional[int] = None
  27. max_num_seqs: int = 256
  28. max_paddings: int = 256
  29. disable_log_stats: bool = False
  30. revision: Optional[str] = None
  31. quantization: Optional[str] = None
  32. enforce_eager: bool = False
  33. max_context_len_to_capture: int = 8192
  34. kv_cache_dtype: Optional[str] = None
  35. def __post_init__(self):
  36. if self.tokenizer is None:
  37. self.tokenizer = self.model
  38. @staticmethod
  39. def add_cli_args(
  40. parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
  41. """Shared CLI arguments for the Aphrodite engine."""
  42. # Model arguments
  43. parser.add_argument(
  44. '--model',
  45. type=str,
  46. default='EleutherAI/pythia-70m-deduped',
  47. help='name or path of the huggingface model to use')
  48. parser.add_argument(
  49. '--tokenizer',
  50. type=str,
  51. default=EngineArgs.tokenizer,
  52. help='name or path of the huggingface tokenizer to use')
  53. parser.add_argument(
  54. '--revision',
  55. type=str,
  56. default=None,
  57. help='the specific model version to use. It can be a branch '
  58. 'name, a tag name, or a commit id. If unspecified, will use '
  59. 'the default version.')
  60. parser.add_argument('--tokenizer-mode',
  61. type=str,
  62. default=EngineArgs.tokenizer_mode,
  63. choices=['auto', 'slow'],
  64. help='tokenizer mode. "auto" will use the fast '
  65. 'tokenizer if available, and "slow" will '
  66. 'always use the slow tokenizer.')
  67. parser.add_argument('--trust-remote-code',
  68. action='store_true',
  69. help='trust remote code from huggingface')
  70. parser.add_argument('--download-dir',
  71. type=str,
  72. default=EngineArgs.download_dir,
  73. help='directory to download and load the weights, '
  74. 'default to the default cache dir of '
  75. 'huggingface')
  76. parser.add_argument(
  77. '--load-format',
  78. type=str,
  79. default=EngineArgs.load_format,
  80. choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'],
  81. help='The format of the model weights to load. '
  82. '"auto" will try to load the weights in the safetensors '
  83. 'and fall back to the pytorch bin format if safetensors '
  84. 'is not available. '
  85. '"pt" will load the weights in the pytorch bin format. '
  86. '"safetensors" will load the weights in the safetensors. '
  87. '"npcache" will load the weights in pytorch format and store '
  88. 'a numpy cache to speed up the loading. '
  89. '"dummy" will initialize the weights with random values, '
  90. 'which is mainly for profiling.')
  91. parser.add_argument(
  92. '--dtype',
  93. type=str,
  94. default=EngineArgs.dtype,
  95. choices=[
  96. 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'
  97. ],
  98. help='data type for model weights and activations. '
  99. 'The "auto" option will use FP16 precision '
  100. 'for FP32 and FP16 models, and BF16 precision '
  101. 'for BF16 models.')
  102. parser.add_argument('--max-model-len',
  103. type=int,
  104. default=None,
  105. help='model context length. If unspecified, '
  106. 'will be automatically derived from the model.')
  107. # Parallel arguments
  108. parser.add_argument('--worker-use-ray',
  109. action='store_true',
  110. help='use Ray for distributed serving, will be '
  111. 'automatically set when using more than 1 GPU')
  112. parser.add_argument('--pipeline-parallel-size',
  113. '-pp',
  114. type=int,
  115. default=EngineArgs.pipeline_parallel_size,
  116. help='number of pipeline stages')
  117. parser.add_argument('--tensor-parallel-size',
  118. '-tp',
  119. type=int,
  120. default=EngineArgs.tensor_parallel_size,
  121. help='number of tensor parallel replicas')
  122. parser.add_argument(
  123. '--max-parallel-loading-workers',
  124. '-mplw',
  125. type=int,
  126. help='load model sequentially in multiple batches, '
  127. 'to avoid CPU OOM when using tensor parallel '
  128. 'with large models.')
  129. # KV cache arguments
  130. parser.add_argument('--block-size',
  131. type=int,
  132. default=EngineArgs.block_size,
  133. choices=[8, 16, 32],
  134. help='token block size')
  135. # TODO: Support fine-grained seeds (e.g., seed per request).
  136. parser.add_argument('--seed',
  137. type=int,
  138. default=EngineArgs.seed,
  139. help='random seed')
  140. parser.add_argument('--swap-space',
  141. type=int,
  142. default=EngineArgs.swap_space,
  143. help='CPU swap space size (GiB) per GPU')
  144. parser.add_argument('--gpu-memory-utilization',
  145. '-gmu',
  146. type=float,
  147. default=EngineArgs.gpu_memory_utilization,
  148. help='the percentage of GPU memory to be used for'
  149. 'the model executor')
  150. parser.add_argument('--max-num-batched-tokens',
  151. '-mnbt',
  152. type=int,
  153. default=EngineArgs.max_num_batched_tokens,
  154. help='maximum number of batched tokens per '
  155. 'iteration')
  156. parser.add_argument('--max-num-seqs',
  157. type=int,
  158. default=EngineArgs.max_num_seqs,
  159. help='maximum number of sequences per iteration')
  160. parser.add_argument('--max-paddings',
  161. type=int,
  162. default=EngineArgs.max_paddings,
  163. help='maximum number of paddings in a batch')
  164. parser.add_argument('--disable-log-stats',
  165. action='store_true',
  166. help='disable logging statistics')
  167. # Quantization settings.
  168. parser.add_argument('--quantization',
  169. '-q',
  170. type=str,
  171. choices=['awq', 'squeezellm', 'gptq', None],
  172. default=None,
  173. help='Method used to quantize the weights')
  174. parser.add_argument('--enforce-eager',
  175. action='store_true',
  176. help='Always use eager-mode PyTorch. If False, '
  177. 'will use eager mode and CUDA graph in hybrid '
  178. 'for maximum performance and flexibility.')
  179. parser.add_argument('--max-context-len-to-capture',
  180. type=int,
  181. default=EngineArgs.max_context_len_to_capture,
  182. help='maximum context length covered by CUDA '
  183. 'graphs. When a sequence has context length '
  184. 'larger than this, we fall back to eager mode.')
  185. parser.add_argument('--kv-cache-dtype',
  186. type=str,
  187. choices=['fp8', None],
  188. default=None,
  189. help='Data type for the KV cache.')
  190. return parser
  191. @classmethod
  192. def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
  193. # Get the list of attributes of this dataclass.
  194. attrs = [attr.name for attr in dataclasses.fields(cls)]
  195. # Set the attributes from the parsed arguments.
  196. engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
  197. return engine_args
  198. def create_engine_configs(
  199. self,
  200. ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig]:
  201. model_config = ModelConfig(self.model, self.tokenizer,
  202. self.tokenizer_mode, self.trust_remote_code,
  203. self.download_dir, self.load_format,
  204. self.dtype, self.seed, self.revision,
  205. self.max_model_len, self.quantization,
  206. self.enforce_eager,
  207. self.max_context_len_to_capture)
  208. cache_config = CacheConfig(self.block_size,
  209. self.gpu_memory_utilization,
  210. self.swap_space, self.kv_cache_dtype,
  211. model_config.get_sliding_window())
  212. parallel_config = ParallelConfig(self.pipeline_parallel_size,
  213. self.tensor_parallel_size,
  214. self.worker_use_ray,
  215. self.max_parallel_loading_workers)
  216. scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
  217. self.max_num_seqs,
  218. model_config.max_model_len,
  219. self.max_paddings)
  220. return model_config, cache_config, parallel_config, scheduler_config
  221. @dataclass
  222. class AsyncEngineArgs(EngineArgs):
  223. """Arguments for asynchronous Aohrodite engine."""
  224. engine_use_ray: bool = False
  225. disable_log_requests: bool = False
  226. max_log_len: Optional[int] = None
  227. @staticmethod
  228. def add_cli_args(
  229. parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
  230. parser = EngineArgs.add_cli_args(parser)
  231. parser.add_argument('--engine-use-ray',
  232. action='store_true',
  233. help='use Ray to start the LLM engine in a '
  234. 'separate process as the server process.')
  235. parser.add_argument('--disable-log-requests',
  236. action='store_true',
  237. help='disable logging requests')
  238. parser.add_argument('--max-log-len',
  239. type=int,
  240. default=None,
  241. help='max number of prompt characters or prompt '
  242. 'ID numbers being printed in log. '
  243. 'Default: unlimited.')
  244. return parser