args_tools.py 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. import argparse
  2. import dataclasses
  3. from dataclasses import dataclass
  4. from typing import Optional, Tuple
  5. from aphrodite.common.config import (CacheConfig, ModelConfig, ParallelConfig,
  6. SchedulerConfig)
  7. @dataclass
  8. class EngineArgs:
  9. """Arguments for the Aphrodite engine."""
  10. model: str
  11. tokenizer: Optional[str] = None
  12. tokenizer_mode: str = 'auto'
  13. trust_remote_code: bool = False
  14. download_dir: Optional[str] = None
  15. load_format: str = 'auto'
  16. dtype: str = 'auto'
  17. seed: int = 0
  18. max_model_len: Optional[int] = None
  19. worker_use_ray: bool = False
  20. pipeline_parallel_size: int = 1
  21. tensor_parallel_size: int = 1
  22. block_size: int = 16
  23. swap_space: int = 4 # GiB
  24. gpu_memory_utilization: float = 0.90
  25. max_num_batched_tokens: Optional[int] = None
  26. max_num_seqs: int = 256
  27. max_paddings: int = 256
  28. disable_log_stats: bool = False
  29. revision: Optional[str] = None
  30. quantization: Optional[str] = None
  31. def __post_init__(self):
  32. if self.tokenizer is None:
  33. self.tokenizer = self.model
  34. @staticmethod
  35. def add_cli_args(
  36. parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
  37. """Shared CLI arguments for the Aphrodite engine."""
  38. # Model arguments
  39. parser.add_argument(
  40. '--model',
  41. type=str,
  42. default='facebook/opt-125m',
  43. help='name or path of the huggingface model to use')
  44. parser.add_argument(
  45. '--tokenizer',
  46. type=str,
  47. default=EngineArgs.tokenizer,
  48. help='name or path of the huggingface tokenizer to use')
  49. parser.add_argument(
  50. '--revision',
  51. type=str,
  52. default=None,
  53. help='the specific model version to use. It can be a branch '
  54. 'name, a tag name, or a commit id. If unspecified, will use '
  55. 'the default version.')
  56. parser.add_argument('--tokenizer-mode',
  57. type=str,
  58. default=EngineArgs.tokenizer_mode,
  59. choices=['auto', 'slow'],
  60. help='tokenizer mode. "auto" will use the fast '
  61. 'tokenizer if available, and "slow" will '
  62. 'always use the slow tokenizer.')
  63. parser.add_argument('--trust-remote-code',
  64. action='store_true',
  65. help='trust remote code from huggingface')
  66. parser.add_argument('--download-dir',
  67. type=str,
  68. default=EngineArgs.download_dir,
  69. help='directory to download and load the weights, '
  70. 'default to the default cache dir of '
  71. 'huggingface')
  72. parser.add_argument(
  73. '--load-format',
  74. type=str,
  75. default=EngineArgs.load_format,
  76. choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'],
  77. help='The format of the model weights to load. '
  78. '"auto" will try to load the weights in the safetensors format '
  79. 'and fall back to the pytorch bin format if safetensors format '
  80. 'is not available. '
  81. '"pt" will load the weights in the pytorch bin format. '
  82. '"safetensors" will load the weights in the safetensors format. '
  83. '"npcache" will load the weights in pytorch format and store '
  84. 'a numpy cache to speed up the loading. '
  85. '"dummy" will initialize the weights with random values, '
  86. 'which is mainly for profiling.')
  87. parser.add_argument(
  88. '--dtype',
  89. type=str,
  90. default=EngineArgs.dtype,
  91. choices=[
  92. 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'
  93. ],
  94. help='data type for model weights and activations. '
  95. 'The "auto" option will use FP16 precision '
  96. 'for FP32 and FP16 models, and BF16 precision '
  97. 'for BF16 models.')
  98. parser.add_argument('--max-model-len',
  99. type=int,
  100. default=None,
  101. help='model context length. If unspecified, '
  102. 'will be automatically derived from the model.')
  103. # Parallel arguments
  104. parser.add_argument('--worker-use-ray',
  105. action='store_true',
  106. help='use Ray for distributed serving, will be '
  107. 'automatically set when using more than 1 GPU')
  108. parser.add_argument('--pipeline-parallel-size',
  109. '-pp',
  110. type=int,
  111. default=EngineArgs.pipeline_parallel_size,
  112. help='number of pipeline stages')
  113. parser.add_argument('--tensor-parallel-size',
  114. '-tp',
  115. type=int,
  116. default=EngineArgs.tensor_parallel_size,
  117. help='number of tensor parallel replicas')
  118. # KV cache arguments
  119. parser.add_argument('--block-size',
  120. type=int,
  121. default=EngineArgs.block_size,
  122. choices=[8, 16, 32],
  123. help='token block size')
  124. # TODO: Support fine-grained seeds (e.g., seed per request).
  125. parser.add_argument('--seed',
  126. type=int,
  127. default=EngineArgs.seed,
  128. help='random seed')
  129. parser.add_argument('--swap-space',
  130. type=int,
  131. default=EngineArgs.swap_space,
  132. help='CPU swap space size (GiB) per GPU')
  133. parser.add_argument('--gpu-memory-utilization',
  134. type=float,
  135. default=EngineArgs.gpu_memory_utilization,
  136. help='the percentage of GPU memory to be used for'
  137. 'the model executor')
  138. parser.add_argument('--max-num-batched-tokens',
  139. type=int,
  140. default=EngineArgs.max_num_batched_tokens,
  141. help='maximum number of batched tokens per '
  142. 'iteration')
  143. parser.add_argument('--max-num-seqs',
  144. type=int,
  145. default=EngineArgs.max_num_seqs,
  146. help='maximum number of sequences per iteration')
  147. parser.add_argument('--max-paddings',
  148. type=int,
  149. default=EngineArgs.max_paddings,
  150. help='maximum number of paddings in a batch')
  151. parser.add_argument('--disable-log-stats',
  152. action='store_true',
  153. help='disable logging statistics')
  154. # Quantization settings.
  155. parser.add_argument('--quantization',
  156. '-q',
  157. type=str,
  158. choices=['awq', 'gptq', None],
  159. default=None,
  160. help='Method used to quantize the weights')
  161. return parser
  162. @classmethod
  163. def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
  164. # Get the list of attributes of this dataclass.
  165. attrs = [attr.name for attr in dataclasses.fields(cls)]
  166. # Set the attributes from the parsed arguments.
  167. engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
  168. return engine_args
  169. def create_engine_configs(
  170. self,
  171. ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig]:
  172. model_config = ModelConfig(self.model, self.tokenizer,
  173. self.tokenizer_mode, self.trust_remote_code,
  174. self.download_dir, self.load_format,
  175. self.dtype, self.seed, self.revision,
  176. self.max_model_len, self.quantization)
  177. cache_config = CacheConfig(
  178. self.block_size, self.gpu_memory_utilization, self.swap_space,
  179. getattr(model_config.hf_config, 'sliding_window', None))
  180. parallel_config = ParallelConfig(self.pipeline_parallel_size,
  181. self.tensor_parallel_size,
  182. self.worker_use_ray)
  183. scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
  184. self.max_num_seqs,
  185. model_config.max_model_len,
  186. self.max_paddings)
  187. return model_config, cache_config, parallel_config, scheduler_config
  188. @dataclass
  189. class AsyncEngineArgs(EngineArgs):
  190. """Arguments for asynchronous Aohrodite engine."""
  191. engine_use_ray: bool = False
  192. disable_log_requests: bool = False
  193. max_log_len: Optional[int] = None
  194. @staticmethod
  195. def add_cli_args(
  196. parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
  197. parser = EngineArgs.add_cli_args(parser)
  198. parser.add_argument('--engine-use-ray',
  199. action='store_true',
  200. help='use Ray to start the LLM engine in a '
  201. 'separate process as the server process.')
  202. parser.add_argument('--disable-log-requests',
  203. action='store_true',
  204. help='disable logging requests')
  205. parser.add_argument('--max-log-len',
  206. type=int,
  207. default=None,
  208. help='max number of prompt characters or prompt '
  209. 'ID numbers being printed in log. '
  210. 'Default: unlimited.')
  211. return parser