latency.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. """Benchmark the latency of processing a single batch of requests."""
  2. import argparse
  3. import json
  4. import time
  5. from pathlib import Path
  6. from typing import List, Optional
  7. import numpy as np
  8. import torch
  9. from tqdm import tqdm
  10. from aphrodite import LLM, SamplingParams
  11. from aphrodite.common.utils import FlexibleArgumentParser
  12. from aphrodite.engine.args_tools import EngineArgs
  13. from aphrodite.inputs import PromptInputs
  14. from aphrodite.quantization import QUANTIZATION_METHODS
  15. def main(args: argparse.Namespace):
  16. print(args)
  17. # NOTE: If the request cannot be processed in a single batch,
  18. # the engine will automatically process the request in multiple batches.
  19. llm = LLM(
  20. model=args.model,
  21. speculative_model=args.speculative_model,
  22. num_speculative_tokens=args.num_speculative_tokens,
  23. speculative_draft_tensor_parallel_size=\
  24. args.speculative_draft_tensor_parallel_size,
  25. ngram_prompt_lookup_max=args.ngram_prompt_lookup_max,
  26. ngram_prompt_lookup_min=args.ngram_prompt_lookup_min,
  27. tokenizer=args.tokenizer,
  28. quantization=args.quantization,
  29. tensor_parallel_size=args.tensor_parallel_size,
  30. trust_remote_code=args.trust_remote_code,
  31. dtype=args.dtype,
  32. max_model_len=args.max_model_len,
  33. enforce_eager=args.enforce_eager,
  34. kv_cache_dtype=args.kv_cache_dtype,
  35. quantization_param_path=args.quantization_param_path,
  36. device=args.device,
  37. ray_workers_use_nsight=args.ray_workers_use_nsight,
  38. use_v2_block_manager=args.use_v2_block_manager,
  39. enable_chunked_prefill=args.enable_chunked_prefill,
  40. download_dir=args.download_dir,
  41. block_size=args.block_size,
  42. gpu_memory_utilization=args.gpu_memory_utilization,
  43. load_format=args.load_format,
  44. distributed_executor_backend=args.distributed_executor_backend,
  45. enable_prefix_caching=args.enable_prefix_caching,
  46. )
  47. sampling_params = SamplingParams(
  48. n=args.n,
  49. temperature=0.0 if args.use_beam_search else 1.0,
  50. top_p=1.0,
  51. use_beam_search=args.use_beam_search,
  52. ignore_eos=True,
  53. max_tokens=args.output_len,
  54. )
  55. print(sampling_params)
  56. dummy_prompt_token_ids = np.random.randint(10000,
  57. size=(args.batch_size,
  58. args.input_len))
  59. dummy_inputs: List[PromptInputs] = [{
  60. "prompt_token_ids": batch
  61. } for batch in dummy_prompt_token_ids.tolist()]
  62. def run_to_completion(profile_dir: Optional[str] = None):
  63. if profile_dir:
  64. with torch.profiler.profile(
  65. activities=[
  66. torch.profiler.ProfilerActivity.CPU,
  67. torch.profiler.ProfilerActivity.CUDA,
  68. ],
  69. on_trace_ready=torch.profiler.tensorboard_trace_handler(
  70. str(profile_dir))) as p:
  71. llm.generate(dummy_inputs,
  72. sampling_params=sampling_params,
  73. use_tqdm=False)
  74. print(p.key_averages())
  75. else:
  76. start_time = time.perf_counter()
  77. llm.generate(dummy_inputs,
  78. sampling_params=sampling_params,
  79. use_tqdm=False)
  80. end_time = time.perf_counter()
  81. latency = end_time - start_time
  82. return latency
  83. print("Warming up...")
  84. for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
  85. run_to_completion(profile_dir=None)
  86. if args.profile:
  87. profile_dir = args.profile_result_dir
  88. if not profile_dir:
  89. profile_dir = Path(
  90. "."
  91. ) / "aphrodite_benchmark_result" / f"latency_result_{time.time()}"
  92. print(f"Profiling (results will be saved to '{profile_dir}')...")
  93. run_to_completion(profile_dir=profile_dir)
  94. return
  95. # Benchmark.
  96. latencies = []
  97. for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
  98. latencies.append(run_to_completion(profile_dir=None))
  99. latencies = np.array(latencies)
  100. percentages = [10, 25, 50, 75, 90, 99]
  101. percentiles = np.percentile(latencies, percentages)
  102. print(f'Avg latency: {np.mean(latencies)} seconds')
  103. for percentage, percentile in zip(percentages, percentiles):
  104. print(f'{percentage}% percentile latency: {percentile} seconds')
  105. # Output JSON results if specified
  106. if args.output_json:
  107. results = {
  108. "avg_latency": np.mean(latencies),
  109. "latencies": latencies.tolist(),
  110. "percentiles": dict(zip(percentages, percentiles.tolist())),
  111. }
  112. with open(args.output_json, "w") as f:
  113. json.dump(results, f, indent=4)
  114. if __name__ == '__main__':
  115. parser = FlexibleArgumentParser(
  116. description='Benchmark the latency of processing a single batch of '
  117. 'requests till completion.')
  118. parser.add_argument('--model', type=str, default='facebook/opt-125m')
  119. parser.add_argument('--speculative-model', type=str, default=None)
  120. parser.add_argument('--num-speculative-tokens', type=int, default=None)
  121. parser.add_argument('--speculative-draft-tensor-parallel-size',
  122. '-spec-draft-tp',
  123. type=int,
  124. default=None)
  125. parser.add_argument('--ngram-prompt-lookup-max', type=int, default=None)
  126. parser.add_argument('--ngram-prompt-lookup-min', type=int, default=None)
  127. parser.add_argument('--tokenizer', type=str, default=None)
  128. parser.add_argument('--quantization',
  129. '-q',
  130. choices=[*QUANTIZATION_METHODS, None],
  131. default=None)
  132. parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
  133. parser.add_argument('--input-len', type=int, default=32)
  134. parser.add_argument('--output-len', type=int, default=128)
  135. parser.add_argument('--batch-size', type=int, default=8)
  136. parser.add_argument('--n',
  137. type=int,
  138. default=1,
  139. help='Number of generated sequences per prompt.')
  140. parser.add_argument('--use-beam-search', action='store_true')
  141. parser.add_argument('--num-iters-warmup',
  142. type=int,
  143. default=10,
  144. help='Number of iterations to run for warmup.')
  145. parser.add_argument('--num-iters',
  146. type=int,
  147. default=30,
  148. help='Number of iterations to run.')
  149. parser.add_argument('--trust-remote-code',
  150. action='store_true',
  151. help='trust remote code from huggingface')
  152. parser.add_argument(
  153. '--max-model-len',
  154. type=int,
  155. default=None,
  156. help='Maximum length of a sequence (including prompt and output). '
  157. 'If None, will be derived from the model.')
  158. parser.add_argument(
  159. '--dtype',
  160. type=str,
  161. default='auto',
  162. choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
  163. help='data type for model weights and activations. '
  164. 'The "auto" option will use FP16 precision '
  165. 'for FP32 and FP16 models, and BF16 precision '
  166. 'for BF16 models.')
  167. parser.add_argument('--enforce-eager',
  168. action='store_true',
  169. help='enforce eager mode and disable CUDA graph')
  170. parser.add_argument(
  171. '--kv-cache-dtype',
  172. type=str,
  173. choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
  174. default="auto",
  175. help='Data type for kv cache storage. If "auto", will use model '
  176. 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
  177. 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
  178. parser.add_argument(
  179. '--quantization-param-path',
  180. type=str,
  181. default=None,
  182. help='Path to the JSON file containing the KV cache scaling factors. '
  183. 'This should generally be supplied, when KV cache dtype is FP8. '
  184. 'Otherwise, KV cache scaling factors default to 1.0, which may cause '
  185. 'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
  186. 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
  187. 'instead supported for common inference criteria.')
  188. parser.add_argument(
  189. '--profile',
  190. action='store_true',
  191. help='profile the generation process of a single batch')
  192. parser.add_argument(
  193. '--profile-result-dir',
  194. type=str,
  195. default=None,
  196. help=('path to save the pytorch profiler output. Can be visualized '
  197. 'with ui.perfetto.dev or Tensorboard.'))
  198. parser.add_argument(
  199. "--device",
  200. type=str,
  201. default="auto",
  202. choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
  203. help='device type for Aphrodite execution, supporting CUDA, OpenVINO '
  204. 'and CPU.')
  205. parser.add_argument('--block-size',
  206. type=int,
  207. default=16,
  208. help='block size of key/value cache')
  209. parser.add_argument(
  210. '--enable-chunked-prefill',
  211. action='store_true',
  212. help='If True, the prefill requests can be chunked based on the '
  213. 'max_num_batched_tokens')
  214. parser.add_argument("--enable-prefix-caching",
  215. action='store_true',
  216. help="Enable automatic prefix caching")
  217. parser.add_argument('--use-v2-block-manager', action='store_true')
  218. parser.add_argument(
  219. "--ray-workers-use-nsight",
  220. action='store_true',
  221. help="If specified, use nsight to profile ray workers",
  222. )
  223. parser.add_argument('--download-dir',
  224. type=str,
  225. default=None,
  226. help='directory to download and load the weights, '
  227. 'default to the default cache dir of huggingface')
  228. parser.add_argument(
  229. '--output-json',
  230. type=str,
  231. default=None,
  232. help='Path to save the latency results in JSON format.')
  233. parser.add_argument('--gpu-memory-utilization',
  234. type=float,
  235. default=0.9,
  236. help='the fraction of GPU memory to be used for '
  237. 'the model executor, which can range from 0 to 1.'
  238. 'If unspecified, will use the default value of 0.9.')
  239. parser.add_argument(
  240. '--load-format',
  241. type=str,
  242. default=EngineArgs.load_format,
  243. choices=[
  244. 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
  245. 'bitsandbytes'
  246. ],
  247. help='The format of the model weights to load.\n\n'
  248. '* "auto" will try to load the weights in the safetensors format '
  249. 'and fall back to the pytorch bin format if safetensors format '
  250. 'is not available.\n'
  251. '* "pt" will load the weights in the pytorch bin format.\n'
  252. '* "safetensors" will load the weights in the safetensors format.\n'
  253. '* "npcache" will load the weights in pytorch format and store '
  254. 'a numpy cache to speed up the loading.\n'
  255. '* "dummy" will initialize the weights with random values, '
  256. 'which is mainly for profiling.\n'
  257. '* "tensorizer" will load the weights using tensorizer from '
  258. 'CoreWeave. See the Tensorize Aphrodite Model script in the Examples'
  259. 'section for more information.\n'
  260. '* "bitsandbytes" will load the weights using bitsandbytes '
  261. 'quantization.\n')
  262. parser.add_argument(
  263. '--distributed-executor-backend',
  264. choices=['ray', 'mp'],
  265. default=None,
  266. help='Backend to use for distributed serving. When more than 1 GPU '
  267. 'is used, will be automatically set to "ray" if installed '
  268. 'or "mp" (multiprocessing) otherwise.')
  269. args = parser.parse_args()
  270. main(args)