1
0

latency.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. import argparse
  2. import time
  3. import numpy as np
  4. import torch
  5. from tqdm import tqdm
  6. from aphrodite import LLM, SamplingParams
  7. def main(args: argparse.Namespace): # pylint: disable=redefined-outer-name
  8. print(args)
  9. # Process all the requests in a single batch if possible.
  10. # NOTE: If the request cannot be processed in a single batch,
  11. # the engine will automatically process the request in multiple batches.
  12. llm = LLM(
  13. model=args.model,
  14. tokenizer=args.tokenizer,
  15. quantization=args.quantization,
  16. tensor_parallel_size=args.tensor_parallel_size,
  17. max_num_seqs=args.batch_size,
  18. max_num_batched_tokens=args.batch_size * args.input_len,
  19. trust_remote_code=args.trust_remote_code,
  20. dtype=args.dtype,
  21. )
  22. sampling_params = SamplingParams(
  23. n=args.n,
  24. temperature=0.0 if args.use_beam_search else 1.0,
  25. top_p=1.0,
  26. use_beam_search=args.use_beam_search,
  27. ignore_eos=True,
  28. max_tokens=args.output_len,
  29. )
  30. print(sampling_params)
  31. dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size
  32. def run_to_completion(profile: bool = False):
  33. if profile:
  34. torch.cuda.cudart().cudaProfilerStart()
  35. start_time = time.perf_counter()
  36. llm.generate(prompt_token_ids=dummy_prompt_token_ids,
  37. sampling_params=sampling_params,
  38. use_tqdm=False)
  39. end_time = time.perf_counter()
  40. latency = end_time - start_time
  41. if profile:
  42. torch.cuda.cudart().cudaProfilerStop()
  43. return latency
  44. print('Warming up...')
  45. run_to_completion(profile=False)
  46. # Benchmark.
  47. latencies = []
  48. for _ in tqdm(range(args.num_iters), desc='Profiling iterations'):
  49. latencies.append(run_to_completion(profile=False))
  50. print(f'Avg latency: {np.mean(latencies)} seconds')
  51. if __name__ == '__main__':
  52. parser = argparse.ArgumentParser(
  53. description='Benchmark the latency of processing a single batch of '
  54. 'requests till completion.')
  55. parser.add_argument('--model', type=str, default='facebook/opt-125m')
  56. parser.add_argument('--tokenizer', type=str, default=None)
  57. parser.add_argument('--quantization',
  58. '-q',
  59. choices=['awq', None],
  60. default=None)
  61. parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
  62. parser.add_argument('--input-len', type=int, default=32)
  63. parser.add_argument('--output-len', type=int, default=128)
  64. parser.add_argument('--batch-size', type=int, default=8)
  65. parser.add_argument('--n',
  66. type=int,
  67. default=1,
  68. help='Number of generated sequences per prompt.')
  69. parser.add_argument('--use-beam-search', action='store_true')
  70. parser.add_argument('--num-iters',
  71. type=int,
  72. default=3,
  73. help='Number of iterations to run.')
  74. parser.add_argument('--trust-remote-code',
  75. action='store_true',
  76. help='trust remote code from huggingface')
  77. parser.add_argument(
  78. '--dtype',
  79. type=str,
  80. default='auto',
  81. choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
  82. help='data type for model weights and activations. '
  83. 'The "auto" option will use FP16 precision '
  84. 'for FP32 and FP16 models, and BF16 precision '
  85. 'for BF16 models.')
  86. args = parser.parse_args()
  87. main(args)