latency.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. import argparse
  2. import time
  3. import numpy as np
  4. import torch
  5. from tqdm import tqdm
  6. from aphrodite import LLM, SamplingParams
  7. def main(args: argparse.Namespace):
  8. print(args)
  9. # Process all the requests in a single batch if possible.
  10. # NOTE: If the request cannot be processed in a single batch,
  11. # the engine will automatically process the request in multiple batches.
  12. llm = LLM(
  13. model=args.model,
  14. tokenizer=args.tokenizer,
  15. tensor_parallel_size=args.tensor_parallel_size,
  16. swap_space=args.swap_space,
  17. max_num_seqs=args.batch_size,
  18. max_num_batched_tokens=args.batch_size * args.input_len,
  19. trust_remote_code=args.trust_remote_code,
  20. )
  21. sampling_params = SamplingParams(
  22. n=args.n,
  23. temperature=0.0 if args.use_beam_search else 1.0,
  24. top_p=1.0,
  25. use_beam_search=args.use_beam_search,
  26. ignore_eos=True,
  27. max_tokens=args.output_len,
  28. )
  29. print(sampling_params)
  30. dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size
  31. def run_to_completion(profile: bool = False):
  32. if profile:
  33. torch.cuda.cudart().cudaProfilerStart()
  34. start_time = time.time()
  35. llm.generate(prompt_token_ids=dummy_prompt_token_ids,
  36. sampling_params=sampling_params,
  37. use_tqdm=False)
  38. end_time = time.time()
  39. latency = end_time - start_time
  40. if profile:
  41. torch.cuda.cudart().cudaProfilerStop()
  42. return latency
  43. print("Warming up...")
  44. run_to_completion(profile=False)
  45. # Benchmark.
  46. latencies = []
  47. for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
  48. latencies.append(run_to_completion(profile=False))
  49. print(f'Avg latency: {np.mean(latencies)} seconds')
  50. if __name__ == '__main__':
  51. parser = argparse.ArgumentParser(
  52. description='Benchmark the latency of processing a single batch of '
  53. 'requests till completion.')
  54. parser.add_argument('--model', type=str, default='facebook/opt-125m')
  55. parser.add_argument('--tokenizer', type=str, default=None)
  56. parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
  57. parser.add_argument('--input-len', type=int, default=32)
  58. parser.add_argument('--output-len', type=int, default=128)
  59. parser.add_argument('--batch-size', type=int, default=8)
  60. parser.add_argument('--n', '-n', type=int, default=1,
  61. help='Number of generated sequences per prompt.')
  62. parser.add_argument('--swap-space', type=int, default=4)
  63. parser.add_argument('--use-beam-search', action='store_true')
  64. parser.add_argument('--num-iters', type=int, default=3,
  65. help='Number of iterations to run.')
  66. parser.add_argument('--trust-remote-code', action='store_true',
  67. help='trust remote code from huggingface')
  68. args = parser.parse_args()
  69. main(args)