1
0

hashing.py 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. import argparse
  2. import cProfile
  3. import pstats
  4. from aphrodite import LLM, SamplingParams
  5. # A very long prompt, total number of tokens is about 15k.
  6. LONG_PROMPT = ["You are an expert in large language models, aren't you?"
  7. ] * 1000
  8. LONG_PROMPT = ' '.join(LONG_PROMPT)
  9. def main(args):
  10. llm = LLM(
  11. model=args.model,
  12. enforce_eager=True,
  13. enable_prefix_caching=True,
  14. tensor_parallel_size=args.tensor_parallel_size,
  15. use_v2_block_manager=args.use_v2_block_manager,
  16. max_model_len=args.max_model_len,
  17. )
  18. sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
  19. profiler = cProfile.Profile()
  20. print("------warm up------")
  21. for i in range(3):
  22. output = llm.generate(LONG_PROMPT, sampling_params)
  23. print(output[0].outputs[0].text)
  24. print("------start generating------")
  25. for i in range(3):
  26. profiler.runctx('llm.generate(LONG_PROMPT, sampling_params)',
  27. globals(), locals())
  28. # analyze the runtime of hashing function
  29. stats = pstats.Stats(profiler)
  30. stats.sort_stats('cumulative')
  31. total_time = 0
  32. total_calls = 0
  33. for func in stats.stats:
  34. if 'hash_of_block' in func[2]:
  35. total_time = stats.stats[func][3]
  36. total_calls = stats.stats[func][0]
  37. percentage = (total_time / stats.total_tt) * 100
  38. print(f"Hashing took {total_time:.2f} seconds,"
  39. f"{percentage:.2f}% of the total runtime.")
  40. if __name__ == "__main__":
  41. parser = argparse.ArgumentParser(
  42. description='Benchmark the performance of hashing function in'
  43. 'automatic prefix caching.')
  44. parser.add_argument('--model',
  45. type=str,
  46. default='NousResearch/Meta-Llama-3-8B')
  47. parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
  48. parser.add_argument('--output-len', type=int, default=10)
  49. parser.add_argument('--enable-prefix-caching',
  50. action='store_true',
  51. help='enable prefix caching')
  52. parser.add_argument('--use-v2-block-manager',
  53. action='store_true',
  54. help='Use BlockSpaceMangerV2')
  55. parser.add_argument('--max-model-len',
  56. type=int,
  57. default=None,
  58. help='maximum length of the model')
  59. args = parser.parse_args()
  60. main(args)