1
0

prefix_caching.py 3.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. import time
  2. from aphrodite import LLM, SamplingParams
  3. from aphrodite.common.utils import FlexibleArgumentParser
  4. PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501
  5. def test_prefix(llm=None, sampling_params=None, prompts=None):
  6. start_time = time.time()
  7. llm.generate(prompts, sampling_params=sampling_params)
  8. end_time = time.time()
  9. print(f"cost time {end_time - start_time}")
  10. def main(args):
  11. llm = LLM(model=args.model,
  12. tokenizer_mode='auto',
  13. trust_remote_code=True,
  14. enforce_eager=True,
  15. use_v2_block_manager=args.use_v2_block_manager,
  16. tensor_parallel_size=args.tensor_parallel_size,
  17. enable_prefix_caching=args.enable_prefix_caching)
  18. num_prompts = 100
  19. prompts = [PROMPT] * num_prompts
  20. sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
  21. print("------warm up------")
  22. test_prefix(
  23. llm=llm,
  24. prompts=prompts,
  25. sampling_params=sampling_params,
  26. )
  27. print("------start generating------")
  28. test_prefix(
  29. llm=llm,
  30. prompts=prompts,
  31. sampling_params=sampling_params,
  32. )
  33. if __name__ == "__main__":
  34. parser = FlexibleArgumentParser(
  35. description='Benchmark the performance with or without automatic '
  36. 'prefix caching.')
  37. parser.add_argument('--model',
  38. type=str,
  39. default='baichuan-inc/Baichuan2-13B-Chat')
  40. parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
  41. parser.add_argument('--output-len', type=int, default=10)
  42. parser.add_argument('--enable-prefix-caching',
  43. action='store_true',
  44. help='enable prefix caching')
  45. parser.add_argument('--use-v2-block-manager',
  46. action='store_true',
  47. help='Use BlockSpaceMangerV2')
  48. args = parser.parse_args()
  49. main(args)