12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970 |
- import argparse
- import cProfile
- import pstats
- from aphrodite import LLM, SamplingParams
- # A very long prompt, total number of tokens is about 15k.
- LONG_PROMPT = ["You are an expert in large language models, aren't you?"
- ] * 1000
- LONG_PROMPT = ' '.join(LONG_PROMPT)
- def main(args):
- llm = LLM(
- model=args.model,
- enforce_eager=True,
- enable_prefix_caching=True,
- tensor_parallel_size=args.tensor_parallel_size,
- use_v2_block_manager=args.use_v2_block_manager,
- max_model_len=args.max_model_len,
- )
- sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
- profiler = cProfile.Profile()
- print("------warm up------")
- for i in range(3):
- output = llm.generate(LONG_PROMPT, sampling_params)
- print(output[0].outputs[0].text)
- print("------start generating------")
- for i in range(3):
- profiler.runctx('llm.generate(LONG_PROMPT, sampling_params)',
- globals(), locals())
- # analyze the runtime of hashing function
- stats = pstats.Stats(profiler)
- stats.sort_stats('cumulative')
- total_time = 0
- total_calls = 0
- for func in stats.stats:
- if 'hash_of_block' in func[2]:
- total_time = stats.stats[func][3]
- total_calls = stats.stats[func][0]
- percentage = (total_time / stats.total_tt) * 100
- print(f"Hashing took {total_time:.2f} seconds,"
- f"{percentage:.2f}% of the total runtime.")
- if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description='Benchmark the performance of hashing function in'
- 'automatic prefix caching.')
- parser.add_argument('--model',
- type=str,
- default='NousResearch/Meta-Llama-3-8B')
- parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
- parser.add_argument('--output-len', type=int, default=10)
- parser.add_argument('--enable-prefix-caching',
- action='store_true',
- help='enable prefix caching')
- parser.add_argument('--use-v2-block-manager',
- action='store_true',
- help='Use BlockSpaceMangerV2')
- parser.add_argument('--max-model-len',
- type=int,
- default=None,
- help='maximum length of the model')
- args = parser.parse_args()
- main(args)
|