import argparse import time import numpy as np import torch from tqdm import tqdm from aphrodite import LLM, SamplingParams def main(args: argparse.Namespace): # pylint: disable=redefined-outer-name print(args) # Process all the requests in a single batch if possible. # NOTE: If the request cannot be processed in a single batch, # the engine will automatically process the request in multiple batches. llm = LLM( model=args.model, tokenizer=args.tokenizer, quantization=args.quantization, tensor_parallel_size=args.tensor_parallel_size, max_num_seqs=args.batch_size, max_num_batched_tokens=args.batch_size * args.input_len, trust_remote_code=args.trust_remote_code, dtype=args.dtype, ) sampling_params = SamplingParams( n=args.n, temperature=0.0 if args.use_beam_search else 1.0, top_p=1.0, use_beam_search=args.use_beam_search, ignore_eos=True, max_tokens=args.output_len, ) print(sampling_params) dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size def run_to_completion(profile: bool = False): if profile: torch.cuda.cudart().cudaProfilerStart() start_time = time.perf_counter() llm.generate(prompt_token_ids=dummy_prompt_token_ids, sampling_params=sampling_params, use_tqdm=False) end_time = time.perf_counter() latency = end_time - start_time if profile: torch.cuda.cudart().cudaProfilerStop() return latency print('Warming up...') run_to_completion(profile=False) # Benchmark. latencies = [] for _ in tqdm(range(args.num_iters), desc='Profiling iterations'): latencies.append(run_to_completion(profile=False)) print(f'Avg latency: {np.mean(latencies)} seconds') if __name__ == '__main__': parser = argparse.ArgumentParser( description='Benchmark the latency of processing a single batch of ' 'requests till completion.') parser.add_argument('--model', type=str, default='facebook/opt-125m') parser.add_argument('--tokenizer', type=str, default=None) parser.add_argument('--quantization', '-q', choices=['awq', None], default=None) parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) parser.add_argument('--input-len', type=int, default=32) parser.add_argument('--output-len', type=int, default=128) parser.add_argument('--batch-size', type=int, default=8) parser.add_argument('--n', type=int, default=1, help='Number of generated sequences per prompt.') parser.add_argument('--use-beam-search', action='store_true') parser.add_argument('--num-iters', type=int, default=3, help='Number of iterations to run.') parser.add_argument('--trust-remote-code', action='store_true', help='trust remote code from huggingface') parser.add_argument( '--dtype', type=str, default='auto', choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], help='data type for model weights and activations. ' 'The "auto" option will use FP16 precision ' 'for FP32 and FP16 models, and BF16 precision ' 'for BF16 models.') args = parser.parse_args() main(args)