123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100 |
- import argparse
- import time
- import numpy as np
- import torch
- from tqdm import tqdm
- from aphrodite import LLM, SamplingParams
- def main(args: argparse.Namespace): # pylint: disable=redefined-outer-name
- print(args)
- # Process all the requests in a single batch if possible.
- # NOTE: If the request cannot be processed in a single batch,
- # the engine will automatically process the request in multiple batches.
- llm = LLM(
- model=args.model,
- tokenizer=args.tokenizer,
- quantization=args.quantization,
- tensor_parallel_size=args.tensor_parallel_size,
- max_num_seqs=args.batch_size,
- max_num_batched_tokens=args.batch_size * args.input_len,
- trust_remote_code=args.trust_remote_code,
- dtype=args.dtype,
- )
- sampling_params = SamplingParams(
- n=args.n,
- temperature=0.0 if args.use_beam_search else 1.0,
- top_p=1.0,
- use_beam_search=args.use_beam_search,
- ignore_eos=True,
- max_tokens=args.output_len,
- )
- print(sampling_params)
- dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size
- def run_to_completion(profile: bool = False):
- if profile:
- torch.cuda.cudart().cudaProfilerStart()
- start_time = time.perf_counter()
- llm.generate(prompt_token_ids=dummy_prompt_token_ids,
- sampling_params=sampling_params,
- use_tqdm=False)
- end_time = time.perf_counter()
- latency = end_time - start_time
- if profile:
- torch.cuda.cudart().cudaProfilerStop()
- return latency
- print('Warming up...')
- run_to_completion(profile=False)
- # Benchmark.
- latencies = []
- for _ in tqdm(range(args.num_iters), desc='Profiling iterations'):
- latencies.append(run_to_completion(profile=False))
- print(f'Avg latency: {np.mean(latencies)} seconds')
- if __name__ == '__main__':
- parser = argparse.ArgumentParser(
- description='Benchmark the latency of processing a single batch of '
- 'requests till completion.')
- parser.add_argument('--model', type=str, default='facebook/opt-125m')
- parser.add_argument('--tokenizer', type=str, default=None)
- parser.add_argument('--quantization',
- '-q',
- choices=['awq', None],
- default=None)
- parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
- parser.add_argument('--input-len', type=int, default=32)
- parser.add_argument('--output-len', type=int, default=128)
- parser.add_argument('--batch-size', type=int, default=8)
- parser.add_argument('--n',
- type=int,
- default=1,
- help='Number of generated sequences per prompt.')
- parser.add_argument('--use-beam-search', action='store_true')
- parser.add_argument('--num-iters',
- type=int,
- default=3,
- help='Number of iterations to run.')
- parser.add_argument('--trust-remote-code',
- action='store_true',
- help='trust remote code from huggingface')
- parser.add_argument(
- '--dtype',
- type=str,
- default='auto',
- choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
- help='data type for model weights and activations. '
- 'The "auto" option will use FP16 precision '
- 'for FP32 and FP16 models, and BF16 precision '
- 'for BF16 models.')
- args = parser.parse_args()
- main(args)
|