123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308 |
- """Benchmark offline inference throughput."""
- import argparse
- import json
- import random
- import time
- from typing import List, Optional, Tuple
- import torch
- from transformers import AutoModelForCausalLM, PreTrainedTokenizerBase
- from tqdm import tqdm
- from aphrodite import LLM, SamplingParams
- from aphrodite.transformers_utils.tokenizer import get_tokenizer
- def sample_requests(
- dataset_path: str,
- num_requests: int,
- tokenizer: PreTrainedTokenizerBase,
- ) -> List[Tuple[str, int, int]]:
- # Load the dataset.
- with open(dataset_path) as f:
- dataset = json.load(f)
- # Filter out the conversations with less than 2 turns.
- dataset = [data for data in dataset if len(data["conversations"]) >= 2]
- # Only keep the first two turns of each conversation.
- dataset = [(data["conversations"][0]["value"],
- data["conversations"][1]["value"]) for data in dataset]
- # Tokenize the prompts and completions.
- prompts = [prompt for prompt, _ in dataset]
- prompt_token_ids = tokenizer(prompts).input_ids
- completions = [completion for _, completion in dataset]
- completion_token_ids = tokenizer(completions).input_ids
- tokenized_dataset = []
- for i in range(len(dataset)):
- output_len = len(completion_token_ids[i])
- tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
- # Filter out too long sequences.
- filtered_dataset: List[Tuple[str, int, int]] = []
- for prompt, prompt_token_ids, output_len in tokenized_dataset:
- prompt_len = len(prompt_token_ids)
- if prompt_len < 4 or output_len < 4:
- # Prune too short sequences.
- continue
- if prompt_len > 1024 or prompt_len + output_len > 2048:
- # Prune too long sequences.
- continue
- filtered_dataset.append((prompt, prompt_len, output_len))
- # Sample the requests.
- sampled_requests = random.sample(filtered_dataset, num_requests)
- return sampled_requests
- def run_aphrodite(
- requests: List[Tuple[str, int, int]],
- model: str,
- tokenizer: str,
- quantization: Optional[str],
- tensor_parallel_size: int,
- seed: int,
- n: int,
- use_beam_search: bool,
- trust_remote_code: bool,
- dtype: str,
- kv_cache_dtype: str,
- disable_custom_all_reduce: bool,
- context_shift: bool,
- enforce_eager: bool,
- enable_chunked_prefill: bool,
- max_num_batched_tokens: int,
- speculative_model: Optional[str] = None,
- num_speculative_tokens: Optional[int] = None,
- use_v2_block_manager: bool = False,
- ) -> float:
- llm = LLM(
- model=model,
- tokenizer=tokenizer,
- quantization=quantization,
- tensor_parallel_size=tensor_parallel_size,
- seed=seed,
- trust_remote_code=trust_remote_code,
- dtype=dtype,
- kv_cache_dtype=kv_cache_dtype,
- disable_custom_all_reduce=disable_custom_all_reduce,
- context_shift=context_shift,
- enforce_eager=enforce_eager,
- enable_chunked_prefill=enable_chunked_prefill,
- max_num_batched_tokens=max_num_batched_tokens,
- speculative_model=speculative_model,
- num_speculative_tokens=num_speculative_tokens,
- use_v2_block_manager=use_v2_block_manager,
- )
- # Add the requests to the engine.
- for prompt, _, output_len in requests:
- sampling_params = SamplingParams(
- n=n,
- temperature=0.0 if use_beam_search else 1.0,
- top_p=1.0,
- use_beam_search=use_beam_search,
- ignore_eos=True,
- max_tokens=output_len,
- )
- # FIXME: Do not use internal method.
- llm._add_request( # pylint: disable=protected-access
- prompt=prompt,
- prompt_token_ids=None,
- sampling_params=sampling_params,
- )
- start = time.perf_counter()
- # FIXME Do use internal method.
- llm._run_engine(use_tqdm=True) # pylint: disable=protected-access
- end = time.perf_counter()
- return end - start
- def run_hf(
- requests: List[Tuple[str, int, int]],
- model: str,
- tokenizer: PreTrainedTokenizerBase,
- n: int,
- use_beam_search: bool,
- max_batch_size: int,
- trust_remote_code: bool,
- ) -> float:
- assert not use_beam_search
- llm = AutoModelForCausalLM.from_pretrained(
- model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
- if llm.config.model_type == "llama":
- # To enable padding in the HF backend.
- tokenizer.pad_token = tokenizer.eos_token
- llm = llm.cuda()
- pbar = tqdm(total=len(requests))
- start = time.perf_counter()
- batch: List[str] = []
- max_prompt_len = 0
- max_output_len = 0
- for i in range(len(requests)):
- prompt, prompt_len, output_len = requests[i]
- # Add the prompt to the batch.
- batch.append(prompt)
- max_prompt_len = max(max_prompt_len, prompt_len)
- max_output_len = max(max_output_len, output_len)
- if len(batch) < max_batch_size and i != len(requests) - 1:
- # Check if we can add more requests to the batch.
- _, next_prompt_len, next_output_len = requests[i + 1]
- if (max(max_prompt_len, next_prompt_len) +
- max(max_output_len, next_output_len)) <= 2048:
- # We can add more requests to the batch.
- continue
- # Generate the sequences.
- input_ids = tokenizer(batch, return_tensors="pt",
- padding=True).input_ids
- llm_outputs = llm.generate(
- input_ids=input_ids.cuda(),
- do_sample=not use_beam_search,
- num_return_sequences=n,
- temperature=1.0,
- top_p=1.0,
- use_cache=True,
- max_new_tokens=max_output_len,
- )
- # Include the decoding time.
- tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
- pbar.update(len(batch))
- # Clear the batch.
- batch = []
- max_prompt_len = 0
- max_output_len = 0
- end = time.perf_counter()
- return end - start
- def main(args: argparse.Namespace): # pylint: disable=redefined-outer-name
- print(args)
- random.seed(args.seed)
- # Sample the requests.
- tokenizer = get_tokenizer(args.tokenizer,
- trust_remote_code=args.trust_remote_code)
- requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
- if args.backend == "aphrodite":
- elapsed_time = run_aphrodite(
- requests, args.model, args.tokenizer, args.quantization,
- args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
- args.trust_remote_code, args.dtype, args.kv_cache_dtype,
- args.disable_custom_all_reduce, args.context_shift,
- args.enforce_eager, args.enable_chunked_prefill,
- args.max_num_batched_tokens)
- elif args.backend == "hf":
- assert args.tensor_parallel_size == 1
- elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
- args.use_beam_search, args.hf_max_batch_size,
- args.trust_remote_code)
- else:
- raise ValueError(f"Unknown backend: {args.backend}")
- total_input_tokens = sum(prompt_len for _, prompt_len, _ in requests)
- total_output_tokens = sum(output_len for _, _, output_len in requests)
- print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
- f"Input tokens/s: {total_input_tokens / elapsed_time:.2f}, "
- f"Output tokens/s: {total_output_tokens / elapsed_time:.2f}")
- if __name__ == "__main__":
- parser = argparse.ArgumentParser(description="Benchmark the throughput.")
- parser.add_argument("--backend",
- type=str,
- choices=["aphrodite", "hf"],
- default="aphrodite")
- parser.add_argument("--dataset",
- type=str,
- required=True,
- help="Path to the dataset.")
- parser.add_argument("--model",
- type=str,
- default="EleutherAI/pythia-70m-deduped")
- parser.add_argument("--tokenizer", type=str, default=None)
- parser.add_argument(
- "--quantization",
- "-q",
- choices=["awq", "gguf", "bnb", "gptq", "squeezellm", "marlin", None],
- default=None)
- parser.add_argument("--gpu-memory-utilization", type=float, default=0.88)
- parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
- parser.add_argument("--n",
- type=int,
- default=1,
- help="Number of generated sequences per prompt.")
- parser.add_argument("--use-beam-search", action="store_true")
- parser.add_argument("--num-prompts",
- type=int,
- default=1000,
- help="Number of prompts to process.")
- parser.add_argument("--seed", type=int, default=0)
- parser.add_argument("--hf-max-batch-size",
- type=int,
- default=None,
- help="Maximum batch size for HF backend.")
- parser.add_argument("--trust-remote-code",
- action="store_true",
- help="trust remote code from huggingface")
- parser.add_argument(
- "--dtype",
- type=str,
- default="auto",
- choices=["auto", "half", "float16", "bfloat16", "float", "float32"],
- help="data type for model weights and activations. "
- "The 'auto' option will use FP16 precision "
- "for FP32 and FP16 models, and BF16 precision "
- "for BF16 models.")
- parser.add_argument("--kv-cache-dtype",
- type=str,
- default="auto",
- choices=["auto", "fp8_e5m2"],
- help="The Data Type for the KV cache.")
- parser.add_argument(
- "--disable-custom-all-reduce",
- action="store_true",
- help="disable custom all reduce for the Aphrodite backend")
- parser.add_argument(
- "--context-shift",
- action="store_true",
- help="enable context shifting for the Aphrodite backend")
- parser.add_argument("--enforce-eager",
- type=lambda x: (str(x).lower() == 'true'),
- default=True,
- help="enforce eager mode for the Aphrodite backend")
- parser.add_argument(
- "--enable-chunked-prefill",
- action="store_true",
- help="enable chunked prefill for the Aphrodite backend")
- parser.add_argument("--max-num-batched-tokens",
- type=int,
- help="maximum number of batched tokens for the "
- "Aphrodite backend")
- parser.add_argument("--speculative-model",
- type=str,
- help="speculative model for the Aphrodite backend")
- parser.add_argument("--num-speculative-tokens",
- type=int,
- help="number of speculative tokens for the "
- "Aphrodite backend")
- parser.add_argument("--use-v2-block-manager",
- action="store_true",
- help="use v2 block manager for the Aphrodite backend")
- args = parser.parse_args()
- if args.backend == "aphrodite":
- if args.hf_max_batch_size is not None:
- raise ValueError("HF max batch size is only for HF backend.")
- elif args.backend == "hf":
- if args.hf_max_batch_size is None:
- raise ValueError("HF max batch size is required for HF backend.")
- if args.quantization is not None:
- raise ValueError("Quantization is only for aphrodite backend.")
- if args.tokenizer is None:
- args.tokenizer = args.model
- main(args)
|