Browse Source

update all benchmarks (#597)

AlpinDale 7 months ago
parent
commit
5dbfc200f2

+ 8 - 0
tests/benchmarks/README.md

@@ -0,0 +1,8 @@
+# Benchmarking Aphrodite
+
+## Downloading the ShareGPT dataset
+
+You can download the dataset by running:
+```bash
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+```

+ 45 - 13
tests/benchmarks/backend_request_func.py

@@ -4,10 +4,13 @@ import sys
 import time
 import time
 import traceback
 import traceback
 from dataclasses import dataclass, field
 from dataclasses import dataclass, field
-from typing import List, Optional
+from typing import List, Optional, Union
 
 
 import aiohttp
 import aiohttp
+import huggingface_hub.constants
 from tqdm.asyncio import tqdm
 from tqdm.asyncio import tqdm
+from transformers import (AutoTokenizer, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)
 
 
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
 
 
@@ -68,9 +71,13 @@ async def async_request_tgi(
                         chunk_bytes = chunk_bytes.strip()
                         chunk_bytes = chunk_bytes.strip()
                         if not chunk_bytes:
                         if not chunk_bytes:
                             continue
                             continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")
 
 
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
-                                              "data:")
+                        #NOTE: Sometimes TGI returns a ping response without
+                        # any data, we should skip it.
+                        if chunk_bytes.startswith(":"):
+                            continue
+                        chunk = remove_prefix(chunk_bytes, "data:")
 
 
                         data = json.loads(chunk)
                         data = json.loads(chunk)
                         timestamp = time.perf_counter()
                         timestamp = time.perf_counter()
@@ -218,8 +225,8 @@ async def async_request_openai_completions(
 ) -> RequestFuncOutput:
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     api_url = request_func_input.api_url
     assert api_url.endswith(
     assert api_url.endswith(
-        "v1/completions"
-    ), "OpenAI Completions API URL must end with 'v1/completions'."
+        "completions"
+    ), "OpenAI Completions API URL must end with 'completions'."
 
 
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
         assert not request_func_input.use_beam_search
         assert not request_func_input.use_beam_search
@@ -258,6 +265,9 @@ async def async_request_openai_completions(
                         else:
                         else:
                             data = json.loads(chunk)
                             data = json.loads(chunk)
 
 
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
                             if data["choices"][0]["text"]:
                             if data["choices"][0]["text"]:
                                 timestamp = time.perf_counter()
                                 timestamp = time.perf_counter()
                                 # First token
                                 # First token
@@ -266,12 +276,8 @@ async def async_request_openai_completions(
                                     output.ttft = ttft
                                     output.ttft = ttft
 
 
                                 # Decoding phase
                                 # Decoding phase
-                                # NOTE: Some completion API might have a last
-                                # usage summary response without a token so we
-                                # do not want to include as inter-token-latency
-                                elif data.get("usage", None) is None:
-                                    output.itl.append(timestamp -
-                                                      most_recent_timestamp)
+                                output.itl.append(timestamp -
+                                                  most_recent_timestamp)
 
 
                                 most_recent_timestamp = timestamp
                                 most_recent_timestamp = timestamp
                                 generated_text += data["choices"][0]["text"]
                                 generated_text += data["choices"][0]["text"]
@@ -298,8 +304,8 @@ async def async_request_openai_chat_completions(
 ) -> RequestFuncOutput:
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     api_url = request_func_input.api_url
     assert api_url.endswith(
     assert api_url.endswith(
-        "v1/chat/completions"
-    ), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
+        "chat/completions"
+    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
 
 
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
         assert not request_func_input.use_beam_search
         assert not request_func_input.use_beam_search
@@ -384,12 +390,38 @@ def remove_prefix(text: str, prefix: str) -> str:
     return text
     return text
 
 
 
 
+def get_model(pretrained_model_name_or_path: str) -> str:
+    if os.getenv('APHRODITE_USE_MODELSCOPE', 'False').lower() == 'true':
+        from modelscope import snapshot_download
+
+        model_path = snapshot_download(
+            model_id=pretrained_model_name_or_path,
+            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+            ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
+
+        return model_path
+    return pretrained_model_name_or_path
+
+
+def get_tokenizer(
+    pretrained_model_name_or_path: str, trust_remote_code: bool
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+    if pretrained_model_name_or_path is not None and not os.path.exists(
+            pretrained_model_name_or_path):
+        pretrained_model_name_or_path = get_model(
+            pretrained_model_name_or_path)
+    return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
+                                         trust_remote_code=trust_remote_code)
+
+
 ASYNC_REQUEST_FUNCS = {
 ASYNC_REQUEST_FUNCS = {
     "tgi": async_request_tgi,
     "tgi": async_request_tgi,
     "aphrodite": async_request_openai_completions,
     "aphrodite": async_request_openai_completions,
+    "vllm": async_request_openai_completions,
     "lmdeploy": async_request_openai_completions,
     "lmdeploy": async_request_openai_completions,
     "deepspeed-mii": async_request_deepspeed_mii,
     "deepspeed-mii": async_request_deepspeed_mii,
     "openai": async_request_openai_completions,
     "openai": async_request_openai_completions,
     "openai-chat": async_request_openai_chat_completions,
     "openai-chat": async_request_openai_chat_completions,
     "tensorrt-llm": async_request_trt_llm,
     "tensorrt-llm": async_request_trt_llm,
+    "scalellm": async_request_openai_completions,
 }
 }

+ 20 - 19
tests/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py

@@ -8,9 +8,10 @@ from typing import Callable, Iterable, List, Tuple
 import torch
 import torch
 import torch.utils.benchmark as TBenchmark
 import torch.utils.benchmark as TBenchmark
 from torch.utils.benchmark import Measurement as TMeasurement
 from torch.utils.benchmark import Measurement as TMeasurement
-from .weight_shapes import WEIGHT_SHAPES
+from weight_shapes import WEIGHT_SHAPES
 
 
 from aphrodite import _custom_ops as ops
 from aphrodite import _custom_ops as ops
+from aphrodite.common.utils import FlexibleArgumentParser
 
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
@@ -19,18 +20,18 @@ DEFAULT_TP_SIZES = [1]
 # helpers
 # helpers
 
 
 
 
-def to_fp8(tensor: torch.tensor) -> torch.tensor:
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
     finfo = torch.finfo(torch.float8_e4m3fn)
     finfo = torch.finfo(torch.float8_e4m3fn)
     return torch.round(tensor.clamp(
     return torch.round(tensor.clamp(
         min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
         min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
 
 
 
 
-def to_int8(tensor: torch.tensor) -> torch.tensor:
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
     return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
     return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
 
 
 
 
 def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
 def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
-                      k: int) -> Tuple[torch.tensor, torch.tensor]:
+                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
 
 
     a = torch.randn((m, k), device='cuda') * 5
     a = torch.randn((m, k), device='cuda') * 5
     b = torch.randn((n, k), device='cuda').t() * 5
     b = torch.randn((n, k), device='cuda').t() * 5
@@ -46,15 +47,15 @@ def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
 # impl
 # impl
 
 
 
 
-def pytorch_mm_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
-                    scale_b: torch.tensor,
-                    out_dtype: torch.dtype) -> torch.tensor:
+def pytorch_mm_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
+                    scale_b: torch.Tensor,
+                    out_dtype: torch.dtype) -> torch.Tensor:
     return torch.mm(a, b)
     return torch.mm(a, b)
 
 
 
 
-def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
-                     scale_b: torch.tensor,
-                     out_dtype: torch.dtype) -> torch.tensor:
+def pytorch_fp8_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
+                     scale_b: torch.Tensor,
+                     out_dtype: torch.dtype) -> torch.Tensor:
     return torch._scaled_mm(a,
     return torch._scaled_mm(a,
                             b,
                             b,
                             scale_a=scale_a,
                             scale_a=scale_a,
@@ -62,9 +63,9 @@ def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
                             out_dtype=out_dtype)
                             out_dtype=out_dtype)
 
 
 
 
-def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
-                                scale_a: torch.tensor, scale_b: torch.tensor,
-                                out_dtype: torch.dtype) -> torch.tensor:
+def pytorch_fp8_impl_fast_accum(a: torch.Tensor, b: torch.Tensor,
+                                scale_a: torch.Tensor, scale_b: torch.Tensor,
+                                out_dtype: torch.dtype) -> torch.Tensor:
     return torch._scaled_mm(a,
     return torch._scaled_mm(a,
                             b,
                             b,
                             scale_a=scale_a,
                             scale_a=scale_a,
@@ -73,15 +74,15 @@ def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
                             use_fast_accum=True)
                             use_fast_accum=True)
 
 
 
 
-def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
-                 scale_b: torch.tensor,
-                 out_dtype: torch.dtype) -> torch.tensor:
+def cutlass_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
+                 scale_b: torch.Tensor,
+                 out_dtype: torch.dtype) -> torch.Tensor:
     return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)
     return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)
 
 
 
 
 # bench
 # bench
-def bench_fn(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
-             scale_b: torch.tensor, out_dtype: torch.dtype, label: str,
+def bench_fn(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
+             scale_b: torch.Tensor, out_dtype: torch.dtype, label: str,
              sub_label: str, fn: Callable, description: str) -> TMeasurement:
              sub_label: str, fn: Callable, description: str) -> TMeasurement:
 
 
     min_run_time = 1
     min_run_time = 1
@@ -293,7 +294,7 @@ if __name__ == '__main__':
             return torch.float8_e4m3fn
             return torch.float8_e4m3fn
         raise ValueError("unsupported dtype")
         raise ValueError("unsupported dtype")
 
 
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description="""
         description="""
 Benchmark Cutlass GEMM.
 Benchmark Cutlass GEMM.
 
 

+ 287 - 0
tests/benchmarks/engine/latency.py

@@ -0,0 +1,287 @@
+"""Benchmark the latency of processing a single batch of requests."""
+import argparse
+import json
+import time
+from pathlib import Path
+from typing import List, Optional
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from aphrodite import LLM, SamplingParams
+from aphrodite.engine.args_tools import EngineArgs
+from aphrodite.inputs import PromptStrictInputs
+from aphrodite.quantization import QUANTIZATION_METHODS
+from aphrodite.common.utils import FlexibleArgumentParser
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    # NOTE: If the request cannot be processed in a single batch,
+    # the engine will automatically process the request in multiple batches.
+    llm = LLM(
+        model=args.model,
+        speculative_model=args.speculative_model,
+        num_speculative_tokens=args.num_speculative_tokens,
+        speculative_draft_tensor_parallel_size=\
+            args.speculative_draft_tensor_parallel_size,
+        ngram_prompt_lookup_max=args.ngram_prompt_lookup_max,
+        ngram_prompt_lookup_min=args.ngram_prompt_lookup_min,
+        tokenizer=args.tokenizer,
+        quantization=args.quantization,
+        tensor_parallel_size=args.tensor_parallel_size,
+        trust_remote_code=args.trust_remote_code,
+        dtype=args.dtype,
+        max_model_len=args.max_model_len,
+        enforce_eager=args.enforce_eager,
+        kv_cache_dtype=args.kv_cache_dtype,
+        quantization_param_path=args.quantization_param_path,
+        device=args.device,
+        ray_workers_use_nsight=args.ray_workers_use_nsight,
+        use_v2_block_manager=args.use_v2_block_manager,
+        enable_chunked_prefill=args.enable_chunked_prefill,
+        download_dir=args.download_dir,
+        block_size=args.block_size,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+        load_format=args.load_format,
+        distributed_executor_backend=args.distributed_executor_backend,
+        enable_prefix_caching=args.enable_prefix_caching,
+    )
+
+    sampling_params = SamplingParams(
+        n=args.n,
+        temperature=0.0 if args.use_beam_search else 1.0,
+        top_p=1.0,
+        use_beam_search=args.use_beam_search,
+        ignore_eos=True,
+        max_tokens=args.output_len,
+    )
+    print(sampling_params)
+    dummy_prompt_token_ids = np.random.randint(10000,
+                                               size=(args.batch_size,
+                                                     args.input_len))
+    dummy_inputs: List[PromptStrictInputs] = [{
+        "prompt_token_ids": batch
+    } for batch in dummy_prompt_token_ids.tolist()]
+
+    def run_to_completion(profile_dir: Optional[str] = None):
+        if profile_dir:
+            with torch.profiler.profile(
+                    activities=[
+                        torch.profiler.ProfilerActivity.CPU,
+                        torch.profiler.ProfilerActivity.CUDA,
+                    ],
+                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                        str(profile_dir))) as p:
+                llm.generate(dummy_inputs,
+                             sampling_params=sampling_params,
+                             use_tqdm=False)
+            print(p.key_averages())
+        else:
+            start_time = time.perf_counter()
+            llm.generate(dummy_inputs,
+                         sampling_params=sampling_params,
+                         use_tqdm=False)
+            end_time = time.perf_counter()
+            latency = end_time - start_time
+            return latency
+
+    print("Warming up...")
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        run_to_completion(profile_dir=None)
+
+    if args.profile:
+        profile_dir = args.profile_result_dir
+        if not profile_dir:
+            profile_dir = Path(
+                "."
+            ) / "aphrodite_benchmark_result" / f"latency_result_{time.time()}"
+        print(f"Profiling (results will be saved to '{profile_dir}')...")
+        run_to_completion(profile_dir=profile_dir)
+        return
+
+    # Benchmark.
+    latencies = []
+    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
+        latencies.append(run_to_completion(profile_dir=None))
+    latencies = np.array(latencies)
+    percentages = [10, 25, 50, 75, 90, 99]
+    percentiles = np.percentile(latencies, percentages)
+    print(f'Avg latency: {np.mean(latencies)} seconds')
+    for percentage, percentile in zip(percentages, percentiles):
+        print(f'{percentage}% percentile latency: {percentile} seconds')
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "avg_latency": np.mean(latencies),
+            "latencies": latencies.tolist(),
+            "percentiles": dict(zip(percentages, percentiles.tolist())),
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+
+
+if __name__ == '__main__':
+    parser = FlexibleArgumentParser(
+        description='Benchmark the latency of processing a single batch of '
+        'requests till completion.')
+    parser.add_argument('--model', type=str, default='facebook/opt-125m')
+    parser.add_argument('--speculative-model', type=str, default=None)
+    parser.add_argument('--num-speculative-tokens', type=int, default=None)
+    parser.add_argument('--speculative-draft-tensor-parallel-size',
+                        '-spec-draft-tp',
+                        type=int,
+                        default=None)
+    parser.add_argument('--ngram-prompt-lookup-max',
+                        type=int,
+                        default=None)
+    parser.add_argument('--ngram-prompt-lookup-min',
+                        type=int,
+                        default=None)
+    parser.add_argument('--tokenizer', type=str, default=None)
+    parser.add_argument('--quantization',
+                        '-q',
+                        choices=[*QUANTIZATION_METHODS, None],
+                        default=None)
+    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
+    parser.add_argument('--input-len', type=int, default=32)
+    parser.add_argument('--output-len', type=int, default=128)
+    parser.add_argument('--batch-size', type=int, default=8)
+    parser.add_argument('--n',
+                        type=int,
+                        default=1,
+                        help='Number of generated sequences per prompt.')
+    parser.add_argument('--use-beam-search', action='store_true')
+    parser.add_argument('--num-iters-warmup',
+                        type=int,
+                        default=10,
+                        help='Number of iterations to run for warmup.')
+    parser.add_argument('--num-iters',
+                        type=int,
+                        default=30,
+                        help='Number of iterations to run.')
+    parser.add_argument('--trust-remote-code',
+                        action='store_true',
+                        help='trust remote code from huggingface')
+    parser.add_argument(
+        '--max-model-len',
+        type=int,
+        default=None,
+        help='Maximum length of a sequence (including prompt and output). '
+        'If None, will be derived from the model.')
+    parser.add_argument(
+        '--dtype',
+        type=str,
+        default='auto',
+        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
+        help='data type for model weights and activations. '
+        'The "auto" option will use FP16 precision '
+        'for FP32 and FP16 models, and BF16 precision '
+        'for BF16 models.')
+    parser.add_argument('--enforce-eager',
+                        action='store_true',
+                        help='enforce eager mode and disable CUDA graph')
+    parser.add_argument(
+        '--kv-cache-dtype',
+        type=str,
+        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
+        default="auto",
+        help='Data type for kv cache storage. If "auto", will use model '
+        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
+        'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
+    parser.add_argument(
+        '--quantization-param-path',
+        type=str,
+        default=None,
+        help='Path to the JSON file containing the KV cache scaling factors. '
+        'This should generally be supplied, when KV cache dtype is FP8. '
+        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
+        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
+        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
+        'instead supported for common inference criteria.')
+    parser.add_argument(
+        '--profile',
+        action='store_true',
+        help='profile the generation process of a single batch')
+    parser.add_argument(
+        '--profile-result-dir',
+        type=str,
+        default=None,
+        help=('path to save the pytorch profiler output. Can be visualized '
+              'with ui.perfetto.dev or Tensorboard.'))
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="auto",
+        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
+        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
+        'CPU.')
+    parser.add_argument('--block-size',
+                        type=int,
+                        default=16,
+                        help='block size of key/value cache')
+    parser.add_argument(
+        '--enable-chunked-prefill',
+        action='store_true',
+        help='If True, the prefill requests can be chunked based on the '
+        'max_num_batched_tokens')
+    parser.add_argument("--enable-prefix-caching",
+                        action='store_true',
+                        help="Enable automatic prefix caching")
+    parser.add_argument('--use-v2-block-manager', action='store_true')
+    parser.add_argument(
+        "--ray-workers-use-nsight",
+        action='store_true',
+        help="If specified, use nsight to profile ray workers",
+    )
+    parser.add_argument('--download-dir',
+                        type=str,
+                        default=None,
+                        help='directory to download and load the weights, '
+                        'default to the default cache dir of huggingface')
+    parser.add_argument(
+        '--output-json',
+        type=str,
+        default=None,
+        help='Path to save the latency results in JSON format.')
+    parser.add_argument('--gpu-memory-utilization',
+                        type=float,
+                        default=0.9,
+                        help='the fraction of GPU memory to be used for '
+                        'the model executor, which can range from 0 to 1.'
+                        'If unspecified, will use the default value of 0.9.')
+    parser.add_argument(
+        '--load-format',
+        type=str,
+        default=EngineArgs.load_format,
+        choices=[
+            'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
+            'bitsandbytes'
+        ],
+        help='The format of the model weights to load.\n\n'
+        '* "auto" will try to load the weights in the safetensors format '
+        'and fall back to the pytorch bin format if safetensors format '
+        'is not available.\n'
+        '* "pt" will load the weights in the pytorch bin format.\n'
+        '* "safetensors" will load the weights in the safetensors format.\n'
+        '* "npcache" will load the weights in pytorch format and store '
+        'a numpy cache to speed up the loading.\n'
+        '* "dummy" will initialize the weights with random values, '
+        'which is mainly for profiling.\n'
+        '* "tensorizer" will load the weights using tensorizer from '
+        'CoreWeave. See the Tensorize vLLM Model script in the Examples'
+        'section for more information.\n'
+        '* "bitsandbytes" will load the weights using bitsandbytes '
+        'quantization.\n')
+    parser.add_argument(
+        '--distributed-executor-backend',
+        choices=['ray', 'mp'],
+        default=None,
+        help='Backend to use for distributed serving. When more than 1 GPU '
+        'is used, will be automatically set to "ray" if installed '
+        'or "mp" (multiprocessing) otherwise.')
+    args = parser.parse_args()
+    main(args)

+ 121 - 17
tests/benchmarks/serving.py → tests/benchmarks/engine/serving.py

@@ -2,8 +2,8 @@
 
 
 On the server side, run one of the following commands:
 On the server side, run one of the following commands:
     Aphrodite OpenAI API server
     Aphrodite OpenAI API server
-    python -m aphrodite.endpoints.openai.api_server \
-        --model <your_model> --swap-space 16 \
+    aphrodite run <your_model> \
+        --swap-space 16 \
         --disable-log-requests
         --disable-log-requests
 
 
     (TGI backend)
     (TGI backend)
@@ -17,7 +17,7 @@ On the client side, run:
         --dataset-path <path to dataset> \
         --dataset-path <path to dataset> \
         --request-rate <request_rate> \ # By default <request_rate> is inf
         --request-rate <request_rate> \ # By default <request_rate> is inf
         --num-prompts <num_prompts> # By default <num_prompts> is 1000
         --num-prompts <num_prompts> # By default <num_prompts> is 1000
-        
+
     when using tgi backend, add
     when using tgi backend, add
         --endpoint /generate_stream
         --endpoint /generate_stream
     to the end of the command above.
     to the end of the command above.
@@ -31,7 +31,7 @@ import time
 import warnings
 import warnings
 from dataclasses import dataclass
 from dataclasses import dataclass
 from datetime import datetime
 from datetime import datetime
-from typing import AsyncGenerator, List, Optional, Tuple
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
 
 
 import numpy as np
 import numpy as np
 from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
 from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
@@ -39,7 +39,15 @@ from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
 from tqdm.asyncio import tqdm
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 from transformers import PreTrainedTokenizerBase
 
 
-from aphrodite.transformers_utils.tokenizer import get_tokenizer
+try:
+    from aphrodite.transformers_utils.tokenizer import get_tokenizer
+except ImportError:
+    from backend_request_func import get_tokenizer
+
+try:
+    from aphrodite.common.utils import FlexibleArgumentParser
+except ImportError:
+    from argparse import ArgumentParser as FlexibleArgumentParser
 
 
 
 
 @dataclass
 @dataclass
@@ -52,10 +60,16 @@ class BenchmarkMetrics:
     output_throughput: float
     output_throughput: float
     mean_ttft_ms: float
     mean_ttft_ms: float
     median_ttft_ms: float
     median_ttft_ms: float
+    std_ttft_ms: float
     p99_ttft_ms: float
     p99_ttft_ms: float
     mean_tpot_ms: float
     mean_tpot_ms: float
     median_tpot_ms: float
     median_tpot_ms: float
+    std_tpot_ms: float
     p99_tpot_ms: float
     p99_tpot_ms: float
+    mean_itl_ms: float
+    median_itl_ms: float
+    std_itl_ms: float
+    p99_itl_ms: float
 
 
 
 
 def sample_sharegpt_requests(
 def sample_sharegpt_requests(
@@ -66,7 +80,6 @@ def sample_sharegpt_requests(
 ) -> List[Tuple[str, int, int]]:
 ) -> List[Tuple[str, int, int]]:
     if fixed_output_len is not None and fixed_output_len < 4:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
         raise ValueError("output_len too small")
-
     # Load the dataset.
     # Load the dataset.
     with open(dataset_path) as f:
     with open(dataset_path) as f:
         dataset = json.load(f)
         dataset = json.load(f)
@@ -174,6 +187,31 @@ def sample_sonnet_requests(
     return sampled_requests
     return sampled_requests
 
 
 
 
+def sample_random_requests(
+        input_len: int, output_len: int, num_prompts: int, range_ratio: float,
+        tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
+
+    input_lens = np.random.randint(
+        int(input_len * range_ratio),
+        input_len + 1,
+        size=num_prompts,
+    )
+    output_lens = np.random.randint(
+        int(output_len * range_ratio),
+        output_len + 1,
+        size=num_prompts,
+    )
+    offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
+    input_requests = []
+    for i in range(num_prompts):
+        prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
+                                   for j in range(input_lens[i])])
+        input_requests.append(
+            (prompt, int(input_lens[i]), int(output_lens[i])))
+
+    return input_requests
+
+
 async def get_request(
 async def get_request(
     input_requests: List[Tuple[str, int, int]],
     input_requests: List[Tuple[str, int, int]],
     request_rate: float,
     request_rate: float,
@@ -185,6 +223,7 @@ async def get_request(
         if request_rate == float("inf"):
         if request_rate == float("inf"):
             # If the request rate is infinity, then we don't need to wait.
             # If the request rate is infinity, then we don't need to wait.
             continue
             continue
+
         # Sample the request interval from the exponential distribution.
         # Sample the request interval from the exponential distribution.
         interval = np.random.exponential(1.0 / request_rate)
         interval = np.random.exponential(1.0 / request_rate)
         # The next request will be sent after the interval.
         # The next request will be sent after the interval.
@@ -197,19 +236,27 @@ def calculate_metrics(
     dur_s: float,
     dur_s: float,
     tokenizer: PreTrainedTokenizerBase,
     tokenizer: PreTrainedTokenizerBase,
 ) -> Tuple[BenchmarkMetrics, List[int]]:
 ) -> Tuple[BenchmarkMetrics, List[int]]:
-    actual_output_lens = []
+    actual_output_lens: List[int] = []
     total_input = 0
     total_input = 0
     completed = 0
     completed = 0
-    tpots = []
-    ttfts = []
+    itls: List[float] = []
+    tpots: List[float] = []
+    ttfts: List[float] = []
     for i in range(len(outputs)):
     for i in range(len(outputs)):
         if outputs[i].success:
         if outputs[i].success:
-            output_len = len(tokenizer(outputs[i].generated_text).input_ids)
+            # We use the tokenizer to count the number of output tokens for all
+            # serving backends instead of looking at len(outputs[i].itl) since
+            # multiple output tokens may be bundled together
+            # Note : this may inflate the output token count slightly
+            output_len = len(
+                tokenizer(outputs[i].generated_text,
+                          add_special_tokens=False).input_ids)
             actual_output_lens.append(output_len)
             actual_output_lens.append(output_len)
             total_input += input_requests[i][1]
             total_input += input_requests[i][1]
             if output_len > 1:
             if output_len > 1:
                 tpots.append(
                 tpots.append(
                     (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
                     (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
+            itls += outputs[i].itl
             ttfts.append(outputs[i].ttft)
             ttfts.append(outputs[i].ttft)
             completed += 1
             completed += 1
         else:
         else:
@@ -230,10 +277,16 @@ def calculate_metrics(
         mean_ttft_ms=np.mean(ttfts or 0) *
         mean_ttft_ms=np.mean(ttfts or 0) *
         1000,  # ttfts is empty if streaming is not supported by backend
         1000,  # ttfts is empty if streaming is not supported by backend
         median_ttft_ms=np.median(ttfts or 0) * 1000,
         median_ttft_ms=np.median(ttfts or 0) * 1000,
+        std_ttft_ms=np.std(ttfts or 0) * 1000,
         p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
         p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
         mean_tpot_ms=np.mean(tpots or 0) * 1000,
         mean_tpot_ms=np.mean(tpots or 0) * 1000,
         median_tpot_ms=np.median(tpots or 0) * 1000,
         median_tpot_ms=np.median(tpots or 0) * 1000,
+        std_tpot_ms=np.std(tpots or 0) * 1000,
         p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
         p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
+        mean_itl_ms=np.mean(itls or 0) * 1000,
+        median_itl_ms=np.median(itls or 0) * 1000,
+        std_itl_ms=np.std(itls or 0) * 1000,
+        p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
     )
     )
 
 
     return metrics, actual_output_lens
     return metrics, actual_output_lens
@@ -251,7 +304,7 @@ async def benchmark(
     disable_tqdm: bool,
     disable_tqdm: bool,
 ):
 ):
     if backend in ASYNC_REQUEST_FUNCS:
     if backend in ASYNC_REQUEST_FUNCS:
-        request_func = ASYNC_REQUEST_FUNCS.get(backend)
+        request_func = ASYNC_REQUEST_FUNCS[backend]
     else:
     else:
         raise ValueError(f"Unknown backend: {backend}")
         raise ValueError(f"Unknown backend: {backend}")
 
 
@@ -278,7 +331,7 @@ async def benchmark(
     pbar = None if disable_tqdm else tqdm(total=len(input_requests))
     pbar = None if disable_tqdm else tqdm(total=len(input_requests))
 
 
     benchmark_start_time = time.perf_counter()
     benchmark_start_time = time.perf_counter()
-    tasks = []
+    tasks: List[asyncio.Task] = []
     async for request in get_request(input_requests, request_rate):
     async for request in get_request(input_requests, request_rate):
         prompt, prompt_len, output_len = request
         prompt, prompt_len, output_len = request
         request_func_input = RequestFuncInput(
         request_func_input = RequestFuncInput(
@@ -296,7 +349,7 @@ async def benchmark(
                              pbar=pbar)))
                              pbar=pbar)))
     outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
     outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
 
 
-    if not disable_tqdm:
+    if pbar is not None:
         pbar.close()
         pbar.close()
 
 
     benchmark_duration = time.perf_counter() - benchmark_start_time
     benchmark_duration = time.perf_counter() - benchmark_start_time
@@ -333,6 +386,10 @@ async def benchmark(
     print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
     print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
                                     metrics.median_tpot_ms))
                                     metrics.median_tpot_ms))
     print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
     print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
+    print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
+    print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
+    print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
+    print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
     print("=" * 50)
     print("=" * 50)
 
 
     result = {
     result = {
@@ -345,10 +402,16 @@ async def benchmark(
         "output_throughput": metrics.output_throughput,
         "output_throughput": metrics.output_throughput,
         "mean_ttft_ms": metrics.mean_ttft_ms,
         "mean_ttft_ms": metrics.mean_ttft_ms,
         "median_ttft_ms": metrics.median_ttft_ms,
         "median_ttft_ms": metrics.median_ttft_ms,
+        "std_ttft_ms": metrics.std_ttft_ms,
         "p99_ttft_ms": metrics.p99_ttft_ms,
         "p99_ttft_ms": metrics.p99_ttft_ms,
         "mean_tpot_ms": metrics.mean_tpot_ms,
         "mean_tpot_ms": metrics.mean_tpot_ms,
         "median_tpot_ms": metrics.median_tpot_ms,
         "median_tpot_ms": metrics.median_tpot_ms,
+        "std_tpot_ms": metrics.std_tpot_ms,
         "p99_tpot_ms": metrics.p99_tpot_ms,
         "p99_tpot_ms": metrics.p99_tpot_ms,
+        "mean_itl_ms": metrics.mean_itl_ms,
+        "median_itl_ms": metrics.median_itl_ms,
+        "std_itl_ms": metrics.std_itl_ms,
+        "p99_itl_ms": metrics.p99_itl_ms,
         "input_lens": [output.prompt_len for output in outputs],
         "input_lens": [output.prompt_len for output in outputs],
         "output_lens": actual_output_lens,
         "output_lens": actual_output_lens,
         "ttfts": [output.ttft for output in outputs],
         "ttfts": [output.ttft for output in outputs],
@@ -427,6 +490,15 @@ def main(args: argparse.Namespace):
                               for prompt, prompt_formatted, prompt_len,
                               for prompt, prompt_formatted, prompt_len,
                               output_len in input_requests]
                               output_len in input_requests]
 
 
+    elif args.dataset_name == "random":
+        input_requests = sample_random_requests(
+            input_len=args.random_input_len,
+            output_len=args.random_output_len,
+            num_prompts=args.num_prompts,
+            range_ratio=args.random_range_ratio,
+            tokenizer=tokenizer,
+        )
+
     else:
     else:
         raise ValueError(f"Unknown dataset: {args.dataset_name}")
         raise ValueError(f"Unknown dataset: {args.dataset_name}")
 
 
@@ -445,7 +517,7 @@ def main(args: argparse.Namespace):
 
 
     # Save config and results to json
     # Save config and results to json
     if args.save_result:
     if args.save_result:
-        result_json = {}
+        result_json: Dict[str, Any] = {}
 
 
         # Setup
         # Setup
         current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
         current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
@@ -478,6 +550,8 @@ def main(args: argparse.Namespace):
         # Save to file
         # Save to file
         base_model_id = model_id.split("/")[-1]
         base_model_id = model_id.split("/")[-1]
         file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"  #noqa
         file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"  #noqa
+        if args.result_filename:
+            file_name = args.result_filename
         if args.result_dir:
         if args.result_dir:
             file_name = os.path.join(args.result_dir, file_name)
             file_name = os.path.join(args.result_dir, file_name)
         with open(file_name, "w") as outfile:
         with open(file_name, "w") as outfile:
@@ -485,7 +559,7 @@ def main(args: argparse.Namespace):
 
 
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description="Benchmark the online serving throughput.")
         description="Benchmark the online serving throughput.")
     parser.add_argument(
     parser.add_argument(
         "--backend",
         "--backend",
@@ -518,7 +592,7 @@ if __name__ == "__main__":
         "--dataset-name",
         "--dataset-name",
         type=str,
         type=str,
         default="sharegpt",
         default="sharegpt",
-        choices=["sharegpt", "sonnet"],
+        choices=["sharegpt", "sonnet", "random"],
         help="Name of the dataset to benchmark on.",
         help="Name of the dataset to benchmark on.",
     )
     )
     parser.add_argument("--dataset-path",
     parser.add_argument("--dataset-path",
@@ -535,7 +609,7 @@ if __name__ == "__main__":
         "--tokenizer",
         "--tokenizer",
         type=str,
         type=str,
         help=
         help=
-        "Name or path of the tokenizer, if not using the default tokenizer.",
+        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
     )
     )
     parser.add_argument(
     parser.add_argument(
         "--best-of",
         "--best-of",
@@ -578,6 +652,27 @@ if __name__ == "__main__":
         help=
         help=
         "Number of prefix tokens per request, used only for sonnet dataset.",
         "Number of prefix tokens per request, used only for sonnet dataset.",
     )
     )
+    parser.add_argument(
+        "--random-input-len",
+        type=int,
+        default=1024,
+        help=
+        "Number of input tokens per request, used only for random sampling.",
+    )
+    parser.add_argument(
+        "--random-output-len",
+        type=int,
+        default=128,
+        help=
+        "Number of output tokens per request, used only for random sampling.",
+    )
+    parser.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=1.0,
+        help="Range of sampled ratio of input/output length, "
+        "used only for random sampling.",
+    )
     parser.add_argument(
     parser.add_argument(
         "--request-rate",
         "--request-rate",
         type=float,
         type=float,
@@ -618,6 +713,15 @@ if __name__ == "__main__":
         help="Specify directory to save benchmark json results."
         help="Specify directory to save benchmark json results."
         "If not specified, results are saved in the current directory.",
         "If not specified, results are saved in the current directory.",
     )
     )
+    parser.add_argument(
+        "--result-filename",
+        type=str,
+        default=None,
+        help="Specify the filename to save benchmark json results."
+        "If not specified, results will be saved in "
+        "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
+        " format.",
+    )
 
 
     args = parser.parse_args()
     args = parser.parse_args()
     main(args)
     main(args)

+ 13 - 44
tests/benchmarks/throughput.py → tests/benchmarks/engine/throughput.py

@@ -10,6 +10,7 @@ from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                           PreTrainedTokenizerBase)
                           PreTrainedTokenizerBase)
 
 
+from aphrodite.common.utils import FlexibleArgumentParser
 from aphrodite.engine.args_tools import EngineArgs
 from aphrodite.engine.args_tools import EngineArgs
 from aphrodite.quantization import QUANTIZATION_METHODS
 from aphrodite.quantization import QUANTIZATION_METHODS
 
 
@@ -76,15 +77,10 @@ def run_aphrodite(
     kv_cache_dtype: str,
     kv_cache_dtype: str,
     quantization_param_path: Optional[str],
     quantization_param_path: Optional[str],
     device: str,
     device: str,
-    speculative_model: str,
-    num_speculative_tokens: int,
-    use_v2_block_manager: bool,
-    ngram_prompt_lookup_min: int,
-    ngram_prompt_lookup_max: int,
     enable_prefix_caching: bool,
     enable_prefix_caching: bool,
     enable_chunked_prefill: bool,
     enable_chunked_prefill: bool,
     max_num_batched_tokens: int,
     max_num_batched_tokens: int,
-    distributed_executor_backend: Optional[str] = None,
+    distributed_executor_backend: Optional[str],
     gpu_memory_utilization: float = 0.9,
     gpu_memory_utilization: float = 0.9,
     download_dir: Optional[str] = None,
     download_dir: Optional[str] = None,
     load_format: str = EngineArgs.load_format,
     load_format: str = EngineArgs.load_format,
@@ -110,11 +106,6 @@ def run_aphrodite(
         max_num_batched_tokens=max_num_batched_tokens,
         max_num_batched_tokens=max_num_batched_tokens,
         distributed_executor_backend=distributed_executor_backend,
         distributed_executor_backend=distributed_executor_backend,
         load_format=load_format,
         load_format=load_format,
-        speculative_model=speculative_model,
-        num_speculative_tokens=num_speculative_tokens,
-        use_v2_block_manager=use_v2_block_manager,
-        ngram_prompt_lookup_min=ngram_prompt_lookup_min,
-        ngram_prompt_lookup_max=ngram_prompt_lookup_max,
     )
     )
 
 
     # Add the requests to the engine.
     # Add the requests to the engine.
@@ -238,9 +229,7 @@ def main(args: argparse.Namespace):
             args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
             args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
             args.trust_remote_code, args.dtype, args.max_model_len,
             args.trust_remote_code, args.dtype, args.max_model_len,
             args.enforce_eager, args.kv_cache_dtype,
             args.enforce_eager, args.kv_cache_dtype,
-            args.quantization_param_path, args.device, args.speculative_model,
-            args.num_speculative_tokens, args.use_v2_block_manager,
-            args.ngram_prompt_lookup_min, args.ngram_prompt_lookup_max,
+            args.quantization_param_path, args.device,
             args.enable_prefix_caching, args.enable_chunked_prefill,
             args.enable_prefix_caching, args.enable_chunked_prefill,
             args.max_num_batched_tokens, args.distributed_executor_backend,
             args.max_num_batched_tokens, args.distributed_executor_backend,
             args.gpu_memory_utilization, args.download_dir, args.load_format)
             args.gpu_memory_utilization, args.download_dir, args.load_format)
@@ -257,7 +246,7 @@ def main(args: argparse.Namespace):
     total_num_tokens = sum(prompt_len + output_len
     total_num_tokens = sum(prompt_len + output_len
                            for _, prompt_len, output_len in requests)
                            for _, prompt_len, output_len in requests)
     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
-          f"{total_num_tokens / elapsed_time:.2f} total tokens/s")
+          f"{total_num_tokens / elapsed_time:.2f} tokens/s")
 
 
     # Output JSON results if specified
     # Output JSON results if specified
     if args.output_json:
     if args.output_json:
@@ -273,7 +262,7 @@ def main(args: argparse.Namespace):
 
 
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Benchmark the throughput.")
+    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
     parser.add_argument("--backend",
     parser.add_argument("--backend",
                         type=str,
                         type=str,
                         choices=["aphrodite", "hf", "mii"],
                         choices=["aphrodite", "hf", "mii"],
@@ -337,8 +326,7 @@ if __name__ == "__main__":
                         'the model executor, which can range from 0 to 1.'
                         'the model executor, which can range from 0 to 1.'
                         'If unspecified, will use the default value of 0.9.')
                         'If unspecified, will use the default value of 0.9.')
     parser.add_argument("--enforce-eager",
     parser.add_argument("--enforce-eager",
-                        type=lambda x: (str(x).lower() == 'true'),
-                        default=True,
+                        action="store_true",
                         help="enforce eager execution")
                         help="enforce eager execution")
     parser.add_argument(
     parser.add_argument(
         '--kv-cache-dtype',
         '--kv-cache-dtype',
@@ -363,15 +351,15 @@ if __name__ == "__main__":
         type=str,
         type=str,
         default="auto",
         default="auto",
         choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
         choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
-        help='device type for Aphrodite execution, supporting CUDA, OpenVINO, '
-        'CPU, TPU, and XPU.')
+        help='device type for Aphrodite execution, supporting CUDA, OpenVINO and '
+        'CPU.')
     parser.add_argument(
     parser.add_argument(
         "--enable-prefix-caching",
         "--enable-prefix-caching",
         action='store_true',
         action='store_true',
-        help="enable automatic prefix caching for aphrodite backend.")
+        help="enable automatic prefix caching for Aphrodite backend.")
     parser.add_argument("--enable-chunked-prefill",
     parser.add_argument("--enable-chunked-prefill",
                         action='store_true',
                         action='store_true',
-                        help="enable chunked prefill for aphrodite backend.")
+                        help="enable chunked prefill for Aphrodite backend.")
     parser.add_argument('--max-num-batched-tokens',
     parser.add_argument('--max-num-batched-tokens',
                         type=int,
                         type=int,
                         default=None,
                         default=None,
@@ -413,29 +401,10 @@ if __name__ == "__main__":
         '* "dummy" will initialize the weights with random values, '
         '* "dummy" will initialize the weights with random values, '
         'which is mainly for profiling.\n'
         'which is mainly for profiling.\n'
         '* "tensorizer" will load the weights using tensorizer from '
         '* "tensorizer" will load the weights using tensorizer from '
-        'CoreWeave. See the Tensorize aphrodite Model script in the Examples'
+        'CoreWeave. See the Tensorize Aphrodite Model script in the Examples'
         'section for more information.\n'
         'section for more information.\n'
         '* "bitsandbytes" will load the weights using bitsandbytes '
         '* "bitsandbytes" will load the weights using bitsandbytes '
         'quantization.\n')
         'quantization.\n')
-    parser.add_argument('--speculative-model',
-                        type=str,
-                        default=None,
-                        help='speculative model for speculative decoding')
-    parser.add_argument('--use-v2-block-manager',
-                        action='store_true',
-                        help='use v2 block manage.')
-    parser.add_argument('--num-speculative-tokens',
-                        type=int,
-                        default=None,
-                        help='number of speculative tokens.')
-    parser.add_argument('--ngram-prompt-lookup-min',
-                        type=int,
-                        default=None,
-                        help='minimum ngram prompt lookup size')
-    parser.add_argument('--ngram-prompt-lookup-max',
-                        type=int,
-                        default=None,
-                        help='maximum ngram prompt lookup size')
     args = parser.parse_args()
     args = parser.parse_args()
     if args.tokenizer is None:
     if args.tokenizer is None:
         args.tokenizer = args.model
         args.tokenizer = args.model
@@ -452,7 +421,7 @@ if __name__ == "__main__":
         if args.hf_max_batch_size is None:
         if args.hf_max_batch_size is None:
             raise ValueError("HF max batch size is required for HF backend.")
             raise ValueError("HF max batch size is required for HF backend.")
         if args.quantization is not None:
         if args.quantization is not None:
-            raise ValueError("Quantization is only for aphrodite backend.")
+            raise ValueError("Quantization is only for Aphrodite backend.")
     elif args.backend == "mii":
     elif args.backend == "mii":
         if args.dtype != "auto":
         if args.dtype != "auto":
             raise ValueError("dtype must be auto for MII backend.")
             raise ValueError("dtype must be auto for MII backend.")
@@ -461,7 +430,7 @@ if __name__ == "__main__":
         if args.use_beam_search:
         if args.use_beam_search:
             raise ValueError("Beam search is not supported for MII backend.")
             raise ValueError("Beam search is not supported for MII backend.")
         if args.quantization is not None:
         if args.quantization is not None:
-            raise ValueError("Quantization is only for aphrodite backend.")
+            raise ValueError("Quantization is only for Aphrodite backend.")
         if args.hf_max_batch_size is not None:
         if args.hf_max_batch_size is not None:
             raise ValueError("HF max batch size is only for HF backend.")
             raise ValueError("HF max batch size is only for HF backend.")
         if args.tokenizer != args.model:
         if args.tokenizer != args.model:

+ 303 - 0
tests/benchmarks/kernels/aqlm.py

@@ -0,0 +1,303 @@
+import os
+import sys
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+
+from aphrodite import _custom_ops as ops
+from aphrodite.common.utils import FlexibleArgumentParser
+from aphrodite.quantization.aqlm import (dequantize_weight,
+                                         generic_dequantize_gemm,
+                                         get_int_dtype,
+                                         optimized_dequantize_gemm)
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+
+def torch_mult(
+        input: torch.Tensor,  #  [..., in_features]
+        weights: torch.Tensor,
+        scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+) -> torch.Tensor:
+    output = F.linear(input, weights)
+    return output
+
+
+def dequant_out_scale(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    if bias is None:
+        output = F.linear(input, weights, bias)
+        orig_shape = output.shape
+        flattened_output = output.view(-1, output.size(-1))
+        f_scales = scales.view(-1, scales.shape[0])
+        b_scales = f_scales.expand(flattened_output.shape[0], -1)
+        flattened_output *= b_scales
+        return flattened_output.view(orig_shape)
+    else:
+        b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
+            -1, weights.shape[1])
+        weights *= b_scales
+        return F.linear(input, weights, bias)
+
+
+def dequant_weight_scale(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
+        -1, weights.shape[1])
+    weights *= b_scales
+    return F.linear(input, weights, bias)
+
+
+def dequant_no_scale(
+    input: torch.Tensor,  #  [..., in_features]
+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
+    codebooks: torch.
+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
+    output_partition_sizes: torch.IntTensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+    return F.linear(input, weights, bias)
+
+
+# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
+# the generic pytorch version.
+# Just visual comparison.
+def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
+
+    n = int(parts.sum().item())
+
+    device = torch.device('cuda:0')
+
+    code_range = (1 << bits) // 2
+    ingroups = 8
+
+    codes = torch.randint(-code_range,
+                          code_range,
+                          size=(n, k // ingroups, nbooks),
+                          dtype=get_int_dtype(bits),
+                          device=device)
+
+    codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
+                            dtype=torch.float16,
+                            device=device)
+
+    count = 0
+    for index in range(16):
+        for i in range(8):
+            for book in range(nbooks):
+                codebooks[book, index, 0, i] = count * (10**book)
+            count += 1
+
+    print("codes shape", codes.shape)
+
+    for i in range(16):
+        for book in range(nbooks):
+            codes[0, i, book] = i
+            codes[0, -i, book] = i
+
+    weights = dequantize_weight(codes, codebooks, None)
+    weights2 = ops.aqlm_dequant(codes, codebooks, parts)
+
+    print("weights shape:", weights.shape)
+    print("weights2 shape:", weights2.shape)
+
+    print("weights are:", weights)
+    print("weights2 are:", weights2)
+
+    print("first 128 weights are", weights[0, 0:128].to(torch.int32))
+    print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
+
+    print("last 128 weights are", weights[0, -128:])
+    print("last 128 weights2 are:", weights2[0, -128:])
+
+
+def main():
+
+    parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
+
+    # Add arguments
+    parser.add_argument("--nbooks",
+                        type=int,
+                        default=1,
+                        help="Number of codebooks (default: 1)")
+    parser.add_argument("--bits",
+                        type=int,
+                        default=16,
+                        help="Number of bits per code element (default: 16)")
+    parser.add_argument(
+        "--test",
+        type=bool,
+        default=False,
+        help="Run the decompression/dequant tester rather than benchmarking "
+        "(default: False)")
+
+    # Parse the arguments
+    args = parser.parse_args()
+
+    # Extract values
+    nbooks = args.nbooks
+    bits = args.bits
+
+    if args.test:
+        dequant_test(4096, torch.tensor((4096, )), nbooks, bits)
+        return
+
+    # Otherwise, benchmark.
+    methods = [
+        ops.aqlm_gemm,
+        dequant_out_scale,
+        generic_dequantize_gemm,
+        optimized_dequantize_gemm,
+        dequant_weight_scale,
+        torch_mult,
+        dequant_no_scale,
+    ]
+
+    filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
+    print(f"writing benchmarks to file {filename}")
+    with open(filename, "w") as f:
+        sys.stdout = f
+
+        print('m | k | n | n parts', end='')
+        for method in methods:
+            print(f" | {method.__name__.replace('_', ' ')} (µs)", end='')
+        print('')
+
+        # These are reasonable prefill sizes.
+        ksandpartions = ((4096, (4096, 4096, 4096)), (4096, (4096, )),
+                         (4096, (11008, 11008)), (11008, (4096, )))
+
+        # reasonable ranges for m.
+        for m in [
+                1, 2, 4, 8, 10, 12, 14, 16, 24, 32, 48, 52, 56, 64, 96, 112,
+                128, 256, 512, 1024, 1536, 2048, 3072, 4096
+        ]:
+            print(f'{m}', file=sys.__stdout__)
+            for ksp in ksandpartions:
+                run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits,
+                         methods)
+
+        sys.stdout = sys.__stdout__
+
+
+def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
+             methods):
+
+    # I didn't see visible improvements from increasing these, but feel free :)
+    num_warmup_trials = 1
+    num_trials = 1
+
+    num_calls = 100
+
+    # warmup.
+    for method in methods:
+        for _ in range(num_warmup_trials):
+            run_timing(
+                num_calls=num_calls,
+                m=m,
+                k=k,
+                parts=parts,
+                nbooks=nbooks,
+                bits=bits,
+                method=method,
+            )
+
+    n = parts.sum().item()
+    print(f'{m} | {k} | {n} | {parts.tolist()}', end='')
+
+    for method in methods:
+        best_time_us = 1e20
+        for _ in range(num_trials):
+            kernel_dur_ms = run_timing(
+                num_calls=num_calls,
+                m=m,
+                k=k,
+                parts=parts,
+                nbooks=nbooks,
+                bits=bits,
+                method=method,
+            )
+
+            kernel_dur_us = 1000 * kernel_dur_ms
+
+            if kernel_dur_us < best_time_us:
+                best_time_us = kernel_dur_us
+
+        print(f' | {kernel_dur_us:.0f}', end='')
+
+    print('')
+
+
+def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor,
+               nbooks: int, bits: int, method) -> float:
+
+    n = int(parts.sum().item())
+
+    device = torch.device('cuda:0')
+
+    input = torch.randn((1, m, k), dtype=torch.float16, device=device)
+
+    code_range = (1 << bits) // 2
+    ingroups = 8
+
+    codes = torch.randint(-code_range,
+                          code_range,
+                          size=(n, k // ingroups, nbooks),
+                          dtype=get_int_dtype(bits),
+                          device=device)
+
+    codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
+                            dtype=torch.float16,
+                            device=device)
+
+    scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
+
+    # for comparison to just a pytorch mult.
+    weights = torch.randn((n, k), dtype=torch.float16, device=device)
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    start_event.record()
+
+    if method is torch_mult:
+        for i in range(num_calls):
+            torch_mult(input, weights, scales)
+    else:
+        for i in range(num_calls):
+            method(input, codes, codebooks, scales, parts, None)
+
+    end_event.record()
+    end_event.synchronize()
+
+    dur_ms = start_event.elapsed_time(end_event) / num_calls
+    return dur_ms
+
+
+if __name__ == "__main__":
+    sys.exit(main())

+ 75 - 0
tests/benchmarks/kernels/benchmark_shapes.py

@@ -0,0 +1,75 @@
+WEIGHT_SHAPES = {
+    "ideal": [[4 * 256 * 32, 256 * 32]],
+    "mistralai/Mistral-7B-v0.1/TP1": [
+        [4096, 6144],
+        [4096, 4096],
+        [4096, 28672],
+        [14336, 4096],
+    ],
+    "mistralai/Mistral-7B-v0.1/TP2": [
+        [4096, 3072],
+        [2048, 4096],
+        [4096, 14336],
+        [7168, 4096],
+    ],
+    "mistralai/Mistral-7B-v0.1/TP4": [
+        [4096, 1536],
+        [1024, 4096],
+        [4096, 7168],
+        [3584, 4096],
+    ],
+    "meta-llama/Llama-2-7b-hf/TP1": [
+        [4096, 12288],
+        [4096, 4096],
+        [4096, 22016],
+        [11008, 4096],
+    ],
+    "meta-llama/Llama-2-7b-hf/TP2": [
+        [4096, 6144],
+        [2048, 4096],
+        [4096, 11008],
+        [5504, 4096],
+    ],
+    "meta-llama/Llama-2-7b-hf/TP4": [
+        [4096, 3072],
+        [1024, 4096],
+        [4096, 5504],
+        [2752, 4096],
+    ],
+    "meta-llama/Llama-2-13b-hf/TP1": [
+        [5120, 15360],
+        [5120, 5120],
+        [5120, 27648],
+        [13824, 5120],
+    ],
+    "meta-llama/Llama-2-13b-hf/TP2": [
+        [5120, 7680],
+        [2560, 5120],
+        [5120, 13824],
+        [6912, 5120],
+    ],
+    "meta-llama/Llama-2-13b-hf/TP4": [
+        [5120, 3840],
+        [1280, 5120],
+        [5120, 6912],
+        [3456, 5120],
+    ],
+    "meta-llama/Llama-2-70b-hf/TP1": [
+        [8192, 10240],
+        [8192, 8192],
+        [8192, 57344],
+        [28672, 8192],
+    ],
+    "meta-llama/Llama-2-70b-hf/TP2": [
+        [8192, 5120],
+        [4096, 8192],
+        [8192, 28672],
+        [14336, 8192],
+    ],
+    "meta-llama/Llama-2-70b-hf/TP4": [
+        [8192, 2560],
+        [2048, 8192],
+        [8192, 14336],
+        [7168, 8192],
+    ],
+}

+ 238 - 0
tests/benchmarks/kernels/marlin.py

@@ -0,0 +1,238 @@
+from typing import List
+
+import torch
+import torch.utils.benchmark as benchmark
+from benchmark_shapes import WEIGHT_SHAPES
+
+from aphrodite import _custom_ops as ops
+from aphrodite.common.utils import FlexibleArgumentParser
+from aphrodite.quantization.gptq_marlin_24 import (
+    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
+    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
+from aphrodite.quantization.utils.marlin_utils import (
+    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
+    GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
+from aphrodite.quantization.utils.marlin_utils_test import (MarlinWorkspace,
+                                                            marlin_quantize)
+from aphrodite.quantization.utils.marlin_utils_test_24 import \
+    marlin_24_quantize
+from aphrodite.quantization.utils.quant_utils import (gptq_pack,
+                                                      quantize_weights,
+                                                      sort_weights)
+
+DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+
+ACT_ORDER_OPTS = [False, True]
+K_FULL_OPTS = [False, True]
+
+
+def bench_run(results: List[benchmark.Measurement], model: str,
+              act_order: bool, is_k_full: bool, num_bits: int, group_size: int,
+              size_m: int, size_k: int, size_n: int):
+    label = "Quant Matmul"
+
+    sub_label = ("{}, act={} k_full={}, b={}, g={}, "
+                 "MKN=({}x{}x{})".format(model, act_order, is_k_full, num_bits,
+                                         group_size, size_m, size_k, size_n))
+
+    print(f"Testing: {sub_label}")
+
+    a = torch.randn(size_m, size_k).to(torch.half).cuda()
+    b = torch.rand(size_k, size_n).to(torch.half).cuda()
+
+    a_tmp = (torch.zeros(size_m, size_k).to(torch.half).cuda())
+
+    # Marlin quant
+    (
+        marlin_w_ref,
+        marlin_q_w,
+        marlin_s,
+        marlin_g_idx,
+        marlin_sort_indices,
+        marlin_rand_perm,
+    ) = marlin_quantize(b, num_bits, group_size, act_order)
+
+    # Marlin_24 quant
+    (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
+     marlin_24_s) = marlin_24_quantize(b, num_bits, group_size)
+
+    # GPTQ quant
+    (w_ref, q_w, s, g_idx,
+     rand_perm) = quantize_weights(b, num_bits, group_size, act_order)
+    q_w_gptq = gptq_pack(q_w, num_bits, size_k, size_n)
+
+    # For act_order, sort the "weights" and "g_idx"
+    # so that group ids are increasing
+    repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
+    if act_order:
+        (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
+
+    # Prepare
+    marlin_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
+                                       GPTQ_MARLIN_MAX_PARALLEL)
+
+    marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
+                                          GPTQ_MARLIN_24_MAX_PARALLEL)
+
+    globals = {
+        # Gen params
+        "num_bits": num_bits,
+        "group_size": group_size,
+        "size_m": size_m,
+        "size_n": size_n,
+        "size_k": size_k,
+        "a": a,
+        "a_tmp": a_tmp,
+        # Marlin params
+        "marlin_w_ref": marlin_w_ref,
+        "marlin_q_w": marlin_q_w,
+        "marlin_s": marlin_s,
+        "marlin_g_idx": marlin_g_idx,
+        "marlin_sort_indices": marlin_sort_indices,
+        "marlin_rand_perm": marlin_rand_perm,
+        "marlin_workspace": marlin_workspace,
+        "is_k_full": is_k_full,
+        # Marlin_24 params
+        "marlin_24_w_ref": marlin_24_w_ref,
+        "marlin_24_q_w_comp": marlin_24_q_w_comp,
+        "marlin_24_meta": marlin_24_meta,
+        "marlin_24_s": marlin_24_s,
+        "marlin_24_workspace": marlin_24_workspace,
+        # GPTQ params
+        "q_w_gptq": q_w_gptq,
+        "repack_sort_indices": repack_sort_indices,
+        # Kernels
+        "gptq_marlin_gemm": ops.gptq_marlin_gemm,
+        "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
+        "gptq_marlin_repack": ops.gptq_marlin_repack,
+    }
+
+    min_run_time = 1
+
+    # Warmup pytorch
+    for i in range(5):
+        torch.matmul(a, marlin_w_ref)
+
+    results.append(
+        benchmark.Timer(
+            stmt="torch.matmul(a, marlin_w_ref)",
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="pytorch_gemm",
+        ).blocked_autorange(min_run_time=min_run_time))
+
+    results.append(
+        benchmark.Timer(
+            stmt=
+            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="gptq_marlin_gemm",
+        ).blocked_autorange(min_run_time=min_run_time))
+
+    if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
+            and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES):
+        results.append(
+            benchmark.Timer(
+                stmt=
+                "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, num_bits, size_m, size_n, size_k)",  # noqa: E501
+                globals=globals,
+                label=label,
+                sub_label=sub_label,
+                description="gptq_marlin_24_gemm",
+            ).blocked_autorange(min_run_time=min_run_time))
+
+    results.append(
+        benchmark.Timer(
+            stmt=
+            "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, num_bits)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="gptq_marlin_repack",
+        ).blocked_autorange(min_run_time=min_run_time))
+
+
+def main(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    results: List[benchmark.Measurement] = []
+
+    for model in args.models:
+        for layer in WEIGHT_SHAPES[model]:
+            size_k = layer[0]
+            size_n = layer[1]
+
+            if len(args.limit_k) > 0 and size_k not in args.limit_k:
+                continue
+
+            if len(args.limit_n) > 0 and size_n not in args.limit_n:
+                continue
+
+            for act_order in ACT_ORDER_OPTS:
+                if len(args.limit_act_order
+                       ) > 0 and act_order not in args.limit_act_order:
+                    continue
+
+                for is_k_full in K_FULL_OPTS:
+                    if len(args.limit_k_full
+                           ) > 0 and is_k_full not in args.limit_k_full:
+                        continue
+
+                    for num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
+                        if len(args.limit_num_bits
+                               ) > 0 and num_bits not in args.limit_num_bits:
+                            continue
+
+                        for group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
+                            if len(
+                                    args.limit_group_size
+                            ) > 0 and group_size not in args.limit_group_size:
+                                continue
+
+                            # For act_order, the group_size must be less than
+                            # size_k
+                            if act_order and (group_size == size_k
+                                              or group_size == -1):
+                                continue
+
+                            for size_m in args.batch_sizes:
+                                bench_run(results, model, act_order, is_k_full,
+                                          num_bits, group_size, size_m, size_k,
+                                          size_n)
+
+    compare = benchmark.Compare(results)
+    compare.print()
+
+
+# For quick benchmarking use:
+#   python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
+#
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark Marlin across specified models/shapes/batches")
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=DEFAULT_MODELS,
+        choices=WEIGHT_SHAPES.keys(),
+    )
+    parser.add_argument("--batch-sizes",
+                        nargs="+",
+                        type=int,
+                        default=DEFAULT_BATCH_SIZES)
+    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-group-size", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-num-bits", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-act-order", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-k-full", nargs="+", type=int, default=[])
+
+    args = parser.parse_args()
+    main(args)

+ 22 - 10
tests/benchmarks/benchmark_moe.py → tests/benchmarks/kernels/moe.py

@@ -1,7 +1,7 @@
 import argparse
 import argparse
 import time
 import time
 from datetime import datetime
 from datetime import datetime
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Tuple, TypedDict
 
 
 import ray
 import ray
 import torch
 import torch
@@ -9,11 +9,21 @@ import triton
 from ray.experimental.tqdm_ray import tqdm
 from ray.experimental.tqdm_ray import tqdm
 from transformers import AutoConfig
 from transformers import AutoConfig
 
 
+from aphrodite.common.utils import FlexibleArgumentParser
 from aphrodite.modeling.layers.fused_moe.fused_moe import *
 from aphrodite.modeling.layers.fused_moe.fused_moe import *
 
 
 
 
+class BenchmarkConfig(TypedDict):
+    BLOCK_SIZE_M: int
+    BLOCK_SIZE_N: int
+    BLOCK_SIZE_K: int
+    GROUP_SIZE_M: int
+    num_warps: int
+    num_stages: int
+
+
 def benchmark_config(
 def benchmark_config(
-    config: Dict[str, int],
+    config: BenchmarkConfig,
     num_tokens: int,
     num_tokens: int,
     num_experts: int,
     num_experts: int,
     shard_intermediate_size: int,
     shard_intermediate_size: int,
@@ -92,7 +102,7 @@ def benchmark_config(
     start_event = torch.cuda.Event(enable_timing=True)
     start_event = torch.cuda.Event(enable_timing=True)
     end_event = torch.cuda.Event(enable_timing=True)
     end_event = torch.cuda.Event(enable_timing=True)
 
 
-    latencies = []
+    latencies: List[float] = []
     for i in range(num_iters):
     for i in range(num_iters):
         prepare(i)
         prepare(i)
         torch.cuda.synchronize()
         torch.cuda.synchronize()
@@ -111,7 +121,7 @@ def get_configs_compute_bound() -> List[Dict[str, int]]:
     # Reduced search space for faster tuning.
     # Reduced search space for faster tuning.
     # TODO(woosuk): Increase the search space and use a performance model to
     # TODO(woosuk): Increase the search space and use a performance model to
     # prune the search space.
     # prune the search space.
-    configs = []
+    configs: List[BenchmarkConfig] = []
     for num_stages in [2, 3, 4, 5]:
     for num_stages in [2, 3, 4, 5]:
         for block_m in [16, 32, 64, 128, 256]:
         for block_m in [16, 32, 64, 128, 256]:
             for block_k in [64, 128, 256]:
             for block_k in [64, 128, 256]:
@@ -175,8 +185,8 @@ class BenchmarkWorker:
         topk: int,
         topk: int,
         dtype: torch.dtype,
         dtype: torch.dtype,
         use_fp8: bool,
         use_fp8: bool,
-        search_space: List[Dict[str, int]],
-    ) -> Dict[str, int]:
+        search_space: List[BenchmarkConfig],
+    ) -> BenchmarkConfig:
         best_config = None
         best_config = None
         best_time = float("inf")
         best_time = float("inf")
         for config in tqdm(search_space):
         for config in tqdm(search_space):
@@ -199,10 +209,11 @@ class BenchmarkWorker:
                 best_config = config
                 best_config = config
         now = datetime.now()
         now = datetime.now()
         print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
         print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
+        assert best_config is not None
         return best_config
         return best_config
 
 
 
 
-def sort_config(config: Dict[str, int]) -> Dict[str, int]:
+def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
     return {
     return {
         "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
         "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
         "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
         "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
@@ -214,7 +225,7 @@ def sort_config(config: Dict[str, int]) -> Dict[str, int]:
 
 
 
 
 def save_configs(
 def save_configs(
-    configs: Dict[int, Dict[str, int]],
+    configs: Dict[int, BenchmarkConfig],
     num_experts: int,
     num_experts: int,
     shard_intermediate_size: int,
     shard_intermediate_size: int,
     hidden_size: int,
     hidden_size: int,
@@ -255,7 +266,8 @@ def main(args: argparse.Namespace):
 
 
     if args.batch_size is None:
     if args.batch_size is None:
         batch_sizes = [
         batch_sizes = [
-            1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048, 3072, 4096
+            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
+            2048, 3072, 4096
         ]
         ]
     else:
     else:
         batch_sizes = [args.batch_size]
         batch_sizes = [args.batch_size]
@@ -304,7 +316,7 @@ def main(args: argparse.Namespace):
 
 
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+    parser = FlexibleArgumentParser()
     parser.add_argument("--model",
     parser.add_argument("--model",
                         type=str,
                         type=str,
                         default="mistralai/Mixtral-8x7B-Instruct-v0.1")
                         default="mistralai/Mixtral-8x7B-Instruct-v0.1")

+ 215 - 0
tests/benchmarks/kernels/paged_attention.py

@@ -0,0 +1,215 @@
+import random
+import time
+from typing import List, Optional
+
+import torch
+
+from aphrodite import _custom_ops as ops
+from aphrodite.common.utils import (STR_DTYPE_TO_TORCH_DTYPE,
+                                    FlexibleArgumentParser,
+                                    create_kv_caches_with_random)
+
+NUM_BLOCKS = 1024
+PARTITION_SIZE = 512
+
+
+@torch.inference_mode()
+def main(
+    version: str,
+    num_seqs: int,
+    seq_len: int,
+    num_query_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    use_alibi: bool,
+    block_size: int,
+    dtype: torch.dtype,
+    seed: int,
+    do_profile: bool,
+    device: str = "cuda",
+    kv_cache_dtype: Optional[str] = None,
+) -> None:
+    random.seed(seed)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+
+    scale = float(1.0 / (head_size**0.5))
+    query = torch.empty(num_seqs,
+                        num_query_heads,
+                        head_size,
+                        dtype=dtype,
+                        device=device)
+    query.uniform_(-scale, scale)
+
+    assert num_query_heads % num_kv_heads == 0
+    alibi_slopes = None
+    if use_alibi:
+        alibi_slopes = torch.randn(num_query_heads,
+                                   dtype=torch.float,
+                                   device=device)
+
+    seq_lens = [seq_len for _ in range(num_seqs)]
+    max_seq_len = max(seq_lens)
+    seq_lens = torch.tensor(seq_lens, dtype=torch.int, device=device)
+
+    # Create the block tables.
+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
+    block_tables_lst: List[List[int]] = []
+    for _ in range(num_seqs):
+        block_table = [
+            random.randint(0, NUM_BLOCKS - 1)
+            for _ in range(max_num_blocks_per_seq)
+        ]
+        block_tables_lst.append(block_table)
+
+    block_tables = torch.tensor(block_tables_lst,
+                                dtype=torch.int,
+                                device=device)
+
+    # Create the KV cache.
+    key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
+                                                            block_size,
+                                                            1,
+                                                            num_kv_heads,
+                                                            head_size,
+                                                            kv_cache_dtype,
+                                                            dtype,
+                                                            device=device)
+    key_cache, value_cache = key_caches[0], value_caches[0]
+
+    # Prepare for the paged attention kernel.
+    output = torch.empty_like(query)
+    if version == "v2":
+        num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
+        tmp_output = torch.empty(
+            size=(num_seqs, num_query_heads, num_partitions, head_size),
+            dtype=output.dtype,
+            device=output.device,
+        )
+        exp_sums = torch.empty(
+            size=(num_seqs, num_query_heads, num_partitions),
+            dtype=torch.float32,
+            device=output.device,
+        )
+        max_logits = torch.empty_like(exp_sums)
+
+    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
+        torch.cuda.synchronize()
+        if profile:
+            torch.cuda.cudart().cudaProfilerStart()
+        start_time = time.perf_counter()
+
+        # Using default kv_scale
+        k_scale = v_scale = 1.0
+
+        for _ in range(num_iters):
+            if version == "v1":
+                ops.paged_attention_v1(
+                    output,
+                    query,
+                    key_cache,
+                    value_cache,
+                    num_kv_heads,
+                    scale,
+                    block_tables,
+                    seq_lens,
+                    block_size,
+                    max_seq_len,
+                    alibi_slopes,
+                    kv_cache_dtype,
+                    k_scale,
+                    v_scale,
+                )
+            elif version == "v2":
+                ops.paged_attention_v2(
+                    output,
+                    exp_sums,
+                    max_logits,
+                    tmp_output,
+                    query,
+                    key_cache,
+                    value_cache,
+                    num_kv_heads,
+                    scale,
+                    block_tables,
+                    seq_lens,
+                    block_size,
+                    max_seq_len,
+                    alibi_slopes,
+                    kv_cache_dtype,
+                    k_scale,
+                    v_scale,
+                )
+            else:
+                raise ValueError(f"Invalid version: {version}")
+        torch.cuda.synchronize()
+
+        end_time = time.perf_counter()
+        if profile:
+            torch.cuda.cudart().cudaProfilerStart()
+        return (end_time - start_time) / num_iters
+
+    # Warmup.
+    print("Warming up...")
+    run_benchmark = run_cuda_benchmark
+    run_benchmark(num_iters=3, profile=False)
+
+    # Benchmark.
+    if do_profile:
+        latency = run_benchmark(num_iters=1, profile=True)
+    else:
+        latency = run_benchmark(num_iters=100, profile=False)
+    print(f"Kernel running time: {latency * 1000000:.3f} us")
+
+
+if __name__ == '__main__':
+    parser = FlexibleArgumentParser(
+        description="Benchmark the paged attention kernel.")
+    parser.add_argument("--version",
+                        type=str,
+                        choices=["v1", "v2"],
+                        default="v2")
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument("--seq-len", type=int, default=4096)
+    parser.add_argument("--num-query-heads", type=int, default=64)
+    parser.add_argument("--num-kv-heads", type=int, default=8)
+    parser.add_argument("--head-size",
+                        type=int,
+                        choices=[64, 80, 96, 112, 128, 192, 256],
+                        default=128)
+    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
+    parser.add_argument("--use-alibi", action="store_true")
+    parser.add_argument("--dtype",
+                        type=str,
+                        choices=["half", "bfloat16", "float"],
+                        default="half")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--profile", action="store_true")
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"],
+        default="auto",
+        help="Data type for kv cache storage. If 'auto', will use model "
+        "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
+        "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)")
+    args = parser.parse_args()
+    print(args)
+
+    if args.num_query_heads % args.num_kv_heads != 0:
+        raise ValueError("num_query_heads must be divisible by num_kv_heads")
+    main(
+        version=args.version,
+        num_seqs=args.batch_size,
+        seq_len=args.seq_len,
+        num_query_heads=args.num_query_heads,
+        num_kv_heads=args.num_kv_heads,
+        head_size=args.head_size,
+        block_size=args.block_size,
+        use_alibi=args.use_alibi,
+        dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
+        seed=args.seed,
+        do_profile=args.profile,
+        kv_cache_dtype=args.kv_cache_dtype,
+    )

+ 122 - 0
tests/benchmarks/kernels/rope.py

@@ -0,0 +1,122 @@
+from itertools import accumulate
+from typing import List, Optional
+
+import nvtx
+import torch
+
+from aphrodite.common.utils import FlexibleArgumentParser
+from aphrodite.modeling.layers.rotary_embedding import (RotaryEmbedding,
+                                                        get_rope)
+
+
+def benchmark_rope_kernels_multi_lora(
+    is_neox_style: bool,
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    head_size: int,
+    rotary_dim: Optional[int],
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    max_position: int = 8192,
+    base: int = 10000,
+) -> None:
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    if rotary_dim is None:
+        rotary_dim = head_size
+    # silulating serving 4 LoRAs
+    scaling_factors = [1, 2, 4, 8]
+    # batched RoPE can take multiple scaling factors
+    batched_rope = get_rope(head_size, rotary_dim, max_position, base,
+                            is_neox_style, {
+                                "type": "linear",
+                                "factor": tuple(scaling_factors)
+                            })
+    # non-batched RoPE takes only one scaling factor, we create multiple
+    # instances to simulate the same behavior
+    non_batched_ropes: List[RotaryEmbedding] = []
+    for scaling_factor in scaling_factors:
+        non_batched_ropes.append(
+            get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
+                     {
+                         "type": "linear",
+                         "factor": (scaling_factor, )
+                     }))
+
+    positions = torch.randint(0, max_position, (batch_size, seq_len))
+    query = torch.randn(batch_size,
+                        seq_len,
+                        num_heads * head_size,
+                        dtype=dtype)
+    key = torch.randn_like(query)
+
+    # create query offsets for batched RoPE, we concat multiple kv cache
+    # together and each query needs to find the right kv cache of its type
+    offset_map = torch.tensor(
+        list(
+            accumulate([0] + [
+                max_position * scaling_factor * 2
+                for scaling_factor in scaling_factors[:-1]
+            ])))
+    query_types = torch.randint(0,
+                                len(scaling_factors), (batch_size, seq_len),
+                                device=device)
+    # map query types to offsets
+    query_offsets = offset_map[query_types]
+    # the kernel takes flattened offsets
+    flatten_offsets = query_offsets.flatten()
+
+    # batched queries of the same type together for non-batched RoPE
+    queries = [query[query_types == i] for i in range(len(scaling_factors))]
+    keys = [key[query_types == i] for i in range(len(scaling_factors))]
+    packed_qkr = zip(queries, keys, non_batched_ropes)
+    # synchronize before start timing
+    torch.cuda.synchronize()
+    with nvtx.annotate("non-batched", color="yellow"):
+        for q, k, r in packed_qkr:
+            r.forward(positions, q, k)
+    torch.cuda.synchronize()
+    with nvtx.annotate("batched", color="green"):
+        batched_rope.forward(positions, query, key, flatten_offsets)
+    torch.cuda.synchronize()
+
+
+if __name__ == '__main__':
+    parser = FlexibleArgumentParser(
+        description="Benchmark the rotary embedding kernels.")
+    parser.add_argument("--is-neox-style", type=bool, default=True)
+    parser.add_argument("--batch-size", type=int, default=16)
+    parser.add_argument("--seq-len", type=int, default=512)
+    parser.add_argument("--num-heads", type=int, default=8)
+    parser.add_argument("--head-size",
+                        type=int,
+                        choices=[64, 80, 96, 112, 128, 192, 256],
+                        default=128)
+    parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
+    parser.add_argument("--dtype",
+                        type=str,
+                        choices=["bfloat16", "float"],
+                        default="float")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--device",
+                        type=str,
+                        choices=["cuda:0", "cuda:1"],
+                        default="cuda:0")
+    args = parser.parse_args()
+    print(args)
+
+    benchmark_rope_kernels_multi_lora(
+        is_neox_style=args.is_neox_style,
+        batch_size=args.batch_size,
+        seq_len=args.seq_len,
+        num_heads=args.num_heads,
+        head_size=args.head_size,
+        rotary_dim=args.rotary_dim,
+        dtype=getattr(torch, args.dtype),
+        seed=args.seed,
+        device=args.device,
+    )

+ 0 - 100
tests/benchmarks/latency.py

@@ -1,100 +0,0 @@
-import argparse
-import time
-
-import numpy as np
-import torch
-from tqdm import tqdm
-
-from aphrodite import LLM, SamplingParams
-
-
-def main(args: argparse.Namespace):  # pylint: disable=redefined-outer-name
-    print(args)
-
-    # Process all the requests in a single batch if possible.
-    # NOTE: If the request cannot be processed in a single batch,
-    # the engine will automatically process the request in multiple batches.
-    llm = LLM(
-        model=args.model,
-        tokenizer=args.tokenizer,
-        quantization=args.quantization,
-        tensor_parallel_size=args.tensor_parallel_size,
-        max_num_seqs=args.batch_size,
-        max_num_batched_tokens=args.batch_size * args.input_len,
-        trust_remote_code=args.trust_remote_code,
-        dtype=args.dtype,
-    )
-
-    sampling_params = SamplingParams(
-        n=args.n,
-        temperature=0.0 if args.use_beam_search else 1.0,
-        top_p=1.0,
-        use_beam_search=args.use_beam_search,
-        ignore_eos=True,
-        max_tokens=args.output_len,
-    )
-    print(sampling_params)
-    dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size
-
-    def run_to_completion(profile: bool = False):
-        if profile:
-            torch.cuda.cudart().cudaProfilerStart()
-        start_time = time.perf_counter()
-
-        llm.generate(prompt_token_ids=dummy_prompt_token_ids,
-                     sampling_params=sampling_params,
-                     use_tqdm=False)
-
-        end_time = time.perf_counter()
-        latency = end_time - start_time
-        if profile:
-            torch.cuda.cudart().cudaProfilerStop()
-        return latency
-
-    print('Warming up...')
-    run_to_completion(profile=False)
-
-    # Benchmark.
-    latencies = []
-    for _ in tqdm(range(args.num_iters), desc='Profiling iterations'):
-        latencies.append(run_to_completion(profile=False))
-    print(f'Avg latency: {np.mean(latencies)} seconds')
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description='Benchmark the latency of processing a single batch of '
-        'requests till completion.')
-    parser.add_argument('--model', type=str, default='facebook/opt-125m')
-    parser.add_argument('--tokenizer', type=str, default=None)
-    parser.add_argument('--quantization',
-                        '-q',
-                        choices=['awq', None],
-                        default=None)
-    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
-    parser.add_argument('--input-len', type=int, default=32)
-    parser.add_argument('--output-len', type=int, default=128)
-    parser.add_argument('--batch-size', type=int, default=8)
-    parser.add_argument('--n',
-                        type=int,
-                        default=1,
-                        help='Number of generated sequences per prompt.')
-    parser.add_argument('--use-beam-search', action='store_true')
-    parser.add_argument('--num-iters',
-                        type=int,
-                        default=3,
-                        help='Number of iterations to run.')
-    parser.add_argument('--trust-remote-code',
-                        action='store_true',
-                        help='trust remote code from huggingface')
-    parser.add_argument(
-        '--dtype',
-        type=str,
-        default='auto',
-        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
-        help='data type for model weights and activations. '
-        'The "auto" option will use FP16 precision '
-        'for FP32 and FP16 models, and BF16 precision '
-        'for BF16 models.')
-    args = parser.parse_args()
-    main(args)

+ 3 - 10
tests/benchmarks/hashing.py → tests/benchmarks/overheads/hashing.py

@@ -1,8 +1,8 @@
-import argparse
 import cProfile
 import cProfile
 import pstats
 import pstats
 
 
 from aphrodite import LLM, SamplingParams
 from aphrodite import LLM, SamplingParams
+from aphrodite.common.utils import FlexibleArgumentParser
 
 
 # A very long prompt, total number of tokens is about 15k.
 # A very long prompt, total number of tokens is about 15k.
 LONG_PROMPT = ["You are an expert in large language models, aren't you?"
 LONG_PROMPT = ["You are an expert in large language models, aren't you?"
@@ -17,7 +17,6 @@ def main(args):
         enable_prefix_caching=True,
         enable_prefix_caching=True,
         tensor_parallel_size=args.tensor_parallel_size,
         tensor_parallel_size=args.tensor_parallel_size,
         use_v2_block_manager=args.use_v2_block_manager,
         use_v2_block_manager=args.use_v2_block_manager,
-        max_model_len=args.max_model_len,
     )
     )
 
 
     sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
     sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
@@ -48,12 +47,10 @@ def main(args):
 
 
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
         description='Benchmark the performance of hashing function in'
         description='Benchmark the performance of hashing function in'
         'automatic prefix caching.')
         'automatic prefix caching.')
-    parser.add_argument('--model',
-                        type=str,
-                        default='NousResearch/Meta-Llama-3-8B')
+    parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k')
     parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
     parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
     parser.add_argument('--output-len', type=int, default=10)
     parser.add_argument('--output-len', type=int, default=10)
     parser.add_argument('--enable-prefix-caching',
     parser.add_argument('--enable-prefix-caching',
@@ -62,9 +59,5 @@ if __name__ == "__main__":
     parser.add_argument('--use-v2-block-manager',
     parser.add_argument('--use-v2-block-manager',
                         action='store_true',
                         action='store_true',
                         help='Use BlockSpaceMangerV2')
                         help='Use BlockSpaceMangerV2')
-    parser.add_argument('--max-model-len',
-                        type=int,
-                        default=None,
-                        help='maximum length of the model')
     args = parser.parse_args()
     args = parser.parse_args()
     main(args)
     main(args)

+ 62 - 0
tests/benchmarks/overheads/prefix_caching.py

@@ -0,0 +1,62 @@
+import time
+
+from aphrodite import LLM, SamplingParams
+from aphrodite.common.utils import FlexibleArgumentParser
+
+PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"  # noqa: E501
+
+
+def test_prefix(llm=None, sampling_params=None, prompts=None):
+    start_time = time.time()
+
+    llm.generate(prompts, sampling_params=sampling_params)
+
+    end_time = time.time()
+    print(f"cost time {end_time - start_time}")
+
+
+def main(args):
+    llm = LLM(model=args.model,
+              tokenizer_mode='auto',
+              trust_remote_code=True,
+              enforce_eager=True,
+              use_v2_block_manager=args.use_v2_block_manager,
+              tensor_parallel_size=args.tensor_parallel_size,
+              enable_prefix_caching=args.enable_prefix_caching)
+
+    num_prompts = 100
+    prompts = [PROMPT] * num_prompts
+    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+
+    print("------warm up------")
+    test_prefix(
+        llm=llm,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+
+    print("------start generating------")
+    test_prefix(
+        llm=llm,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Benchmark the performance with or without automatic '
+        'prefix caching.')
+    parser.add_argument('--model',
+                        type=str,
+                        default='baichuan-inc/Baichuan2-13B-Chat')
+    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
+    parser.add_argument('--output-len', type=int, default=10)
+    parser.add_argument('--enable-prefix-caching',
+                        action='store_true',
+                        help='enable prefix caching')
+    parser.add_argument('--use-v2-block-manager',
+                        action='store_true',
+                        help='Use BlockSpaceMangerV2')
+    args = parser.parse_args()
+    main(args)