7 kuukautta sitten · 5dbfc200f2
--- a/tests/benchmarks/README.md
+++ b/tests/benchmarks/README.md
@@ -0,0 +1,8 @@
 
				+# Benchmarking Aphrodite
			
 
				+
			
 
				+## Downloading the ShareGPT dataset
			
 
				+
			
 
				+You can download the dataset by running:
			
 
				+```bash
			
 
				+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
			
 
				+```
			
--- a/tests/benchmarks/backend_request_func.py
+++ b/tests/benchmarks/backend_request_func.py
@@ -4,10 +4,13 @@ import sys
 
				 import time
			
 
				 import traceback
			
 
				 from dataclasses import dataclass, field
			
 
				-from typing import List, Optional
			
 
				+from typing import List, Optional, Union
			
 
				 
			
 
				 import aiohttp
			
 
				+import huggingface_hub.constants
			
 
				 from tqdm.asyncio import tqdm
			
 
				+from transformers import (AutoTokenizer, PreTrainedTokenizer,
			
 
				+                          PreTrainedTokenizerFast)
			
 
				 
			
 
				 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
			
 
				 
			
@@ -68,9 +71,13 @@ async def async_request_tgi(
 
				                         chunk_bytes = chunk_bytes.strip()
			
 
				                         if not chunk_bytes:
			
 
				                             continue
			
 
				+                        chunk_bytes = chunk_bytes.decode("utf-8")
			
 
				 
			
 
				-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
			
 
				-                                              "data:")
			
 
				+                        #NOTE: Sometimes TGI returns a ping response without
			
 
				+                        # any data, we should skip it.
			
 
				+                        if chunk_bytes.startswith(":"):
			
 
				+                            continue
			
 
				+                        chunk = remove_prefix(chunk_bytes, "data:")
			
 
				 
			
 
				                         data = json.loads(chunk)
			
 
				                         timestamp = time.perf_counter()
			
@@ -218,8 +225,8 @@ async def async_request_openai_completions(
 
				 ) -> RequestFuncOutput:
			
 
				     api_url = request_func_input.api_url
			
 
				     assert api_url.endswith(
			
 
				-        "v1/completions"
			
 
				-    ), "OpenAI Completions API URL must end with 'v1/completions'."
			
 
				+        "completions"
			
 
				+    ), "OpenAI Completions API URL must end with 'completions'."
			
 
				 
			
 
				     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
			
 
				         assert not request_func_input.use_beam_search
			
@@ -258,6 +265,9 @@ async def async_request_openai_completions(
 
				                         else:
			
 
				                             data = json.loads(chunk)
			
 
				 
			
 
				+                            # NOTE: Some completion API might have a last
			
 
				+                            # usage summary response without a token so we
			
 
				+                            # want to check a token was generated
			
 
				                             if data["choices"][0]["text"]:
			
 
				                                 timestamp = time.perf_counter()
			
 
				                                 # First token
			
@@ -266,12 +276,8 @@ async def async_request_openai_completions(
 
				                                     output.ttft = ttft
			
 
				 
			
 
				                                 # Decoding phase
			
 
				-                                # NOTE: Some completion API might have a last
			
 
				-                                # usage summary response without a token so we
			
 
				-                                # do not want to include as inter-token-latency
			
 
				-                                elif data.get("usage", None) is None:
			
 
				-                                    output.itl.append(timestamp -
			
 
				-                                                      most_recent_timestamp)
			
 
				+                                output.itl.append(timestamp -
			
 
				+                                                  most_recent_timestamp)
			
 
				 
			
 
				                                 most_recent_timestamp = timestamp
			
 
				                                 generated_text += data["choices"][0]["text"]
			
@@ -298,8 +304,8 @@ async def async_request_openai_chat_completions(
 
				 ) -> RequestFuncOutput:
			
 
				     api_url = request_func_input.api_url
			
 
				     assert api_url.endswith(
			
 
				-        "v1/chat/completions"
			
 
				-    ), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
			
 
				+        "chat/completions"
			
 
				+    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
			
 
				 
			
 
				     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
			
 
				         assert not request_func_input.use_beam_search
			
@@ -384,12 +390,38 @@ def remove_prefix(text: str, prefix: str) -> str:
 
				     return text
			
 
				 
			
 
				 
			
 
				+def get_model(pretrained_model_name_or_path: str) -> str:
			
 
				+    if os.getenv('APHRODITE_USE_MODELSCOPE', 'False').lower() == 'true':
			
 
				+        from modelscope import snapshot_download
			
 
				+
			
 
				+        model_path = snapshot_download(
			
 
				+            model_id=pretrained_model_name_or_path,
			
 
				+            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
			
 
				+            ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
			
 
				+
			
 
				+        return model_path
			
 
				+    return pretrained_model_name_or_path
			
 
				+
			
 
				+
			
 
				+def get_tokenizer(
			
 
				+    pretrained_model_name_or_path: str, trust_remote_code: bool
			
 
				+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
			
 
				+    if pretrained_model_name_or_path is not None and not os.path.exists(
			
 
				+            pretrained_model_name_or_path):
			
 
				+        pretrained_model_name_or_path = get_model(
			
 
				+            pretrained_model_name_or_path)
			
 
				+    return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
			
 
				+                                         trust_remote_code=trust_remote_code)
			
 
				+
			
 
				+
			
 
				 ASYNC_REQUEST_FUNCS = {
			
 
				     "tgi": async_request_tgi,
			
 
				     "aphrodite": async_request_openai_completions,
			
 
				+    "vllm": async_request_openai_completions,
			
 
				     "lmdeploy": async_request_openai_completions,
			
 
				     "deepspeed-mii": async_request_deepspeed_mii,
			
 
				     "openai": async_request_openai_completions,
			
 
				     "openai-chat": async_request_openai_chat_completions,
			
 
				     "tensorrt-llm": async_request_trt_llm,
			
 
				+    "scalellm": async_request_openai_completions,
			
 
				 }
			
--- a/tests/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/tests/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -8,9 +8,10 @@ from typing import Callable, Iterable, List, Tuple
 
				 import torch
			
 
				 import torch.utils.benchmark as TBenchmark
			
 
				 from torch.utils.benchmark import Measurement as TMeasurement
			
 
				-from .weight_shapes import WEIGHT_SHAPES
			
 
				+from weight_shapes import WEIGHT_SHAPES
			
 
				 
			
 
				 from aphrodite import _custom_ops as ops
			
 
				+from aphrodite.common.utils import FlexibleArgumentParser
			
 
				 
			
 
				 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
			
 
				 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
			
@@ -19,18 +20,18 @@ DEFAULT_TP_SIZES = [1]
 
				 # helpers
			
 
				 
			
 
				 
			
 
				-def to_fp8(tensor: torch.tensor) -> torch.tensor:
			
 
				+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
			
 
				     finfo = torch.finfo(torch.float8_e4m3fn)
			
 
				     return torch.round(tensor.clamp(
			
 
				         min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
			
 
				 
			
 
				 
			
 
				-def to_int8(tensor: torch.tensor) -> torch.tensor:
			
 
				+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
			
 
				     return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
			
 
				 
			
 
				 
			
 
				 def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
			
 
				-                      k: int) -> Tuple[torch.tensor, torch.tensor]:
			
 
				+                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
			
 
				 
			
 
				     a = torch.randn((m, k), device='cuda') * 5
			
 
				     b = torch.randn((n, k), device='cuda').t() * 5
			
@@ -46,15 +47,15 @@ def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
 
				 # impl
			
 
				 
			
 
				 
			
 
				-def pytorch_mm_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
			
 
				-                    scale_b: torch.tensor,
			
 
				-                    out_dtype: torch.dtype) -> torch.tensor:
			
 
				+def pytorch_mm_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
			
 
				+                    scale_b: torch.Tensor,
			
 
				+                    out_dtype: torch.dtype) -> torch.Tensor:
			
 
				     return torch.mm(a, b)
			
 
				 
			
 
				 
			
 
				-def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
			
 
				-                     scale_b: torch.tensor,
			
 
				-                     out_dtype: torch.dtype) -> torch.tensor:
			
 
				+def pytorch_fp8_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
			
 
				+                     scale_b: torch.Tensor,
			
 
				+                     out_dtype: torch.dtype) -> torch.Tensor:
			
 
				     return torch._scaled_mm(a,
			
 
				                             b,
			
 
				                             scale_a=scale_a,
			
@@ -62,9 +63,9 @@ def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
 
				                             out_dtype=out_dtype)
			
 
				 
			
 
				 
			
 
				-def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
			
 
				-                                scale_a: torch.tensor, scale_b: torch.tensor,
			
 
				-                                out_dtype: torch.dtype) -> torch.tensor:
			
 
				+def pytorch_fp8_impl_fast_accum(a: torch.Tensor, b: torch.Tensor,
			
 
				+                                scale_a: torch.Tensor, scale_b: torch.Tensor,
			
 
				+                                out_dtype: torch.dtype) -> torch.Tensor:
			
 
				     return torch._scaled_mm(a,
			
 
				                             b,
			
 
				                             scale_a=scale_a,
			
@@ -73,15 +74,15 @@ def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
 
				                             use_fast_accum=True)
			
 
				 
			
 
				 
			
 
				-def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
			
 
				-                 scale_b: torch.tensor,
			
 
				-                 out_dtype: torch.dtype) -> torch.tensor:
			
 
				+def cutlass_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
			
 
				+                 scale_b: torch.Tensor,
			
 
				+                 out_dtype: torch.dtype) -> torch.Tensor:
			
 
				     return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)
			
 
				 
			
 
				 
			
 
				 # bench
			
 
				-def bench_fn(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
			
 
				-             scale_b: torch.tensor, out_dtype: torch.dtype, label: str,
			
 
				+def bench_fn(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
			
 
				+             scale_b: torch.Tensor, out_dtype: torch.dtype, label: str,
			
 
				              sub_label: str, fn: Callable, description: str) -> TMeasurement:
			
 
				 
			
 
				     min_run_time = 1
			
@@ -293,7 +294,7 @@ if __name__ == '__main__':
 
				             return torch.float8_e4m3fn
			
 
				         raise ValueError("unsupported dtype")
			
 
				 
			
 
				-    parser = argparse.ArgumentParser(
			
 
				+    parser = FlexibleArgumentParser(
			
 
				         description="""
			
 
				 Benchmark Cutlass GEMM.
			
 
				 
			
--- a/tests/benchmarks/engine/latency.py
+++ b/tests/benchmarks/engine/latency.py
@@ -0,0 +1,287 @@
 
				+"""Benchmark the latency of processing a single batch of requests."""
			
 
				+import argparse
			
 
				+import json
			
 
				+import time
			
 
				+from pathlib import Path
			
 
				+from typing import List, Optional
			
 
				+
			
 
				+import numpy as np
			
 
				+import torch
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+from aphrodite import LLM, SamplingParams
			
 
				+from aphrodite.engine.args_tools import EngineArgs
			
 
				+from aphrodite.inputs import PromptStrictInputs
			
 
				+from aphrodite.quantization import QUANTIZATION_METHODS
			
 
				+from aphrodite.common.utils import FlexibleArgumentParser
			
 
				+
			
 
				+
			
 
				+def main(args: argparse.Namespace):
			
 
				+    print(args)
			
 
				+
			
 
				+    # NOTE: If the request cannot be processed in a single batch,
			
 
				+    # the engine will automatically process the request in multiple batches.
			
 
				+    llm = LLM(
			
 
				+        model=args.model,
			
 
				+        speculative_model=args.speculative_model,
			
 
				+        num_speculative_tokens=args.num_speculative_tokens,
			
 
				+        speculative_draft_tensor_parallel_size=\
			
 
				+            args.speculative_draft_tensor_parallel_size,
			
 
				+        ngram_prompt_lookup_max=args.ngram_prompt_lookup_max,
			
 
				+        ngram_prompt_lookup_min=args.ngram_prompt_lookup_min,
			
 
				+        tokenizer=args.tokenizer,
			
 
				+        quantization=args.quantization,
			
 
				+        tensor_parallel_size=args.tensor_parallel_size,
			
 
				+        trust_remote_code=args.trust_remote_code,
			
 
				+        dtype=args.dtype,
			
 
				+        max_model_len=args.max_model_len,
			
 
				+        enforce_eager=args.enforce_eager,
			
 
				+        kv_cache_dtype=args.kv_cache_dtype,
			
 
				+        quantization_param_path=args.quantization_param_path,
			
 
				+        device=args.device,
			
 
				+        ray_workers_use_nsight=args.ray_workers_use_nsight,
			
 
				+        use_v2_block_manager=args.use_v2_block_manager,
			
 
				+        enable_chunked_prefill=args.enable_chunked_prefill,
			
 
				+        download_dir=args.download_dir,
			
 
				+        block_size=args.block_size,
			
 
				+        gpu_memory_utilization=args.gpu_memory_utilization,
			
 
				+        load_format=args.load_format,
			
 
				+        distributed_executor_backend=args.distributed_executor_backend,
			
 
				+        enable_prefix_caching=args.enable_prefix_caching,
			
 
				+    )
			
 
				+
			
 
				+    sampling_params = SamplingParams(
			
 
				+        n=args.n,
			
 
				+        temperature=0.0 if args.use_beam_search else 1.0,
			
 
				+        top_p=1.0,
			
 
				+        use_beam_search=args.use_beam_search,
			
 
				+        ignore_eos=True,
			
 
				+        max_tokens=args.output_len,
			
 
				+    )
			
 
				+    print(sampling_params)
			
 
				+    dummy_prompt_token_ids = np.random.randint(10000,
			
 
				+                                               size=(args.batch_size,
			
 
				+                                                     args.input_len))
			
 
				+    dummy_inputs: List[PromptStrictInputs] = [{
			
 
				+        "prompt_token_ids": batch
			
 
				+    } for batch in dummy_prompt_token_ids.tolist()]
			
 
				+
			
 
				+    def run_to_completion(profile_dir: Optional[str] = None):
			
 
				+        if profile_dir:
			
 
				+            with torch.profiler.profile(
			
 
				+                    activities=[
			
 
				+                        torch.profiler.ProfilerActivity.CPU,
			
 
				+                        torch.profiler.ProfilerActivity.CUDA,
			
 
				+                    ],
			
 
				+                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
			
 
				+                        str(profile_dir))) as p:
			
 
				+                llm.generate(dummy_inputs,
			
 
				+                             sampling_params=sampling_params,
			
 
				+                             use_tqdm=False)
			
 
				+            print(p.key_averages())
			
 
				+        else:
			
 
				+            start_time = time.perf_counter()
			
 
				+            llm.generate(dummy_inputs,
			
 
				+                         sampling_params=sampling_params,
			
 
				+                         use_tqdm=False)
			
 
				+            end_time = time.perf_counter()
			
 
				+            latency = end_time - start_time
			
 
				+            return latency
			
 
				+
			
 
				+    print("Warming up...")
			
 
				+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
			
 
				+        run_to_completion(profile_dir=None)
			
 
				+
			
 
				+    if args.profile:
			
 
				+        profile_dir = args.profile_result_dir
			
 
				+        if not profile_dir:
			
 
				+            profile_dir = Path(
			
 
				+                "."
			
 
				+            ) / "aphrodite_benchmark_result" / f"latency_result_{time.time()}"
			
 
				+        print(f"Profiling (results will be saved to '{profile_dir}')...")
			
 
				+        run_to_completion(profile_dir=profile_dir)
			
 
				+        return
			
 
				+
			
 
				+    # Benchmark.
			
 
				+    latencies = []
			
 
				+    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
			
 
				+        latencies.append(run_to_completion(profile_dir=None))
			
 
				+    latencies = np.array(latencies)
			
 
				+    percentages = [10, 25, 50, 75, 90, 99]
			
 
				+    percentiles = np.percentile(latencies, percentages)
			
 
				+    print(f'Avg latency: {np.mean(latencies)} seconds')
			
 
				+    for percentage, percentile in zip(percentages, percentiles):
			
 
				+        print(f'{percentage}% percentile latency: {percentile} seconds')
			
 
				+
			
 
				+    # Output JSON results if specified
			
 
				+    if args.output_json:
			
 
				+        results = {
			
 
				+            "avg_latency": np.mean(latencies),
			
 
				+            "latencies": latencies.tolist(),
			
 
				+            "percentiles": dict(zip(percentages, percentiles.tolist())),
			
 
				+        }
			
 
				+        with open(args.output_json, "w") as f:
			
 
				+            json.dump(results, f, indent=4)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    parser = FlexibleArgumentParser(
			
 
				+        description='Benchmark the latency of processing a single batch of '
			
 
				+        'requests till completion.')
			
 
				+    parser.add_argument('--model', type=str, default='facebook/opt-125m')
			
 
				+    parser.add_argument('--speculative-model', type=str, default=None)
			
 
				+    parser.add_argument('--num-speculative-tokens', type=int, default=None)
			
 
				+    parser.add_argument('--speculative-draft-tensor-parallel-size',
			
 
				+                        '-spec-draft-tp',
			
 
				+                        type=int,
			
 
				+                        default=None)
			
 
				+    parser.add_argument('--ngram-prompt-lookup-max',
			
 
				+                        type=int,
			
 
				+                        default=None)
			
 
				+    parser.add_argument('--ngram-prompt-lookup-min',
			
 
				+                        type=int,
			
 
				+                        default=None)
			
 
				+    parser.add_argument('--tokenizer', type=str, default=None)
			
 
				+    parser.add_argument('--quantization',
			
 
				+                        '-q',
			
 
				+                        choices=[*QUANTIZATION_METHODS, None],
			
 
				+                        default=None)
			
 
				+    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
			
 
				+    parser.add_argument('--input-len', type=int, default=32)
			
 
				+    parser.add_argument('--output-len', type=int, default=128)
			
 
				+    parser.add_argument('--batch-size', type=int, default=8)
			
 
				+    parser.add_argument('--n',
			
 
				+                        type=int,
			
 
				+                        default=1,
			
 
				+                        help='Number of generated sequences per prompt.')
			
 
				+    parser.add_argument('--use-beam-search', action='store_true')
			
 
				+    parser.add_argument('--num-iters-warmup',
			
 
				+                        type=int,
			
 
				+                        default=10,
			
 
				+                        help='Number of iterations to run for warmup.')
			
 
				+    parser.add_argument('--num-iters',
			
 
				+                        type=int,
			
 
				+                        default=30,
			
 
				+                        help='Number of iterations to run.')
			
 
				+    parser.add_argument('--trust-remote-code',
			
 
				+                        action='store_true',
			
 
				+                        help='trust remote code from huggingface')
			
 
				+    parser.add_argument(
			
 
				+        '--max-model-len',
			
 
				+        type=int,
			
 
				+        default=None,
			
 
				+        help='Maximum length of a sequence (including prompt and output). '
			
 
				+        'If None, will be derived from the model.')
			
 
				+    parser.add_argument(
			
 
				+        '--dtype',
			
 
				+        type=str,
			
 
				+        default='auto',
			
 
				+        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
			
 
				+        help='data type for model weights and activations. '
			
 
				+        'The "auto" option will use FP16 precision '
			
 
				+        'for FP32 and FP16 models, and BF16 precision '
			
 
				+        'for BF16 models.')
			
 
				+    parser.add_argument('--enforce-eager',
			
 
				+                        action='store_true',
			
 
				+                        help='enforce eager mode and disable CUDA graph')
			
 
				+    parser.add_argument(
			
 
				+        '--kv-cache-dtype',
			
 
				+        type=str,
			
 
				+        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
			
 
				+        default="auto",
			
 
				+        help='Data type for kv cache storage. If "auto", will use model '
			
 
				+        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
			
 
				+        'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
			
 
				+    parser.add_argument(
			
 
				+        '--quantization-param-path',
			
 
				+        type=str,
			
 
				+        default=None,
			
 
				+        help='Path to the JSON file containing the KV cache scaling factors. '
			
 
				+        'This should generally be supplied, when KV cache dtype is FP8. '
			
 
				+        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
			
 
				+        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
			
 
				+        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
			
 
				+        'instead supported for common inference criteria.')
			
 
				+    parser.add_argument(
			
 
				+        '--profile',
			
 
				+        action='store_true',
			
 
				+        help='profile the generation process of a single batch')
			
 
				+    parser.add_argument(
			
 
				+        '--profile-result-dir',
			
 
				+        type=str,
			
 
				+        default=None,
			
 
				+        help=('path to save the pytorch profiler output. Can be visualized '
			
 
				+              'with ui.perfetto.dev or Tensorboard.'))
			
 
				+    parser.add_argument(
			
 
				+        "--device",
			
 
				+        type=str,
			
 
				+        default="auto",
			
 
				+        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
			
 
				+        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
			
 
				+        'CPU.')
			
 
				+    parser.add_argument('--block-size',
			
 
				+                        type=int,
			
 
				+                        default=16,
			
 
				+                        help='block size of key/value cache')
			
 
				+    parser.add_argument(
			
 
				+        '--enable-chunked-prefill',
			
 
				+        action='store_true',
			
 
				+        help='If True, the prefill requests can be chunked based on the '
			
 
				+        'max_num_batched_tokens')
			
 
				+    parser.add_argument("--enable-prefix-caching",
			
 
				+                        action='store_true',
			
 
				+                        help="Enable automatic prefix caching")
			
 
				+    parser.add_argument('--use-v2-block-manager', action='store_true')
			
 
				+    parser.add_argument(
			
 
				+        "--ray-workers-use-nsight",
			
 
				+        action='store_true',
			
 
				+        help="If specified, use nsight to profile ray workers",
			
 
				+    )
			
 
				+    parser.add_argument('--download-dir',
			
 
				+                        type=str,
			
 
				+                        default=None,
			
 
				+                        help='directory to download and load the weights, '
			
 
				+                        'default to the default cache dir of huggingface')
			
 
				+    parser.add_argument(
			
 
				+        '--output-json',
			
 
				+        type=str,
			
 
				+        default=None,
			
 
				+        help='Path to save the latency results in JSON format.')
			
 
				+    parser.add_argument('--gpu-memory-utilization',
			
 
				+                        type=float,
			
 
				+                        default=0.9,
			
 
				+                        help='the fraction of GPU memory to be used for '
			
 
				+                        'the model executor, which can range from 0 to 1.'
			
 
				+                        'If unspecified, will use the default value of 0.9.')
			
 
				+    parser.add_argument(
			
 
				+        '--load-format',
			
 
				+        type=str,
			
 
				+        default=EngineArgs.load_format,
			
 
				+        choices=[
			
 
				+            'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
			
 
				+            'bitsandbytes'
			
 
				+        ],
			
 
				+        help='The format of the model weights to load.\n\n'
			
 
				+        '* "auto" will try to load the weights in the safetensors format '
			
 
				+        'and fall back to the pytorch bin format if safetensors format '
			
 
				+        'is not available.\n'
			
 
				+        '* "pt" will load the weights in the pytorch bin format.\n'
			
 
				+        '* "safetensors" will load the weights in the safetensors format.\n'
			
 
				+        '* "npcache" will load the weights in pytorch format and store '
			
 
				+        'a numpy cache to speed up the loading.\n'
			
 
				+        '* "dummy" will initialize the weights with random values, '
			
 
				+        'which is mainly for profiling.\n'
			
 
				+        '* "tensorizer" will load the weights using tensorizer from '
			
 
				+        'CoreWeave. See the Tensorize vLLM Model script in the Examples'
			
 
				+        'section for more information.\n'
			
 
				+        '* "bitsandbytes" will load the weights using bitsandbytes '
			
 
				+        'quantization.\n')
			
 
				+    parser.add_argument(
			
 
				+        '--distributed-executor-backend',
			
 
				+        choices=['ray', 'mp'],
			
 
				+        default=None,
			
 
				+        help='Backend to use for distributed serving. When more than 1 GPU '
			
 
				+        'is used, will be automatically set to "ray" if installed '
			
 
				+        'or "mp" (multiprocessing) otherwise.')
			
 
				+    args = parser.parse_args()
			
 
				+    main(args)
			
--- a/tests/benchmarks/engine/serving.py
+++ b/tests/benchmarks/engine/serving.py
@@ -2,8 +2,8 @@
 
				 
			
 
				 On the server side, run one of the following commands:
			
 
				     Aphrodite OpenAI API server
			
 
				-    python -m aphrodite.endpoints.openai.api_server \
			
 
				-        --model <your_model> --swap-space 16 \
			
 
				+    aphrodite run <your_model> \
			
 
				+        --swap-space 16 \
			
 
				         --disable-log-requests
			
 
				 
			
 
				     (TGI backend)
			
@@ -17,7 +17,7 @@ On the client side, run:
 
				         --dataset-path <path to dataset> \
			
 
				         --request-rate <request_rate> \ # By default <request_rate> is inf
			
 
				         --num-prompts <num_prompts> # By default <num_prompts> is 1000
			
 
				-        
			
 
				+
			
 
				     when using tgi backend, add
			
 
				         --endpoint /generate_stream
			
 
				     to the end of the command above.
			
@@ -31,7 +31,7 @@ import time
 
				 import warnings
			
 
				 from dataclasses import dataclass
			
 
				 from datetime import datetime
			
 
				-from typing import AsyncGenerator, List, Optional, Tuple
			
 
				+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
			
 
				 
			
 
				 import numpy as np
			
 
				 from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
			
@@ -39,7 +39,15 @@ from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
 
				 from tqdm.asyncio import tqdm
			
 
				 from transformers import PreTrainedTokenizerBase
			
 
				 
			
 
				-from aphrodite.transformers_utils.tokenizer import get_tokenizer
			
 
				+try:
			
 
				+    from aphrodite.transformers_utils.tokenizer import get_tokenizer
			
 
				+except ImportError:
			
 
				+    from backend_request_func import get_tokenizer
			
 
				+
			
 
				+try:
			
 
				+    from aphrodite.common.utils import FlexibleArgumentParser
			
 
				+except ImportError:
			
 
				+    from argparse import ArgumentParser as FlexibleArgumentParser
			
 
				 
			
 
				 
			
 
				 @dataclass
			
@@ -52,10 +60,16 @@ class BenchmarkMetrics:
 
				     output_throughput: float
			
 
				     mean_ttft_ms: float
			
 
				     median_ttft_ms: float
			
 
				+    std_ttft_ms: float
			
 
				     p99_ttft_ms: float
			
 
				     mean_tpot_ms: float
			
 
				     median_tpot_ms: float
			
 
				+    std_tpot_ms: float
			
 
				     p99_tpot_ms: float
			
 
				+    mean_itl_ms: float
			
 
				+    median_itl_ms: float
			
 
				+    std_itl_ms: float
			
 
				+    p99_itl_ms: float
			
 
				 
			
 
				 
			
 
				 def sample_sharegpt_requests(
			
@@ -66,7 +80,6 @@ def sample_sharegpt_requests(
 
				 ) -> List[Tuple[str, int, int]]:
			
 
				     if fixed_output_len is not None and fixed_output_len < 4:
			
 
				         raise ValueError("output_len too small")
			
 
				-
			
 
				     # Load the dataset.
			
 
				     with open(dataset_path) as f:
			
 
				         dataset = json.load(f)
			
@@ -174,6 +187,31 @@ def sample_sonnet_requests(
 
				     return sampled_requests
			
 
				 
			
 
				 
			
 
				+def sample_random_requests(
			
 
				+        input_len: int, output_len: int, num_prompts: int, range_ratio: float,
			
 
				+        tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
			
 
				+
			
 
				+    input_lens = np.random.randint(
			
 
				+        int(input_len * range_ratio),
			
 
				+        input_len + 1,
			
 
				+        size=num_prompts,
			
 
				+    )
			
 
				+    output_lens = np.random.randint(
			
 
				+        int(output_len * range_ratio),
			
 
				+        output_len + 1,
			
 
				+        size=num_prompts,
			
 
				+    )
			
 
				+    offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
			
 
				+    input_requests = []
			
 
				+    for i in range(num_prompts):
			
 
				+        prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
			
 
				+                                   for j in range(input_lens[i])])
			
 
				+        input_requests.append(
			
 
				+            (prompt, int(input_lens[i]), int(output_lens[i])))
			
 
				+
			
 
				+    return input_requests
			
 
				+
			
 
				+
			
 
				 async def get_request(
			
 
				     input_requests: List[Tuple[str, int, int]],
			
 
				     request_rate: float,
			
@@ -185,6 +223,7 @@ async def get_request(
 
				         if request_rate == float("inf"):
			
 
				             # If the request rate is infinity, then we don't need to wait.
			
 
				             continue
			
 
				+
			
 
				         # Sample the request interval from the exponential distribution.
			
 
				         interval = np.random.exponential(1.0 / request_rate)
			
 
				         # The next request will be sent after the interval.
			
@@ -197,19 +236,27 @@ def calculate_metrics(
 
				     dur_s: float,
			
 
				     tokenizer: PreTrainedTokenizerBase,
			
 
				 ) -> Tuple[BenchmarkMetrics, List[int]]:
			
 
				-    actual_output_lens = []
			
 
				+    actual_output_lens: List[int] = []
			
 
				     total_input = 0
			
 
				     completed = 0
			
 
				-    tpots = []
			
 
				-    ttfts = []
			
 
				+    itls: List[float] = []
			
 
				+    tpots: List[float] = []
			
 
				+    ttfts: List[float] = []
			
 
				     for i in range(len(outputs)):
			
 
				         if outputs[i].success:
			
 
				-            output_len = len(tokenizer(outputs[i].generated_text).input_ids)
			
 
				+            # We use the tokenizer to count the number of output tokens for all
			
 
				+            # serving backends instead of looking at len(outputs[i].itl) since
			
 
				+            # multiple output tokens may be bundled together
			
 
				+            # Note : this may inflate the output token count slightly
			
 
				+            output_len = len(
			
 
				+                tokenizer(outputs[i].generated_text,
			
 
				+                          add_special_tokens=False).input_ids)
			
 
				             actual_output_lens.append(output_len)
			
 
				             total_input += input_requests[i][1]
			
 
				             if output_len > 1:
			
 
				                 tpots.append(
			
 
				                     (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
			
 
				+            itls += outputs[i].itl
			
 
				             ttfts.append(outputs[i].ttft)
			
 
				             completed += 1
			
 
				         else:
			
@@ -230,10 +277,16 @@ def calculate_metrics(
 
				         mean_ttft_ms=np.mean(ttfts or 0) *
			
 
				         1000,  # ttfts is empty if streaming is not supported by backend
			
 
				         median_ttft_ms=np.median(ttfts or 0) * 1000,
			
 
				+        std_ttft_ms=np.std(ttfts or 0) * 1000,
			
 
				         p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
			
 
				         mean_tpot_ms=np.mean(tpots or 0) * 1000,
			
 
				         median_tpot_ms=np.median(tpots or 0) * 1000,
			
 
				+        std_tpot_ms=np.std(tpots or 0) * 1000,
			
 
				         p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
			
 
				+        mean_itl_ms=np.mean(itls or 0) * 1000,
			
 
				+        median_itl_ms=np.median(itls or 0) * 1000,
			
 
				+        std_itl_ms=np.std(itls or 0) * 1000,
			
 
				+        p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
			
 
				     )
			
 
				 
			
 
				     return metrics, actual_output_lens
			
@@ -251,7 +304,7 @@ async def benchmark(
 
				     disable_tqdm: bool,
			
 
				 ):
			
 
				     if backend in ASYNC_REQUEST_FUNCS:
			
 
				-        request_func = ASYNC_REQUEST_FUNCS.get(backend)
			
 
				+        request_func = ASYNC_REQUEST_FUNCS[backend]
			
 
				     else:
			
 
				         raise ValueError(f"Unknown backend: {backend}")
			
 
				 
			
@@ -278,7 +331,7 @@ async def benchmark(
 
				     pbar = None if disable_tqdm else tqdm(total=len(input_requests))
			
 
				 
			
 
				     benchmark_start_time = time.perf_counter()
			
 
				-    tasks = []
			
 
				+    tasks: List[asyncio.Task] = []
			
 
				     async for request in get_request(input_requests, request_rate):
			
 
				         prompt, prompt_len, output_len = request
			
 
				         request_func_input = RequestFuncInput(
			
@@ -296,7 +349,7 @@ async def benchmark(
 
				                              pbar=pbar)))
			
 
				     outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
			
 
				 
			
 
				-    if not disable_tqdm:
			
 
				+    if pbar is not None:
			
 
				         pbar.close()
			
 
				 
			
 
				     benchmark_duration = time.perf_counter() - benchmark_start_time
			
@@ -333,6 +386,10 @@ async def benchmark(
 
				     print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
			
 
				                                     metrics.median_tpot_ms))
			
 
				     print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
			
 
				+    print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
			
 
				+    print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
			
 
				+    print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
			
 
				+    print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
			
 
				     print("=" * 50)
			
 
				 
			
 
				     result = {
			
@@ -345,10 +402,16 @@ async def benchmark(
 
				         "output_throughput": metrics.output_throughput,
			
 
				         "mean_ttft_ms": metrics.mean_ttft_ms,
			
 
				         "median_ttft_ms": metrics.median_ttft_ms,
			
 
				+        "std_ttft_ms": metrics.std_ttft_ms,
			
 
				         "p99_ttft_ms": metrics.p99_ttft_ms,
			
 
				         "mean_tpot_ms": metrics.mean_tpot_ms,
			
 
				         "median_tpot_ms": metrics.median_tpot_ms,
			
 
				+        "std_tpot_ms": metrics.std_tpot_ms,
			
 
				         "p99_tpot_ms": metrics.p99_tpot_ms,
			
 
				+        "mean_itl_ms": metrics.mean_itl_ms,
			
 
				+        "median_itl_ms": metrics.median_itl_ms,
			
 
				+        "std_itl_ms": metrics.std_itl_ms,
			
 
				+        "p99_itl_ms": metrics.p99_itl_ms,
			
 
				         "input_lens": [output.prompt_len for output in outputs],
			
 
				         "output_lens": actual_output_lens,
			
 
				         "ttfts": [output.ttft for output in outputs],
			
@@ -427,6 +490,15 @@ def main(args: argparse.Namespace):
 
				                               for prompt, prompt_formatted, prompt_len,
			
 
				                               output_len in input_requests]
			
 
				 
			
 
				+    elif args.dataset_name == "random":
			
 
				+        input_requests = sample_random_requests(
			
 
				+            input_len=args.random_input_len,
			
 
				+            output_len=args.random_output_len,
			
 
				+            num_prompts=args.num_prompts,
			
 
				+            range_ratio=args.random_range_ratio,
			
 
				+            tokenizer=tokenizer,
			
 
				+        )
			
 
				+
			
 
				     else:
			
 
				         raise ValueError(f"Unknown dataset: {args.dataset_name}")
			
 
				 
			
@@ -445,7 +517,7 @@ def main(args: argparse.Namespace):
 
				 
			
 
				     # Save config and results to json
			
 
				     if args.save_result:
			
 
				-        result_json = {}
			
 
				+        result_json: Dict[str, Any] = {}
			
 
				 
			
 
				         # Setup
			
 
				         current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
			
@@ -478,6 +550,8 @@ def main(args: argparse.Namespace):
 
				         # Save to file
			
 
				         base_model_id = model_id.split("/")[-1]
			
 
				         file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"  #noqa
			
 
				+        if args.result_filename:
			
 
				+            file_name = args.result_filename
			
 
				         if args.result_dir:
			
 
				             file_name = os.path.join(args.result_dir, file_name)
			
 
				         with open(file_name, "w") as outfile:
			
@@ -485,7 +559,7 @@ def main(args: argparse.Namespace):
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    parser = argparse.ArgumentParser(
			
 
				+    parser = FlexibleArgumentParser(
			
 
				         description="Benchmark the online serving throughput.")
			
 
				     parser.add_argument(
			
 
				         "--backend",
			
@@ -518,7 +592,7 @@ if __name__ == "__main__":
 
				         "--dataset-name",
			
 
				         type=str,
			
 
				         default="sharegpt",
			
 
				-        choices=["sharegpt", "sonnet"],
			
 
				+        choices=["sharegpt", "sonnet", "random"],
			
 
				         help="Name of the dataset to benchmark on.",
			
 
				     )
			
 
				     parser.add_argument("--dataset-path",
			
@@ -535,7 +609,7 @@ if __name__ == "__main__":
 
				         "--tokenizer",
			
 
				         type=str,
			
 
				         help=
			
 
				-        "Name or path of the tokenizer, if not using the default tokenizer.",
			
 
				+        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         "--best-of",
			
@@ -578,6 +652,27 @@ if __name__ == "__main__":
 
				         help=
			
 
				         "Number of prefix tokens per request, used only for sonnet dataset.",
			
 
				     )
			
 
				+    parser.add_argument(
			
 
				+        "--random-input-len",
			
 
				+        type=int,
			
 
				+        default=1024,
			
 
				+        help=
			
 
				+        "Number of input tokens per request, used only for random sampling.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--random-output-len",
			
 
				+        type=int,
			
 
				+        default=128,
			
 
				+        help=
			
 
				+        "Number of output tokens per request, used only for random sampling.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--random-range-ratio",
			
 
				+        type=float,
			
 
				+        default=1.0,
			
 
				+        help="Range of sampled ratio of input/output length, "
			
 
				+        "used only for random sampling.",
			
 
				+    )
			
 
				     parser.add_argument(
			
 
				         "--request-rate",
			
 
				         type=float,
			
@@ -618,6 +713,15 @@ if __name__ == "__main__":
 
				         help="Specify directory to save benchmark json results."
			
 
				         "If not specified, results are saved in the current directory.",
			
 
				     )
			
 
				+    parser.add_argument(
			
 
				+        "--result-filename",
			
 
				+        type=str,
			
 
				+        default=None,
			
 
				+        help="Specify the filename to save benchmark json results."
			
 
				+        "If not specified, results will be saved in "
			
 
				+        "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
			
 
				+        " format.",
			
 
				+    )
			
 
				 
			
 
				     args = parser.parse_args()
			
 
				     main(args)
			
--- a/tests/benchmarks/engine/throughput.py
+++ b/tests/benchmarks/engine/throughput.py
@@ -10,6 +10,7 @@ from tqdm import tqdm
 
				 from transformers import (AutoModelForCausalLM, AutoTokenizer,
			
 
				                           PreTrainedTokenizerBase)
			
 
				 
			
 
				+from aphrodite.common.utils import FlexibleArgumentParser
			
 
				 from aphrodite.engine.args_tools import EngineArgs
			
 
				 from aphrodite.quantization import QUANTIZATION_METHODS
			
 
				 
			
@@ -76,15 +77,10 @@ def run_aphrodite(
 
				     kv_cache_dtype: str,
			
 
				     quantization_param_path: Optional[str],
			
 
				     device: str,
			
 
				-    speculative_model: str,
			
 
				-    num_speculative_tokens: int,
			
 
				-    use_v2_block_manager: bool,
			
 
				-    ngram_prompt_lookup_min: int,
			
 
				-    ngram_prompt_lookup_max: int,
			
 
				     enable_prefix_caching: bool,
			
 
				     enable_chunked_prefill: bool,
			
 
				     max_num_batched_tokens: int,
			
 
				-    distributed_executor_backend: Optional[str] = None,
			
 
				+    distributed_executor_backend: Optional[str],
			
 
				     gpu_memory_utilization: float = 0.9,
			
 
				     download_dir: Optional[str] = None,
			
 
				     load_format: str = EngineArgs.load_format,
			
@@ -110,11 +106,6 @@ def run_aphrodite(
 
				         max_num_batched_tokens=max_num_batched_tokens,
			
 
				         distributed_executor_backend=distributed_executor_backend,
			
 
				         load_format=load_format,
			
 
				-        speculative_model=speculative_model,
			
 
				-        num_speculative_tokens=num_speculative_tokens,
			
 
				-        use_v2_block_manager=use_v2_block_manager,
			
 
				-        ngram_prompt_lookup_min=ngram_prompt_lookup_min,
			
 
				-        ngram_prompt_lookup_max=ngram_prompt_lookup_max,
			
 
				     )
			
 
				 
			
 
				     # Add the requests to the engine.
			
@@ -238,9 +229,7 @@ def main(args: argparse.Namespace):
 
				             args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
			
 
				             args.trust_remote_code, args.dtype, args.max_model_len,
			
 
				             args.enforce_eager, args.kv_cache_dtype,
			
 
				-            args.quantization_param_path, args.device, args.speculative_model,
			
 
				-            args.num_speculative_tokens, args.use_v2_block_manager,
			
 
				-            args.ngram_prompt_lookup_min, args.ngram_prompt_lookup_max,
			
 
				+            args.quantization_param_path, args.device,
			
 
				             args.enable_prefix_caching, args.enable_chunked_prefill,
			
 
				             args.max_num_batched_tokens, args.distributed_executor_backend,
			
 
				             args.gpu_memory_utilization, args.download_dir, args.load_format)
			
@@ -257,7 +246,7 @@ def main(args: argparse.Namespace):
 
				     total_num_tokens = sum(prompt_len + output_len
			
 
				                            for _, prompt_len, output_len in requests)
			
 
				     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
			
 
				-          f"{total_num_tokens / elapsed_time:.2f} total tokens/s")
			
 
				+          f"{total_num_tokens / elapsed_time:.2f} tokens/s")
			
 
				 
			
 
				     # Output JSON results if specified
			
 
				     if args.output_json:
			
@@ -273,7 +262,7 @@ def main(args: argparse.Namespace):
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    parser = argparse.ArgumentParser(description="Benchmark the throughput.")
			
 
				+    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
			
 
				     parser.add_argument("--backend",
			
 
				                         type=str,
			
 
				                         choices=["aphrodite", "hf", "mii"],
			
@@ -337,8 +326,7 @@ if __name__ == "__main__":
 
				                         'the model executor, which can range from 0 to 1.'
			
 
				                         'If unspecified, will use the default value of 0.9.')
			
 
				     parser.add_argument("--enforce-eager",
			
 
				-                        type=lambda x: (str(x).lower() == 'true'),
			
 
				-                        default=True,
			
 
				+                        action="store_true",
			
 
				                         help="enforce eager execution")
			
 
				     parser.add_argument(
			
 
				         '--kv-cache-dtype',
			
@@ -363,15 +351,15 @@ if __name__ == "__main__":
 
				         type=str,
			
 
				         default="auto",
			
 
				         choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
			
 
				-        help='device type for Aphrodite execution, supporting CUDA, OpenVINO, '
			
 
				-        'CPU, TPU, and XPU.')
			
 
				+        help='device type for Aphrodite execution, supporting CUDA, OpenVINO and '
			
 
				+        'CPU.')
			
 
				     parser.add_argument(
			
 
				         "--enable-prefix-caching",
			
 
				         action='store_true',
			
 
				-        help="enable automatic prefix caching for aphrodite backend.")
			
 
				+        help="enable automatic prefix caching for Aphrodite backend.")
			
 
				     parser.add_argument("--enable-chunked-prefill",
			
 
				                         action='store_true',
			
 
				-                        help="enable chunked prefill for aphrodite backend.")
			
 
				+                        help="enable chunked prefill for Aphrodite backend.")
			
 
				     parser.add_argument('--max-num-batched-tokens',
			
 
				                         type=int,
			
 
				                         default=None,
			
@@ -413,29 +401,10 @@ if __name__ == "__main__":
 
				         '* "dummy" will initialize the weights with random values, '
			
 
				         'which is mainly for profiling.\n'
			
 
				         '* "tensorizer" will load the weights using tensorizer from '
			
 
				-        'CoreWeave. See the Tensorize aphrodite Model script in the Examples'
			
 
				+        'CoreWeave. See the Tensorize Aphrodite Model script in the Examples'
			
 
				         'section for more information.\n'
			
 
				         '* "bitsandbytes" will load the weights using bitsandbytes '
			
 
				         'quantization.\n')
			
 
				-    parser.add_argument('--speculative-model',
			
 
				-                        type=str,
			
 
				-                        default=None,
			
 
				-                        help='speculative model for speculative decoding')
			
 
				-    parser.add_argument('--use-v2-block-manager',
			
 
				-                        action='store_true',
			
 
				-                        help='use v2 block manage.')
			
 
				-    parser.add_argument('--num-speculative-tokens',
			
 
				-                        type=int,
			
 
				-                        default=None,
			
 
				-                        help='number of speculative tokens.')
			
 
				-    parser.add_argument('--ngram-prompt-lookup-min',
			
 
				-                        type=int,
			
 
				-                        default=None,
			
 
				-                        help='minimum ngram prompt lookup size')
			
 
				-    parser.add_argument('--ngram-prompt-lookup-max',
			
 
				-                        type=int,
			
 
				-                        default=None,
			
 
				-                        help='maximum ngram prompt lookup size')
			
 
				     args = parser.parse_args()
			
 
				     if args.tokenizer is None:
			
 
				         args.tokenizer = args.model
			
@@ -452,7 +421,7 @@ if __name__ == "__main__":
 
				         if args.hf_max_batch_size is None:
			
 
				             raise ValueError("HF max batch size is required for HF backend.")
			
 
				         if args.quantization is not None:
			
 
				-            raise ValueError("Quantization is only for aphrodite backend.")
			
 
				+            raise ValueError("Quantization is only for Aphrodite backend.")
			
 
				     elif args.backend == "mii":
			
 
				         if args.dtype != "auto":
			
 
				             raise ValueError("dtype must be auto for MII backend.")
			
@@ -461,7 +430,7 @@ if __name__ == "__main__":
 
				         if args.use_beam_search:
			
 
				             raise ValueError("Beam search is not supported for MII backend.")
			
 
				         if args.quantization is not None:
			
 
				-            raise ValueError("Quantization is only for aphrodite backend.")
			
 
				+            raise ValueError("Quantization is only for Aphrodite backend.")
			
 
				         if args.hf_max_batch_size is not None:
			
 
				             raise ValueError("HF max batch size is only for HF backend.")
			
 
				         if args.tokenizer != args.model:
			
--- a/tests/benchmarks/kernels/aqlm.py
+++ b/tests/benchmarks/kernels/aqlm.py
@@ -0,0 +1,303 @@
 
				+import os
			
 
				+import sys
			
 
				+from typing import Optional
			
 
				+
			
 
				+import torch
			
 
				+import torch.nn.functional as F
			
 
				+
			
 
				+from aphrodite import _custom_ops as ops
			
 
				+from aphrodite.common.utils import FlexibleArgumentParser
			
 
				+from aphrodite.quantization.aqlm import (dequantize_weight,
			
 
				+                                         generic_dequantize_gemm,
			
 
				+                                         get_int_dtype,
			
 
				+                                         optimized_dequantize_gemm)
			
 
				+
			
 
				+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
			
 
				+
			
 
				+
			
 
				+def torch_mult(
			
 
				+        input: torch.Tensor,  #  [..., in_features]
			
 
				+        weights: torch.Tensor,
			
 
				+        scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
			
 
				+) -> torch.Tensor:
			
 
				+    output = F.linear(input, weights)
			
 
				+    return output
			
 
				+
			
 
				+
			
 
				+def dequant_out_scale(
			
 
				+    input: torch.Tensor,  #  [..., in_features]
			
 
				+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
			
 
				+    codebooks: torch.
			
 
				+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
			
 
				+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
			
 
				+    output_partition_sizes: torch.IntTensor,
			
 
				+    bias: Optional[torch.Tensor],
			
 
				+) -> torch.Tensor:
			
 
				+
			
 
				+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
			
 
				+
			
 
				+    if bias is None:
			
 
				+        output = F.linear(input, weights, bias)
			
 
				+        orig_shape = output.shape
			
 
				+        flattened_output = output.view(-1, output.size(-1))
			
 
				+        f_scales = scales.view(-1, scales.shape[0])
			
 
				+        b_scales = f_scales.expand(flattened_output.shape[0], -1)
			
 
				+        flattened_output *= b_scales
			
 
				+        return flattened_output.view(orig_shape)
			
 
				+    else:
			
 
				+        b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
			
 
				+            -1, weights.shape[1])
			
 
				+        weights *= b_scales
			
 
				+        return F.linear(input, weights, bias)
			
 
				+
			
 
				+
			
 
				+def dequant_weight_scale(
			
 
				+    input: torch.Tensor,  #  [..., in_features]
			
 
				+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
			
 
				+    codebooks: torch.
			
 
				+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
			
 
				+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
			
 
				+    output_partition_sizes: torch.IntTensor,
			
 
				+    bias: Optional[torch.Tensor],
			
 
				+) -> torch.Tensor:
			
 
				+
			
 
				+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
			
 
				+
			
 
				+    b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
			
 
				+        -1, weights.shape[1])
			
 
				+    weights *= b_scales
			
 
				+    return F.linear(input, weights, bias)
			
 
				+
			
 
				+
			
 
				+def dequant_no_scale(
			
 
				+    input: torch.Tensor,  #  [..., in_features]
			
 
				+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
			
 
				+    codebooks: torch.
			
 
				+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
			
 
				+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
			
 
				+    output_partition_sizes: torch.IntTensor,
			
 
				+    bias: Optional[torch.Tensor],
			
 
				+) -> torch.Tensor:
			
 
				+
			
 
				+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
			
 
				+
			
 
				+    return F.linear(input, weights, bias)
			
 
				+
			
 
				+
			
 
				+# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
			
 
				+# the generic pytorch version.
			
 
				+# Just visual comparison.
			
 
				+def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
			
 
				+
			
 
				+    n = int(parts.sum().item())
			
 
				+
			
 
				+    device = torch.device('cuda:0')
			
 
				+
			
 
				+    code_range = (1 << bits) // 2
			
 
				+    ingroups = 8
			
 
				+
			
 
				+    codes = torch.randint(-code_range,
			
 
				+                          code_range,
			
 
				+                          size=(n, k // ingroups, nbooks),
			
 
				+                          dtype=get_int_dtype(bits),
			
 
				+                          device=device)
			
 
				+
			
 
				+    codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
			
 
				+                            dtype=torch.float16,
			
 
				+                            device=device)
			
 
				+
			
 
				+    count = 0
			
 
				+    for index in range(16):
			
 
				+        for i in range(8):
			
 
				+            for book in range(nbooks):
			
 
				+                codebooks[book, index, 0, i] = count * (10**book)
			
 
				+            count += 1
			
 
				+
			
 
				+    print("codes shape", codes.shape)
			
 
				+
			
 
				+    for i in range(16):
			
 
				+        for book in range(nbooks):
			
 
				+            codes[0, i, book] = i
			
 
				+            codes[0, -i, book] = i
			
 
				+
			
 
				+    weights = dequantize_weight(codes, codebooks, None)
			
 
				+    weights2 = ops.aqlm_dequant(codes, codebooks, parts)
			
 
				+
			
 
				+    print("weights shape:", weights.shape)
			
 
				+    print("weights2 shape:", weights2.shape)
			
 
				+
			
 
				+    print("weights are:", weights)
			
 
				+    print("weights2 are:", weights2)
			
 
				+
			
 
				+    print("first 128 weights are", weights[0, 0:128].to(torch.int32))
			
 
				+    print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
			
 
				+
			
 
				+    print("last 128 weights are", weights[0, -128:])
			
 
				+    print("last 128 weights2 are:", weights2[0, -128:])
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+
			
 
				+    parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
			
 
				+
			
 
				+    # Add arguments
			
 
				+    parser.add_argument("--nbooks",
			
 
				+                        type=int,
			
 
				+                        default=1,
			
 
				+                        help="Number of codebooks (default: 1)")
			
 
				+    parser.add_argument("--bits",
			
 
				+                        type=int,
			
 
				+                        default=16,
			
 
				+                        help="Number of bits per code element (default: 16)")
			
 
				+    parser.add_argument(
			
 
				+        "--test",
			
 
				+        type=bool,
			
 
				+        default=False,
			
 
				+        help="Run the decompression/dequant tester rather than benchmarking "
			
 
				+        "(default: False)")
			
 
				+
			
 
				+    # Parse the arguments
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    # Extract values
			
 
				+    nbooks = args.nbooks
			
 
				+    bits = args.bits
			
 
				+
			
 
				+    if args.test:
			
 
				+        dequant_test(4096, torch.tensor((4096, )), nbooks, bits)
			
 
				+        return
			
 
				+
			
 
				+    # Otherwise, benchmark.
			
 
				+    methods = [
			
 
				+        ops.aqlm_gemm,
			
 
				+        dequant_out_scale,
			
 
				+        generic_dequantize_gemm,
			
 
				+        optimized_dequantize_gemm,
			
 
				+        dequant_weight_scale,
			
 
				+        torch_mult,
			
 
				+        dequant_no_scale,
			
 
				+    ]
			
 
				+
			
 
				+    filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
			
 
				+    print(f"writing benchmarks to file {filename}")
			
 
				+    with open(filename, "w") as f:
			
 
				+        sys.stdout = f
			
 
				+
			
 
				+        print('m | k | n | n parts', end='')
			
 
				+        for method in methods:
			
 
				+            print(f" | {method.__name__.replace('_', ' ')} (µs)", end='')
			
 
				+        print('')
			
 
				+
			
 
				+        # These are reasonable prefill sizes.
			
 
				+        ksandpartions = ((4096, (4096, 4096, 4096)), (4096, (4096, )),
			
 
				+                         (4096, (11008, 11008)), (11008, (4096, )))
			
 
				+
			
 
				+        # reasonable ranges for m.
			
 
				+        for m in [
			
 
				+                1, 2, 4, 8, 10, 12, 14, 16, 24, 32, 48, 52, 56, 64, 96, 112,
			
 
				+                128, 256, 512, 1024, 1536, 2048, 3072, 4096
			
 
				+        ]:
			
 
				+            print(f'{m}', file=sys.__stdout__)
			
 
				+            for ksp in ksandpartions:
			
 
				+                run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits,
			
 
				+                         methods)
			
 
				+
			
 
				+        sys.stdout = sys.__stdout__
			
 
				+
			
 
				+
			
 
				+def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
			
 
				+             methods):
			
 
				+
			
 
				+    # I didn't see visible improvements from increasing these, but feel free :)
			
 
				+    num_warmup_trials = 1
			
 
				+    num_trials = 1
			
 
				+
			
 
				+    num_calls = 100
			
 
				+
			
 
				+    # warmup.
			
 
				+    for method in methods:
			
 
				+        for _ in range(num_warmup_trials):
			
 
				+            run_timing(
			
 
				+                num_calls=num_calls,
			
 
				+                m=m,
			
 
				+                k=k,
			
 
				+                parts=parts,
			
 
				+                nbooks=nbooks,
			
 
				+                bits=bits,
			
 
				+                method=method,
			
 
				+            )
			
 
				+
			
 
				+    n = parts.sum().item()
			
 
				+    print(f'{m} | {k} | {n} | {parts.tolist()}', end='')
			
 
				+
			
 
				+    for method in methods:
			
 
				+        best_time_us = 1e20
			
 
				+        for _ in range(num_trials):
			
 
				+            kernel_dur_ms = run_timing(
			
 
				+                num_calls=num_calls,
			
 
				+                m=m,
			
 
				+                k=k,
			
 
				+                parts=parts,
			
 
				+                nbooks=nbooks,
			
 
				+                bits=bits,
			
 
				+                method=method,
			
 
				+            )
			
 
				+
			
 
				+            kernel_dur_us = 1000 * kernel_dur_ms
			
 
				+
			
 
				+            if kernel_dur_us < best_time_us:
			
 
				+                best_time_us = kernel_dur_us
			
 
				+
			
 
				+        print(f' | {kernel_dur_us:.0f}', end='')
			
 
				+
			
 
				+    print('')
			
 
				+
			
 
				+
			
 
				+def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor,
			
 
				+               nbooks: int, bits: int, method) -> float:
			
 
				+
			
 
				+    n = int(parts.sum().item())
			
 
				+
			
 
				+    device = torch.device('cuda:0')
			
 
				+
			
 
				+    input = torch.randn((1, m, k), dtype=torch.float16, device=device)
			
 
				+
			
 
				+    code_range = (1 << bits) // 2
			
 
				+    ingroups = 8
			
 
				+
			
 
				+    codes = torch.randint(-code_range,
			
 
				+                          code_range,
			
 
				+                          size=(n, k // ingroups, nbooks),
			
 
				+                          dtype=get_int_dtype(bits),
			
 
				+                          device=device)
			
 
				+
			
 
				+    codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
			
 
				+                            dtype=torch.float16,
			
 
				+                            device=device)
			
 
				+
			
 
				+    scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
			
 
				+
			
 
				+    # for comparison to just a pytorch mult.
			
 
				+    weights = torch.randn((n, k), dtype=torch.float16, device=device)
			
 
				+
			
 
				+    start_event = torch.cuda.Event(enable_timing=True)
			
 
				+    end_event = torch.cuda.Event(enable_timing=True)
			
 
				+
			
 
				+    start_event.record()
			
 
				+
			
 
				+    if method is torch_mult:
			
 
				+        for i in range(num_calls):
			
 
				+            torch_mult(input, weights, scales)
			
 
				+    else:
			
 
				+        for i in range(num_calls):
			
 
				+            method(input, codes, codebooks, scales, parts, None)
			
 
				+
			
 
				+    end_event.record()
			
 
				+    end_event.synchronize()
			
 
				+
			
 
				+    dur_ms = start_event.elapsed_time(end_event) / num_calls
			
 
				+    return dur_ms
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    sys.exit(main())
			
--- a/tests/benchmarks/kernels/benchmark_shapes.py
+++ b/tests/benchmarks/kernels/benchmark_shapes.py
@@ -0,0 +1,75 @@
 
				+WEIGHT_SHAPES = {
			
 
				+    "ideal": [[4 * 256 * 32, 256 * 32]],
			
 
				+    "mistralai/Mistral-7B-v0.1/TP1": [
			
 
				+        [4096, 6144],
			
 
				+        [4096, 4096],
			
 
				+        [4096, 28672],
			
 
				+        [14336, 4096],
			
 
				+    ],
			
 
				+    "mistralai/Mistral-7B-v0.1/TP2": [
			
 
				+        [4096, 3072],
			
 
				+        [2048, 4096],
			
 
				+        [4096, 14336],
			
 
				+        [7168, 4096],
			
 
				+    ],
			
 
				+    "mistralai/Mistral-7B-v0.1/TP4": [
			
 
				+        [4096, 1536],
			
 
				+        [1024, 4096],
			
 
				+        [4096, 7168],
			
 
				+        [3584, 4096],
			
 
				+    ],
			
 
				+    "meta-llama/Llama-2-7b-hf/TP1": [
			
 
				+        [4096, 12288],
			
 
				+        [4096, 4096],
			
 
				+        [4096, 22016],
			
 
				+        [11008, 4096],
			
 
				+    ],
			
 
				+    "meta-llama/Llama-2-7b-hf/TP2": [
			
 
				+        [4096, 6144],
			
 
				+        [2048, 4096],
			
 
				+        [4096, 11008],
			
 
				+        [5504, 4096],
			
 
				+    ],
			
 
				+    "meta-llama/Llama-2-7b-hf/TP4": [
			
 
				+        [4096, 3072],
			
 
				+        [1024, 4096],
			
 
				+        [4096, 5504],
			
 
				+        [2752, 4096],
			
 
				+    ],
			
 
				+    "meta-llama/Llama-2-13b-hf/TP1": [
			
 
				+        [5120, 15360],
			
 
				+        [5120, 5120],
			
 
				+        [5120, 27648],
			
 
				+        [13824, 5120],
			
 
				+    ],
			
 
				+    "meta-llama/Llama-2-13b-hf/TP2": [
			
 
				+        [5120, 7680],
			
 
				+        [2560, 5120],
			
 
				+        [5120, 13824],
			
 
				+        [6912, 5120],
			
 
				+    ],
			
 
				+    "meta-llama/Llama-2-13b-hf/TP4": [
			
 
				+        [5120, 3840],
			
 
				+        [1280, 5120],
			
 
				+        [5120, 6912],
			
 
				+        [3456, 5120],
			
 
				+    ],
			
 
				+    "meta-llama/Llama-2-70b-hf/TP1": [
			
 
				+        [8192, 10240],
			
 
				+        [8192, 8192],
			
 
				+        [8192, 57344],
			
 
				+        [28672, 8192],
			
 
				+    ],
			
 
				+    "meta-llama/Llama-2-70b-hf/TP2": [
			
 
				+        [8192, 5120],
			
 
				+        [4096, 8192],
			
 
				+        [8192, 28672],
			
 
				+        [14336, 8192],
			
 
				+    ],
			
 
				+    "meta-llama/Llama-2-70b-hf/TP4": [
			
 
				+        [8192, 2560],
			
 
				+        [2048, 8192],
			
 
				+        [8192, 14336],
			
 
				+        [7168, 8192],
			
 
				+    ],
			
 
				+}
			
--- a/tests/benchmarks/kernels/marlin.py
+++ b/tests/benchmarks/kernels/marlin.py
@@ -0,0 +1,238 @@
 
				+from typing import List
			
 
				+
			
 
				+import torch
			
 
				+import torch.utils.benchmark as benchmark
			
 
				+from benchmark_shapes import WEIGHT_SHAPES
			
 
				+
			
 
				+from aphrodite import _custom_ops as ops
			
 
				+from aphrodite.common.utils import FlexibleArgumentParser
			
 
				+from aphrodite.quantization.gptq_marlin_24 import (
			
 
				+    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
			
 
				+    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
			
 
				+from aphrodite.quantization.utils.marlin_utils import (
			
 
				+    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
			
 
				+    GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
			
 
				+from aphrodite.quantization.utils.marlin_utils_test import (MarlinWorkspace,
			
 
				+                                                            marlin_quantize)
			
 
				+from aphrodite.quantization.utils.marlin_utils_test_24 import \
			
 
				+    marlin_24_quantize
			
 
				+from aphrodite.quantization.utils.quant_utils import (gptq_pack,
			
 
				+                                                      quantize_weights,
			
 
				+                                                      sort_weights)
			
 
				+
			
 
				+DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
			
 
				+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
			
 
				+
			
 
				+ACT_ORDER_OPTS = [False, True]
			
 
				+K_FULL_OPTS = [False, True]
			
 
				+
			
 
				+
			
 
				+def bench_run(results: List[benchmark.Measurement], model: str,
			
 
				+              act_order: bool, is_k_full: bool, num_bits: int, group_size: int,
			
 
				+              size_m: int, size_k: int, size_n: int):
			
 
				+    label = "Quant Matmul"
			
 
				+
			
 
				+    sub_label = ("{}, act={} k_full={}, b={}, g={}, "
			
 
				+                 "MKN=({}x{}x{})".format(model, act_order, is_k_full, num_bits,
			
 
				+                                         group_size, size_m, size_k, size_n))
			
 
				+
			
 
				+    print(f"Testing: {sub_label}")
			
 
				+
			
 
				+    a = torch.randn(size_m, size_k).to(torch.half).cuda()
			
 
				+    b = torch.rand(size_k, size_n).to(torch.half).cuda()
			
 
				+
			
 
				+    a_tmp = (torch.zeros(size_m, size_k).to(torch.half).cuda())
			
 
				+
			
 
				+    # Marlin quant
			
 
				+    (
			
 
				+        marlin_w_ref,
			
 
				+        marlin_q_w,
			
 
				+        marlin_s,
			
 
				+        marlin_g_idx,
			
 
				+        marlin_sort_indices,
			
 
				+        marlin_rand_perm,
			
 
				+    ) = marlin_quantize(b, num_bits, group_size, act_order)
			
 
				+
			
 
				+    # Marlin_24 quant
			
 
				+    (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
			
 
				+     marlin_24_s) = marlin_24_quantize(b, num_bits, group_size)
			
 
				+
			
 
				+    # GPTQ quant
			
 
				+    (w_ref, q_w, s, g_idx,
			
 
				+     rand_perm) = quantize_weights(b, num_bits, group_size, act_order)
			
 
				+    q_w_gptq = gptq_pack(q_w, num_bits, size_k, size_n)
			
 
				+
			
 
				+    # For act_order, sort the "weights" and "g_idx"
			
 
				+    # so that group ids are increasing
			
 
				+    repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
			
 
				+    if act_order:
			
 
				+        (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
			
 
				+
			
 
				+    # Prepare
			
 
				+    marlin_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
			
 
				+                                       GPTQ_MARLIN_MAX_PARALLEL)
			
 
				+
			
 
				+    marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
			
 
				+                                          GPTQ_MARLIN_24_MAX_PARALLEL)
			
 
				+
			
 
				+    globals = {
			
 
				+        # Gen params
			
 
				+        "num_bits": num_bits,
			
 
				+        "group_size": group_size,
			
 
				+        "size_m": size_m,
			
 
				+        "size_n": size_n,
			
 
				+        "size_k": size_k,
			
 
				+        "a": a,
			
 
				+        "a_tmp": a_tmp,
			
 
				+        # Marlin params
			
 
				+        "marlin_w_ref": marlin_w_ref,
			
 
				+        "marlin_q_w": marlin_q_w,
			
 
				+        "marlin_s": marlin_s,
			
 
				+        "marlin_g_idx": marlin_g_idx,
			
 
				+        "marlin_sort_indices": marlin_sort_indices,
			
 
				+        "marlin_rand_perm": marlin_rand_perm,
			
 
				+        "marlin_workspace": marlin_workspace,
			
 
				+        "is_k_full": is_k_full,
			
 
				+        # Marlin_24 params
			
 
				+        "marlin_24_w_ref": marlin_24_w_ref,
			
 
				+        "marlin_24_q_w_comp": marlin_24_q_w_comp,
			
 
				+        "marlin_24_meta": marlin_24_meta,
			
 
				+        "marlin_24_s": marlin_24_s,
			
 
				+        "marlin_24_workspace": marlin_24_workspace,
			
 
				+        # GPTQ params
			
 
				+        "q_w_gptq": q_w_gptq,
			
 
				+        "repack_sort_indices": repack_sort_indices,
			
 
				+        # Kernels
			
 
				+        "gptq_marlin_gemm": ops.gptq_marlin_gemm,
			
 
				+        "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
			
 
				+        "gptq_marlin_repack": ops.gptq_marlin_repack,
			
 
				+    }
			
 
				+
			
 
				+    min_run_time = 1
			
 
				+
			
 
				+    # Warmup pytorch
			
 
				+    for i in range(5):
			
 
				+        torch.matmul(a, marlin_w_ref)
			
 
				+
			
 
				+    results.append(
			
 
				+        benchmark.Timer(
			
 
				+            stmt="torch.matmul(a, marlin_w_ref)",
			
 
				+            globals=globals,
			
 
				+            label=label,
			
 
				+            sub_label=sub_label,
			
 
				+            description="pytorch_gemm",
			
 
				+        ).blocked_autorange(min_run_time=min_run_time))
			
 
				+
			
 
				+    results.append(
			
 
				+        benchmark.Timer(
			
 
				+            stmt=
			
 
				+            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full)",  # noqa: E501
			
 
				+            globals=globals,
			
 
				+            label=label,
			
 
				+            sub_label=sub_label,
			
 
				+            description="gptq_marlin_gemm",
			
 
				+        ).blocked_autorange(min_run_time=min_run_time))
			
 
				+
			
 
				+    if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
			
 
				+            and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES):
			
 
				+        results.append(
			
 
				+            benchmark.Timer(
			
 
				+                stmt=
			
 
				+                "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, num_bits, size_m, size_n, size_k)",  # noqa: E501
			
 
				+                globals=globals,
			
 
				+                label=label,
			
 
				+                sub_label=sub_label,
			
 
				+                description="gptq_marlin_24_gemm",
			
 
				+            ).blocked_autorange(min_run_time=min_run_time))
			
 
				+
			
 
				+    results.append(
			
 
				+        benchmark.Timer(
			
 
				+            stmt=
			
 
				+            "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, num_bits)",  # noqa: E501
			
 
				+            globals=globals,
			
 
				+            label=label,
			
 
				+            sub_label=sub_label,
			
 
				+            description="gptq_marlin_repack",
			
 
				+        ).blocked_autorange(min_run_time=min_run_time))
			
 
				+
			
 
				+
			
 
				+def main(args):
			
 
				+    print("Benchmarking models:")
			
 
				+    for i, model in enumerate(args.models):
			
 
				+        print(f"[{i}]  {model}")
			
 
				+
			
 
				+    results: List[benchmark.Measurement] = []
			
 
				+
			
 
				+    for model in args.models:
			
 
				+        for layer in WEIGHT_SHAPES[model]:
			
 
				+            size_k = layer[0]
			
 
				+            size_n = layer[1]
			
 
				+
			
 
				+            if len(args.limit_k) > 0 and size_k not in args.limit_k:
			
 
				+                continue
			
 
				+
			
 
				+            if len(args.limit_n) > 0 and size_n not in args.limit_n:
			
 
				+                continue
			
 
				+
			
 
				+            for act_order in ACT_ORDER_OPTS:
			
 
				+                if len(args.limit_act_order
			
 
				+                       ) > 0 and act_order not in args.limit_act_order:
			
 
				+                    continue
			
 
				+
			
 
				+                for is_k_full in K_FULL_OPTS:
			
 
				+                    if len(args.limit_k_full
			
 
				+                           ) > 0 and is_k_full not in args.limit_k_full:
			
 
				+                        continue
			
 
				+
			
 
				+                    for num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
			
 
				+                        if len(args.limit_num_bits
			
 
				+                               ) > 0 and num_bits not in args.limit_num_bits:
			
 
				+                            continue
			
 
				+
			
 
				+                        for group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
			
 
				+                            if len(
			
 
				+                                    args.limit_group_size
			
 
				+                            ) > 0 and group_size not in args.limit_group_size:
			
 
				+                                continue
			
 
				+
			
 
				+                            # For act_order, the group_size must be less than
			
 
				+                            # size_k
			
 
				+                            if act_order and (group_size == size_k
			
 
				+                                              or group_size == -1):
			
 
				+                                continue
			
 
				+
			
 
				+                            for size_m in args.batch_sizes:
			
 
				+                                bench_run(results, model, act_order, is_k_full,
			
 
				+                                          num_bits, group_size, size_m, size_k,
			
 
				+                                          size_n)
			
 
				+
			
 
				+    compare = benchmark.Compare(results)
			
 
				+    compare.print()
			
 
				+
			
 
				+
			
 
				+# For quick benchmarking use:
			
 
				+#   python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
			
 
				+#
			
 
				+if __name__ == "__main__":
			
 
				+    parser = FlexibleArgumentParser(
			
 
				+        description="Benchmark Marlin across specified models/shapes/batches")
			
 
				+    parser.add_argument(
			
 
				+        "--models",
			
 
				+        nargs="+",
			
 
				+        type=str,
			
 
				+        default=DEFAULT_MODELS,
			
 
				+        choices=WEIGHT_SHAPES.keys(),
			
 
				+    )
			
 
				+    parser.add_argument("--batch-sizes",
			
 
				+                        nargs="+",
			
 
				+                        type=int,
			
 
				+                        default=DEFAULT_BATCH_SIZES)
			
 
				+    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
			
 
				+    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
			
 
				+    parser.add_argument("--limit-group-size", nargs="+", type=int, default=[])
			
 
				+    parser.add_argument("--limit-num-bits", nargs="+", type=int, default=[])
			
 
				+    parser.add_argument("--limit-act-order", nargs="+", type=int, default=[])
			
 
				+    parser.add_argument("--limit-k-full", nargs="+", type=int, default=[])
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+    main(args)
			
--- a/tests/benchmarks/benchmark_moe.py
+++ b/tests/benchmarks/benchmark_moe.py
@@ -1,7 +1,7 @@
 
				 import argparse
			
 
				 import time
			
 
				 from datetime import datetime
			
 
				-from typing import Any, Dict, List, Tuple
			
 
				+from typing import Any, Dict, List, Tuple, TypedDict
			
 
				 
			
 
				 import ray
			
 
				 import torch
			
@@ -9,11 +9,21 @@ import triton
 
				 from ray.experimental.tqdm_ray import tqdm
			
 
				 from transformers import AutoConfig
			
 
				 
			
 
				+from aphrodite.common.utils import FlexibleArgumentParser
			
 
				 from aphrodite.modeling.layers.fused_moe.fused_moe import *
			
 
				 
			
 
				 
			
 
				+class BenchmarkConfig(TypedDict):
			
 
				+    BLOCK_SIZE_M: int
			
 
				+    BLOCK_SIZE_N: int
			
 
				+    BLOCK_SIZE_K: int
			
 
				+    GROUP_SIZE_M: int
			
 
				+    num_warps: int
			
 
				+    num_stages: int
			
 
				+
			
 
				+
			
 
				 def benchmark_config(
			
 
				-    config: Dict[str, int],
			
 
				+    config: BenchmarkConfig,
			
 
				     num_tokens: int,
			
 
				     num_experts: int,
			
 
				     shard_intermediate_size: int,
			
@@ -92,7 +102,7 @@ def benchmark_config(
 
				     start_event = torch.cuda.Event(enable_timing=True)
			
 
				     end_event = torch.cuda.Event(enable_timing=True)
			
 
				 
			
 
				-    latencies = []
			
 
				+    latencies: List[float] = []
			
 
				     for i in range(num_iters):
			
 
				         prepare(i)
			
 
				         torch.cuda.synchronize()
			
@@ -111,7 +121,7 @@ def get_configs_compute_bound() -> List[Dict[str, int]]:
 
				     # Reduced search space for faster tuning.
			
 
				     # TODO(woosuk): Increase the search space and use a performance model to
			
 
				     # prune the search space.
			
 
				-    configs = []
			
 
				+    configs: List[BenchmarkConfig] = []
			
 
				     for num_stages in [2, 3, 4, 5]:
			
 
				         for block_m in [16, 32, 64, 128, 256]:
			
 
				             for block_k in [64, 128, 256]:
			
@@ -175,8 +185,8 @@ class BenchmarkWorker:
 
				         topk: int,
			
 
				         dtype: torch.dtype,
			
 
				         use_fp8: bool,
			
 
				-        search_space: List[Dict[str, int]],
			
 
				-    ) -> Dict[str, int]:
			
 
				+        search_space: List[BenchmarkConfig],
			
 
				+    ) -> BenchmarkConfig:
			
 
				         best_config = None
			
 
				         best_time = float("inf")
			
 
				         for config in tqdm(search_space):
			
@@ -199,10 +209,11 @@ class BenchmarkWorker:
 
				                 best_config = config
			
 
				         now = datetime.now()
			
 
				         print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
			
 
				+        assert best_config is not None
			
 
				         return best_config
			
 
				 
			
 
				 
			
 
				-def sort_config(config: Dict[str, int]) -> Dict[str, int]:
			
 
				+def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
			
 
				     return {
			
 
				         "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
			
 
				         "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
			
@@ -214,7 +225,7 @@ def sort_config(config: Dict[str, int]) -> Dict[str, int]:
 
				 
			
 
				 
			
 
				 def save_configs(
			
 
				-    configs: Dict[int, Dict[str, int]],
			
 
				+    configs: Dict[int, BenchmarkConfig],
			
 
				     num_experts: int,
			
 
				     shard_intermediate_size: int,
			
 
				     hidden_size: int,
			
@@ -255,7 +266,8 @@ def main(args: argparse.Namespace):
 
				 
			
 
				     if args.batch_size is None:
			
 
				         batch_sizes = [
			
 
				-            1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048, 3072, 4096
			
 
				+            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
			
 
				+            2048, 3072, 4096
			
 
				         ]
			
 
				     else:
			
 
				         batch_sizes = [args.batch_size]
			
@@ -304,7 +316,7 @@ def main(args: argparse.Namespace):
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    parser = argparse.ArgumentParser()
			
 
				+    parser = FlexibleArgumentParser()
			
 
				     parser.add_argument("--model",
			
 
				                         type=str,
			
 
				                         default="mistralai/Mixtral-8x7B-Instruct-v0.1")
			
--- a/tests/benchmarks/kernels/paged_attention.py
+++ b/tests/benchmarks/kernels/paged_attention.py
@@ -0,0 +1,215 @@
 
				+import random
			
 
				+import time
			
 
				+from typing import List, Optional
			
 
				+
			
 
				+import torch
			
 
				+
			
 
				+from aphrodite import _custom_ops as ops
			
 
				+from aphrodite.common.utils import (STR_DTYPE_TO_TORCH_DTYPE,
			
 
				+                                    FlexibleArgumentParser,
			
 
				+                                    create_kv_caches_with_random)
			
 
				+
			
 
				+NUM_BLOCKS = 1024
			
 
				+PARTITION_SIZE = 512
			
 
				+
			
 
				+
			
 
				+@torch.inference_mode()
			
 
				+def main(
			
 
				+    version: str,
			
 
				+    num_seqs: int,
			
 
				+    seq_len: int,
			
 
				+    num_query_heads: int,
			
 
				+    num_kv_heads: int,
			
 
				+    head_size: int,
			
 
				+    use_alibi: bool,
			
 
				+    block_size: int,
			
 
				+    dtype: torch.dtype,
			
 
				+    seed: int,
			
 
				+    do_profile: bool,
			
 
				+    device: str = "cuda",
			
 
				+    kv_cache_dtype: Optional[str] = None,
			
 
				+) -> None:
			
 
				+    random.seed(seed)
			
 
				+    torch.random.manual_seed(seed)
			
 
				+    if torch.cuda.is_available():
			
 
				+        torch.cuda.manual_seed(seed)
			
 
				+
			
 
				+    scale = float(1.0 / (head_size**0.5))
			
 
				+    query = torch.empty(num_seqs,
			
 
				+                        num_query_heads,
			
 
				+                        head_size,
			
 
				+                        dtype=dtype,
			
 
				+                        device=device)
			
 
				+    query.uniform_(-scale, scale)
			
 
				+
			
 
				+    assert num_query_heads % num_kv_heads == 0
			
 
				+    alibi_slopes = None
			
 
				+    if use_alibi:
			
 
				+        alibi_slopes = torch.randn(num_query_heads,
			
 
				+                                   dtype=torch.float,
			
 
				+                                   device=device)
			
 
				+
			
 
				+    seq_lens = [seq_len for _ in range(num_seqs)]
			
 
				+    max_seq_len = max(seq_lens)
			
 
				+    seq_lens = torch.tensor(seq_lens, dtype=torch.int, device=device)
			
 
				+
			
 
				+    # Create the block tables.
			
 
				+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
			
 
				+    block_tables_lst: List[List[int]] = []
			
 
				+    for _ in range(num_seqs):
			
 
				+        block_table = [
			
 
				+            random.randint(0, NUM_BLOCKS - 1)
			
 
				+            for _ in range(max_num_blocks_per_seq)
			
 
				+        ]
			
 
				+        block_tables_lst.append(block_table)
			
 
				+
			
 
				+    block_tables = torch.tensor(block_tables_lst,
			
 
				+                                dtype=torch.int,
			
 
				+                                device=device)
			
 
				+
			
 
				+    # Create the KV cache.
			
 
				+    key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
			
 
				+                                                            block_size,
			
 
				+                                                            1,
			
 
				+                                                            num_kv_heads,
			
 
				+                                                            head_size,
			
 
				+                                                            kv_cache_dtype,
			
 
				+                                                            dtype,
			
 
				+                                                            device=device)
			
 
				+    key_cache, value_cache = key_caches[0], value_caches[0]
			
 
				+
			
 
				+    # Prepare for the paged attention kernel.
			
 
				+    output = torch.empty_like(query)
			
 
				+    if version == "v2":
			
 
				+        num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
			
 
				+        tmp_output = torch.empty(
			
 
				+            size=(num_seqs, num_query_heads, num_partitions, head_size),
			
 
				+            dtype=output.dtype,
			
 
				+            device=output.device,
			
 
				+        )
			
 
				+        exp_sums = torch.empty(
			
 
				+            size=(num_seqs, num_query_heads, num_partitions),
			
 
				+            dtype=torch.float32,
			
 
				+            device=output.device,
			
 
				+        )
			
 
				+        max_logits = torch.empty_like(exp_sums)
			
 
				+
			
 
				+    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
			
 
				+        torch.cuda.synchronize()
			
 
				+        if profile:
			
 
				+            torch.cuda.cudart().cudaProfilerStart()
			
 
				+        start_time = time.perf_counter()
			
 
				+
			
 
				+        # Using default kv_scale
			
 
				+        k_scale = v_scale = 1.0
			
 
				+
			
 
				+        for _ in range(num_iters):
			
 
				+            if version == "v1":
			
 
				+                ops.paged_attention_v1(
			
 
				+                    output,
			
 
				+                    query,
			
 
				+                    key_cache,
			
 
				+                    value_cache,
			
 
				+                    num_kv_heads,
			
 
				+                    scale,
			
 
				+                    block_tables,
			
 
				+                    seq_lens,
			
 
				+                    block_size,
			
 
				+                    max_seq_len,
			
 
				+                    alibi_slopes,
			
 
				+                    kv_cache_dtype,
			
 
				+                    k_scale,
			
 
				+                    v_scale,
			
 
				+                )
			
 
				+            elif version == "v2":
			
 
				+                ops.paged_attention_v2(
			
 
				+                    output,
			
 
				+                    exp_sums,
			
 
				+                    max_logits,
			
 
				+                    tmp_output,
			
 
				+                    query,
			
 
				+                    key_cache,
			
 
				+                    value_cache,
			
 
				+                    num_kv_heads,
			
 
				+                    scale,
			
 
				+                    block_tables,
			
 
				+                    seq_lens,
			
 
				+                    block_size,
			
 
				+                    max_seq_len,
			
 
				+                    alibi_slopes,
			
 
				+                    kv_cache_dtype,
			
 
				+                    k_scale,
			
 
				+                    v_scale,
			
 
				+                )
			
 
				+            else:
			
 
				+                raise ValueError(f"Invalid version: {version}")
			
 
				+        torch.cuda.synchronize()
			
 
				+
			
 
				+        end_time = time.perf_counter()
			
 
				+        if profile:
			
 
				+            torch.cuda.cudart().cudaProfilerStart()
			
 
				+        return (end_time - start_time) / num_iters
			
 
				+
			
 
				+    # Warmup.
			
 
				+    print("Warming up...")
			
 
				+    run_benchmark = run_cuda_benchmark
			
 
				+    run_benchmark(num_iters=3, profile=False)
			
 
				+
			
 
				+    # Benchmark.
			
 
				+    if do_profile:
			
 
				+        latency = run_benchmark(num_iters=1, profile=True)
			
 
				+    else:
			
 
				+        latency = run_benchmark(num_iters=100, profile=False)
			
 
				+    print(f"Kernel running time: {latency * 1000000:.3f} us")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    parser = FlexibleArgumentParser(
			
 
				+        description="Benchmark the paged attention kernel.")
			
 
				+    parser.add_argument("--version",
			
 
				+                        type=str,
			
 
				+                        choices=["v1", "v2"],
			
 
				+                        default="v2")
			
 
				+    parser.add_argument("--batch-size", type=int, default=8)
			
 
				+    parser.add_argument("--seq-len", type=int, default=4096)
			
 
				+    parser.add_argument("--num-query-heads", type=int, default=64)
			
 
				+    parser.add_argument("--num-kv-heads", type=int, default=8)
			
 
				+    parser.add_argument("--head-size",
			
 
				+                        type=int,
			
 
				+                        choices=[64, 80, 96, 112, 128, 192, 256],
			
 
				+                        default=128)
			
 
				+    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
			
 
				+    parser.add_argument("--use-alibi", action="store_true")
			
 
				+    parser.add_argument("--dtype",
			
 
				+                        type=str,
			
 
				+                        choices=["half", "bfloat16", "float"],
			
 
				+                        default="half")
			
 
				+    parser.add_argument("--seed", type=int, default=0)
			
 
				+    parser.add_argument("--profile", action="store_true")
			
 
				+    parser.add_argument(
			
 
				+        "--kv-cache-dtype",
			
 
				+        type=str,
			
 
				+        choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"],
			
 
				+        default="auto",
			
 
				+        help="Data type for kv cache storage. If 'auto', will use model "
			
 
				+        "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
			
 
				+        "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)")
			
 
				+    args = parser.parse_args()
			
 
				+    print(args)
			
 
				+
			
 
				+    if args.num_query_heads % args.num_kv_heads != 0:
			
 
				+        raise ValueError("num_query_heads must be divisible by num_kv_heads")
			
 
				+    main(
			
 
				+        version=args.version,
			
 
				+        num_seqs=args.batch_size,
			
 
				+        seq_len=args.seq_len,
			
 
				+        num_query_heads=args.num_query_heads,
			
 
				+        num_kv_heads=args.num_kv_heads,
			
 
				+        head_size=args.head_size,
			
 
				+        block_size=args.block_size,
			
 
				+        use_alibi=args.use_alibi,
			
 
				+        dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
			
 
				+        seed=args.seed,
			
 
				+        do_profile=args.profile,
			
 
				+        kv_cache_dtype=args.kv_cache_dtype,
			
 
				+    )
			
--- a/tests/benchmarks/kernels/rope.py
+++ b/tests/benchmarks/kernels/rope.py
@@ -0,0 +1,122 @@
 
				+from itertools import accumulate
			
 
				+from typing import List, Optional
			
 
				+
			
 
				+import nvtx
			
 
				+import torch
			
 
				+
			
 
				+from aphrodite.common.utils import FlexibleArgumentParser
			
 
				+from aphrodite.modeling.layers.rotary_embedding import (RotaryEmbedding,
			
 
				+                                                        get_rope)
			
 
				+
			
 
				+
			
 
				+def benchmark_rope_kernels_multi_lora(
			
 
				+    is_neox_style: bool,
			
 
				+    batch_size: int,
			
 
				+    seq_len: int,
			
 
				+    num_heads: int,
			
 
				+    head_size: int,
			
 
				+    rotary_dim: Optional[int],
			
 
				+    dtype: torch.dtype,
			
 
				+    seed: int,
			
 
				+    device: str,
			
 
				+    max_position: int = 8192,
			
 
				+    base: int = 10000,
			
 
				+) -> None:
			
 
				+    torch.random.manual_seed(seed)
			
 
				+    if torch.cuda.is_available():
			
 
				+        torch.cuda.manual_seed(seed)
			
 
				+    torch.set_default_device(device)
			
 
				+    if rotary_dim is None:
			
 
				+        rotary_dim = head_size
			
 
				+    # silulating serving 4 LoRAs
			
 
				+    scaling_factors = [1, 2, 4, 8]
			
 
				+    # batched RoPE can take multiple scaling factors
			
 
				+    batched_rope = get_rope(head_size, rotary_dim, max_position, base,
			
 
				+                            is_neox_style, {
			
 
				+                                "type": "linear",
			
 
				+                                "factor": tuple(scaling_factors)
			
 
				+                            })
			
 
				+    # non-batched RoPE takes only one scaling factor, we create multiple
			
 
				+    # instances to simulate the same behavior
			
 
				+    non_batched_ropes: List[RotaryEmbedding] = []
			
 
				+    for scaling_factor in scaling_factors:
			
 
				+        non_batched_ropes.append(
			
 
				+            get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
			
 
				+                     {
			
 
				+                         "type": "linear",
			
 
				+                         "factor": (scaling_factor, )
			
 
				+                     }))
			
 
				+
			
 
				+    positions = torch.randint(0, max_position, (batch_size, seq_len))
			
 
				+    query = torch.randn(batch_size,
			
 
				+                        seq_len,
			
 
				+                        num_heads * head_size,
			
 
				+                        dtype=dtype)
			
 
				+    key = torch.randn_like(query)
			
 
				+
			
 
				+    # create query offsets for batched RoPE, we concat multiple kv cache
			
 
				+    # together and each query needs to find the right kv cache of its type
			
 
				+    offset_map = torch.tensor(
			
 
				+        list(
			
 
				+            accumulate([0] + [
			
 
				+                max_position * scaling_factor * 2
			
 
				+                for scaling_factor in scaling_factors[:-1]
			
 
				+            ])))
			
 
				+    query_types = torch.randint(0,
			
 
				+                                len(scaling_factors), (batch_size, seq_len),
			
 
				+                                device=device)
			
 
				+    # map query types to offsets
			
 
				+    query_offsets = offset_map[query_types]
			
 
				+    # the kernel takes flattened offsets
			
 
				+    flatten_offsets = query_offsets.flatten()
			
 
				+
			
 
				+    # batched queries of the same type together for non-batched RoPE
			
 
				+    queries = [query[query_types == i] for i in range(len(scaling_factors))]
			
 
				+    keys = [key[query_types == i] for i in range(len(scaling_factors))]
			
 
				+    packed_qkr = zip(queries, keys, non_batched_ropes)
			
 
				+    # synchronize before start timing
			
 
				+    torch.cuda.synchronize()
			
 
				+    with nvtx.annotate("non-batched", color="yellow"):
			
 
				+        for q, k, r in packed_qkr:
			
 
				+            r.forward(positions, q, k)
			
 
				+    torch.cuda.synchronize()
			
 
				+    with nvtx.annotate("batched", color="green"):
			
 
				+        batched_rope.forward(positions, query, key, flatten_offsets)
			
 
				+    torch.cuda.synchronize()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    parser = FlexibleArgumentParser(
			
 
				+        description="Benchmark the rotary embedding kernels.")
			
 
				+    parser.add_argument("--is-neox-style", type=bool, default=True)
			
 
				+    parser.add_argument("--batch-size", type=int, default=16)
			
 
				+    parser.add_argument("--seq-len", type=int, default=512)
			
 
				+    parser.add_argument("--num-heads", type=int, default=8)
			
 
				+    parser.add_argument("--head-size",
			
 
				+                        type=int,
			
 
				+                        choices=[64, 80, 96, 112, 128, 192, 256],
			
 
				+                        default=128)
			
 
				+    parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
			
 
				+    parser.add_argument("--dtype",
			
 
				+                        type=str,
			
 
				+                        choices=["bfloat16", "float"],
			
 
				+                        default="float")
			
 
				+    parser.add_argument("--seed", type=int, default=0)
			
 
				+    parser.add_argument("--device",
			
 
				+                        type=str,
			
 
				+                        choices=["cuda:0", "cuda:1"],
			
 
				+                        default="cuda:0")
			
 
				+    args = parser.parse_args()
			
 
				+    print(args)
			
 
				+
			
 
				+    benchmark_rope_kernels_multi_lora(
			
 
				+        is_neox_style=args.is_neox_style,
			
 
				+        batch_size=args.batch_size,
			
 
				+        seq_len=args.seq_len,
			
 
				+        num_heads=args.num_heads,
			
 
				+        head_size=args.head_size,
			
 
				+        rotary_dim=args.rotary_dim,
			
 
				+        dtype=getattr(torch, args.dtype),
			
 
				+        seed=args.seed,
			
 
				+        device=args.device,
			
 
				+    )
			
--- a/tests/benchmarks/latency.py
+++ b/tests/benchmarks/latency.py
@@ -1,100 +0,0 @@
 
				-import argparse
			
 
				-import time
			
 
				-
			
 
				-import numpy as np
			
 
				-import torch
			
 
				-from tqdm import tqdm
			
 
				-
			
 
				-from aphrodite import LLM, SamplingParams
			
 
				-
			
 
				-
			
 
				-def main(args: argparse.Namespace):  # pylint: disable=redefined-outer-name
			
 
				-    print(args)
			
 
				-
			
 
				-    # Process all the requests in a single batch if possible.
			
 
				-    # NOTE: If the request cannot be processed in a single batch,
			
 
				-    # the engine will automatically process the request in multiple batches.
			
 
				-    llm = LLM(
			
 
				-        model=args.model,
			
 
				-        tokenizer=args.tokenizer,
			
 
				-        quantization=args.quantization,
			
 
				-        tensor_parallel_size=args.tensor_parallel_size,
			
 
				-        max_num_seqs=args.batch_size,
			
 
				-        max_num_batched_tokens=args.batch_size * args.input_len,
			
 
				-        trust_remote_code=args.trust_remote_code,
			
 
				-        dtype=args.dtype,
			
 
				-    )
			
 
				-
			
 
				-    sampling_params = SamplingParams(
			
 
				-        n=args.n,
			
 
				-        temperature=0.0 if args.use_beam_search else 1.0,
			
 
				-        top_p=1.0,
			
 
				-        use_beam_search=args.use_beam_search,
			
 
				-        ignore_eos=True,
			
 
				-        max_tokens=args.output_len,
			
 
				-    )
			
 
				-    print(sampling_params)
			
 
				-    dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size
			
 
				-
			
 
				-    def run_to_completion(profile: bool = False):
			
 
				-        if profile:
			
 
				-            torch.cuda.cudart().cudaProfilerStart()
			
 
				-        start_time = time.perf_counter()
			
 
				-
			
 
				-        llm.generate(prompt_token_ids=dummy_prompt_token_ids,
			
 
				-                     sampling_params=sampling_params,
			
 
				-                     use_tqdm=False)
			
 
				-
			
 
				-        end_time = time.perf_counter()
			
 
				-        latency = end_time - start_time
			
 
				-        if profile:
			
 
				-            torch.cuda.cudart().cudaProfilerStop()
			
 
				-        return latency
			
 
				-
			
 
				-    print('Warming up...')
			
 
				-    run_to_completion(profile=False)
			
 
				-
			
 
				-    # Benchmark.
			
 
				-    latencies = []
			
 
				-    for _ in tqdm(range(args.num_iters), desc='Profiling iterations'):
			
 
				-        latencies.append(run_to_completion(profile=False))
			
 
				-    print(f'Avg latency: {np.mean(latencies)} seconds')
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    parser = argparse.ArgumentParser(
			
 
				-        description='Benchmark the latency of processing a single batch of '
			
 
				-        'requests till completion.')
			
 
				-    parser.add_argument('--model', type=str, default='facebook/opt-125m')
			
 
				-    parser.add_argument('--tokenizer', type=str, default=None)
			
 
				-    parser.add_argument('--quantization',
			
 
				-                        '-q',
			
 
				-                        choices=['awq', None],
			
 
				-                        default=None)
			
 
				-    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
			
 
				-    parser.add_argument('--input-len', type=int, default=32)
			
 
				-    parser.add_argument('--output-len', type=int, default=128)
			
 
				-    parser.add_argument('--batch-size', type=int, default=8)
			
 
				-    parser.add_argument('--n',
			
 
				-                        type=int,
			
 
				-                        default=1,
			
 
				-                        help='Number of generated sequences per prompt.')
			
 
				-    parser.add_argument('--use-beam-search', action='store_true')
			
 
				-    parser.add_argument('--num-iters',
			
 
				-                        type=int,
			
 
				-                        default=3,
			
 
				-                        help='Number of iterations to run.')
			
 
				-    parser.add_argument('--trust-remote-code',
			
 
				-                        action='store_true',
			
 
				-                        help='trust remote code from huggingface')
			
 
				-    parser.add_argument(
			
 
				-        '--dtype',
			
 
				-        type=str,
			
 
				-        default='auto',
			
 
				-        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
			
 
				-        help='data type for model weights and activations. '
			
 
				-        'The "auto" option will use FP16 precision '
			
 
				-        'for FP32 and FP16 models, and BF16 precision '
			
 
				-        'for BF16 models.')
			
 
				-    args = parser.parse_args()
			
 
				-    main(args)
			
--- a/tests/benchmarks/overheads/hashing.py
+++ b/tests/benchmarks/overheads/hashing.py
@@ -1,8 +1,8 @@
 
				-import argparse
			
 
				 import cProfile
			
 
				 import pstats
			
 
				 
			
 
				 from aphrodite import LLM, SamplingParams
			
 
				+from aphrodite.common.utils import FlexibleArgumentParser
			
 
				 
			
 
				 # A very long prompt, total number of tokens is about 15k.
			
 
				 LONG_PROMPT = ["You are an expert in large language models, aren't you?"
			
@@ -17,7 +17,6 @@ def main(args):
 
				         enable_prefix_caching=True,
			
 
				         tensor_parallel_size=args.tensor_parallel_size,
			
 
				         use_v2_block_manager=args.use_v2_block_manager,
			
 
				-        max_model_len=args.max_model_len,
			
 
				     )
			
 
				 
			
 
				     sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
			
@@ -48,12 +47,10 @@ def main(args):
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    parser = argparse.ArgumentParser(
			
 
				+    parser = FlexibleArgumentParser(
			
 
				         description='Benchmark the performance of hashing function in'
			
 
				         'automatic prefix caching.')
			
 
				-    parser.add_argument('--model',
			
 
				-                        type=str,
			
 
				-                        default='NousResearch/Meta-Llama-3-8B')
			
 
				+    parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k')
			
 
				     parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
			
 
				     parser.add_argument('--output-len', type=int, default=10)
			
 
				     parser.add_argument('--enable-prefix-caching',
			
@@ -62,9 +59,5 @@ if __name__ == "__main__":
 
				     parser.add_argument('--use-v2-block-manager',
			
 
				                         action='store_true',
			
 
				                         help='Use BlockSpaceMangerV2')
			
 
				-    parser.add_argument('--max-model-len',
			
 
				-                        type=int,
			
 
				-                        default=None,
			
 
				-                        help='maximum length of the model')
			
 
				     args = parser.parse_args()
			
 
				     main(args)
			
--- a/tests/benchmarks/overheads/prefix_caching.py
+++ b/tests/benchmarks/overheads/prefix_caching.py
@@ -0,0 +1,62 @@
 
				+import time
			
 
				+
			
 
				+from aphrodite import LLM, SamplingParams
			
 
				+from aphrodite.common.utils import FlexibleArgumentParser
			
 
				+
			
 
				+PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"  # noqa: E501
			
 
				+
			
 
				+
			
 
				+def test_prefix(llm=None, sampling_params=None, prompts=None):
			
 
				+    start_time = time.time()
			
 
				+
			
 
				+    llm.generate(prompts, sampling_params=sampling_params)
			
 
				+
			
 
				+    end_time = time.time()
			
 
				+    print(f"cost time {end_time - start_time}")
			
 
				+
			
 
				+
			
 
				+def main(args):
			
 
				+    llm = LLM(model=args.model,
			
 
				+              tokenizer_mode='auto',
			
 
				+              trust_remote_code=True,
			
 
				+              enforce_eager=True,
			
 
				+              use_v2_block_manager=args.use_v2_block_manager,
			
 
				+              tensor_parallel_size=args.tensor_parallel_size,
			
 
				+              enable_prefix_caching=args.enable_prefix_caching)
			
 
				+
			
 
				+    num_prompts = 100
			
 
				+    prompts = [PROMPT] * num_prompts
			
 
				+    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
			
 
				+
			
 
				+    print("------warm up------")
			
 
				+    test_prefix(
			
 
				+        llm=llm,
			
 
				+        prompts=prompts,
			
 
				+        sampling_params=sampling_params,
			
 
				+    )
			
 
				+
			
 
				+    print("------start generating------")
			
 
				+    test_prefix(
			
 
				+        llm=llm,
			
 
				+        prompts=prompts,
			
 
				+        sampling_params=sampling_params,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    parser = FlexibleArgumentParser(
			
 
				+        description='Benchmark the performance with or without automatic '
			
 
				+        'prefix caching.')
			
 
				+    parser.add_argument('--model',
			
 
				+                        type=str,
			
 
				+                        default='baichuan-inc/Baichuan2-13B-Chat')
			
 
				+    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
			
 
				+    parser.add_argument('--output-len', type=int, default=10)
			
 
				+    parser.add_argument('--enable-prefix-caching',
			
 
				+                        action='store_true',
			
 
				+                        help='enable prefix caching')
			
 
				+    parser.add_argument('--use-v2-block-manager',
			
 
				+                        action='store_true',
			
 
				+                        help='Use BlockSpaceMangerV2')
			
 
				+    args = parser.parse_args()
			
 
				+    main(args)