7 months ago · 5dbfc200f2
--- a/tests/benchmarks/README.md
+++ b/tests/benchmarks/README.md
@@ -0,0 +1,8 @@
 
															+# Benchmarking Aphrodite
														
 
															+
														
 
															+## Downloading the ShareGPT dataset
														
 
															+
														
 
															+You can download the dataset by running:
														
 
															+```bash
														
 
															+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
														
 
															+```
														
--- a/tests/benchmarks/backend_request_func.py
+++ b/tests/benchmarks/backend_request_func.py
@@ -4,10 +4,13 @@ import sys
 
															 import time
														
 
															 import traceback
														
 
															 from dataclasses import dataclass, field
														
 
															-from typing import List, Optional
														
 
															+from typing import List, Optional, Union
														
 
															 import aiohttp
														
 
															+import huggingface_hub.constants
														
 
															 from tqdm.asyncio import tqdm
														
 
															+from transformers import (AutoTokenizer, PreTrainedTokenizer,
														
 
															+                          PreTrainedTokenizerFast)
														
 
															 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
														
@@ -68,9 +71,13 @@ async def async_request_tgi(
 
															                         chunk_bytes = chunk_bytes.strip()
														
 
															                         if not chunk_bytes:
														
 
															                             continue
														
 
															+                        chunk_bytes = chunk_bytes.decode("utf-8")
														
 
															-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
														
 
															-                                              "data:")
														
 
															+                        #NOTE: Sometimes TGI returns a ping response without
														
 
															+                        # any data, we should skip it.
														
 
															+                        if chunk_bytes.startswith(":"):
														
 
															+                            continue
														
 
															+                        chunk = remove_prefix(chunk_bytes, "data:")
														
 
															                         data = json.loads(chunk)
														
 
															                         timestamp = time.perf_counter()
														
@@ -218,8 +225,8 @@ async def async_request_openai_completions(
 
															 ) -> RequestFuncOutput:
														
 
															     api_url = request_func_input.api_url
														
 
															     assert api_url.endswith(
														
 
															-        "v1/completions"
														
 
															-    ), "OpenAI Completions API URL must end with 'v1/completions'."
														
 
															+        "completions"
														
 
															+    ), "OpenAI Completions API URL must end with 'completions'."
														
 
															     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
														
 
															         assert not request_func_input.use_beam_search
														
@@ -258,6 +265,9 @@ async def async_request_openai_completions(
 
															                         else:
														
 
															                             data = json.loads(chunk)
														
 
															+                            # NOTE: Some completion API might have a last
														
 
															+                            # usage summary response without a token so we
														
 
															+                            # want to check a token was generated
														
 
															                             if data["choices"][0]["text"]:
														
 
															                                 timestamp = time.perf_counter()
														
 
															                                 # First token
														
@@ -266,12 +276,8 @@ async def async_request_openai_completions(
 
															                                     output.ttft = ttft
														
 
															                                 # Decoding phase
														
 
															-                                # NOTE: Some completion API might have a last
														
 
															-                                # usage summary response without a token so we
														
 
															-                                # do not want to include as inter-token-latency
														
 
															-                                elif data.get("usage", None) is None:
														
 
															-                                    output.itl.append(timestamp -
														
 
															-                                                      most_recent_timestamp)
														
 
															+                                output.itl.append(timestamp -
														
 
															+                                                  most_recent_timestamp)
														
 
															                                 most_recent_timestamp = timestamp
														
 
															                                 generated_text += data["choices"][0]["text"]
														
@@ -298,8 +304,8 @@ async def async_request_openai_chat_completions(
 
															 ) -> RequestFuncOutput:
														
 
															     api_url = request_func_input.api_url
														
 
															     assert api_url.endswith(
														
 
															-        "v1/chat/completions"
														
 
															-    ), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
														
 
															+        "chat/completions"
														
 
															+    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
														
 
															     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
														
 
															         assert not request_func_input.use_beam_search
														
@@ -384,12 +390,38 @@ def remove_prefix(text: str, prefix: str) -> str:
 
															     return text
														
 
															+def get_model(pretrained_model_name_or_path: str) -> str:
														
 
															+    if os.getenv('APHRODITE_USE_MODELSCOPE', 'False').lower() == 'true':
														
 
															+        from modelscope import snapshot_download
														
 
															+
														
 
															+        model_path = snapshot_download(
														
 
															+            model_id=pretrained_model_name_or_path,
														
 
															+            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
														
 
															+            ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
														
 
															+
														
 
															+        return model_path
														
 
															+    return pretrained_model_name_or_path
														
 
															+
														
 
															+
														
 
															+def get_tokenizer(
														
 
															+    pretrained_model_name_or_path: str, trust_remote_code: bool
														
 
															+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
														
 
															+    if pretrained_model_name_or_path is not None and not os.path.exists(
														
 
															+            pretrained_model_name_or_path):
														
 
															+        pretrained_model_name_or_path = get_model(
														
 
															+            pretrained_model_name_or_path)
														
 
															+    return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
														
 
															+                                         trust_remote_code=trust_remote_code)
														
 
															+
														
 
															+
														
 
															 ASYNC_REQUEST_FUNCS = {
														
 
															     "tgi": async_request_tgi,
														
 
															     "aphrodite": async_request_openai_completions,
														
 
															+    "vllm": async_request_openai_completions,
														
 
															     "lmdeploy": async_request_openai_completions,
														
 
															     "deepspeed-mii": async_request_deepspeed_mii,
														
 
															     "openai": async_request_openai_completions,
														
 
															     "openai-chat": async_request_openai_chat_completions,
														
 
															     "tensorrt-llm": async_request_trt_llm,
														
 
															+    "scalellm": async_request_openai_completions,
														
 
															 }
														
--- a/tests/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/tests/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -8,9 +8,10 @@ from typing import Callable, Iterable, List, Tuple
 
															 import torch
														
 
															 import torch.utils.benchmark as TBenchmark
														
 
															 from torch.utils.benchmark import Measurement as TMeasurement
														
 
															-from .weight_shapes import WEIGHT_SHAPES
														
 
															+from weight_shapes import WEIGHT_SHAPES
														
 
															 from aphrodite import _custom_ops as ops
														
 
															+from aphrodite.common.utils import FlexibleArgumentParser
														
 
															 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
														
 
															 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
														
@@ -19,18 +20,18 @@ DEFAULT_TP_SIZES = [1]
 
															 # helpers
														
 
															-def to_fp8(tensor: torch.tensor) -> torch.tensor:
														
 
															+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
														
 
															     finfo = torch.finfo(torch.float8_e4m3fn)
														
 
															     return torch.round(tensor.clamp(
														
 
															         min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
														
 
															-def to_int8(tensor: torch.tensor) -> torch.tensor:
														
 
															+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
														
 
															     return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
														
 
															 def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
														
 
															-                      k: int) -> Tuple[torch.tensor, torch.tensor]:
														
 
															+                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
														
 
															     a = torch.randn((m, k), device='cuda') * 5
														
 
															     b = torch.randn((n, k), device='cuda').t() * 5
														
@@ -46,15 +47,15 @@ def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
 
															 # impl
														
 
															-def pytorch_mm_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
														
 
															-                    scale_b: torch.tensor,
														
 
															-                    out_dtype: torch.dtype) -> torch.tensor:
														
 
															+def pytorch_mm_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
														
 
															+                    scale_b: torch.Tensor,
														
 
															+                    out_dtype: torch.dtype) -> torch.Tensor:
														
 
															     return torch.mm(a, b)
														
 
															-def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
														
 
															-                     scale_b: torch.tensor,
														
 
															-                     out_dtype: torch.dtype) -> torch.tensor:
														
 
															+def pytorch_fp8_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
														
 
															+                     scale_b: torch.Tensor,
														
 
															+                     out_dtype: torch.dtype) -> torch.Tensor:
														
 
															     return torch._scaled_mm(a,
														
 
															                             b,
														
 
															                             scale_a=scale_a,
														
@@ -62,9 +63,9 @@ def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
 
															                             out_dtype=out_dtype)
														
 
															-def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
														
 
															-                                scale_a: torch.tensor, scale_b: torch.tensor,
														
 
															-                                out_dtype: torch.dtype) -> torch.tensor:
														
 
															+def pytorch_fp8_impl_fast_accum(a: torch.Tensor, b: torch.Tensor,
														
 
															+                                scale_a: torch.Tensor, scale_b: torch.Tensor,
														
 
															+                                out_dtype: torch.dtype) -> torch.Tensor:
														
 
															     return torch._scaled_mm(a,
														
 
															                             b,
														
 
															                             scale_a=scale_a,
														
@@ -73,15 +74,15 @@ def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
 
															                             use_fast_accum=True)
														
 
															-def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
														
 
															-                 scale_b: torch.tensor,
														
 
															-                 out_dtype: torch.dtype) -> torch.tensor:
														
 
															+def cutlass_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
														
 
															+                 scale_b: torch.Tensor,
														
 
															+                 out_dtype: torch.dtype) -> torch.Tensor:
														
 
															     return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)
														
 
															 # bench
														
 
															-def bench_fn(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
														
 
															-             scale_b: torch.tensor, out_dtype: torch.dtype, label: str,
														
 
															+def bench_fn(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
														
 
															+             scale_b: torch.Tensor, out_dtype: torch.dtype, label: str,
														
 
															              sub_label: str, fn: Callable, description: str) -> TMeasurement:
														
 
															     min_run_time = 1
														
@@ -293,7 +294,7 @@ if __name__ == '__main__':
 
															             return torch.float8_e4m3fn
														
 
															         raise ValueError("unsupported dtype")
														
 
															-    parser = argparse.ArgumentParser(
														
 
															+    parser = FlexibleArgumentParser(
														
 
															         description="""
														
 
															 Benchmark Cutlass GEMM.
														
--- a/tests/benchmarks/engine/latency.py
+++ b/tests/benchmarks/engine/latency.py
@@ -0,0 +1,287 @@
 
															+"""Benchmark the latency of processing a single batch of requests."""
														
 
															+import argparse
														
 
															+import json
														
 
															+import time
														
 
															+from pathlib import Path
														
 
															+from typing import List, Optional
														
 
															+
														
 
															+import numpy as np
														
 
															+import torch
														
 
															+from tqdm import tqdm
														
 
															+
														
 
															+from aphrodite import LLM, SamplingParams
														
 
															+from aphrodite.engine.args_tools import EngineArgs
														
 
															+from aphrodite.inputs import PromptStrictInputs
														
 
															+from aphrodite.quantization import QUANTIZATION_METHODS
														
 
															+from aphrodite.common.utils import FlexibleArgumentParser
														
 
															+
														
 
															+
														
 
															+def main(args: argparse.Namespace):
														
 
															+    print(args)
														
 
															+
														
 
															+    # NOTE: If the request cannot be processed in a single batch,
														
 
															+    # the engine will automatically process the request in multiple batches.
														
 
															+    llm = LLM(
														
 
															+        model=args.model,
														
 
															+        speculative_model=args.speculative_model,
														
 
															+        num_speculative_tokens=args.num_speculative_tokens,
														
 
															+        speculative_draft_tensor_parallel_size=\
														
 
															+            args.speculative_draft_tensor_parallel_size,
														
 
															+        ngram_prompt_lookup_max=args.ngram_prompt_lookup_max,
														
 
															+        ngram_prompt_lookup_min=args.ngram_prompt_lookup_min,
														
 
															+        tokenizer=args.tokenizer,
														
 
															+        quantization=args.quantization,
														
 
															+        tensor_parallel_size=args.tensor_parallel_size,
														
 
															+        trust_remote_code=args.trust_remote_code,
														
 
															+        dtype=args.dtype,
														
 
															+        max_model_len=args.max_model_len,
														
 
															+        enforce_eager=args.enforce_eager,
														
 
															+        kv_cache_dtype=args.kv_cache_dtype,
														
 
															+        quantization_param_path=args.quantization_param_path,
														
 
															+        device=args.device,
														
 
															+        ray_workers_use_nsight=args.ray_workers_use_nsight,
														
 
															+        use_v2_block_manager=args.use_v2_block_manager,
														
 
															+        enable_chunked_prefill=args.enable_chunked_prefill,
														
 
															+        download_dir=args.download_dir,
														
 
															+        block_size=args.block_size,
														
 
															+        gpu_memory_utilization=args.gpu_memory_utilization,
														
 
															+        load_format=args.load_format,
														
 
															+        distributed_executor_backend=args.distributed_executor_backend,
														
 
															+        enable_prefix_caching=args.enable_prefix_caching,
														
 
															+    )
														
 
															+
														
 
															+    sampling_params = SamplingParams(
														
 
															+        n=args.n,
														
 
															+        temperature=0.0 if args.use_beam_search else 1.0,
														
 
															+        top_p=1.0,
														
 
															+        use_beam_search=args.use_beam_search,
														
 
															+        ignore_eos=True,
														
 
															+        max_tokens=args.output_len,
														
 
															+    )
														
 
															+    print(sampling_params)
														
 
															+    dummy_prompt_token_ids = np.random.randint(10000,
														
 
															+                                               size=(args.batch_size,
														
 
															+                                                     args.input_len))
														
 
															+    dummy_inputs: List[PromptStrictInputs] = [{
														
 
															+        "prompt_token_ids": batch
														
 
															+    } for batch in dummy_prompt_token_ids.tolist()]
														
 
															+
														
 
															+    def run_to_completion(profile_dir: Optional[str] = None):
														
 
															+        if profile_dir:
														
 
															+            with torch.profiler.profile(
														
 
															+                    activities=[
														
 
															+                        torch.profiler.ProfilerActivity.CPU,
														
 
															+                        torch.profiler.ProfilerActivity.CUDA,
														
 
															+                    ],
														
 
															+                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
														
 
															+                        str(profile_dir))) as p:
														
 
															+                llm.generate(dummy_inputs,
														
 
															+                             sampling_params=sampling_params,
														
 
															+                             use_tqdm=False)
														
 
															+            print(p.key_averages())
														
 
															+        else:
														
 
															+            start_time = time.perf_counter()
														
 
															+            llm.generate(dummy_inputs,
														
 
															+                         sampling_params=sampling_params,
														
 
															+                         use_tqdm=False)
														
 
															+            end_time = time.perf_counter()
														
 
															+            latency = end_time - start_time
														
 
															+            return latency
														
 
															+
														
 
															+    print("Warming up...")
														
 
															+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
														
 
															+        run_to_completion(profile_dir=None)
														
 
															+
														
 
															+    if args.profile:
														
 
															+        profile_dir = args.profile_result_dir
														
 
															+        if not profile_dir:
														
 
															+            profile_dir = Path(
														
 
															+                "."
														
 
															+            ) / "aphrodite_benchmark_result" / f"latency_result_{time.time()}"
														
 
															+        print(f"Profiling (results will be saved to '{profile_dir}')...")
														
 
															+        run_to_completion(profile_dir=profile_dir)
														
 
															+        return
														
 
															+
														
 
															+    # Benchmark.
														
 
															+    latencies = []
														
 
															+    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
														
 
															+        latencies.append(run_to_completion(profile_dir=None))
														
 
															+    latencies = np.array(latencies)
														
 
															+    percentages = [10, 25, 50, 75, 90, 99]
														
 
															+    percentiles = np.percentile(latencies, percentages)
														
 
															+    print(f'Avg latency: {np.mean(latencies)} seconds')
														
 
															+    for percentage, percentile in zip(percentages, percentiles):
														
 
															+        print(f'{percentage}% percentile latency: {percentile} seconds')
														
 
															+
														
 
															+    # Output JSON results if specified
														
 
															+    if args.output_json:
														
 
															+        results = {
														
 
															+            "avg_latency": np.mean(latencies),
														
 
															+            "latencies": latencies.tolist(),
														
 
															+            "percentiles": dict(zip(percentages, percentiles.tolist())),
														
 
															+        }
														
 
															+        with open(args.output_json, "w") as f:
														
 
															+            json.dump(results, f, indent=4)
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    parser = FlexibleArgumentParser(
														
 
															+        description='Benchmark the latency of processing a single batch of '
														
 
															+        'requests till completion.')
														
 
															+    parser.add_argument('--model', type=str, default='facebook/opt-125m')
														
 
															+    parser.add_argument('--speculative-model', type=str, default=None)
														
 
															+    parser.add_argument('--num-speculative-tokens', type=int, default=None)
														
 
															+    parser.add_argument('--speculative-draft-tensor-parallel-size',
														
 
															+                        '-spec-draft-tp',
														
 
															+                        type=int,
														
 
															+                        default=None)
														
 
															+    parser.add_argument('--ngram-prompt-lookup-max',
														
 
															+                        type=int,
														
 
															+                        default=None)
														
 
															+    parser.add_argument('--ngram-prompt-lookup-min',
														
 
															+                        type=int,
														
 
															+                        default=None)
														
 
															+    parser.add_argument('--tokenizer', type=str, default=None)
														
 
															+    parser.add_argument('--quantization',
														
 
															+                        '-q',
														
 
															+                        choices=[*QUANTIZATION_METHODS, None],
														
 
															+                        default=None)
														
 
															+    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
														
 
															+    parser.add_argument('--input-len', type=int, default=32)
														
 
															+    parser.add_argument('--output-len', type=int, default=128)
														
 
															+    parser.add_argument('--batch-size', type=int, default=8)
														
 
															+    parser.add_argument('--n',
														
 
															+                        type=int,
														
 
															+                        default=1,
														
 
															+                        help='Number of generated sequences per prompt.')
														
 
															+    parser.add_argument('--use-beam-search', action='store_true')
														
 
															+    parser.add_argument('--num-iters-warmup',
														
 
															+                        type=int,
														
 
															+                        default=10,
														
 
															+                        help='Number of iterations to run for warmup.')
														
 
															+    parser.add_argument('--num-iters',
														
 
															+                        type=int,
														
 
															+                        default=30,
														
 
															+                        help='Number of iterations to run.')
														
 
															+    parser.add_argument('--trust-remote-code',
														
 
															+                        action='store_true',
														
 
															+                        help='trust remote code from huggingface')
														
 
															+    parser.add_argument(
														
 
															+        '--max-model-len',
														
 
															+        type=int,
														
 
															+        default=None,
														
 
															+        help='Maximum length of a sequence (including prompt and output). '
														
 
															+        'If None, will be derived from the model.')
														
 
															+    parser.add_argument(
														
 
															+        '--dtype',
														
 
															+        type=str,
														
 
															+        default='auto',
														
 
															+        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
														
 
															+        help='data type for model weights and activations. '
														
 
															+        'The "auto" option will use FP16 precision '
														
 
															+        'for FP32 and FP16 models, and BF16 precision '
														
 
															+        'for BF16 models.')
														
 
															+    parser.add_argument('--enforce-eager',
														
 
															+                        action='store_true',
														
 
															+                        help='enforce eager mode and disable CUDA graph')
														
 
															+    parser.add_argument(
														
 
															+        '--kv-cache-dtype',
														
 
															+        type=str,
														
 
															+        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
														
 
															+        default="auto",
														
 
															+        help='Data type for kv cache storage. If "auto", will use model '
														
 
															+        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
														
 
															+        'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
														
 
															+    parser.add_argument(
														
 
															+        '--quantization-param-path',
														
 
															+        type=str,
														
 
															+        default=None,
														
 
															+        help='Path to the JSON file containing the KV cache scaling factors. '
														
 
															+        'This should generally be supplied, when KV cache dtype is FP8. '
														
 
															+        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
														
 
															+        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
														
 
															+        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
														
 
															+        'instead supported for common inference criteria.')
														
 
															+    parser.add_argument(
														
 
															+        '--profile',
														
 
															+        action='store_true',
														
 
															+        help='profile the generation process of a single batch')
														
 
															+    parser.add_argument(
														
 
															+        '--profile-result-dir',
														
 
															+        type=str,
														
 
															+        default=None,
														
 
															+        help=('path to save the pytorch profiler output. Can be visualized '
														
 
															+              'with ui.perfetto.dev or Tensorboard.'))
														
 
															+    parser.add_argument(
														
 
															+        "--device",
														
 
															+        type=str,
														
 
															+        default="auto",
														
 
															+        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
														
 
															+        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
														
 
															+        'CPU.')
														
 
															+    parser.add_argument('--block-size',
														
 
															+                        type=int,
														
 
															+                        default=16,
														
 
															+                        help='block size of key/value cache')
														
 
															+    parser.add_argument(
														
 
															+        '--enable-chunked-prefill',
														
 
															+        action='store_true',
														
 
															+        help='If True, the prefill requests can be chunked based on the '
														
 
															+        'max_num_batched_tokens')
														
 
															+    parser.add_argument("--enable-prefix-caching",
														
 
															+                        action='store_true',
														
 
															+                        help="Enable automatic prefix caching")
														
 
															+    parser.add_argument('--use-v2-block-manager', action='store_true')
														
 
															+    parser.add_argument(
														
 
															+        "--ray-workers-use-nsight",
														
 
															+        action='store_true',
														
 
															+        help="If specified, use nsight to profile ray workers",
														
 
															+    )
														
 
															+    parser.add_argument('--download-dir',
														
 
															+                        type=str,
														
 
															+                        default=None,
														
 
															+                        help='directory to download and load the weights, '
														
 
															+                        'default to the default cache dir of huggingface')
														
 
															+    parser.add_argument(
														
 
															+        '--output-json',
														
 
															+        type=str,
														
 
															+        default=None,
														
 
															+        help='Path to save the latency results in JSON format.')
														
 
															+    parser.add_argument('--gpu-memory-utilization',
														
 
															+                        type=float,
														
 
															+                        default=0.9,
														
 
															+                        help='the fraction of GPU memory to be used for '
														
 
															+                        'the model executor, which can range from 0 to 1.'
														
 
															+                        'If unspecified, will use the default value of 0.9.')
														
 
															+    parser.add_argument(
														
 
															+        '--load-format',
														
 
															+        type=str,
														
 
															+        default=EngineArgs.load_format,
														
 
															+        choices=[
														
 
															+            'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
														
 
															+            'bitsandbytes'
														
 
															+        ],
														
 
															+        help='The format of the model weights to load.\n\n'
														
 
															+        '* "auto" will try to load the weights in the safetensors format '
														
 
															+        'and fall back to the pytorch bin format if safetensors format '
														
 
															+        'is not available.\n'
														
 
															+        '* "pt" will load the weights in the pytorch bin format.\n'
														
 
															+        '* "safetensors" will load the weights in the safetensors format.\n'
														
 
															+        '* "npcache" will load the weights in pytorch format and store '
														
 
															+        'a numpy cache to speed up the loading.\n'
														
 
															+        '* "dummy" will initialize the weights with random values, '
														
 
															+        'which is mainly for profiling.\n'
														
 
															+        '* "tensorizer" will load the weights using tensorizer from '
														
 
															+        'CoreWeave. See the Tensorize vLLM Model script in the Examples'
														
 
															+        'section for more information.\n'
														
 
															+        '* "bitsandbytes" will load the weights using bitsandbytes '
														
 
															+        'quantization.\n')
														
 
															+    parser.add_argument(
														
 
															+        '--distributed-executor-backend',
														
 
															+        choices=['ray', 'mp'],
														
 
															+        default=None,
														
 
															+        help='Backend to use for distributed serving. When more than 1 GPU '
														
 
															+        'is used, will be automatically set to "ray" if installed '
														
 
															+        'or "mp" (multiprocessing) otherwise.')
														
 
															+    args = parser.parse_args()
														
 
															+    main(args)
														
--- a/tests/benchmarks/engine/serving.py
+++ b/tests/benchmarks/engine/serving.py
@@ -2,8 +2,8 @@
 
															 On the server side, run one of the following commands:
														
 
															     Aphrodite OpenAI API server
														
 
															-    python -m aphrodite.endpoints.openai.api_server \
														
 
															-        --model <your_model> --swap-space 16 \
														
 
															+    aphrodite run <your_model> \
														
 
															+        --swap-space 16 \
														
 
															         --disable-log-requests
														
 
															     (TGI backend)
														
@@ -17,7 +17,7 @@ On the client side, run:
 
															         --dataset-path <path to dataset> \
														
 
															         --request-rate <request_rate> \ # By default <request_rate> is inf
														
 
															         --num-prompts <num_prompts> # By default <num_prompts> is 1000
														
 
															-        
														
 
															+
														
 
															     when using tgi backend, add
														
 
															         --endpoint /generate_stream
														
 
															     to the end of the command above.
														
@@ -31,7 +31,7 @@ import time
 
															 import warnings
														
 
															 from dataclasses import dataclass
														
 
															 from datetime import datetime
														
 
															-from typing import AsyncGenerator, List, Optional, Tuple
														
 
															+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
														
 
															 import numpy as np
														
 
															 from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
														
@@ -39,7 +39,15 @@ from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
 
															 from tqdm.asyncio import tqdm
														
 
															 from transformers import PreTrainedTokenizerBase
														
 
															-from aphrodite.transformers_utils.tokenizer import get_tokenizer
														
 
															+try:
														
 
															+    from aphrodite.transformers_utils.tokenizer import get_tokenizer
														
 
															+except ImportError:
														
 
															+    from backend_request_func import get_tokenizer
														
 
															+
														
 
															+try:
														
 
															+    from aphrodite.common.utils import FlexibleArgumentParser
														
 
															+except ImportError:
														
 
															+    from argparse import ArgumentParser as FlexibleArgumentParser
														
 
															 @dataclass
														
@@ -52,10 +60,16 @@ class BenchmarkMetrics:
 
															     output_throughput: float
														
 
															     mean_ttft_ms: float
														
 
															     median_ttft_ms: float
														
 
															+    std_ttft_ms: float
														
 
															     p99_ttft_ms: float
														
 
															     mean_tpot_ms: float
														
 
															     median_tpot_ms: float
														
 
															+    std_tpot_ms: float
														
 
															     p99_tpot_ms: float
														
 
															+    mean_itl_ms: float
														
 
															+    median_itl_ms: float
														
 
															+    std_itl_ms: float
														
 
															+    p99_itl_ms: float
														
 
															 def sample_sharegpt_requests(
														
@@ -66,7 +80,6 @@ def sample_sharegpt_requests(
 
															 ) -> List[Tuple[str, int, int]]:
														
 
															     if fixed_output_len is not None and fixed_output_len < 4:
														
 
															         raise ValueError("output_len too small")
														
 
															-
														
 
															     # Load the dataset.
														
 
															     with open(dataset_path) as f:
														
 
															         dataset = json.load(f)
														
@@ -174,6 +187,31 @@ def sample_sonnet_requests(
 
															     return sampled_requests
														
 
															+def sample_random_requests(
														
 
															+        input_len: int, output_len: int, num_prompts: int, range_ratio: float,
														
 
															+        tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
														
 
															+
														
 
															+    input_lens = np.random.randint(
														
 
															+        int(input_len * range_ratio),
														
 
															+        input_len + 1,
														
 
															+        size=num_prompts,
														
 
															+    )
														
 
															+    output_lens = np.random.randint(
														
 
															+        int(output_len * range_ratio),
														
 
															+        output_len + 1,
														
 
															+        size=num_prompts,
														
 
															+    )
														
 
															+    offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
														
 
															+    input_requests = []
														
 
															+    for i in range(num_prompts):
														
 
															+        prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
														
 
															+                                   for j in range(input_lens[i])])
														
 
															+        input_requests.append(
														
 
															+            (prompt, int(input_lens[i]), int(output_lens[i])))
														
 
															+
														
 
															+    return input_requests
														
 
															+
														
 
															+
														
 
															 async def get_request(
														
 
															     input_requests: List[Tuple[str, int, int]],
														
 
															     request_rate: float,
														
@@ -185,6 +223,7 @@ async def get_request(
 
															         if request_rate == float("inf"):
														
 
															             # If the request rate is infinity, then we don't need to wait.
														
 
															             continue
														
 
															+
														
 
															         # Sample the request interval from the exponential distribution.
														
 
															         interval = np.random.exponential(1.0 / request_rate)
														
 
															         # The next request will be sent after the interval.
														
@@ -197,19 +236,27 @@ def calculate_metrics(
 
															     dur_s: float,
														
 
															     tokenizer: PreTrainedTokenizerBase,
														
 
															 ) -> Tuple[BenchmarkMetrics, List[int]]:
														
 
															-    actual_output_lens = []
														
 
															+    actual_output_lens: List[int] = []
														
 
															     total_input = 0
														
 
															     completed = 0
														
 
															-    tpots = []
														
 
															-    ttfts = []
														
 
															+    itls: List[float] = []
														
 
															+    tpots: List[float] = []
														
 
															+    ttfts: List[float] = []
														
 
															     for i in range(len(outputs)):
														
 
															         if outputs[i].success:
														
 
															-            output_len = len(tokenizer(outputs[i].generated_text).input_ids)
														
 
															+            # We use the tokenizer to count the number of output tokens for all
														
 
															+            # serving backends instead of looking at len(outputs[i].itl) since
														
 
															+            # multiple output tokens may be bundled together
														
 
															+            # Note : this may inflate the output token count slightly
														
 
															+            output_len = len(
														
 
															+                tokenizer(outputs[i].generated_text,
														
 
															+                          add_special_tokens=False).input_ids)
														
 
															             actual_output_lens.append(output_len)
														
 
															             total_input += input_requests[i][1]
														
 
															             if output_len > 1:
														
 
															                 tpots.append(
														
 
															                     (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
														
 
															+            itls += outputs[i].itl
														
 
															             ttfts.append(outputs[i].ttft)
														
 
															             completed += 1
														
 
															         else:
														
@@ -230,10 +277,16 @@ def calculate_metrics(
 
															         mean_ttft_ms=np.mean(ttfts or 0) *
														
 
															         1000,  # ttfts is empty if streaming is not supported by backend
														
 
															         median_ttft_ms=np.median(ttfts or 0) * 1000,
														
 
															+        std_ttft_ms=np.std(ttfts or 0) * 1000,
														
 
															         p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
														
 
															         mean_tpot_ms=np.mean(tpots or 0) * 1000,
														
 
															         median_tpot_ms=np.median(tpots or 0) * 1000,
														
 
															+        std_tpot_ms=np.std(tpots or 0) * 1000,
														
 
															         p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
														
 
															+        mean_itl_ms=np.mean(itls or 0) * 1000,
														
 
															+        median_itl_ms=np.median(itls or 0) * 1000,
														
 
															+        std_itl_ms=np.std(itls or 0) * 1000,
														
 
															+        p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
														
 
															     )
														
 
															     return metrics, actual_output_lens
														
@@ -251,7 +304,7 @@ async def benchmark(
 
															     disable_tqdm: bool,
														
 
															 ):
														
 
															     if backend in ASYNC_REQUEST_FUNCS:
														
 
															-        request_func = ASYNC_REQUEST_FUNCS.get(backend)
														
 
															+        request_func = ASYNC_REQUEST_FUNCS[backend]
														
 
															     else:
														
 
															         raise ValueError(f"Unknown backend: {backend}")
														
@@ -278,7 +331,7 @@ async def benchmark(
 
															     pbar = None if disable_tqdm else tqdm(total=len(input_requests))
														
 
															     benchmark_start_time = time.perf_counter()
														
 
															-    tasks = []
														
 
															+    tasks: List[asyncio.Task] = []
														
 
															     async for request in get_request(input_requests, request_rate):
														
 
															         prompt, prompt_len, output_len = request
														
 
															         request_func_input = RequestFuncInput(
														
@@ -296,7 +349,7 @@ async def benchmark(
 
															                              pbar=pbar)))
														
 
															     outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
														
 
															-    if not disable_tqdm:
														
 
															+    if pbar is not None:
														
 
															         pbar.close()
														
 
															     benchmark_duration = time.perf_counter() - benchmark_start_time
														
@@ -333,6 +386,10 @@ async def benchmark(
 
															     print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
														
 
															                                     metrics.median_tpot_ms))
														
 
															     print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
														
 
															+    print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
														
 
															+    print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
														
 
															+    print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
														
 
															+    print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
														
 
															     print("=" * 50)
														
 
															     result = {
														
@@ -345,10 +402,16 @@ async def benchmark(
 
															         "output_throughput": metrics.output_throughput,
														
 
															         "mean_ttft_ms": metrics.mean_ttft_ms,
														
 
															         "median_ttft_ms": metrics.median_ttft_ms,
														
 
															+        "std_ttft_ms": metrics.std_ttft_ms,
														
 
															         "p99_ttft_ms": metrics.p99_ttft_ms,
														
 
															         "mean_tpot_ms": metrics.mean_tpot_ms,
														
 
															         "median_tpot_ms": metrics.median_tpot_ms,
														
 
															+        "std_tpot_ms": metrics.std_tpot_ms,
														
 
															         "p99_tpot_ms": metrics.p99_tpot_ms,
														
 
															+        "mean_itl_ms": metrics.mean_itl_ms,
														
 
															+        "median_itl_ms": metrics.median_itl_ms,
														
 
															+        "std_itl_ms": metrics.std_itl_ms,
														
 
															+        "p99_itl_ms": metrics.p99_itl_ms,
														
 
															         "input_lens": [output.prompt_len for output in outputs],
														
 
															         "output_lens": actual_output_lens,
														
 
															         "ttfts": [output.ttft for output in outputs],
														
@@ -427,6 +490,15 @@ def main(args: argparse.Namespace):
 
															                               for prompt, prompt_formatted, prompt_len,
														
 
															                               output_len in input_requests]
														
 
															+    elif args.dataset_name == "random":
														
 
															+        input_requests = sample_random_requests(
														
 
															+            input_len=args.random_input_len,
														
 
															+            output_len=args.random_output_len,
														
 
															+            num_prompts=args.num_prompts,
														
 
															+            range_ratio=args.random_range_ratio,
														
 
															+            tokenizer=tokenizer,
														
 
															+        )
														
 
															+
														
 
															     else:
														
 
															         raise ValueError(f"Unknown dataset: {args.dataset_name}")
														
@@ -445,7 +517,7 @@ def main(args: argparse.Namespace):
 
															     # Save config and results to json
														
 
															     if args.save_result:
														
 
															-        result_json = {}
														
 
															+        result_json: Dict[str, Any] = {}
														
 
															         # Setup
														
 
															         current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
														
@@ -478,6 +550,8 @@ def main(args: argparse.Namespace):
 
															         # Save to file
														
 
															         base_model_id = model_id.split("/")[-1]
														
 
															         file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"  #noqa
														
 
															+        if args.result_filename:
														
 
															+            file_name = args.result_filename
														
 
															         if args.result_dir:
														
 
															             file_name = os.path.join(args.result_dir, file_name)
														
 
															         with open(file_name, "w") as outfile:
														
@@ -485,7 +559,7 @@ def main(args: argparse.Namespace):
 
															 if __name__ == "__main__":
														
 
															-    parser = argparse.ArgumentParser(
														
 
															+    parser = FlexibleArgumentParser(
														
 
															         description="Benchmark the online serving throughput.")
														
 
															     parser.add_argument(
														
 
															         "--backend",
														
@@ -518,7 +592,7 @@ if __name__ == "__main__":
 
															         "--dataset-name",
														
 
															         type=str,
														
 
															         default="sharegpt",
														
 
															-        choices=["sharegpt", "sonnet"],
														
 
															+        choices=["sharegpt", "sonnet", "random"],
														
 
															         help="Name of the dataset to benchmark on.",
														
 
															     )
														
 
															     parser.add_argument("--dataset-path",
														
@@ -535,7 +609,7 @@ if __name__ == "__main__":
 
															         "--tokenizer",
														
 
															         type=str,
														
 
															         help=
														
 
															-        "Name or path of the tokenizer, if not using the default tokenizer.",
														
 
															+        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
														
 
															     )
														
 
															     parser.add_argument(
														
 
															         "--best-of",
														
@@ -578,6 +652,27 @@ if __name__ == "__main__":
 
															         help=
														
 
															         "Number of prefix tokens per request, used only for sonnet dataset.",
														
 
															     )
														
 
															+    parser.add_argument(
														
 
															+        "--random-input-len",
														
 
															+        type=int,
														
 
															+        default=1024,
														
 
															+        help=
														
 
															+        "Number of input tokens per request, used only for random sampling.",
														
 
															+    )
														
 
															+    parser.add_argument(
														
 
															+        "--random-output-len",
														
 
															+        type=int,
														
 
															+        default=128,
														
 
															+        help=
														
 
															+        "Number of output tokens per request, used only for random sampling.",
														
 
															+    )
														
 
															+    parser.add_argument(
														
 
															+        "--random-range-ratio",
														
 
															+        type=float,
														
 
															+        default=1.0,
														
 
															+        help="Range of sampled ratio of input/output length, "
														
 
															+        "used only for random sampling.",
														
 
															+    )
														
 
															     parser.add_argument(
														
 
															         "--request-rate",
														
 
															         type=float,
														
@@ -618,6 +713,15 @@ if __name__ == "__main__":
 
															         help="Specify directory to save benchmark json results."
														
 
															         "If not specified, results are saved in the current directory.",
														
 
															     )
														
 
															+    parser.add_argument(
														
 
															+        "--result-filename",
														
 
															+        type=str,
														
 
															+        default=None,
														
 
															+        help="Specify the filename to save benchmark json results."
														
 
															+        "If not specified, results will be saved in "
														
 
															+        "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
														
 
															+        " format.",
														
 
															+    )
														
 
															     args = parser.parse_args()
														
 
															     main(args)
														
--- a/tests/benchmarks/engine/throughput.py
+++ b/tests/benchmarks/engine/throughput.py
@@ -10,6 +10,7 @@ from tqdm import tqdm
 
															 from transformers import (AutoModelForCausalLM, AutoTokenizer,
														
 
															                           PreTrainedTokenizerBase)
														
 
															+from aphrodite.common.utils import FlexibleArgumentParser
														
 
															 from aphrodite.engine.args_tools import EngineArgs
														
 
															 from aphrodite.quantization import QUANTIZATION_METHODS
														
@@ -76,15 +77,10 @@ def run_aphrodite(
 
															     kv_cache_dtype: str,
														
 
															     quantization_param_path: Optional[str],
														
 
															     device: str,
														
 
															-    speculative_model: str,
														
 
															-    num_speculative_tokens: int,
														
 
															-    use_v2_block_manager: bool,
														
 
															-    ngram_prompt_lookup_min: int,
														
 
															-    ngram_prompt_lookup_max: int,
														
 
															     enable_prefix_caching: bool,
														
 
															     enable_chunked_prefill: bool,
														
 
															     max_num_batched_tokens: int,
														
 
															-    distributed_executor_backend: Optional[str] = None,
														
 
															+    distributed_executor_backend: Optional[str],
														
 
															     gpu_memory_utilization: float = 0.9,
														
 
															     download_dir: Optional[str] = None,
														
 
															     load_format: str = EngineArgs.load_format,
														
@@ -110,11 +106,6 @@ def run_aphrodite(
 
															         max_num_batched_tokens=max_num_batched_tokens,
														
 
															         distributed_executor_backend=distributed_executor_backend,
														
 
															         load_format=load_format,
														
 
															-        speculative_model=speculative_model,
														
 
															-        num_speculative_tokens=num_speculative_tokens,
														
 
															-        use_v2_block_manager=use_v2_block_manager,
														
 
															-        ngram_prompt_lookup_min=ngram_prompt_lookup_min,
														
 
															-        ngram_prompt_lookup_max=ngram_prompt_lookup_max,
														
 
															     )
														
 
															     # Add the requests to the engine.
														
@@ -238,9 +229,7 @@ def main(args: argparse.Namespace):
 
															             args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
														
 
															             args.trust_remote_code, args.dtype, args.max_model_len,
														
 
															             args.enforce_eager, args.kv_cache_dtype,
														
 
															-            args.quantization_param_path, args.device, args.speculative_model,
														
 
															-            args.num_speculative_tokens, args.use_v2_block_manager,
														
 
															-            args.ngram_prompt_lookup_min, args.ngram_prompt_lookup_max,
														
 
															+            args.quantization_param_path, args.device,
														
 
															             args.enable_prefix_caching, args.enable_chunked_prefill,
														
 
															             args.max_num_batched_tokens, args.distributed_executor_backend,
														
 
															             args.gpu_memory_utilization, args.download_dir, args.load_format)
														
@@ -257,7 +246,7 @@ def main(args: argparse.Namespace):
 
															     total_num_tokens = sum(prompt_len + output_len
														
 
															                            for _, prompt_len, output_len in requests)
														
 
															     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
														
 
															-          f"{total_num_tokens / elapsed_time:.2f} total tokens/s")
														
 
															+          f"{total_num_tokens / elapsed_time:.2f} tokens/s")
														
 
															     # Output JSON results if specified
														
 
															     if args.output_json:
														
@@ -273,7 +262,7 @@ def main(args: argparse.Namespace):
 
															 if __name__ == "__main__":
														
 
															-    parser = argparse.ArgumentParser(description="Benchmark the throughput.")
														
 
															+    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
														
 
															     parser.add_argument("--backend",
														
 
															                         type=str,
														
 
															                         choices=["aphrodite", "hf", "mii"],
														
@@ -337,8 +326,7 @@ if __name__ == "__main__":
 
															                         'the model executor, which can range from 0 to 1.'
														
 
															                         'If unspecified, will use the default value of 0.9.')
														
 
															     parser.add_argument("--enforce-eager",
														
 
															-                        type=lambda x: (str(x).lower() == 'true'),
														
 
															-                        default=True,
														
 
															+                        action="store_true",
														
 
															                         help="enforce eager execution")
														
 
															     parser.add_argument(
														
 
															         '--kv-cache-dtype',
														
@@ -363,15 +351,15 @@ if __name__ == "__main__":
 
															         type=str,
														
 
															         default="auto",
														
 
															         choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
														
 
															-        help='device type for Aphrodite execution, supporting CUDA, OpenVINO, '
														
 
															-        'CPU, TPU, and XPU.')
														
 
															+        help='device type for Aphrodite execution, supporting CUDA, OpenVINO and '
														
 
															+        'CPU.')
														
 
															     parser.add_argument(
														
 
															         "--enable-prefix-caching",
														
 
															         action='store_true',
														
 
															-        help="enable automatic prefix caching for aphrodite backend.")
														
 
															+        help="enable automatic prefix caching for Aphrodite backend.")
														
 
															     parser.add_argument("--enable-chunked-prefill",
														
 
															                         action='store_true',
														
 
															-                        help="enable chunked prefill for aphrodite backend.")
														
 
															+                        help="enable chunked prefill for Aphrodite backend.")
														
 
															     parser.add_argument('--max-num-batched-tokens',
														
 
															                         type=int,
														
 
															                         default=None,
														
@@ -413,29 +401,10 @@ if __name__ == "__main__":
 
															         '* "dummy" will initialize the weights with random values, '
														
 
															         'which is mainly for profiling.\n'
														
 
															         '* "tensorizer" will load the weights using tensorizer from '
														
 
															-        'CoreWeave. See the Tensorize aphrodite Model script in the Examples'
														
 
															+        'CoreWeave. See the Tensorize Aphrodite Model script in the Examples'
														
 
															         'section for more information.\n'
														
 
															         '* "bitsandbytes" will load the weights using bitsandbytes '
														
 
															         'quantization.\n')
														
 
															-    parser.add_argument('--speculative-model',
														
 
															-                        type=str,
														
 
															-                        default=None,
														
 
															-                        help='speculative model for speculative decoding')
														
 
															-    parser.add_argument('--use-v2-block-manager',
														
 
															-                        action='store_true',
														
 
															-                        help='use v2 block manage.')
														
 
															-    parser.add_argument('--num-speculative-tokens',
														
 
															-                        type=int,
														
 
															-                        default=None,
														
 
															-                        help='number of speculative tokens.')
														
 
															-    parser.add_argument('--ngram-prompt-lookup-min',
														
 
															-                        type=int,
														
 
															-                        default=None,
														
 
															-                        help='minimum ngram prompt lookup size')
														
 
															-    parser.add_argument('--ngram-prompt-lookup-max',
														
 
															-                        type=int,
														
 
															-                        default=None,
														
 
															-                        help='maximum ngram prompt lookup size')
														
 
															     args = parser.parse_args()
														
 
															     if args.tokenizer is None:
														
 
															         args.tokenizer = args.model
														
@@ -452,7 +421,7 @@ if __name__ == "__main__":
 
															         if args.hf_max_batch_size is None:
														
 
															             raise ValueError("HF max batch size is required for HF backend.")
														
 
															         if args.quantization is not None:
														
 
															-            raise ValueError("Quantization is only for aphrodite backend.")
														
 
															+            raise ValueError("Quantization is only for Aphrodite backend.")
														
 
															     elif args.backend == "mii":
														
 
															         if args.dtype != "auto":
														
 
															             raise ValueError("dtype must be auto for MII backend.")
														
@@ -461,7 +430,7 @@ if __name__ == "__main__":
 
															         if args.use_beam_search:
														
 
															             raise ValueError("Beam search is not supported for MII backend.")
														
 
															         if args.quantization is not None:
														
 
															-            raise ValueError("Quantization is only for aphrodite backend.")
														
 
															+            raise ValueError("Quantization is only for Aphrodite backend.")
														
 
															         if args.hf_max_batch_size is not None:
														
 
															             raise ValueError("HF max batch size is only for HF backend.")
														
 
															         if args.tokenizer != args.model:
														
--- a/tests/benchmarks/kernels/aqlm.py
+++ b/tests/benchmarks/kernels/aqlm.py
@@ -0,0 +1,303 @@
 
															+import os
														
 
															+import sys
														
 
															+from typing import Optional
														
 
															+
														
 
															+import torch
														
 
															+import torch.nn.functional as F
														
 
															+
														
 
															+from aphrodite import _custom_ops as ops
														
 
															+from aphrodite.common.utils import FlexibleArgumentParser
														
 
															+from aphrodite.quantization.aqlm import (dequantize_weight,
														
 
															+                                         generic_dequantize_gemm,
														
 
															+                                         get_int_dtype,
														
 
															+                                         optimized_dequantize_gemm)
														
 
															+
														
 
															+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
														
 
															+
														
 
															+
														
 
															+def torch_mult(
														
 
															+        input: torch.Tensor,  #  [..., in_features]
														
 
															+        weights: torch.Tensor,
														
 
															+        scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
														
 
															+) -> torch.Tensor:
														
 
															+    output = F.linear(input, weights)
														
 
															+    return output
														
 
															+
														
 
															+
														
 
															+def dequant_out_scale(
														
 
															+    input: torch.Tensor,  #  [..., in_features]
														
 
															+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
														
 
															+    codebooks: torch.
														
 
															+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
														
 
															+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
														
 
															+    output_partition_sizes: torch.IntTensor,
														
 
															+    bias: Optional[torch.Tensor],
														
 
															+) -> torch.Tensor:
														
 
															+
														
 
															+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
														
 
															+
														
 
															+    if bias is None:
														
 
															+        output = F.linear(input, weights, bias)
														
 
															+        orig_shape = output.shape
														
 
															+        flattened_output = output.view(-1, output.size(-1))
														
 
															+        f_scales = scales.view(-1, scales.shape[0])
														
 
															+        b_scales = f_scales.expand(flattened_output.shape[0], -1)
														
 
															+        flattened_output *= b_scales
														
 
															+        return flattened_output.view(orig_shape)
														
 
															+    else:
														
 
															+        b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
														
 
															+            -1, weights.shape[1])
														
 
															+        weights *= b_scales
														
 
															+        return F.linear(input, weights, bias)
														
 
															+
														
 
															+
														
 
															+def dequant_weight_scale(
														
 
															+    input: torch.Tensor,  #  [..., in_features]
														
 
															+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
														
 
															+    codebooks: torch.
														
 
															+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
														
 
															+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
														
 
															+    output_partition_sizes: torch.IntTensor,
														
 
															+    bias: Optional[torch.Tensor],
														
 
															+) -> torch.Tensor:
														
 
															+
														
 
															+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
														
 
															+
														
 
															+    b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
														
 
															+        -1, weights.shape[1])
														
 
															+    weights *= b_scales
														
 
															+    return F.linear(input, weights, bias)
														
 
															+
														
 
															+
														
 
															+def dequant_no_scale(
														
 
															+    input: torch.Tensor,  #  [..., in_features]
														
 
															+    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
														
 
															+    codebooks: torch.
														
 
															+    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
														
 
															+    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
														
 
															+    output_partition_sizes: torch.IntTensor,
														
 
															+    bias: Optional[torch.Tensor],
														
 
															+) -> torch.Tensor:
														
 
															+
														
 
															+    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
														
 
															+
														
 
															+    return F.linear(input, weights, bias)
														
 
															+
														
 
															+
														
 
															+# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
														
 
															+# the generic pytorch version.
														
 
															+# Just visual comparison.
														
 
															+def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
														
 
															+
														
 
															+    n = int(parts.sum().item())
														
 
															+
														
 
															+    device = torch.device('cuda:0')
														
 
															+
														
 
															+    code_range = (1 << bits) // 2
														
 
															+    ingroups = 8
														
 
															+
														
 
															+    codes = torch.randint(-code_range,
														
 
															+                          code_range,
														
 
															+                          size=(n, k // ingroups, nbooks),
														
 
															+                          dtype=get_int_dtype(bits),
														
 
															+                          device=device)
														
 
															+
														
 
															+    codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
														
 
															+                            dtype=torch.float16,
														
 
															+                            device=device)
														
 
															+
														
 
															+    count = 0
														
 
															+    for index in range(16):
														
 
															+        for i in range(8):
														
 
															+            for book in range(nbooks):
														
 
															+                codebooks[book, index, 0, i] = count * (10**book)
														
 
															+            count += 1
														
 
															+
														
 
															+    print("codes shape", codes.shape)
														
 
															+
														
 
															+    for i in range(16):
														
 
															+        for book in range(nbooks):
														
 
															+            codes[0, i, book] = i
														
 
															+            codes[0, -i, book] = i
														
 
															+
														
 
															+    weights = dequantize_weight(codes, codebooks, None)
														
 
															+    weights2 = ops.aqlm_dequant(codes, codebooks, parts)
														
 
															+
														
 
															+    print("weights shape:", weights.shape)
														
 
															+    print("weights2 shape:", weights2.shape)
														
 
															+
														
 
															+    print("weights are:", weights)
														
 
															+    print("weights2 are:", weights2)
														
 
															+
														
 
															+    print("first 128 weights are", weights[0, 0:128].to(torch.int32))
														
 
															+    print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
														
 
															+
														
 
															+    print("last 128 weights are", weights[0, -128:])
														
 
															+    print("last 128 weights2 are:", weights2[0, -128:])
														
 
															+
														
 
															+
														
 
															+def main():
														
 
															+
														
 
															+    parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
														
 
															+
														
 
															+    # Add arguments
														
 
															+    parser.add_argument("--nbooks",
														
 
															+                        type=int,
														
 
															+                        default=1,
														
 
															+                        help="Number of codebooks (default: 1)")
														
 
															+    parser.add_argument("--bits",
														
 
															+                        type=int,
														
 
															+                        default=16,
														
 
															+                        help="Number of bits per code element (default: 16)")
														
 
															+    parser.add_argument(
														
 
															+        "--test",
														
 
															+        type=bool,
														
 
															+        default=False,
														
 
															+        help="Run the decompression/dequant tester rather than benchmarking "
														
 
															+        "(default: False)")
														
 
															+
														
 
															+    # Parse the arguments
														
 
															+    args = parser.parse_args()
														
 
															+
														
 
															+    # Extract values
														
 
															+    nbooks = args.nbooks
														
 
															+    bits = args.bits
														
 
															+
														
 
															+    if args.test:
														
 
															+        dequant_test(4096, torch.tensor((4096, )), nbooks, bits)
														
 
															+        return
														
 
															+
														
 
															+    # Otherwise, benchmark.
														
 
															+    methods = [
														
 
															+        ops.aqlm_gemm,
														
 
															+        dequant_out_scale,
														
 
															+        generic_dequantize_gemm,
														
 
															+        optimized_dequantize_gemm,
														
 
															+        dequant_weight_scale,
														
 
															+        torch_mult,
														
 
															+        dequant_no_scale,
														
 
															+    ]
														
 
															+
														
 
															+    filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
														
 
															+    print(f"writing benchmarks to file {filename}")
														
 
															+    with open(filename, "w") as f:
														
 
															+        sys.stdout = f
														
 
															+
														
 
															+        print('m | k | n | n parts', end='')
														
 
															+        for method in methods:
														
 
															+            print(f" | {method.__name__.replace('_', ' ')} (µs)", end='')
														
 
															+        print('')
														
 
															+
														
 
															+        # These are reasonable prefill sizes.
														
 
															+        ksandpartions = ((4096, (4096, 4096, 4096)), (4096, (4096, )),
														
 
															+                         (4096, (11008, 11008)), (11008, (4096, )))
														
 
															+
														
 
															+        # reasonable ranges for m.
														
 
															+        for m in [
														
 
															+                1, 2, 4, 8, 10, 12, 14, 16, 24, 32, 48, 52, 56, 64, 96, 112,
														
 
															+                128, 256, 512, 1024, 1536, 2048, 3072, 4096
														
 
															+        ]:
														
 
															+            print(f'{m}', file=sys.__stdout__)
														
 
															+            for ksp in ksandpartions:
														
 
															+                run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits,
														
 
															+                         methods)
														
 
															+
														
 
															+        sys.stdout = sys.__stdout__
														
 
															+
														
 
															+
														
 
															+def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
														
 
															+             methods):
														
 
															+
														
 
															+    # I didn't see visible improvements from increasing these, but feel free :)
														
 
															+    num_warmup_trials = 1
														
 
															+    num_trials = 1
														
 
															+
														
 
															+    num_calls = 100
														
 
															+
														
 
															+    # warmup.
														
 
															+    for method in methods:
														
 
															+        for _ in range(num_warmup_trials):
														
 
															+            run_timing(
														
 
															+                num_calls=num_calls,
														
 
															+                m=m,
														
 
															+                k=k,
														
 
															+                parts=parts,
														
 
															+                nbooks=nbooks,
														
 
															+                bits=bits,
														
 
															+                method=method,
														
 
															+            )
														
 
															+
														
 
															+    n = parts.sum().item()
														
 
															+    print(f'{m} | {k} | {n} | {parts.tolist()}', end='')
														
 
															+
														
 
															+    for method in methods:
														
 
															+        best_time_us = 1e20
														
 
															+        for _ in range(num_trials):
														
 
															+            kernel_dur_ms = run_timing(
														
 
															+                num_calls=num_calls,
														
 
															+                m=m,
														
 
															+                k=k,
														
 
															+                parts=parts,
														
 
															+                nbooks=nbooks,
														
 
															+                bits=bits,
														
 
															+                method=method,
														
 
															+            )
														
 
															+
														
 
															+            kernel_dur_us = 1000 * kernel_dur_ms
														
 
															+
														
 
															+            if kernel_dur_us < best_time_us:
														
 
															+                best_time_us = kernel_dur_us
														
 
															+
														
 
															+        print(f' | {kernel_dur_us:.0f}', end='')
														
 
															+
														
 
															+    print('')
														
 
															+
														
 
															+
														
 
															+def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor,
														
 
															+               nbooks: int, bits: int, method) -> float:
														
 
															+
														
 
															+    n = int(parts.sum().item())
														
 
															+
														
 
															+    device = torch.device('cuda:0')
														
 
															+
														
 
															+    input = torch.randn((1, m, k), dtype=torch.float16, device=device)
														
 
															+
														
 
															+    code_range = (1 << bits) // 2
														
 
															+    ingroups = 8
														
 
															+
														
 
															+    codes = torch.randint(-code_range,
														
 
															+                          code_range,
														
 
															+                          size=(n, k // ingroups, nbooks),
														
 
															+                          dtype=get_int_dtype(bits),
														
 
															+                          device=device)
														
 
															+
														
 
															+    codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
														
 
															+                            dtype=torch.float16,
														
 
															+                            device=device)
														
 
															+
														
 
															+    scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
														
 
															+
														
 
															+    # for comparison to just a pytorch mult.
														
 
															+    weights = torch.randn((n, k), dtype=torch.float16, device=device)
														
 
															+
														
 
															+    start_event = torch.cuda.Event(enable_timing=True)
														
 
															+    end_event = torch.cuda.Event(enable_timing=True)
														
 
															+
														
 
															+    start_event.record()
														
 
															+
														
 
															+    if method is torch_mult:
														
 
															+        for i in range(num_calls):
														
 
															+            torch_mult(input, weights, scales)
														
 
															+    else:
														
 
															+        for i in range(num_calls):
														
 
															+            method(input, codes, codebooks, scales, parts, None)
														
 
															+
														
 
															+    end_event.record()
														
 
															+    end_event.synchronize()
														
 
															+
														
 
															+    dur_ms = start_event.elapsed_time(end_event) / num_calls
														
 
															+    return dur_ms
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    sys.exit(main())
														
--- a/tests/benchmarks/kernels/benchmark_shapes.py
+++ b/tests/benchmarks/kernels/benchmark_shapes.py
@@ -0,0 +1,75 @@
 
															+WEIGHT_SHAPES = {
														
 
															+    "ideal": [[4 * 256 * 32, 256 * 32]],
														
 
															+    "mistralai/Mistral-7B-v0.1/TP1": [
														
 
															+        [4096, 6144],
														
 
															+        [4096, 4096],
														
 
															+        [4096, 28672],
														
 
															+        [14336, 4096],
														
 
															+    ],
														
 
															+    "mistralai/Mistral-7B-v0.1/TP2": [
														
 
															+        [4096, 3072],
														
 
															+        [2048, 4096],
														
 
															+        [4096, 14336],
														
 
															+        [7168, 4096],
														
 
															+    ],
														
 
															+    "mistralai/Mistral-7B-v0.1/TP4": [
														
 
															+        [4096, 1536],
														
 
															+        [1024, 4096],
														
 
															+        [4096, 7168],
														
 
															+        [3584, 4096],
														
 
															+    ],
														
 
															+    "meta-llama/Llama-2-7b-hf/TP1": [
														
 
															+        [4096, 12288],
														
 
															+        [4096, 4096],
														
 
															+        [4096, 22016],
														
 
															+        [11008, 4096],
														
 
															+    ],
														
 
															+    "meta-llama/Llama-2-7b-hf/TP2": [
														
 
															+        [4096, 6144],
														
 
															+        [2048, 4096],
														
 
															+        [4096, 11008],
														
 
															+        [5504, 4096],
														
 
															+    ],
														
 
															+    "meta-llama/Llama-2-7b-hf/TP4": [
														
 
															+        [4096, 3072],
														
 
															+        [1024, 4096],
														
 
															+        [4096, 5504],
														
 
															+        [2752, 4096],
														
 
															+    ],
														
 
															+    "meta-llama/Llama-2-13b-hf/TP1": [
														
 
															+        [5120, 15360],
														
 
															+        [5120, 5120],
														
 
															+        [5120, 27648],
														
 
															+        [13824, 5120],
														
 
															+    ],
														
 
															+    "meta-llama/Llama-2-13b-hf/TP2": [
														
 
															+        [5120, 7680],
														
 
															+        [2560, 5120],
														
 
															+        [5120, 13824],
														
 
															+        [6912, 5120],
														
 
															+    ],
														
 
															+    "meta-llama/Llama-2-13b-hf/TP4": [
														
 
															+        [5120, 3840],
														
 
															+        [1280, 5120],
														
 
															+        [5120, 6912],
														
 
															+        [3456, 5120],
														
 
															+    ],
														
 
															+    "meta-llama/Llama-2-70b-hf/TP1": [
														
 
															+        [8192, 10240],
														
 
															+        [8192, 8192],
														
 
															+        [8192, 57344],
														
 
															+        [28672, 8192],
														
 
															+    ],
														
 
															+    "meta-llama/Llama-2-70b-hf/TP2": [
														
 
															+        [8192, 5120],
														
 
															+        [4096, 8192],
														
 
															+        [8192, 28672],
														
 
															+        [14336, 8192],
														
 
															+    ],
														
 
															+    "meta-llama/Llama-2-70b-hf/TP4": [
														
 
															+        [8192, 2560],
														
 
															+        [2048, 8192],
														
 
															+        [8192, 14336],
														
 
															+        [7168, 8192],
														
 
															+    ],
														
 
															+}
														
--- a/tests/benchmarks/kernels/marlin.py
+++ b/tests/benchmarks/kernels/marlin.py
@@ -0,0 +1,238 @@
 
															+from typing import List
														
 
															+
														
 
															+import torch
														
 
															+import torch.utils.benchmark as benchmark
														
 
															+from benchmark_shapes import WEIGHT_SHAPES
														
 
															+
														
 
															+from aphrodite import _custom_ops as ops
														
 
															+from aphrodite.common.utils import FlexibleArgumentParser
														
 
															+from aphrodite.quantization.gptq_marlin_24 import (
														
 
															+    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
														
 
															+    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
														
 
															+from aphrodite.quantization.utils.marlin_utils import (
														
 
															+    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
														
 
															+    GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
														
 
															+from aphrodite.quantization.utils.marlin_utils_test import (MarlinWorkspace,
														
 
															+                                                            marlin_quantize)
														
 
															+from aphrodite.quantization.utils.marlin_utils_test_24 import \
														
 
															+    marlin_24_quantize
														
 
															+from aphrodite.quantization.utils.quant_utils import (gptq_pack,
														
 
															+                                                      quantize_weights,
														
 
															+                                                      sort_weights)
														
 
															+
														
 
															+DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
														
 
															+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
														
 
															+
														
 
															+ACT_ORDER_OPTS = [False, True]
														
 
															+K_FULL_OPTS = [False, True]
														
 
															+
														
 
															+
														
 
															+def bench_run(results: List[benchmark.Measurement], model: str,
														
 
															+              act_order: bool, is_k_full: bool, num_bits: int, group_size: int,
														
 
															+              size_m: int, size_k: int, size_n: int):
														
 
															+    label = "Quant Matmul"
														
 
															+
														
 
															+    sub_label = ("{}, act={} k_full={}, b={}, g={}, "
														
 
															+                 "MKN=({}x{}x{})".format(model, act_order, is_k_full, num_bits,
														
 
															+                                         group_size, size_m, size_k, size_n))
														
 
															+
														
 
															+    print(f"Testing: {sub_label}")
														
 
															+
														
 
															+    a = torch.randn(size_m, size_k).to(torch.half).cuda()
														
 
															+    b = torch.rand(size_k, size_n).to(torch.half).cuda()
														
 
															+
														
 
															+    a_tmp = (torch.zeros(size_m, size_k).to(torch.half).cuda())
														
 
															+
														
 
															+    # Marlin quant
														
 
															+    (
														
 
															+        marlin_w_ref,
														
 
															+        marlin_q_w,
														
 
															+        marlin_s,
														
 
															+        marlin_g_idx,
														
 
															+        marlin_sort_indices,
														
 
															+        marlin_rand_perm,
														
 
															+    ) = marlin_quantize(b, num_bits, group_size, act_order)
														
 
															+
														
 
															+    # Marlin_24 quant
														
 
															+    (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
														
 
															+     marlin_24_s) = marlin_24_quantize(b, num_bits, group_size)
														
 
															+
														
 
															+    # GPTQ quant
														
 
															+    (w_ref, q_w, s, g_idx,
														
 
															+     rand_perm) = quantize_weights(b, num_bits, group_size, act_order)
														
 
															+    q_w_gptq = gptq_pack(q_w, num_bits, size_k, size_n)
														
 
															+
														
 
															+    # For act_order, sort the "weights" and "g_idx"
														
 
															+    # so that group ids are increasing
														
 
															+    repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
														
 
															+    if act_order:
														
 
															+        (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
														
 
															+
														
 
															+    # Prepare
														
 
															+    marlin_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
														
 
															+                                       GPTQ_MARLIN_MAX_PARALLEL)
														
 
															+
														
 
															+    marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
														
 
															+                                          GPTQ_MARLIN_24_MAX_PARALLEL)
														
 
															+
														
 
															+    globals = {
														
 
															+        # Gen params
														
 
															+        "num_bits": num_bits,
														
 
															+        "group_size": group_size,
														
 
															+        "size_m": size_m,
														
 
															+        "size_n": size_n,
														
 
															+        "size_k": size_k,
														
 
															+        "a": a,
														
 
															+        "a_tmp": a_tmp,
														
 
															+        # Marlin params
														
 
															+        "marlin_w_ref": marlin_w_ref,
														
 
															+        "marlin_q_w": marlin_q_w,
														
 
															+        "marlin_s": marlin_s,
														
 
															+        "marlin_g_idx": marlin_g_idx,
														
 
															+        "marlin_sort_indices": marlin_sort_indices,
														
 
															+        "marlin_rand_perm": marlin_rand_perm,
														
 
															+        "marlin_workspace": marlin_workspace,
														
 
															+        "is_k_full": is_k_full,
														
 
															+        # Marlin_24 params
														
 
															+        "marlin_24_w_ref": marlin_24_w_ref,
														
 
															+        "marlin_24_q_w_comp": marlin_24_q_w_comp,
														
 
															+        "marlin_24_meta": marlin_24_meta,
														
 
															+        "marlin_24_s": marlin_24_s,
														
 
															+        "marlin_24_workspace": marlin_24_workspace,
														
 
															+        # GPTQ params
														
 
															+        "q_w_gptq": q_w_gptq,
														
 
															+        "repack_sort_indices": repack_sort_indices,
														
 
															+        # Kernels
														
 
															+        "gptq_marlin_gemm": ops.gptq_marlin_gemm,
														
 
															+        "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
														
 
															+        "gptq_marlin_repack": ops.gptq_marlin_repack,
														
 
															+    }
														
 
															+
														
 
															+    min_run_time = 1
														
 
															+
														
 
															+    # Warmup pytorch
														
 
															+    for i in range(5):
														
 
															+        torch.matmul(a, marlin_w_ref)
														
 
															+
														
 
															+    results.append(
														
 
															+        benchmark.Timer(
														
 
															+            stmt="torch.matmul(a, marlin_w_ref)",
														
 
															+            globals=globals,
														
 
															+            label=label,
														
 
															+            sub_label=sub_label,
														
 
															+            description="pytorch_gemm",
														
 
															+        ).blocked_autorange(min_run_time=min_run_time))
														
 
															+
														
 
															+    results.append(
														
 
															+        benchmark.Timer(
														
 
															+            stmt=
														
 
															+            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full)",  # noqa: E501
														
 
															+            globals=globals,
														
 
															+            label=label,
														
 
															+            sub_label=sub_label,
														
 
															+            description="gptq_marlin_gemm",
														
 
															+        ).blocked_autorange(min_run_time=min_run_time))
														
 
															+
														
 
															+    if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
														
 
															+            and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES):
														
 
															+        results.append(
														
 
															+            benchmark.Timer(
														
 
															+                stmt=
														
 
															+                "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, num_bits, size_m, size_n, size_k)",  # noqa: E501
														
 
															+                globals=globals,
														
 
															+                label=label,
														
 
															+                sub_label=sub_label,
														
 
															+                description="gptq_marlin_24_gemm",
														
 
															+            ).blocked_autorange(min_run_time=min_run_time))
														
 
															+
														
 
															+    results.append(
														
 
															+        benchmark.Timer(
														
 
															+            stmt=
														
 
															+            "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, num_bits)",  # noqa: E501
														
 
															+            globals=globals,
														
 
															+            label=label,
														
 
															+            sub_label=sub_label,
														
 
															+            description="gptq_marlin_repack",
														
 
															+        ).blocked_autorange(min_run_time=min_run_time))
														
 
															+
														
 
															+
														
 
															+def main(args):
														
 
															+    print("Benchmarking models:")
														
 
															+    for i, model in enumerate(args.models):
														
 
															+        print(f"[{i}]  {model}")
														
 
															+
														
 
															+    results: List[benchmark.Measurement] = []
														
 
															+
														
 
															+    for model in args.models:
														
 
															+        for layer in WEIGHT_SHAPES[model]:
														
 
															+            size_k = layer[0]
														
 
															+            size_n = layer[1]
														
 
															+
														
 
															+            if len(args.limit_k) > 0 and size_k not in args.limit_k:
														
 
															+                continue
														
 
															+
														
 
															+            if len(args.limit_n) > 0 and size_n not in args.limit_n:
														
 
															+                continue
														
 
															+
														
 
															+            for act_order in ACT_ORDER_OPTS:
														
 
															+                if len(args.limit_act_order
														
 
															+                       ) > 0 and act_order not in args.limit_act_order:
														
 
															+                    continue
														
 
															+
														
 
															+                for is_k_full in K_FULL_OPTS:
														
 
															+                    if len(args.limit_k_full
														
 
															+                           ) > 0 and is_k_full not in args.limit_k_full:
														
 
															+                        continue
														
 
															+
														
 
															+                    for num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
														
 
															+                        if len(args.limit_num_bits
														
 
															+                               ) > 0 and num_bits not in args.limit_num_bits:
														
 
															+                            continue
														
 
															+
														
 
															+                        for group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
														
 
															+                            if len(
														
 
															+                                    args.limit_group_size
														
 
															+                            ) > 0 and group_size not in args.limit_group_size:
														
 
															+                                continue
														
 
															+
														
 
															+                            # For act_order, the group_size must be less than
														
 
															+                            # size_k
														
 
															+                            if act_order and (group_size == size_k
														
 
															+                                              or group_size == -1):
														
 
															+                                continue
														
 
															+
														
 
															+                            for size_m in args.batch_sizes:
														
 
															+                                bench_run(results, model, act_order, is_k_full,
														
 
															+                                          num_bits, group_size, size_m, size_k,
														
 
															+                                          size_n)
														
 
															+
														
 
															+    compare = benchmark.Compare(results)
														
 
															+    compare.print()
														
 
															+
														
 
															+
														
 
															+# For quick benchmarking use:
														
 
															+#   python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
														
 
															+#
														
 
															+if __name__ == "__main__":
														
 
															+    parser = FlexibleArgumentParser(
														
 
															+        description="Benchmark Marlin across specified models/shapes/batches")
														
 
															+    parser.add_argument(
														
 
															+        "--models",
														
 
															+        nargs="+",
														
 
															+        type=str,
														
 
															+        default=DEFAULT_MODELS,
														
 
															+        choices=WEIGHT_SHAPES.keys(),
														
 
															+    )
														
 
															+    parser.add_argument("--batch-sizes",
														
 
															+                        nargs="+",
														
 
															+                        type=int,
														
 
															+                        default=DEFAULT_BATCH_SIZES)
														
 
															+    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
														
 
															+    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
														
 
															+    parser.add_argument("--limit-group-size", nargs="+", type=int, default=[])
														
 
															+    parser.add_argument("--limit-num-bits", nargs="+", type=int, default=[])
														
 
															+    parser.add_argument("--limit-act-order", nargs="+", type=int, default=[])
														
 
															+    parser.add_argument("--limit-k-full", nargs="+", type=int, default=[])
														
 
															+
														
 
															+    args = parser.parse_args()
														
 
															+    main(args)
														
--- a/tests/benchmarks/benchmark_moe.py
+++ b/tests/benchmarks/benchmark_moe.py
@@ -1,7 +1,7 @@
 
															 import argparse
														
 
															 import time
														
 
															 from datetime import datetime
														
 
															-from typing import Any, Dict, List, Tuple
														
 
															+from typing import Any, Dict, List, Tuple, TypedDict
														
 
															 import ray
														
 
															 import torch
														
@@ -9,11 +9,21 @@ import triton
 
															 from ray.experimental.tqdm_ray import tqdm
														
 
															 from transformers import AutoConfig
														
 
															+from aphrodite.common.utils import FlexibleArgumentParser
														
 
															 from aphrodite.modeling.layers.fused_moe.fused_moe import *
														
 
															+class BenchmarkConfig(TypedDict):
														
 
															+    BLOCK_SIZE_M: int
														
 
															+    BLOCK_SIZE_N: int
														
 
															+    BLOCK_SIZE_K: int
														
 
															+    GROUP_SIZE_M: int
														
 
															+    num_warps: int
														
 
															+    num_stages: int
														
 
															+
														
 
															+
														
 
															 def benchmark_config(
														
 
															-    config: Dict[str, int],
														
 
															+    config: BenchmarkConfig,
														
 
															     num_tokens: int,
														
 
															     num_experts: int,
														
 
															     shard_intermediate_size: int,
														
@@ -92,7 +102,7 @@ def benchmark_config(
 
															     start_event = torch.cuda.Event(enable_timing=True)
														
 
															     end_event = torch.cuda.Event(enable_timing=True)
														
 
															-    latencies = []
														
 
															+    latencies: List[float] = []
														
 
															     for i in range(num_iters):
														
 
															         prepare(i)
														
 
															         torch.cuda.synchronize()
														
@@ -111,7 +121,7 @@ def get_configs_compute_bound() -> List[Dict[str, int]]:
 
															     # Reduced search space for faster tuning.
														
 
															     # TODO(woosuk): Increase the search space and use a performance model to
														
 
															     # prune the search space.
														
 
															-    configs = []
														
 
															+    configs: List[BenchmarkConfig] = []
														
 
															     for num_stages in [2, 3, 4, 5]:
														
 
															         for block_m in [16, 32, 64, 128, 256]:
														
 
															             for block_k in [64, 128, 256]:
														
@@ -175,8 +185,8 @@ class BenchmarkWorker:
 
															         topk: int,
														
 
															         dtype: torch.dtype,
														
 
															         use_fp8: bool,
														
 
															-        search_space: List[Dict[str, int]],
														
 
															-    ) -> Dict[str, int]:
														
 
															+        search_space: List[BenchmarkConfig],
														
 
															+    ) -> BenchmarkConfig:
														
 
															         best_config = None
														
 
															         best_time = float("inf")
														
 
															         for config in tqdm(search_space):
														
@@ -199,10 +209,11 @@ class BenchmarkWorker:
 
															                 best_config = config
														
 
															         now = datetime.now()
														
 
															         print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
														
 
															+        assert best_config is not None
														
 
															         return best_config
														
 
															-def sort_config(config: Dict[str, int]) -> Dict[str, int]:
														
 
															+def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
														
 
															     return {
														
 
															         "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
														
 
															         "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
														
@@ -214,7 +225,7 @@ def sort_config(config: Dict[str, int]) -> Dict[str, int]:
 
															 def save_configs(
														
 
															-    configs: Dict[int, Dict[str, int]],
														
 
															+    configs: Dict[int, BenchmarkConfig],
														
 
															     num_experts: int,
														
 
															     shard_intermediate_size: int,
														
 
															     hidden_size: int,
														
@@ -255,7 +266,8 @@ def main(args: argparse.Namespace):
 
															     if args.batch_size is None:
														
 
															         batch_sizes = [
														
 
															-            1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048, 3072, 4096
														
 
															+            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
														
 
															+            2048, 3072, 4096
														
 
															         ]
														
 
															     else:
														
 
															         batch_sizes = [args.batch_size]
														
@@ -304,7 +316,7 @@ def main(args: argparse.Namespace):
 
															 if __name__ == "__main__":
														
 
															-    parser = argparse.ArgumentParser()
														
 
															+    parser = FlexibleArgumentParser()
														
 
															     parser.add_argument("--model",
														
 
															                         type=str,
														
 
															                         default="mistralai/Mixtral-8x7B-Instruct-v0.1")
														
--- a/tests/benchmarks/kernels/paged_attention.py
+++ b/tests/benchmarks/kernels/paged_attention.py
@@ -0,0 +1,215 @@
 
															+import random
														
 
															+import time
														
 
															+from typing import List, Optional
														
 
															+
														
 
															+import torch
														
 
															+
														
 
															+from aphrodite import _custom_ops as ops
														
 
															+from aphrodite.common.utils import (STR_DTYPE_TO_TORCH_DTYPE,
														
 
															+                                    FlexibleArgumentParser,
														
 
															+                                    create_kv_caches_with_random)
														
 
															+
														
 
															+NUM_BLOCKS = 1024
														
 
															+PARTITION_SIZE = 512
														
 
															+
														
 
															+
														
 
															+@torch.inference_mode()
														
 
															+def main(
														
 
															+    version: str,
														
 
															+    num_seqs: int,
														
 
															+    seq_len: int,
														
 
															+    num_query_heads: int,
														
 
															+    num_kv_heads: int,
														
 
															+    head_size: int,
														
 
															+    use_alibi: bool,
														
 
															+    block_size: int,
														
 
															+    dtype: torch.dtype,
														
 
															+    seed: int,
														
 
															+    do_profile: bool,
														
 
															+    device: str = "cuda",
														
 
															+    kv_cache_dtype: Optional[str] = None,
														
 
															+) -> None:
														
 
															+    random.seed(seed)
														
 
															+    torch.random.manual_seed(seed)
														
 
															+    if torch.cuda.is_available():
														
 
															+        torch.cuda.manual_seed(seed)
														
 
															+
														
 
															+    scale = float(1.0 / (head_size**0.5))
														
 
															+    query = torch.empty(num_seqs,
														
 
															+                        num_query_heads,
														
 
															+                        head_size,
														
 
															+                        dtype=dtype,
														
 
															+                        device=device)
														
 
															+    query.uniform_(-scale, scale)
														
 
															+
														
 
															+    assert num_query_heads % num_kv_heads == 0
														
 
															+    alibi_slopes = None
														
 
															+    if use_alibi:
														
 
															+        alibi_slopes = torch.randn(num_query_heads,
														
 
															+                                   dtype=torch.float,
														
 
															+                                   device=device)
														
 
															+
														
 
															+    seq_lens = [seq_len for _ in range(num_seqs)]
														
 
															+    max_seq_len = max(seq_lens)
														
 
															+    seq_lens = torch.tensor(seq_lens, dtype=torch.int, device=device)
														
 
															+
														
 
															+    # Create the block tables.
														
 
															+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
														
 
															+    block_tables_lst: List[List[int]] = []
														
 
															+    for _ in range(num_seqs):
														
 
															+        block_table = [
														
 
															+            random.randint(0, NUM_BLOCKS - 1)
														
 
															+            for _ in range(max_num_blocks_per_seq)
														
 
															+        ]
														
 
															+        block_tables_lst.append(block_table)
														
 
															+
														
 
															+    block_tables = torch.tensor(block_tables_lst,
														
 
															+                                dtype=torch.int,
														
 
															+                                device=device)
														
 
															+
														
 
															+    # Create the KV cache.
														
 
															+    key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
														
 
															+                                                            block_size,
														
 
															+                                                            1,
														
 
															+                                                            num_kv_heads,
														
 
															+                                                            head_size,
														
 
															+                                                            kv_cache_dtype,
														
 
															+                                                            dtype,
														
 
															+                                                            device=device)
														
 
															+    key_cache, value_cache = key_caches[0], value_caches[0]
														
 
															+
														
 
															+    # Prepare for the paged attention kernel.
														
 
															+    output = torch.empty_like(query)
														
 
															+    if version == "v2":
														
 
															+        num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
														
 
															+        tmp_output = torch.empty(
														
 
															+            size=(num_seqs, num_query_heads, num_partitions, head_size),
														
 
															+            dtype=output.dtype,
														
 
															+            device=output.device,
														
 
															+        )
														
 
															+        exp_sums = torch.empty(
														
 
															+            size=(num_seqs, num_query_heads, num_partitions),
														
 
															+            dtype=torch.float32,
														
 
															+            device=output.device,
														
 
															+        )
														
 
															+        max_logits = torch.empty_like(exp_sums)
														
 
															+
														
 
															+    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
														
 
															+        torch.cuda.synchronize()
														
 
															+        if profile:
														
 
															+            torch.cuda.cudart().cudaProfilerStart()
														
 
															+        start_time = time.perf_counter()
														
 
															+
														
 
															+        # Using default kv_scale
														
 
															+        k_scale = v_scale = 1.0
														
 
															+
														
 
															+        for _ in range(num_iters):
														
 
															+            if version == "v1":
														
 
															+                ops.paged_attention_v1(
														
 
															+                    output,
														
 
															+                    query,
														
 
															+                    key_cache,
														
 
															+                    value_cache,
														
 
															+                    num_kv_heads,
														
 
															+                    scale,
														
 
															+                    block_tables,
														
 
															+                    seq_lens,
														
 
															+                    block_size,
														
 
															+                    max_seq_len,
														
 
															+                    alibi_slopes,
														
 
															+                    kv_cache_dtype,
														
 
															+                    k_scale,
														
 
															+                    v_scale,
														
 
															+                )
														
 
															+            elif version == "v2":
														
 
															+                ops.paged_attention_v2(
														
 
															+                    output,
														
 
															+                    exp_sums,
														
 
															+                    max_logits,
														
 
															+                    tmp_output,
														
 
															+                    query,
														
 
															+                    key_cache,
														
 
															+                    value_cache,
														
 
															+                    num_kv_heads,
														
 
															+                    scale,
														
 
															+                    block_tables,
														
 
															+                    seq_lens,
														
 
															+                    block_size,
														
 
															+                    max_seq_len,
														
 
															+                    alibi_slopes,
														
 
															+                    kv_cache_dtype,
														
 
															+                    k_scale,
														
 
															+                    v_scale,
														
 
															+                )
														
 
															+            else:
														
 
															+                raise ValueError(f"Invalid version: {version}")
														
 
															+        torch.cuda.synchronize()
														
 
															+
														
 
															+        end_time = time.perf_counter()
														
 
															+        if profile:
														
 
															+            torch.cuda.cudart().cudaProfilerStart()
														
 
															+        return (end_time - start_time) / num_iters
														
 
															+
														
 
															+    # Warmup.
														
 
															+    print("Warming up...")
														
 
															+    run_benchmark = run_cuda_benchmark
														
 
															+    run_benchmark(num_iters=3, profile=False)
														
 
															+
														
 
															+    # Benchmark.
														
 
															+    if do_profile:
														
 
															+        latency = run_benchmark(num_iters=1, profile=True)
														
 
															+    else:
														
 
															+        latency = run_benchmark(num_iters=100, profile=False)
														
 
															+    print(f"Kernel running time: {latency * 1000000:.3f} us")
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    parser = FlexibleArgumentParser(
														
 
															+        description="Benchmark the paged attention kernel.")
														
 
															+    parser.add_argument("--version",
														
 
															+                        type=str,
														
 
															+                        choices=["v1", "v2"],
														
 
															+                        default="v2")
														
 
															+    parser.add_argument("--batch-size", type=int, default=8)
														
 
															+    parser.add_argument("--seq-len", type=int, default=4096)
														
 
															+    parser.add_argument("--num-query-heads", type=int, default=64)
														
 
															+    parser.add_argument("--num-kv-heads", type=int, default=8)
														
 
															+    parser.add_argument("--head-size",
														
 
															+                        type=int,
														
 
															+                        choices=[64, 80, 96, 112, 128, 192, 256],
														
 
															+                        default=128)
														
 
															+    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
														
 
															+    parser.add_argument("--use-alibi", action="store_true")
														
 
															+    parser.add_argument("--dtype",
														
 
															+                        type=str,
														
 
															+                        choices=["half", "bfloat16", "float"],
														
 
															+                        default="half")
														
 
															+    parser.add_argument("--seed", type=int, default=0)
														
 
															+    parser.add_argument("--profile", action="store_true")
														
 
															+    parser.add_argument(
														
 
															+        "--kv-cache-dtype",
														
 
															+        type=str,
														
 
															+        choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"],
														
 
															+        default="auto",
														
 
															+        help="Data type for kv cache storage. If 'auto', will use model "
														
 
															+        "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
														
 
															+        "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)")
														
 
															+    args = parser.parse_args()
														
 
															+    print(args)
														
 
															+
														
 
															+    if args.num_query_heads % args.num_kv_heads != 0:
														
 
															+        raise ValueError("num_query_heads must be divisible by num_kv_heads")
														
 
															+    main(
														
 
															+        version=args.version,
														
 
															+        num_seqs=args.batch_size,
														
 
															+        seq_len=args.seq_len,
														
 
															+        num_query_heads=args.num_query_heads,
														
 
															+        num_kv_heads=args.num_kv_heads,
														
 
															+        head_size=args.head_size,
														
 
															+        block_size=args.block_size,
														
 
															+        use_alibi=args.use_alibi,
														
 
															+        dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
														
 
															+        seed=args.seed,
														
 
															+        do_profile=args.profile,
														
 
															+        kv_cache_dtype=args.kv_cache_dtype,
														
 
															+    )
														
--- a/tests/benchmarks/kernels/rope.py
+++ b/tests/benchmarks/kernels/rope.py
@@ -0,0 +1,122 @@
 
															+from itertools import accumulate
														
 
															+from typing import List, Optional
														
 
															+
														
 
															+import nvtx
														
 
															+import torch
														
 
															+
														
 
															+from aphrodite.common.utils import FlexibleArgumentParser
														
 
															+from aphrodite.modeling.layers.rotary_embedding import (RotaryEmbedding,
														
 
															+                                                        get_rope)
														
 
															+
														
 
															+
														
 
															+def benchmark_rope_kernels_multi_lora(
														
 
															+    is_neox_style: bool,
														
 
															+    batch_size: int,
														
 
															+    seq_len: int,
														
 
															+    num_heads: int,
														
 
															+    head_size: int,
														
 
															+    rotary_dim: Optional[int],
														
 
															+    dtype: torch.dtype,
														
 
															+    seed: int,
														
 
															+    device: str,
														
 
															+    max_position: int = 8192,
														
 
															+    base: int = 10000,
														
 
															+) -> None:
														
 
															+    torch.random.manual_seed(seed)
														
 
															+    if torch.cuda.is_available():
														
 
															+        torch.cuda.manual_seed(seed)
														
 
															+    torch.set_default_device(device)
														
 
															+    if rotary_dim is None:
														
 
															+        rotary_dim = head_size
														
 
															+    # silulating serving 4 LoRAs
														
 
															+    scaling_factors = [1, 2, 4, 8]
														
 
															+    # batched RoPE can take multiple scaling factors
														
 
															+    batched_rope = get_rope(head_size, rotary_dim, max_position, base,
														
 
															+                            is_neox_style, {
														
 
															+                                "type": "linear",
														
 
															+                                "factor": tuple(scaling_factors)
														
 
															+                            })
														
 
															+    # non-batched RoPE takes only one scaling factor, we create multiple
														
 
															+    # instances to simulate the same behavior
														
 
															+    non_batched_ropes: List[RotaryEmbedding] = []
														
 
															+    for scaling_factor in scaling_factors:
														
 
															+        non_batched_ropes.append(
														
 
															+            get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
														
 
															+                     {
														
 
															+                         "type": "linear",
														
 
															+                         "factor": (scaling_factor, )
														
 
															+                     }))
														
 
															+
														
 
															+    positions = torch.randint(0, max_position, (batch_size, seq_len))
														
 
															+    query = torch.randn(batch_size,
														
 
															+                        seq_len,
														
 
															+                        num_heads * head_size,
														
 
															+                        dtype=dtype)
														
 
															+    key = torch.randn_like(query)
														
 
															+
														
 
															+    # create query offsets for batched RoPE, we concat multiple kv cache
														
 
															+    # together and each query needs to find the right kv cache of its type
														
 
															+    offset_map = torch.tensor(
														
 
															+        list(
														
 
															+            accumulate([0] + [
														
 
															+                max_position * scaling_factor * 2
														
 
															+                for scaling_factor in scaling_factors[:-1]
														
 
															+            ])))
														
 
															+    query_types = torch.randint(0,
														
 
															+                                len(scaling_factors), (batch_size, seq_len),
														
 
															+                                device=device)
														
 
															+    # map query types to offsets
														
 
															+    query_offsets = offset_map[query_types]
														
 
															+    # the kernel takes flattened offsets
														
 
															+    flatten_offsets = query_offsets.flatten()
														
 
															+
														
 
															+    # batched queries of the same type together for non-batched RoPE
														
 
															+    queries = [query[query_types == i] for i in range(len(scaling_factors))]
														
 
															+    keys = [key[query_types == i] for i in range(len(scaling_factors))]
														
 
															+    packed_qkr = zip(queries, keys, non_batched_ropes)
														
 
															+    # synchronize before start timing
														
 
															+    torch.cuda.synchronize()
														
 
															+    with nvtx.annotate("non-batched", color="yellow"):
														
 
															+        for q, k, r in packed_qkr:
														
 
															+            r.forward(positions, q, k)
														
 
															+    torch.cuda.synchronize()
														
 
															+    with nvtx.annotate("batched", color="green"):
														
 
															+        batched_rope.forward(positions, query, key, flatten_offsets)
														
 
															+    torch.cuda.synchronize()
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    parser = FlexibleArgumentParser(
														
 
															+        description="Benchmark the rotary embedding kernels.")
														
 
															+    parser.add_argument("--is-neox-style", type=bool, default=True)
														
 
															+    parser.add_argument("--batch-size", type=int, default=16)
														
 
															+    parser.add_argument("--seq-len", type=int, default=512)
														
 
															+    parser.add_argument("--num-heads", type=int, default=8)
														
 
															+    parser.add_argument("--head-size",
														
 
															+                        type=int,
														
 
															+                        choices=[64, 80, 96, 112, 128, 192, 256],
														
 
															+                        default=128)
														
 
															+    parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
														
 
															+    parser.add_argument("--dtype",
														
 
															+                        type=str,
														
 
															+                        choices=["bfloat16", "float"],
														
 
															+                        default="float")
														
 
															+    parser.add_argument("--seed", type=int, default=0)
														
 
															+    parser.add_argument("--device",
														
 
															+                        type=str,
														
 
															+                        choices=["cuda:0", "cuda:1"],
														
 
															+                        default="cuda:0")
														
 
															+    args = parser.parse_args()
														
 
															+    print(args)
														
 
															+
														
 
															+    benchmark_rope_kernels_multi_lora(
														
 
															+        is_neox_style=args.is_neox_style,
														
 
															+        batch_size=args.batch_size,
														
 
															+        seq_len=args.seq_len,
														
 
															+        num_heads=args.num_heads,
														
 
															+        head_size=args.head_size,
														
 
															+        rotary_dim=args.rotary_dim,
														
 
															+        dtype=getattr(torch, args.dtype),
														
 
															+        seed=args.seed,
														
 
															+        device=args.device,
														
 
															+    )
														
--- a/tests/benchmarks/latency.py
+++ b/tests/benchmarks/latency.py
@@ -1,100 +0,0 @@
 
															-import argparse
														
 
															-import time
														
 
															-
														
 
															-import numpy as np
														
 
															-import torch
														
 
															-from tqdm import tqdm
														
 
															-
														
 
															-from aphrodite import LLM, SamplingParams
														
 
															-
														
 
															-
														
 
															-def main(args: argparse.Namespace):  # pylint: disable=redefined-outer-name
														
 
															-    print(args)
														
 
															-
														
 
															-    # Process all the requests in a single batch if possible.
														
 
															-    # NOTE: If the request cannot be processed in a single batch,
														
 
															-    # the engine will automatically process the request in multiple batches.
														
 
															-    llm = LLM(
														
 
															-        model=args.model,
														
 
															-        tokenizer=args.tokenizer,
														
 
															-        quantization=args.quantization,
														
 
															-        tensor_parallel_size=args.tensor_parallel_size,
														
 
															-        max_num_seqs=args.batch_size,
														
 
															-        max_num_batched_tokens=args.batch_size * args.input_len,
														
 
															-        trust_remote_code=args.trust_remote_code,
														
 
															-        dtype=args.dtype,
														
 
															-    )
														
 
															-
														
 
															-    sampling_params = SamplingParams(
														
 
															-        n=args.n,
														
 
															-        temperature=0.0 if args.use_beam_search else 1.0,
														
 
															-        top_p=1.0,
														
 
															-        use_beam_search=args.use_beam_search,
														
 
															-        ignore_eos=True,
														
 
															-        max_tokens=args.output_len,
														
 
															-    )
														
 
															-    print(sampling_params)
														
 
															-    dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size
														
 
															-
														
 
															-    def run_to_completion(profile: bool = False):
														
 
															-        if profile:
														
 
															-            torch.cuda.cudart().cudaProfilerStart()
														
 
															-        start_time = time.perf_counter()
														
 
															-
														
 
															-        llm.generate(prompt_token_ids=dummy_prompt_token_ids,
														
 
															-                     sampling_params=sampling_params,
														
 
															-                     use_tqdm=False)
														
 
															-
														
 
															-        end_time = time.perf_counter()
														
 
															-        latency = end_time - start_time
														
 
															-        if profile:
														
 
															-            torch.cuda.cudart().cudaProfilerStop()
														
 
															-        return latency
														
 
															-
														
 
															-    print('Warming up...')
														
 
															-    run_to_completion(profile=False)
														
 
															-
														
 
															-    # Benchmark.
														
 
															-    latencies = []
														
 
															-    for _ in tqdm(range(args.num_iters), desc='Profiling iterations'):
														
 
															-        latencies.append(run_to_completion(profile=False))
														
 
															-    print(f'Avg latency: {np.mean(latencies)} seconds')
														
 
															-
														
 
															-
														
 
															-if __name__ == '__main__':
														
 
															-    parser = argparse.ArgumentParser(
														
 
															-        description='Benchmark the latency of processing a single batch of '
														
 
															-        'requests till completion.')
														
 
															-    parser.add_argument('--model', type=str, default='facebook/opt-125m')
														
 
															-    parser.add_argument('--tokenizer', type=str, default=None)
														
 
															-    parser.add_argument('--quantization',
														
 
															-                        '-q',
														
 
															-                        choices=['awq', None],
														
 
															-                        default=None)
														
 
															-    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
														
 
															-    parser.add_argument('--input-len', type=int, default=32)
														
 
															-    parser.add_argument('--output-len', type=int, default=128)
														
 
															-    parser.add_argument('--batch-size', type=int, default=8)
														
 
															-    parser.add_argument('--n',
														
 
															-                        type=int,
														
 
															-                        default=1,
														
 
															-                        help='Number of generated sequences per prompt.')
														
 
															-    parser.add_argument('--use-beam-search', action='store_true')
														
 
															-    parser.add_argument('--num-iters',
														
 
															-                        type=int,
														
 
															-                        default=3,
														
 
															-                        help='Number of iterations to run.')
														
 
															-    parser.add_argument('--trust-remote-code',
														
 
															-                        action='store_true',
														
 
															-                        help='trust remote code from huggingface')
														
 
															-    parser.add_argument(
														
 
															-        '--dtype',
														
 
															-        type=str,
														
 
															-        default='auto',
														
 
															-        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
														
 
															-        help='data type for model weights and activations. '
														
 
															-        'The "auto" option will use FP16 precision '
														
 
															-        'for FP32 and FP16 models, and BF16 precision '
														
 
															-        'for BF16 models.')
														
 
															-    args = parser.parse_args()
														
 
															-    main(args)
														
--- a/tests/benchmarks/overheads/hashing.py
+++ b/tests/benchmarks/overheads/hashing.py
@@ -1,8 +1,8 @@
 
															-import argparse
														
 
															 import cProfile
														
 
															 import pstats
														
 
															 from aphrodite import LLM, SamplingParams
														
 
															+from aphrodite.common.utils import FlexibleArgumentParser
														
 
															 # A very long prompt, total number of tokens is about 15k.
														
 
															 LONG_PROMPT = ["You are an expert in large language models, aren't you?"
														
@@ -17,7 +17,6 @@ def main(args):
 
															         enable_prefix_caching=True,
														
 
															         tensor_parallel_size=args.tensor_parallel_size,
														
 
															         use_v2_block_manager=args.use_v2_block_manager,
														
 
															-        max_model_len=args.max_model_len,
														
 
															     )
														
 
															     sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
														
@@ -48,12 +47,10 @@ def main(args):
 
															 if __name__ == "__main__":
														
 
															-    parser = argparse.ArgumentParser(
														
 
															+    parser = FlexibleArgumentParser(
														
 
															         description='Benchmark the performance of hashing function in'
														
 
															         'automatic prefix caching.')
														
 
															-    parser.add_argument('--model',
														
 
															-                        type=str,
														
 
															-                        default='NousResearch/Meta-Llama-3-8B')
														
 
															+    parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k')
														
 
															     parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
														
 
															     parser.add_argument('--output-len', type=int, default=10)
														
 
															     parser.add_argument('--enable-prefix-caching',
														
@@ -62,9 +59,5 @@ if __name__ == "__main__":
 
															     parser.add_argument('--use-v2-block-manager',
														
 
															                         action='store_true',
														
 
															                         help='Use BlockSpaceMangerV2')
														
 
															-    parser.add_argument('--max-model-len',
														
 
															-                        type=int,
														
 
															-                        default=None,
														
 
															-                        help='maximum length of the model')
														
 
															     args = parser.parse_args()
														
 
															     main(args)
														
--- a/tests/benchmarks/overheads/prefix_caching.py
+++ b/tests/benchmarks/overheads/prefix_caching.py
@@ -0,0 +1,62 @@
 
															+import time
														
 
															+
														
 
															+from aphrodite import LLM, SamplingParams
														
 
															+from aphrodite.common.utils import FlexibleArgumentParser
														
 
															+
														
 
															+PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"  # noqa: E501
														
 
															+
														
 
															+
														
 
															+def test_prefix(llm=None, sampling_params=None, prompts=None):
														
 
															+    start_time = time.time()
														
 
															+
														
 
															+    llm.generate(prompts, sampling_params=sampling_params)
														
 
															+
														
 
															+    end_time = time.time()
														
 
															+    print(f"cost time {end_time - start_time}")
														
 
															+
														
 
															+
														
 
															+def main(args):
														
 
															+    llm = LLM(model=args.model,
														
 
															+              tokenizer_mode='auto',
														
 
															+              trust_remote_code=True,
														
 
															+              enforce_eager=True,
														
 
															+              use_v2_block_manager=args.use_v2_block_manager,
														
 
															+              tensor_parallel_size=args.tensor_parallel_size,
														
 
															+              enable_prefix_caching=args.enable_prefix_caching)
														
 
															+
														
 
															+    num_prompts = 100
														
 
															+    prompts = [PROMPT] * num_prompts
														
 
															+    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
														
 
															+
														
 
															+    print("------warm up------")
														
 
															+    test_prefix(
														
 
															+        llm=llm,
														
 
															+        prompts=prompts,
														
 
															+        sampling_params=sampling_params,
														
 
															+    )
														
 
															+
														
 
															+    print("------start generating------")
														
 
															+    test_prefix(
														
 
															+        llm=llm,
														
 
															+        prompts=prompts,
														
 
															+        sampling_params=sampling_params,
														
 
															+    )
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    parser = FlexibleArgumentParser(
														
 
															+        description='Benchmark the performance with or without automatic '
														
 
															+        'prefix caching.')
														
 
															+    parser.add_argument('--model',
														
 
															+                        type=str,
														
 
															+                        default='baichuan-inc/Baichuan2-13B-Chat')
														
 
															+    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
														
 
															+    parser.add_argument('--output-len', type=int, default=10)
														
 
															+    parser.add_argument('--enable-prefix-caching',
														
 
															+                        action='store_true',
														
 
															+                        help='enable prefix caching')
														
 
															+    parser.add_argument('--use-v2-block-manager',
														
 
															+                        action='store_true',
														
 
															+                        help='Use BlockSpaceMangerV2')
														
 
															+    args = parser.parse_args()
														
 
															+    main(args)