11 months ago · f8dfac6372
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,6 @@ repos
 
															 *.so
														
 
															 .conda
														
 
															 build
														
 
															-*.json
														
 
															 dist*
														
 
															 .VSCodeCounter
														
 
															 conda/
														
--- a/aphrodite/common/config.py
+++ b/aphrodite/common/config.py
@@ -8,7 +8,7 @@ import torch
 
															 from transformers import PretrainedConfig
														
 
															 from aphrodite.transformers_utils.config import get_config
														
 
															-from aphrodite.common.utils import (get_cpu_memory, is_hip,
														
 
															+from aphrodite.common.utils import (get_cpu_memory, is_hip, is_neuron,
														
 
															                                     get_nvcc_cuda_version)
														
 
															 _GB = 1 << 30
														
@@ -43,6 +43,9 @@ class ModelConfig:
 
															         revision: The specific model version to use. It can be a branch name,
														
 
															             a tag name, or a commit id. If unspecified, will use the default
														
 
															             version.
														
 
															+        code_revision: The specific revision to use for the model code on
														
 
															+            Hugging Face Hub. It can be a branch name, a tag name, or a 
														
 
															+            commit id. If unspecified, will use the default version.
														
 
															         tokenizer_revision: The specific tokenizer version to use. It can be a
														
 
															             branch name, a tag name, or a commit id. If unspecified, will use
														
 
															             the default version.
														
@@ -71,16 +74,18 @@ class ModelConfig:
 
															         trust_remote_code: bool,
														
 
															         download_dir: Optional[str],
														
 
															         load_format: str,
														
 
															-        dtype: str,
														
 
															+        # dtype: str,
														
 
															+        dtype: Union[str, torch.dtype],
														
 
															         seed: int,
														
 
															         revision: Optional[str] = None,
														
 
															+        code_revision: Optional[str] = None,
														
 
															         tokenizer_revision: Optional[str] = None,
														
 
															         max_model_len: Optional[int] = None,
														
 
															         quantization: Optional[str] = None,
														
 
															         load_in_4bit: bool = False,
														
 
															         load_in_8bit: bool = False,
														
 
															         load_in_smooth: bool = False,
														
 
															-        enforce_eager: bool = False,
														
 
															+        enforce_eager: bool = True,
														
 
															         max_context_len_to_capture: Optional[int] = None,
														
 
															         max_log_probs: int = 10,
														
 
															     ) -> None:
														
@@ -92,6 +97,7 @@ class ModelConfig:
 
															         self.load_format = load_format
														
 
															         self.seed = seed
														
 
															         self.revision = revision
														
 
															+        self.code_revision = code_revision
														
 
															         self.tokenizer_revision = tokenizer_revision
														
 
															         self.quantization = quantization
														
 
															         self.load_in_4bit = load_in_4bit
														
@@ -106,14 +112,18 @@ class ModelConfig:
 
															             # download model from ModelScope hub,
														
 
															             # lazy import so that modelscope is not required for normal use.
														
 
															             from modelscope.hub.snapshot_download import snapshot_download  # pylint: disable=C
														
 
															-            model_path = snapshot_download(model_id=model,
														
 
															-                                           cache_dir=download_dir,
														
 
															-                                           revision=revision)
														
 
															+            if not os.path.exists(model):
														
 
															+                model_path = snapshot_download(model_id=model,
														
 
															+                                               cache_dir=download_dir,
														
 
															+                                               revision=revision)
														
 
															+            else:
														
 
															+                model_path = model
														
 
															             self.model = model_path
														
 
															             self.download_dir = model_path
														
 
															             self.tokenizer = model_path
														
 
															-        self.hf_config = get_config(self.model, trust_remote_code, revision)
														
 
															+        self.hf_config = get_config(self.model, trust_remote_code, revision,
														
 
															+                                    code_revision)
														
 
															         self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
														
 
															         self.max_model_len = _get_and_verify_max_len(self.hf_config,
														
 
															                                                      max_model_len)
														
@@ -177,6 +187,7 @@ class ModelConfig:
 
															         # Parse quantization method from the HF model config, if available.
														
 
															         hf_quant_config = getattr(self.hf_config, "quantization_config", None)
														
 
															         if hf_quant_config is not None:
														
 
															+
														
 
															             hf_quant_method = str(hf_quant_config["quant_method"]).lower()
														
 
															             # If the GPTQ model is serialized in marlin format, use marlin.
														
 
															             if (hf_quant_method == "gptq"
														
@@ -375,7 +386,7 @@ class CacheConfig:
 
															         gpu_memory_utilization: float,
														
 
															         swap_space: int,
														
 
															         cache_dtype: str,
														
 
															-        cache_quant_params_path: Optional[str] = None,
														
 
															+        # cache_quant_params_path: Optional[str] = None,
														
 
															         sliding_window: Optional[int] = None,
														
 
															         context_shift: bool = False,
														
 
															     ) -> None:
														
@@ -384,7 +395,7 @@ class CacheConfig:
 
															         self.swap_space_bytes = swap_space * _GB
														
 
															         self.cache_dtype = cache_dtype
														
 
															         self.sliding_window = sliding_window
														
 
															-        self.cache_quant_params_path = cache_quant_params_path
														
 
															+        # self.cache_quant_params_path = cache_quant_params_path
														
 
															         self.context_shift = context_shift
														
 
															         self._verify_args()
														
 
															         self._verify_cache_dtype()
														
@@ -393,6 +404,11 @@ class CacheConfig:
 
															         self.num_gpu_blocks = None
														
 
															         self.num_cpu_blocks = None
														
 
															+    def metrics_info(self):
														
 
															+        # convert cache_config to dict(key: str, value: str) for prometheus
														
 
															+        # metrics info
														
 
															+        return {key: str(value) for key, value in self.__dict__.items()}
														
 
															+
														
 
															     def _verify_args(self) -> None:
														
 
															         if self.gpu_memory_utilization > 1.0:
														
 
															             raise ValueError(
														
@@ -400,25 +416,24 @@ class CacheConfig:
 
															                 f"{self.gpu_memory_utilization}.")
														
 
															     def _verify_cache_dtype(self) -> None:
														
 
															-        if self.cache_dtype in ["auto", "int8"]:
														
 
															+        if self.cache_dtype == "auto":
														
 
															+            # if self.cache_dtype in ["auto", "int8"]:
														
 
															             pass
														
 
															         elif self.cache_dtype == "fp8_e5m2":
														
 
															-            nvcc_cuda_version = get_nvcc_cuda_version()
														
 
															-            if nvcc_cuda_version < Version("11.8"):
														
 
															-                raise ValueError(
														
 
															-                    "FP8 is not supported when cuda version is lower than "
														
 
															-                    "11.8. If you think you have the correct cuda version, "
														
 
															-                    "please make sure you've properly exported CUDA_HOME.")
														
 
															-            device_name = torch.cuda.get_device_name()
														
 
															-            if "AMD" in device_name:
														
 
															+            if is_hip():
														
 
															                 raise NotImplementedError(
														
 
															                     "FP8_E5M2 KV Cache on AMD GPU has not been supported yet.")
														
 
															+            nvcc_cuda_version = get_nvcc_cuda_version()
														
 
															+            if nvcc_cuda_version and nvcc_cuda_version < Version("11.8"):
														
 
															+                raise ValueError(
														
 
															+                    "FP8 is not supported when cuda version is lower than 11.8."
														
 
															+                )
														
 
															             logger.info(
														
 
															                 "Using fp8_e5m2 data type to store kv cache. It reduces "
														
 
															                 "the GPU memory footprint and boosts the performance. "
														
 
															                 "But it may cause slight accuracy drop. "
														
 
															                 "Currently we only support fp8 without scaling factors and "
														
 
															-                "make e5m2 as a default format.")
														
 
															+                "use e5m2 as a default format.")
														
 
															         else:
														
 
															             raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
														
@@ -450,8 +465,13 @@ class ParallelConfig:
 
															         worker_use_ray: Whether to use Ray for model workers. Will be set to
														
 
															             True if either pipeline_parallel_size or tensor_parallel_size is
														
 
															             greater than 1.
														
 
															+        max_parallel_loading_workers: Maximum number of multiple batches
														
 
															+            when load model sequentially. To avoid RAM OOM when using tensor
														
 
															+            parallel and large models.
														
 
															         disable_custom_all_reduce: Disable the custom all-reduce kernel and
														
 
															             fall back to NCCL.
														
 
															+        ray_workers_use_nsight: Whether to profile Ray workers with nsight, see
														
 
															+            https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.
														
 
															     """
														
 
															     def __init__(
														
@@ -461,15 +481,26 @@ class ParallelConfig:
 
															         worker_use_ray: bool,
														
 
															         max_parallel_loading_workers: Optional[int] = None,
														
 
															         disable_custom_all_reduce: bool = False,
														
 
															+        ray_workers_use_nsight: bool = False,
														
 
															     ) -> None:
														
 
															         self.pipeline_parallel_size = pipeline_parallel_size
														
 
															-        self.tensor_parallel_size = tensor_parallel_size
														
 
															+        if is_neuron():
														
 
															+            # For Neuron device support, here we assign TP=1 to avoid sharding
														
 
															+            # within Aphrodite directly.
														
 
															+            # Transformer-neuronx would take neuron_tp_degree attribute, and
														
 
															+            # distribute the workload to multiple NeuronCores.
														
 
															+            self.tensor_parallel_size = 1
														
 
															+            self.neuron_tp_degree = tensor_parallel_size
														
 
															+        else:
														
 
															+            self.tensor_parallel_size = tensor_parallel_size
														
 
															         self.worker_use_ray = worker_use_ray
														
 
															         self.max_parallel_loading_workers = max_parallel_loading_workers
														
 
															         self.disable_custom_all_reduce = disable_custom_all_reduce
														
 
															+        self.ray_workers_use_nsight = ray_workers_use_nsight
														
 
															-        self.world_size = pipeline_parallel_size * tensor_parallel_size
														
 
															-        if self.world_size > 1:
														
 
															+        self.world_size = pipeline_parallel_size * self.tensor_parallel_size
														
 
															+        # Ray worker is not supported for Neuron backend.
														
 
															+        if self.world_size > 1 and not is_neuron():
														
 
															             self.worker_use_ray = True
														
 
															         self._verify_args()
														
@@ -477,16 +508,29 @@ class ParallelConfig:
 
															         if self.pipeline_parallel_size > 1:
														
 
															             raise NotImplementedError(
														
 
															                 "Pipeline parallelism is not supported yet.")
														
 
															-        if is_hip():
														
 
															+        if not self.disable_custom_all_reduce and self.world_size > 1:
														
 
															+            if is_hip():
														
 
															+                self.disable_custom_all_reduce = True
														
 
															+                logger.info(
														
 
															+                    "Disabled the custom all-reduce kernel because it is not "
														
 
															+                    "supported on AMD GPUs.")
														
 
															+            elif self.pipeline_parallel_size > 1:
														
 
															+                self.disable_custom_all_reduce = True
														
 
															+                logger.info(
														
 
															+                    "Disabled the custom all-reduce kernel because it is not "
														
 
															+                    "supported with pipeline parallelism.")
														
 
															+        if self.ray_workers_use_nsight and not self.worker_use_ray:
														
 
															+            raise ValueError("Unable to use nsight profiling unless workers "
														
 
															+                             "run with Ray.")
														
 
															+
														
 
															+        # FIXME: Fix the stability issues and re-enable the custom
														
 
															+        # all-reduce kernel.
														
 
															+        if not self.disable_custom_all_reduce and self.world_size > 1:
														
 
															             self.disable_custom_all_reduce = True
														
 
															             logger.info(
														
 
															-                "Disabled the custom all-reduce kernel because it is not "
														
 
															-                "supported on AMD GPUs.")
														
 
															-        elif self.pipeline_parallel_size > 1:
														
 
															-            self.disable_custom_all_reduce = True
														
 
															-            logger.info(
														
 
															-                "Disabled the custom all-reduce kernel because it is not "
														
 
															-                "supported with pipeline parallelism.")
														
 
															+                "Custom all-reduce kernels are temporarily disabled due to "
														
 
															+                "stability issues. We will re-enable them once the issues are "
														
 
															+                "resolved.")
														
 
															 class SchedulerConfig:
														
@@ -538,8 +582,29 @@ class SchedulerConfig:
 
															 class DeviceConfig:
														
 
															-    def __init__(self, device: str = "cuda") -> None:
														
 
															-        self.device = torch.device(device)
														
 
															+    def __init__(self, device: str = "auto") -> None:
														
 
															+        if device == "auto":
														
 
															+            # Automated device type detection
														
 
															+            if torch.cuda.is_available():
														
 
															+                self.device_type = "cuda"
														
 
															+            elif is_neuron():
														
 
															+                self.device_type = "neuron"
														
 
															+            else:
														
 
															+                raise RuntimeError("No supported device detected.")
														
 
															+        else:
														
 
															+            # Device type is assigned explicitly
														
 
															+            self.device_type = device
														
 
															+
														
 
															+        # Some device types require processing inputs on CPU
														
 
															+        if self.device_type in ["neuron"]:
														
 
															+            self.device = torch.device("cpu")
														
 
															+        else:
														
 
															+            # Set device with device type
														
 
															+            self.device = torch.device(self.device_type)
														
 
															+
														
 
															+    @property
														
 
															+    def is_neuron(self):
														
 
															+        return self.device_type == "neuron"
														
 
															 @dataclass
														
@@ -571,7 +636,7 @@ class LoRAConfig:
 
															         elif self.max_cpu_loras < self.max_loras:
														
 
															             raise ValueError(
														
 
															                 f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
														
 
															-                f"max_num_seqs ({self.max_loras})")
														
 
															+                f"max_loras ({self.max_loras})")
														
 
															     def verify_with_model_config(self, model_config: ModelConfig):
														
 
															         if self.lora_dtype in (None, "auto"):
														
--- a/aphrodite/common/logger.py
+++ b/aphrodite/common/logger.py
@@ -1,9 +1,10 @@
 
															 """
														
 
															-Internal logging utility. Adapted from
														
 
															-https://github.com/theroyallab/tabbyAPI/blob/4cc0b59bdc94e6342b6d1d7acadbadc63c740ed9/common/logger.py
														
 
															+Internal logging utility.
														
 
															 """
														
 
															 import logging
														
 
															+import os
														
 
															+
														
 
															 from loguru import logger
														
 
															 from rich.console import Console
														
 
															 from rich.markup import escape
														
@@ -17,6 +18,7 @@ from rich.progress import (
 
															 )
														
 
															 RICH_CONSOLE = Console()
														
 
															+LOG_LEVEL = os.getenv("APHRODITE_LOG_LEVEL", "INFO").upper()
														
 
															 def unwrap(wrapped, default=None):
														
@@ -60,9 +62,9 @@ def _log_formatter(record: dict):
 
															     message = unwrap(record.get("message"), "")
														
 
															     # Replace once loguru allows for turning off str.format
														
 
															-    message = message.replace("{{", "{{").replace("}}", "}}")
														
 
															-    # Manually escape < and > characters
														
 
															-    message = message.replace("<", "\\<").replace(">", "\\>")
														
 
															+    message = message.replace("{", "{{").replace("}", "}}").replace("<", "\<")
														
 
															+
														
 
															+    # Escape markup tags from Rich
														
 
															     message = escape(message)
														
 
															     lines = message.splitlines()
														
@@ -86,7 +88,7 @@ class UvicornLoggingHandler(logging.Handler):
 
															 # Uvicorn config for logging. Passed into run when creating all loggers in
														
 
															-#server
														
 
															+# server
														
 
															 UVICORN_LOG_CONFIG = {
														
 
															     "version": 1,
														
 
															     "disable_existing_loggers": False,
														
@@ -99,7 +101,7 @@ UVICORN_LOG_CONFIG = {
 
															     "root": {
														
 
															         "handlers": ["uvicorn"],
														
 
															         "propagate": False,
														
 
															-        "level": "INFO"
														
 
															+        "level": LOG_LEVEL
														
 
															     },
														
 
															 }
														
@@ -111,7 +113,7 @@ def setup_logger():
 
															     logger.add(
														
 
															         RICH_CONSOLE.print,
														
 
															-        level="INFO",
														
 
															+        level=LOG_LEVEL,
														
 
															         format=_log_formatter,
														
 
															         colorize=True,
														
 
															     )
														
--- a/aphrodite/common/outputs.py
+++ b/aphrodite/common/outputs.py
@@ -1,7 +1,13 @@
 
															 from typing import List, Optional
														
 
															+import time
														
 
															-from aphrodite.common.sequence import (PromptLogprobs, SampleLogprobs,
														
 
															-                                       SequenceGroup, SequenceStatus)
														
 
															+from aphrodite.common.sequence import (
														
 
															+    PromptLogprobs,
														
 
															+    SampleLogprobs,
														
 
															+    SequenceGroup,
														
 
															+    SequenceStatus,
														
 
															+    RequestMetrics,
														
 
															+)
														
 
															 from aphrodite.lora.request import LoRARequest
														
@@ -60,6 +66,7 @@ class RequestOutput:
 
															         prompt_logprobs: The log probabilities to return per prompt token.
														
 
															         outputs: The output sequences of the request.
														
 
															         finished: Whether the whole request is finished.
														
 
															+        metrics: Metrics associated with the request.
														
 
															         lora_request: The LoRA request that was used to generate the output.
														
 
															     """
														
@@ -71,6 +78,7 @@ class RequestOutput:
 
															         prompt_logprobs: Optional[PromptLogprobs],
														
 
															         outputs: List[CompletionOutput],
														
 
															         finished: bool,
														
 
															+        metrics: Optional[RequestMetrics] = None,
														
 
															         lora_request: Optional[LoRARequest] = None,
														
 
															     ) -> None:
														
 
															         self.request_id = request_id
														
@@ -79,6 +87,7 @@ class RequestOutput:
 
															         self.prompt_logprobs = prompt_logprobs
														
 
															         self.outputs = outputs
														
 
															         self.finished = finished
														
 
															+        self.metrics = metrics
														
 
															         self.lora_request = lora_request
														
 
															     @classmethod
														
@@ -86,43 +95,50 @@ class RequestOutput:
 
															         # Get the top-n sequences.
														
 
															         n = seq_group.sampling_params.n
														
 
															         seqs = seq_group.get_seqs()
														
 
															-        if seq_group.sampling_params.use_beam_search:
														
 
															-            sorting_key = lambda seq: seq.get_beam_search_score(
														
 
															-                seq_group.sampling_params.length_penalty)
														
 
															+        if n == 1:
														
 
															+            top_n_seqs = seqs
														
 
															         else:
														
 
															-            # ruff: noqa: E731
														
 
															-            sorting_key = lambda seq: seq.get_cumulative_logprob()
														
 
															-        sorted_seqs = sorted(seqs, key=sorting_key, reverse=True)
														
 
															-        top_n_seqs = sorted_seqs[:n]
														
 
															+            if seq_group.sampling_params.use_beam_search:
														
 
															+                sorting_key = lambda seq: seq.get_beam_search_score(
														
 
															+                    seq_group.sampling_params.length_penalty)
														
 
															+            else:
														
 
															+                sorting_key = lambda seq: seq.get_cumulative_logprob()
														
 
															+            sorted_seqs = sorted(seqs, key=sorting_key, reverse=True)
														
 
															+            top_n_seqs = sorted_seqs[:n]
														
 
															         # Create the outputs.
														
 
															-        outputs: List[CompletionOutput] = []
														
 
															-        for seq in top_n_seqs:
														
 
															-            logprobs = seq.output_logprobs
														
 
															-            if seq_group.sampling_params.logprobs is None:
														
 
															-                # NOTE: We need to take care of this case because the sequence
														
 
															-                # always has the logprobs of the sampled tokens even if the
														
 
															-                # logprobs are not requested.
														
 
															-                logprobs = None
														
 
															-            finshed_reason = SequenceStatus.get_finished_reason(seq.status)
														
 
															-            output = CompletionOutput(seqs.index(seq), seq.output_text,
														
 
															-                                      seq.get_output_token_ids(),
														
 
															-                                      seq.get_cumulative_logprob(), logprobs,
														
 
															-                                      finshed_reason)
														
 
															-            outputs.append(output)
														
 
															+        # NOTE: We need omit logprobs here explicitly because the sequence
														
 
															+        # always has the logprobs of the sampled tokens even if the
														
 
															+        # logprobs are not requested.
														
 
															+        include_logprobs = seq_group.sampling_params.logprobs
														
 
															+        outputs = [
														
 
															+            CompletionOutput(
														
 
															+                seqs.index(seq),
														
 
															+                seq.output_text,
														
 
															+                seq.get_output_token_ids(),
														
 
															+                seq.get_cumulative_logprob(),
														
 
															+                seq.output_logprobs if include_logprobs else None,
														
 
															+                SequenceStatus.get_finished_reason(seq.status),
														
 
															+            ) for seq in top_n_seqs
														
 
															+        ]
														
 
															         # Every sequence in the sequence group should have the same prompt.
														
 
															         prompt = seq_group.prompt
														
 
															         prompt_token_ids = seq_group.prompt_token_ids
														
 
															         prompt_logprobs = seq_group.prompt_logprobs
														
 
															         finished = seq_group.is_finished()
														
 
															-        return cls(seq_group.request_id,
														
 
															-                   prompt,
														
 
															-                   prompt_token_ids,
														
 
															-                   prompt_logprobs,
														
 
															-                   outputs,
														
 
															-                   finished,
														
 
															-                   lora_request=seq_group.lora_request)
														
 
															+        finished_time = time.time() if finished else None
														
 
															+        seq_group.set_finished_time(finished_time)
														
 
															+        return cls(
														
 
															+            seq_group.request_id,
														
 
															+            prompt,
														
 
															+            prompt_token_ids,
														
 
															+            prompt_logprobs,
														
 
															+            outputs,
														
 
															+            finished,
														
 
															+            seq_group.metrics,
														
 
															+            lora_request=seq_group.lora_request,
														
 
															+        )
														
 
															     def __repr__(self) -> str:
														
 
															         return (f"RequestOutput(request_id={self.request_id}, "
														
@@ -131,4 +147,5 @@ class RequestOutput:
 
															                 f"prompt_logprobs={self.prompt_logprobs}, "
														
 
															                 f"outputs={self.outputs}, "
														
 
															                 f"finished={self.finished}, "
														
 
															+                f"metrics={self.metrics}, "
														
 
															                 f"lora_request={self.lora_request})")
														
--- a/aphrodite/common/sampling_params.py
+++ b/aphrodite/common/sampling_params.py
@@ -1,4 +1,5 @@
 
															 """Sampling parameters for text generation."""
														
 
															+import copy
														
 
															 from enum import IntEnum
														
 
															 from functools import cached_property
														
 
															 from typing import Callable, List, Optional, Union
														
@@ -375,6 +376,18 @@ class SamplingParams:
 
															             return SamplingType.RANDOM_SEED
														
 
															         return SamplingType.RANDOM
														
 
															+    def clone(self) -> "SamplingParams":
														
 
															+        """Deep copy excluding LogitsProcessor objects.
														
 
															+        LogitsProcessor objects are excluded because they may contain an
														
 
															+        arbitrary, nontrivial amount of data.
														
 
															+        """
														
 
															+
														
 
															+        logit_processor_refs = None if self.logits_processors is None else {
														
 
															+            id(lp): lp
														
 
															+            for lp in self.logits_processors
														
 
															+        }
														
 
															+        return copy.deepcopy(self, memo=logit_processor_refs)
														
 
															+
														
 
															     def __repr__(self) -> str:
														
 
															         repr_str = "SamplingParams("
														
 
															         for param, default_value in self.default_values.items():
														
--- a/aphrodite/common/sequence.py
+++ b/aphrodite/common/sequence.py
@@ -2,16 +2,21 @@
 
															 import copy
														
 
															 import enum
														
 
															 from dataclasses import dataclass
														
 
															-from typing import Dict, List, Optional, Union
														
 
															+from typing import Dict, List, Optional, Union, TYPE_CHECKING
														
 
															 from aphrodite.common.block import LogicalTokenBlock
														
 
															 from aphrodite.common.sampling_params import SamplingParams
														
 
															 from aphrodite.lora.request import LoRARequest
														
 
															+if TYPE_CHECKING:
														
 
															+    import torch
														
 
															+    from aphrodite.spec_decode.metrics import SpecDecodeWorkerMetrics
														
 
															+
														
 
															 @dataclass
														
 
															 class Logprob:
														
 
															     """Infos for supporting OpenAI compatible logprobs."""
														
 
															+
														
 
															     logprob: float
														
 
															     decoded_token: Optional[str] = None
														
@@ -22,6 +27,7 @@ SampleLogprobs = List[Dict[int, Logprob]]
 
															 class SequenceStatus(enum.Enum):
														
 
															     """Status of a sequence."""
														
 
															+
														
 
															     WAITING = enum.auto()
														
 
															     RUNNING = enum.auto()
														
 
															     SWAPPED = enum.auto()
														
@@ -68,6 +74,7 @@ class RequestMetrics:
 
															         time_in_queue: The time the request spent in the queue.
														
 
															         finished_time: The time when the request was finished.
														
 
															     """
														
 
															+
														
 
															     arrival_time: float
														
 
															     last_token_time: float
														
 
															     first_scheduled_time: Optional[float]
														
@@ -81,6 +88,8 @@ class SequenceData:
 
															     Args:
														
 
															         prompt_token_ids: The token IDs of the prompt.
														
 
															+        output_token_ids: The token IDs of the output. Set to an empty list if
														
 
															+            None.
														
 
															     Attributes:
														
 
															         prompt_token_ids: The token IDs of the prompt.
														
@@ -91,9 +100,13 @@ class SequenceData:
 
															     def __init__(
														
 
															         self,
														
 
															         prompt_token_ids: List[int],
														
 
															+        output_token_ids: Optional[List[int]] = None,
														
 
															     ) -> None:
														
 
															+        if output_token_ids is None:
														
 
															+            output_token_ids = []
														
 
															+
														
 
															         self.prompt_token_ids = prompt_token_ids
														
 
															-        self.output_token_ids: List[int] = []
														
 
															+        self.output_token_ids = output_token_ids
														
 
															         self.cumulative_logprob = 0.0
														
 
															     def append_token_id(self, token_id: int, logprob: float) -> None:
														
@@ -117,6 +130,12 @@ class SequenceData:
 
															             return self.prompt_token_ids[-1]
														
 
															         return self.output_token_ids[-1]
														
 
															+    def get_prompt_token_ids(self) -> int:
														
 
															+        return self.prompt_token_ids
														
 
															+
														
 
															+    def get_output_token_ids(self) -> int:
														
 
															+        return self.output_token_ids
														
 
															+
														
 
															     def __repr__(self) -> str:
														
 
															         return (f"SequenceData("
														
 
															                 f"prompt_token_ids={self.prompt_token_ids}, "
														
@@ -142,11 +161,13 @@ class Sequence:
 
															         prompt: str,
														
 
															         prompt_token_ids: List[int],
														
 
															         block_size: int,
														
 
															+        eos_token_id: Optional[int] = None,
														
 
															         lora_request: Optional[LoRARequest] = None,
														
 
															     ) -> None:
														
 
															         self.seq_id = seq_id
														
 
															         self.prompt = prompt
														
 
															         self.block_size = block_size
														
 
															+        self.eos_token_id = eos_token_id
														
 
															         self.lora_request = lora_request
														
 
															         self.data = SequenceData(prompt_token_ids)
														
@@ -164,7 +185,6 @@ class Sequence:
 
															         # Input + output tokens
														
 
															         self.tokens: Optional[List[str]] = None
														
 
															         self.persistent_data = {}
														
 
															-        self.persistent_data = {}
														
 
															     @property
														
 
															     def lora_int_id(self) -> int:
														
@@ -235,10 +255,12 @@ class Sequence:
 
															     def get_cumulative_logprob(self) -> float:
														
 
															         return self.data.cumulative_logprob
														
 
															-    def get_beam_search_score(self,
														
 
															-                              length_penalty: float = 0.0,
														
 
															-                              seq_len: Optional[int] = None,
														
 
															-                              eos_token_id: Optional[int] = None) -> float:
														
 
															+    def get_beam_search_score(
														
 
															+        self,
														
 
															+        length_penalty: float = 1.0,
														
 
															+        seq_len: Optional[int] = None,
														
 
															+        eos_token_id: Optional[int] = None,
														
 
															+    ) -> float:
														
 
															         """Calculate the beam search score with length penalty.
														
 
															         Adapted from
														
@@ -298,11 +320,13 @@ class SequenceGroup:
 
															         self.request_id = request_id
														
 
															         self.seqs_dict = {seq.seq_id: seq for seq in seqs}
														
 
															         self.sampling_params = sampling_params
														
 
															-        self.metrics = RequestMetrics(arrival_time=arrival_time,
														
 
															-                                      last_token_time=arrival_time,
														
 
															-                                      first_scheduled_time=None,
														
 
															-                                      first_token_time=None,
														
 
															-                                      time_in_queue=None)
														
 
															+        self.metrics = RequestMetrics(
														
 
															+            arrival_time=arrival_time,
														
 
															+            last_token_time=arrival_time,
														
 
															+            first_scheduled_time=None,
														
 
															+            first_token_time=None,
														
 
															+            time_in_queue=None,
														
 
															+        )
														
 
															         self.lora_request = lora_request
														
 
															         self.prompt_logprobs: Optional[PromptLogprobs] = None
														
 
															         self.state = SequenceGroupState()
														
@@ -366,12 +390,9 @@ class SequenceGroup:
 
															         self,
														
 
															         status: Optional[SequenceStatus] = None,
														
 
															     ) -> List[Sequence]:
														
 
															-        if status is None:
														
 
															-            return list(self.seqs_dict.values())
														
 
															-        else:
														
 
															-            return [
														
 
															-                seq for seq in self.seqs_dict.values() if seq.status == status
														
 
															-            ]
														
 
															+        return (list(self.seqs_dict.values()) if status is None else [
														
 
															+            seq for seq in self.seqs_dict.values() if seq.status == status
														
 
															+        ])
														
 
															     def get_unfinished_seqs(self) -> List[Sequence]:
														
 
															         return [
														
@@ -517,6 +538,35 @@ class SequenceGroupOutput:
 
															                 and self.prompt_logprobs == other.prompt_logprobs)
														
 
															-# For each sequence group, we generate a list of SequenceOutput object,
														
 
															-# each of which contains one possible candidate for the next token.
														
 
															-SamplerOutput = List[SequenceGroupOutput]
														
 
															+@dataclass
														
 
															+class SamplerOutput:
														
 
															+    """For each sequence group, we generate a list of SequenceOutput object,
														
 
															+    each of which contains one possible candidate for the next token.
														
 
															+
														
 
															+    This datastructure implements methods so it can be used like a list, but
														
 
															+    also has optional fields for device tensors.
														
 
															+    """
														
 
															+
														
 
															+    outputs: List[SequenceGroupOutput]
														
 
															+
														
 
															+    # On-device tensor containing probabilities of each token.
														
 
															+    sampled_token_probs: Optional["torch.Tensor"] = None
														
 
															+
														
 
															+    # On-device tensor containing the sampled token ids.
														
 
															+    sampled_token_ids: Optional["torch.Tensor"] = None
														
 
															+
														
 
															+    # Spec decode metrics populated by workers.
														
 
															+    spec_decode_worker_metrics: Optional["SpecDecodeWorkerMetrics"] = None
														
 
															+
														
 
															+    def __getitem__(self, idx: int):
														
 
															+        return self.outputs[idx]
														
 
															+
														
 
															+    def __setitem__(self, idx: int, value):
														
 
															+        self.outputs[idx] = value
														
 
															+
														
 
															+    def __len__(self):
														
 
															+        return len(self.outputs)
														
 
															+
														
 
															+    def __eq__(self, other: object):
														
 
															+        return (isinstance(other, self.__class__)
														
 
															+                and self.outputs == other.outputs)
														
--- a/aphrodite/common/utils.py
+++ b/aphrodite/common/utils.py
@@ -5,16 +5,21 @@ import subprocess
 
															 import uuid
														
 
															 import gc
														
 
															 from platform import uname
														
 
															-from loguru import logger
														
 
															+from typing import List, Tuple, Union
														
 
															+from packaging.version import parse, Version
														
 
															 import psutil
														
 
															 import torch
														
 
															 import asyncio
														
 
															 from functools import partial
														
 
															-from typing import (Any, Awaitable, Callable, Hashable, Optional, TypeVar,
														
 
															-                    List, Tuple, Union)
														
 
															+from typing import (
														
 
															+    Awaitable,
														
 
															+    Callable,
														
 
															+    TypeVar,
														
 
															+)
														
 
															 from collections import OrderedDict
														
 
															-from packaging.version import parse, Version
														
 
															+from typing import Any, Hashable, Optional
														
 
															+from loguru import logger
														
 
															 T = TypeVar("T")
														
@@ -23,7 +28,7 @@ STR_DTYPE_TO_TORCH_DTYPE = {
 
															     "bfloat16": torch.bfloat16,
														
 
															     "float": torch.float,
														
 
															     "fp8_e5m2": torch.uint8,
														
 
															-    "int8": torch.int8,
														
 
															+    # "int8": torch.int8,
														
 
															 }
														
@@ -113,12 +118,24 @@ def is_hip() -> bool:
 
															     return torch.version.hip is not None
														
 
															+def is_neuron() -> bool:
														
 
															+    try:
														
 
															+        import transformers_neuronx
														
 
															+    except ImportError:
														
 
															+        transformers_neuronx = None
														
 
															+    return transformers_neuronx is not None
														
 
															+
														
 
															+
														
 
															 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
														
 
															     """Returns the maximum shared memory per thread block in bytes."""
														
 
															+    # NOTE: This import statement should be executed lazily since
														
 
															+    # the Neuron-X backend does not have the `cuda_utils` module.
														
 
															     from aphrodite._C import cuda_utils
														
 
															-    # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
														
 
															+
														
 
															     max_shared_mem = (
														
 
															         cuda_utils.get_max_shared_memory_per_block_device_attribute(gpu))
														
 
															+    # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py
														
 
															+    # will fail
														
 
															     assert max_shared_mem > 0, "max_shared_mem can not be zero"
														
 
															     return int(max_shared_mem)
														
@@ -139,6 +156,7 @@ def in_wsl() -> bool:
 
															 def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]:
														
 
															     """Take a blocking function, and run it on in an executor thread.
														
 
															+
														
 
															     This function prevents the blocking function from blocking the
														
 
															     asyncio event loop.
														
 
															     The code in this function needs to be thread safe.
														
@@ -153,15 +171,33 @@ def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]:
 
															 def get_ip() -> str:
														
 
															+    # try ipv4
														
 
															     s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
														
 
															-    s.connect(("8.8.8.8", 80))  # Doesn't need to be reachable
														
 
															-    return s.getsockname()[0]
														
 
															+    try:
														
 
															+        s.connect(("8.8.8.8", 80))  # Doesn't need to be reachable
														
 
															+        return s.getsockname()[0]
														
 
															+    except OSError:
														
 
															+        # try ipv6
														
 
															+        s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
														
 
															+        s.connect(("dns.google", 80))
														
 
															+        return s.getsockname()[0]
														
 
															+
														
 
															+
														
 
															+def get_distributed_init_method(ip: str, port: int) -> str:
														
 
															+    return f"tcp://{ip}:{port}"
														
 
															 def get_open_port() -> int:
														
 
															-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
														
 
															-        s.bind(("", 0))
														
 
															-        return s.getsockname()[1]
														
 
															+    # try ipv4
														
 
															+    try:
														
 
															+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
														
 
															+            s.bind(("", 0))
														
 
															+            return s.getsockname()[1]
														
 
															+    except OSError:
														
 
															+        # try ipv6
														
 
															+        with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
														
 
															+            s.bind(("", 0))
														
 
															+            return s.getsockname()[1]
														
 
															 def set_cuda_visible_devices(device_ids: List[int]) -> None:
														
@@ -170,18 +206,22 @@ def set_cuda_visible_devices(device_ids: List[int]) -> None:
 
															 def get_nvcc_cuda_version() -> Optional[Version]:
														
 
															     cuda_home = os.environ.get('CUDA_HOME')
														
 
															-    nvcc_path = os.path.join(cuda_home, 'bin', 'nvcc') if cuda_home else 'nvcc'
														
 
															-
														
 
															-    try:
														
 
															-        nvcc_output = subprocess.check_output([nvcc_path, "-V"],
														
 
															-                                              universal_newlines=True)
														
 
															-        output = nvcc_output.split()
														
 
															-        release_idx = output.index("release") + 1
														
 
															-        nvcc_cuda_version = parse(output[release_idx].split(",")[0])
														
 
															-        return nvcc_cuda_version
														
 
															-    except (FileNotFoundError, subprocess.CalledProcessError):
														
 
															-        logger.warning("nvcc not found. Skipping CUDA version check!")
														
 
															-        return None
														
 
															+    if not cuda_home:
														
 
															+        cuda_home = '/usr/local/cuda'
														
 
															+        if os.path.isfile(cuda_home + '/bin/nvcc'):
														
 
															+            logger.info(
														
 
															+                f'CUDA_HOME is not found in the environment. Using {cuda_home} '
														
 
															+                'as CUDA_HOME.')
														
 
															+        else:
														
 
															+            logger.warning(
														
 
															+                f'Not found nvcc in {cuda_home}. Skip cuda version check!')
														
 
															+            return None
														
 
															+    nvcc_output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"],
														
 
															+                                          universal_newlines=True)
														
 
															+    output = nvcc_output.split()
														
 
															+    release_idx = output.index("release") + 1
														
 
															+    nvcc_cuda_version = parse(output[release_idx].split(",")[0])
														
 
															+    return nvcc_cuda_version
														
 
															 def _generate_random_fp8_e5m2(
														
@@ -248,8 +288,8 @@ def create_kv_caches_with_random(
 
															                                 device=device)
														
 
															         if cache_dtype == 'fp8_e5m2':
														
 
															             _generate_random_fp8_e5m2(key_cache, -scale, scale)
														
 
															-        elif cache_dtype == 'int8':
														
 
															-            torch.randint(-128, 127, key_cache.size(), out=key_cache)
														
 
															+        # elif cache_dtype == 'int8':
														
 
															+        #     torch.randint(-128, 127, key_cache.size(), out=key_cache)
														
 
															         elif torch_dtype in [torch.half, torch.bfloat16, torch.float]:
														
 
															             key_cache.uniform_(-scale, scale)
														
 
															         else:
														
@@ -265,8 +305,8 @@ def create_kv_caches_with_random(
 
															                                   device=device)
														
 
															         if cache_dtype == 'fp8_e5m2':
														
 
															             _generate_random_fp8_e5m2(value_cache, -scale, scale)
														
 
															-        elif cache_dtype == 'int8':
														
 
															-            torch.randint(-128, 127, value_cache.size(), out=value_cache)
														
 
															+        # elif cache_dtype == 'int8':
														
 
															+        #     torch.randint(-128, 127, value_cache.size(), out=value_cache)
														
 
															         elif torch_dtype in [torch.half, torch.bfloat16, torch.float]:
														
 
															             value_cache.uniform_(-scale, scale)
														
 
															         else:
														
--- a/aphrodite/endpoints/llm.py
+++ b/aphrodite/endpoints/llm.py
@@ -78,7 +78,7 @@ class LLM:
 
															         seed: int = 0,
														
 
															         gpu_memory_utilization: float = 0.9,
														
 
															         swap_space: int = 4,
														
 
															-        enforce_eager: bool = False,
														
 
															+        enforce_eager: bool = True,
														
 
															         max_context_len_to_capture: int = 8192,
														
 
															         disable_custom_all_reduce: bool = False,
														
 
															         **kwargs,
														
--- a/aphrodite/endpoints/openai/api_server.py
+++ b/aphrodite/endpoints/openai/api_server.py
@@ -191,6 +191,7 @@ async def validation_exception_handler(_, exc):
 
															 @app.get("/health")
														
 
															 async def health() -> Response:
														
 
															     """Health check."""
														
 
															+    await openai_serving_chat.engine.check_health()
														
 
															     return Response(status_code=200)
														
@@ -526,104 +527,112 @@ async def get_kobold_lite_ui():
 
															 # ============ KoboldAI API ============ #
														
 
															 if __name__ == "__main__":
														
 
															-    args = parse_args()
														
 
															-
														
 
															-    if args.launch_kobold_api:
														
 
															-        logger.warning("Launching Kobold API server in addition to OpenAI. "
														
 
															-                       "Keep in mind that the Kobold API routes are NOT "
														
 
															-                       "protected via the API key.")
														
 
															-        app.include_router(kai_api, prefix="/api/v1")
														
 
															-        app.include_router(kai_api,
														
 
															-                           prefix="/api/latest",
														
 
															-                           include_in_schema=False)
														
 
															-        app.include_router(extra_api, prefix="/api/extra")
														
 
															-
														
 
															-    app.add_middleware(
														
 
															-        CORSMiddleware,
														
 
															-        allow_origins=args.allowed_origins,
														
 
															-        allow_credentials=args.allow_credentials,
														
 
															-        allow_methods=args.allowed_methods,
														
 
															-        allow_headers=args.allowed_headers,
														
 
															-    )
														
 
															+    try:
														
 
															+        args = parse_args()
														
 
															+
														
 
															+        if args.launch_kobold_api:
														
 
															+            logger.warning(
														
 
															+                "Launching Kobold API server in addition to OpenAI. "
														
 
															+                "Keep in mind that the Kobold API routes are NOT "
														
 
															+                "protected via the API key.")
														
 
															+            app.include_router(kai_api, prefix="/api/v1")
														
 
															+            app.include_router(kai_api,
														
 
															+                               prefix="/api/latest",
														
 
															+                               include_in_schema=False)
														
 
															+            app.include_router(extra_api, prefix="/api/extra")
														
 
															+
														
 
															+        app.add_middleware(
														
 
															+            CORSMiddleware,
														
 
															+            allow_origins=args.allowed_origins,
														
 
															+            allow_credentials=args.allow_credentials,
														
 
															+            allow_methods=args.allowed_methods,
														
 
															+            allow_headers=args.allowed_headers,
														
 
															+        )
														
 
															+
														
 
															+        if token := os.environ.get("APHRODITE_API_KEY") or args.api_keys:
														
 
															+            admin_key = os.environ.get("APHRODITE_ADMIN_KEY") or args.admin_key
														
 
															+
														
 
															+            if admin_key is None:
														
 
															+                logger.warning("Admin key not provided. Admin operations will "
														
 
															+                               "be disabled.")
														
 
															+
														
 
															+            @app.middleware("http")
														
 
															+            async def authentication(request: Request, call_next):
														
 
															+                excluded_paths = ["/api"]
														
 
															+                if any(
														
 
															+                        request.url.path.startswith(path)
														
 
															+                        for path in excluded_paths):
														
 
															+                    return await call_next(request)
														
 
															+                if not request.url.path.startswith("/v1"):
														
 
															+                    return await call_next(request)
														
 
															-    if token := os.environ.get("APHRODITE_API_KEY") or args.api_keys:
														
 
															-        admin_key = os.environ.get("APHRODITE_ADMIN_KEY") or args.admin_key
														
 
															+                auth_header = request.headers.get("Authorization")
														
 
															+                api_key_header = request.headers.get("x-api-key")
														
 
															-        if admin_key is None:
														
 
															-            logger.warning("Admin key not provided. Admin operations will "
														
 
															-                           "be disabled.")
														
 
															+                if request.url.path.startswith("/v1/lora"):
														
 
															+                    if admin_key is not None and api_key_header == admin_key:
														
 
															+                        return await call_next(request)
														
 
															+                    return JSONResponse(content={"error": "Unauthorized"},
														
 
															+                                        status_code=401)
														
 
															-        @app.middleware("http")
														
 
															-        async def authentication(request: Request, call_next):
														
 
															-            excluded_paths = ["/api"]
														
 
															-            if any(
														
 
															-                    request.url.path.startswith(path)
														
 
															-                    for path in excluded_paths):
														
 
															-                return await call_next(request)
														
 
															-            if not request.url.path.startswith("/v1"):
														
 
															+                if auth_header != "Bearer " + token and api_key_header != token:
														
 
															+                    return JSONResponse(content={"error": "Unauthorized"},
														
 
															+                                        status_code=401)
														
 
															                 return await call_next(request)
														
 
															-            auth_header = request.headers.get("Authorization")
														
 
															-            api_key_header = request.headers.get("x-api-key")
														
 
															-
														
 
															-            if request.url.path.startswith("/v1/lora"):
														
 
															-                if admin_key is not None and api_key_header == admin_key:
														
 
															-                    return await call_next(request)
														
 
															-                return JSONResponse(content={"error": "Unauthorized"},
														
 
															-                                    status_code=401)
														
 
															-
														
 
															-            if auth_header != "Bearer " + token and api_key_header != token:
														
 
															-                return JSONResponse(content={"error": "Unauthorized"},
														
 
															-                                    status_code=401)
														
 
															-            return await call_next(request)
														
 
															-
														
 
															-    for middleware in args.middleware:
														
 
															-        module_path, object_name = middleware.rsplit(".", 1)
														
 
															-        imported = getattr(importlib.import_module(module_path), object_name)
														
 
															-        if inspect.isclass(imported):
														
 
															-            app.add_middleware(imported)
														
 
															-        elif inspect.iscoroutinefunction(imported):
														
 
															-            app.middleware("http")(imported)
														
 
															+        for middleware in args.middleware:
														
 
															+            module_path, object_name = middleware.rsplit(".", 1)
														
 
															+            imported = getattr(importlib.import_module(module_path),
														
 
															+                               object_name)
														
 
															+            if inspect.isclass(imported):
														
 
															+                app.add_middleware(imported)
														
 
															+            elif inspect.iscoroutinefunction(imported):
														
 
															+                app.middleware("http")(imported)
														
 
															+            else:
														
 
															+                raise ValueError(f"Invalid middleware {middleware}. Must be a "
														
 
															+                                 "function or a class.")
														
 
															+
														
 
															+        logger.debug(f"args: {args}")
														
 
															+
														
 
															+        if args.served_model_name is not None:
														
 
															+            served_model = args.served_model_name
														
 
															         else:
														
 
															-            raise ValueError(f"Invalid middleware {middleware}. Must be a "
														
 
															-                             "function or a class.")
														
 
															-
														
 
															-    logger.debug(f"args: {args}")
														
 
															-
														
 
															-    if args.served_model_name is not None:
														
 
															-        served_model = args.served_model_name
														
 
															-    else:
														
 
															-        served_model = args.model
														
 
															-
														
 
															-    engine_args = AsyncEngineArgs.from_cli_args(args)
														
 
															-    engine = AsyncAphrodite.from_engine_args(engine_args)
														
 
															-    tokenizer = get_tokenizer(
														
 
															-        engine_args.tokenizer,
														
 
															-        tokenizer_mode=engine_args.tokenizer_mode,
														
 
															-        trust_remote_code=engine_args.trust_remote_code,
														
 
															-    )
														
 
															-
														
 
															-    chat_template = args.chat_template
														
 
															-    if chat_template is None and tokenizer.chat_template is not None:
														
 
															-        chat_template = tokenizer.chat_template
														
 
															-
														
 
															-    openai_serving_chat = OpenAIServingChat(engine, served_model,
														
 
															-                                            args.response_role,
														
 
															-                                            args.lora_modules,
														
 
															-                                            args.chat_template)
														
 
															-    openai_serving_completion = OpenAIServingCompletion(
														
 
															-        engine, served_model, args.lora_modules)
														
 
															-    engine_model_config = asyncio.run(engine.get_model_config())
														
 
															-
														
 
															-    if args.launch_kobold_api:
														
 
															-        _set_badwords(tokenizer, engine_model_config.hf_config)
														
 
															-
														
 
															-    app.root_path = args.root_path
														
 
															-    uvicorn.run(app,
														
 
															-                host=args.host,
														
 
															-                port=args.port,
														
 
															-                log_level="info",
														
 
															-                timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
														
 
															-                ssl_keyfile=args.ssl_keyfile,
														
 
															-                ssl_certfile=args.ssl_certfile,
														
 
															-                log_config=UVICORN_LOG_CONFIG)
														
 
															+            served_model = args.model
														
 
															+
														
 
															+        engine_args = AsyncEngineArgs.from_cli_args(args)
														
 
															+        engine = AsyncAphrodite.from_engine_args(engine_args)
														
 
															+        tokenizer = get_tokenizer(
														
 
															+            engine_args.tokenizer,
														
 
															+            tokenizer_mode=engine_args.tokenizer_mode,
														
 
															+            trust_remote_code=engine_args.trust_remote_code,
														
 
															+        )
														
 
															+
														
 
															+        chat_template = args.chat_template
														
 
															+        if chat_template is None and tokenizer.chat_template is not None:
														
 
															+            chat_template = tokenizer.chat_template
														
 
															+
														
 
															+        openai_serving_chat = OpenAIServingChat(engine, served_model,
														
 
															+                                                args.response_role,
														
 
															+                                                args.lora_modules,
														
 
															+                                                args.chat_template)
														
 
															+        openai_serving_completion = OpenAIServingCompletion(
														
 
															+            engine, served_model, args.lora_modules)
														
 
															+        engine_model_config = asyncio.run(engine.get_model_config())
														
 
															+
														
 
															+        if args.launch_kobold_api:
														
 
															+            _set_badwords(tokenizer, engine_model_config.hf_config)
														
 
															+
														
 
															+        app.root_path = args.root_path
														
 
															+        uvicorn.run(app,
														
 
															+                    host=args.host,
														
 
															+                    port=args.port,
														
 
															+                    log_level="info",
														
 
															+                    timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
														
 
															+                    ssl_keyfile=args.ssl_keyfile,
														
 
															+                    ssl_certfile=args.ssl_certfile,
														
 
															+                    log_config=UVICORN_LOG_CONFIG)
														
 
															+    except KeyboardInterrupt:
														
 
															+        logger.info("API server stopped by user. Exiting gracefully.")
														
 
															+    except asyncio.exceptions.CancelledError:
														
 
															+        logger.info("API server stopped due to a cancelled request. "
														
 
															+                    "Exiting gracefully.")
														
--- a/aphrodite/engine/aphrodite_engine.py
+++ b/aphrodite/engine/aphrodite_engine.py
@@ -2,29 +2,61 @@ import copy
 
															 from collections import defaultdict
														
 
															 import os
														
 
															 import time
														
 
															-from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple,
														
 
															-                    Union)
														
 
															+import pickle
														
 
															+import importlib
														
 
															+from typing import (
														
 
															+    TYPE_CHECKING,
														
 
															+    Any,
														
 
															+    Dict,
														
 
															+    Iterable,
														
 
															+    List,
														
 
															+    Optional,
														
 
															+    Tuple,
														
 
															+    Union,
														
 
															+)
														
 
															 from loguru import logger
														
 
															 import aphrodite
														
 
															 from aphrodite.lora.request import LoRARequest
														
 
															-from aphrodite.common.config import (CacheConfig, ModelConfig, ParallelConfig,
														
 
															-                                     SchedulerConfig, LoRAConfig, DeviceConfig)
														
 
															+from aphrodite.common.config import (
														
 
															+    CacheConfig,
														
 
															+    DeviceConfig,
														
 
															+    ModelConfig,
														
 
															+    ParallelConfig,
														
 
															+    SchedulerConfig,
														
 
															+    LoRAConfig,
														
 
															+)
														
 
															 from aphrodite.processing.scheduler import Scheduler, SchedulerOutputs
														
 
															 from aphrodite.engine.args_tools import EngineArgs
														
 
															 from aphrodite.engine.metrics import StatLogger, Stats
														
 
															-from aphrodite.engine.ray_tools import (RayWorkerAphrodite, initialize_cluster,
														
 
															-                                        ray)
														
 
															-from aphrodite.common.logger import setup_logger
														
 
															+from aphrodite.engine.ray_tools import (
														
 
															+    RayWorkerAphrodite,
														
 
															+    initialize_cluster,
														
 
															+    ray,
														
 
															+)
														
 
															 from aphrodite.common.outputs import RequestOutput
														
 
															 from aphrodite.common.sampling_params import SamplingParams
														
 
															-from aphrodite.common.sequence import (SamplerOutput, Sequence, SequenceGroup,
														
 
															-                                       SequenceGroupOutput, SequenceOutput,
														
 
															-                                       SequenceStatus, Logprob)
														
 
															-from aphrodite.transformers_utils.tokenizer import (detokenize_incrementally,
														
 
															-                                                    TokenizerGroup)
														
 
															-from aphrodite.common.utils import (Counter, set_cuda_visible_devices, get_ip,
														
 
															-                                    get_open_port)
														
 
															+from aphrodite.common.sequence import (
														
 
															+    Logprob,
														
 
															+    SamplerOutput,
														
 
															+    Sequence,
														
 
															+    SequenceGroup,
														
 
															+    SequenceGroupOutput,
														
 
															+    SequenceOutput,
														
 
															+    SequenceStatus,
														
 
															+)
														
 
															+from aphrodite.transformers_utils.tokenizer import (
														
 
															+    detokenize_incrementally,
														
 
															+    TokenizerGroup,
														
 
															+)
														
 
															+from aphrodite.common.utils import (
														
 
															+    Counter,
														
 
															+    set_cuda_visible_devices,
														
 
															+    get_ip,
														
 
															+    get_open_port,
														
 
															+    get_distributed_init_method,
														
 
															+)
														
 
															+from aphrodite.common.logger import setup_logger
														
 
															 if ray:
														
 
															     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
														
@@ -34,6 +66,17 @@ if TYPE_CHECKING:
 
															 _LOCAL_LOGGING_INTERVAL_SEC = 5
														
 
															+# A map between the device type (in device config) to its worker module.
														
 
															+DEVICE_TO_WORKER_MODULE_MAP = {
														
 
															+    "cuda": "aphrodite.task_handler.worker",
														
 
															+    "neuron": "aphrodite.task_handler.neuron_worker",
														
 
															+}
														
 
															+
														
 
															+# If the env var is set, it uses the Ray's compiled DAG API
														
 
															+# which optimizes the control plane overhead.
														
 
															+# Run APHRODITE with APHRODITE_USE_RAY_COMPILED_DAG=1 to enable it.
														
 
															+USE_RAY_COMPILED_DAG = bool(os.getenv("APHRODITE_USE_RAY_COMPILED_DAG", 0))
														
 
															+
														
 
															 class AphroditeEngine:
														
 
															     """An LLM engine that receives requests and generates texts.
														
@@ -88,7 +131,7 @@ class AphroditeEngine:
 
															             f"Context Length = {model_config.max_model_len}\n"
														
 
															             f"Enforce Eager Mode = {model_config.enforce_eager}\n"
														
 
															             f"KV Cache Data Type = {cache_config.cache_dtype}\n"
														
 
															-            f"KV Cache Params Path = {cache_config.cache_quant_params_path}\n"
														
 
															+            # f"KV Cache Params Path = {cache_config.cache_quant_params_path}\n"
														
 
															             f"Device = {device_config.device}")
														
 
															         # TODO: Print more configs in debug mode.
														
@@ -110,7 +153,20 @@ class AphroditeEngine:
 
															             ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
														
 
															             if ray_usage != "1":
														
 
															                 os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
														
 
															-            self._init_workers_ray(placement_group)
														
 
															+            # Pass additional arguments to initialize the worker
														
 
															+            additional_ray_args = {}
														
 
															+            if self.parallel_config.ray_workers_use_nsight:
														
 
															+                logger.info("Configuring Ray workers to use nsight.")
														
 
															+                additional_ray_args = {
														
 
															+                    "runtime_env": {
														
 
															+                        "nsight": {
														
 
															+                            "t": "cuda,cudnn,cublas",
														
 
															+                            "o": "'worker_process_%p'",
														
 
															+                            "cuda-graph-trace": "node",
														
 
															+                        }
														
 
															+                    }
														
 
															+                }
														
 
															+            self._init_workers_ray(placement_group, **additional_ray_args)
														
 
															         else:
														
 
															             self._init_workers()
														
@@ -124,22 +180,40 @@ class AphroditeEngine:
 
															         if self.log_stats:
														
 
															             self.stat_logger = StatLogger(
														
 
															                 local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
														
 
															-                labels=dict(model_name=model_config.model))
														
 
															+                labels=dict(model_name=model_config.model),
														
 
															+            )
														
 
															+            self.stat_logger.info("cache_config", self.cache_config)
														
 
															+
														
 
															+        self.forward_dag = None
														
 
															+        if USE_RAY_COMPILED_DAG:
														
 
															+            self.forward_dag = self._compiled_ray_dag()
														
 
															+
														
 
															+    def __reduce__(self):
														
 
															+        # This is to ensure that the AphroditeEngine is not referenced in
														
 
															+        # the closure used to initialize Ray worker actors
														
 
															+        raise RuntimeError("AphroditeEngine should not be pickled!")
														
 
															     def get_tokenizer_for_seq(self, sequence: Sequence):
														
 
															         return self.tokenizer.get_lora_tokenizer(sequence.lora_request)
														
 
															+    def _dispatch_worker(self):
														
 
															+        worker_module = DEVICE_TO_WORKER_MODULE_MAP[
														
 
															+            self.device_config.device_type]
														
 
															+        imported_worker = importlib.import_module(worker_module)
														
 
															+        Worker = imported_worker.Worker
														
 
															+        return Worker
														
 
															+
														
 
															     def _init_workers(self):
														
 
															         # Lazy import the Worker to avoid importing torch.cuda/xformers
														
 
															         # before CUDA_VISIBLE_DEVICES is set in the Worker
														
 
															-        # pylint: disable=import-outside-toplevel
														
 
															-        from aphrodite.task_handler.worker import Worker
														
 
															+        Worker = self._dispatch_worker()
														
 
															-        assert self.parallel_config.world_size == 1, (
														
 
															-            "Ray is required if parallel_config.world_size > 1.")
														
 
															+        assert (self.parallel_config.world_size == 1
														
 
															+                ), "Ray is required if parallel_config.world_size > 1."
														
 
															         self.workers: List[Worker] = []
														
 
															-        distributed_init_method = f"tcp://{get_ip()}:{get_open_port()}"
														
 
															+        distributed_init_method = get_distributed_init_method(
														
 
															+            get_ip(), get_open_port())
														
 
															         self.driver_worker = Worker(
														
 
															             self.model_config,
														
 
															             self.parallel_config,
														
@@ -150,7 +224,7 @@ class AphroditeEngine:
 
															             distributed_init_method=distributed_init_method,
														
 
															             lora_config=self.lora_config,
														
 
															             kv_cache_dtype=self.cache_config.cache_dtype,
														
 
															-            kv_quant_params_path=(self.cache_config.cache_quant_params_path),
														
 
															+            # kv_quant_params_path=(self.cache_config.cache_quant_params_path),
														
 
															             is_driver_worker=True,
														
 
															         )
														
 
															         self._run_workers("init_model")
														
@@ -163,7 +237,8 @@ class AphroditeEngine:
 
															             max_input_length=None,
														
 
															             tokenizer_mode=self.model_config.tokenizer_mode,
														
 
															             trust_remote_code=self.model_config.trust_remote_code,
														
 
															-            revision=self.model_config.tokenizer_revision)
														
 
															+            revision=self.model_config.tokenizer_revision,
														
 
															+        )
														
 
															         init_kwargs.update(tokenizer_init_kwargs)
														
 
															         self.tokenizer: TokenizerGroup = TokenizerGroup(
														
 
															             self.model_config.tokenizer, **init_kwargs)
														
@@ -230,18 +305,21 @@ class AphroditeEngine:
 
															         for worker, (node_id, _) in zip(self.workers, worker_node_and_gpu_ids):
														
 
															             worker.set_cuda_visible_devices.remote(node_gpus[node_id])
														
 
															-        distributed_init_method = f"tcp://{driver_ip}:{get_open_port()}"
														
 
															+        distributed_init_method = get_distributed_init_method(
														
 
															+            driver_ip, get_open_port())
														
 
															         # Lazy import the Worker to avoid importing torch.cuda/xformers
														
 
															         # before CUDA_VISIBLE_DEVICES is set in the Worker
														
 
															-        # pylint: disable=import-outside-toplevel
														
 
															-        from aphrodite.task_handler.worker import Worker
														
 
															+        Worker = self._dispatch_worker()
														
 
															         # Initialize torch distributed process group for the workers.
														
 
															         model_config = copy.deepcopy(self.model_config)
														
 
															         parallel_config = copy.deepcopy(self.parallel_config)
														
 
															         scheduler_config = copy.deepcopy(self.scheduler_config)
														
 
															         device_config = copy.deepcopy(self.device_config)
														
 
															+        lora_config = copy.deepcopy(self.lora_config)
														
 
															+        kv_cache_dtype = self.cache_config.cache_dtype
														
 
															+        # kv_quant_params_path = self.cache_config.cache_quant_params_path
														
 
															         for rank, (worker, (node_id,
														
 
															                             _)) in enumerate(zip(self.workers,
														
@@ -257,29 +335,33 @@ class AphroditeEngine:
 
															                     local_rank,
														
 
															                     rank,
														
 
															                     distributed_init_method,
														
 
															-                    lora_config=self.lora_config,
														
 
															-                    kv_cache_dtype=self.cache_config.cache_dtype,
														
 
															-                    kv_quant_params_path=
														
 
															-                    (self.cache_config.cache_quant_params_path),
														
 
															+                    lora_config=lora_config,
														
 
															+                    kv_cache_dtype=kv_cache_dtype,
														
 
															+                    # kv_quant_params_path=kv_quant_params_path,
														
 
															                 ))
														
 
															         driver_rank = 0
														
 
															         driver_local_rank = node_workers[driver_node_id].index(driver_rank)
														
 
															         self.driver_worker = Worker(
														
 
															-            model_config,
														
 
															-            parallel_config,
														
 
															-            scheduler_config,
														
 
															-            device_config,
														
 
															+            self.model_config,
														
 
															+            self.parallel_config,
														
 
															+            self.scheduler_config,
														
 
															+            self.device_config,
														
 
															             driver_local_rank,
														
 
															             driver_rank,
														
 
															             distributed_init_method,
														
 
															             lora_config=self.lora_config,
														
 
															-            kv_cache_dtype=self.cache_config.cache_dtype,
														
 
															-            kv_quant_params_path=(self.cache_config.cache_quant_params_path),
														
 
															+            kv_cache_dtype=kv_cache_dtype,
														
 
															+            # kv_quant_params_path=kv_quant_params_path,
														
 
															             is_driver_worker=True,
														
 
															         )
														
 
															-        self._run_workers("init_model", cupy_port=get_open_port())
														
 
															+        # don't use cupy for eager mode
														
 
															+        self._run_workers(
														
 
															+            "init_model",
														
 
															+            cupy_port=get_open_port()
														
 
															+            if not model_config.enforce_eager else None,
														
 
															+        )
														
 
															         self._run_workers(
														
 
															             "load_model",
														
 
															             max_concurrent_workers=self.parallel_config.
														
@@ -302,7 +384,6 @@ class AphroditeEngine:
 
															         Then, it calculate the maximum possible number of GPU and CPU blocks
														
 
															         that can be allocated with the remaining free memory.
														
 
															         More details can be found in the
														
 
															-        # pylint: disable=line-too-long
														
 
															         :meth:`~aphrodite.task_handler.worker.Worker.profile_num_available_blocks` method
														
 
															         from class :class:`~aphrodite.task_handler.Worker`.
														
@@ -372,9 +453,11 @@ class AphroditeEngine:
 
															         # Initialize the cluster.
														
 
															         placement_group = initialize_cluster(parallel_config)
														
 
															         # Create the LLM engine.
														
 
															-        engine = cls(*engine_configs,
														
 
															-                     placement_group,
														
 
															-                     log_stats=not engine_args.disable_log_stats)
														
 
															+        engine = cls(
														
 
															+            *engine_configs,
														
 
															+            placement_group,
														
 
															+            log_stats=not engine_args.disable_log_stats,
														
 
															+        )
														
 
															         return engine
														
 
															     def encode_request(
														
@@ -449,20 +532,34 @@ class AphroditeEngine:
 
															                     sampling_params.prompt_logprobs
														
 
															                     and sampling_params.prompt_logprobs > max_log_probs):
														
 
															             raise ValueError(f"Cannot request more than "
														
 
															-                             f"{max_log_probs} logprobs.")
														
 
															+                             f"{max_log_probs} logprobs. "
														
 
															+                             "Please increase the max_log_probs.")
														
 
															         if arrival_time is None:
														
 
															             arrival_time = time.monotonic()
														
 
															         prompt_token_ids = self.encode_request(
														
 
															             request_id=request_id,
														
 
															             prompt=prompt,
														
 
															             prompt_token_ids=prompt_token_ids,
														
 
															-            lora_request=lora_request)
														
 
															+            lora_request=lora_request,
														
 
															+        )
														
 
															         # Create the sequences.
														
 
															         block_size = self.cache_config.block_size
														
 
															         seq_id = next(self.seq_counter)
														
 
															-        seq = Sequence(seq_id, prompt, prompt_token_ids, block_size,
														
 
															-                       lora_request)
														
 
															+        eos_token_id = self.tokenizer.get_lora_tokenizer(
														
 
															+            lora_request).eos_token_id
														
 
															+        seq = Sequence(
														
 
															+            seq_id,
														
 
															+            prompt,
														
 
															+            prompt_token_ids,
														
 
															+            block_size,
														
 
															+            eos_token_id,
														
 
															+            lora_request,
														
 
															+        )
														
 
															+
														
 
															+        # Defensive copy of SamplingParams, which are used by the sampler,
														
 
															+        # this doesn't deep-copy LogitsProcessor objects
														
 
															+        sampling_params = sampling_params.clone()
														
 
															         # Create the sequence group.
														
 
															         seq_group = SequenceGroup(request_id, [seq], sampling_params,
														
@@ -514,15 +611,15 @@ class AphroditeEngine:
 
															         if early_stopping is True:
														
 
															             return True
														
 
															-        current_worst_score = (current_worst_seq.get_beam_search_score(
														
 
															+        current_worst_score = current_worst_seq.get_beam_search_score(
														
 
															             length_penalty=length_penalty,
														
 
															-            eos_token_id=self.get_tokenizer_for_seq(
														
 
															-                current_worst_seq).eos_token_id))
														
 
															+            eos_token_id=current_worst_seq.eos_token_id,
														
 
															+        )
														
 
															         if early_stopping is False:
														
 
															-            highest_attainable_score = (best_running_seq.get_beam_search_score(
														
 
															+            highest_attainable_score = best_running_seq.get_beam_search_score(
														
 
															                 length_penalty=length_penalty,
														
 
															-                eos_token_id=self.get_tokenizer_for_seq(
														
 
															-                    best_running_seq).eos_token_id))
														
 
															+                eos_token_id=best_running_seq.eos_token_id,
														
 
															+            )
														
 
															         else:
														
 
															             assert early_stopping == "never"
														
 
															             if length_penalty > 0.0:
														
@@ -532,13 +629,14 @@ class AphroditeEngine:
 
															                 max_possible_length = max(
														
 
															                     best_running_seq.get_prompt_len() +
														
 
															                     sampling_params.max_tokens,
														
 
															-                    self.scheduler_config.max_model_len)
														
 
															+                    self.scheduler_config.max_model_len,
														
 
															+                )
														
 
															                 highest_attainable_score = (
														
 
															                     best_running_seq.get_beam_search_score(
														
 
															                         length_penalty=length_penalty,
														
 
															-                        eos_token_id=self.get_tokenizer_for_seq(
														
 
															-                            best_running_seq).eos_token_id,
														
 
															-                        seq_len=max_possible_length))
														
 
															+                        eos_token_id=best_running_seq.eos_token_id,
														
 
															+                        seq_len=max_possible_length,
														
 
															+                    ))
														
 
															             else:
														
 
															                 # Otherwise, beam search will prefer shorter sequences. The
														
 
															                 # highest attainable score calculation is based on the current
														
@@ -546,8 +644,8 @@ class AphroditeEngine:
 
															                 highest_attainable_score = (
														
 
															                     best_running_seq.get_beam_search_score(
														
 
															                         length_penalty=length_penalty,
														
 
															-                        eos_token_id=self.get_tokenizer_for_seq(
														
 
															-                            best_running_seq).eos_token_id))
														
 
															+                        eos_token_id=best_running_seq.eos_token_id,
														
 
															+                    ))
														
 
															         return current_worst_score >= highest_attainable_score
														
 
															     def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
														
@@ -555,6 +653,16 @@ class AphroditeEngine:
 
															         # Process prompt logprobs
														
 
															         prompt_logprobs = outputs.prompt_logprobs
														
 
															         if prompt_logprobs is not None:
														
 
															+            # We can pick any sequence for the prompt.
														
 
															+            seq = next(iter(seq_group.seqs_dict.values()))
														
 
															+            all_token_ids = seq.get_token_ids()
														
 
															+            for i, prompt_logprobs_for_token in enumerate(prompt_logprobs):
														
 
															+                self._decode_logprobs(
														
 
															+                    seq,
														
 
															+                    seq_group.sampling_params,
														
 
															+                    prompt_logprobs_for_token,
														
 
															+                    all_token_ids[:i],
														
 
															+                )
														
 
															             seq_group.prompt_logprobs = prompt_logprobs
														
 
															         # Process samples
														
@@ -638,10 +746,11 @@ class AphroditeEngine:
 
															                              if seq.is_finished()]
														
 
															         all_finished_seqs = existing_finished_seqs + new_finished_seqs
														
 
															         # Sort the finished sequences by their scores.
														
 
															-        all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score(
														
 
															-            length_penalty=length_penalty,
														
 
															-            eos_token_id=self.get_tokenizer_for_seq(x[0]).eos_token_id),
														
 
															-                               reverse=True)
														
 
															+        all_finished_seqs.sort(
														
 
															+            key=lambda x: x[0].get_beam_search_score(
														
 
															+                length_penalty=length_penalty, eos_token_id=x[0].eos_token_id),
														
 
															+            reverse=True,
														
 
															+        )
														
 
															         for seq, parent, is_new in all_finished_seqs[:beam_width]:
														
 
															             if is_new:
														
 
															                 # A newly generated child sequence finishes and has a high
														
@@ -666,10 +775,11 @@ class AphroditeEngine:
 
															         running_child_seqs = [(seq, parent) for seq, parent in child_seqs
														
 
															                               if not seq.is_finished()]
														
 
															         # Sort the running sequences by their scores.
														
 
															-        running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score(
														
 
															-            length_penalty=length_penalty,
														
 
															-            eos_token_id=self.get_tokenizer_for_seq(x[0]).eos_token_id),
														
 
															-                                reverse=True)
														
 
															+        running_child_seqs.sort(
														
 
															+            key=lambda x: x[0].get_beam_search_score(
														
 
															+                length_penalty=length_penalty, eos_token_id=x[0].eos_token_id),
														
 
															+            reverse=True,
														
 
															+        )
														
 
															         # Check if we can stop the beam search.
														
 
															         if len(running_child_seqs) == 0:
														
@@ -684,7 +794,10 @@ class AphroditeEngine:
 
															             current_worst_seq = all_finished_seqs[beam_width - 1][0]
														
 
															             stop_beam_search = self._check_beam_search_early_stopping(
														
 
															                 seq_group.sampling_params.early_stopping,
														
 
															-                seq_group.sampling_params, best_running_seq, current_worst_seq)
														
 
															+                seq_group.sampling_params,
														
 
															+                best_running_seq,
														
 
															+                current_worst_seq,
														
 
															+            )
														
 
															         if stop_beam_search:
														
 
															             # Stop the beam search and remove all the running sequences from
														
@@ -726,13 +839,16 @@ class AphroditeEngine:
 
															     def _process_model_outputs(
														
 
															             self, output: SamplerOutput,
														
 
															             scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]:
														
 
															+        now = time.time()
														
 
															         # Update the scheduled sequence groups with the model outputs.
														
 
															         scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
														
 
															+
														
 
															         # If prefix caching is enabled, mark all blocks in the sequence groups
														
 
															         # as completed so that future requests don't attempt to recompute them
														
 
															         if self.cache_config.context_shift:
														
 
															             for seq_group in scheduled_seq_groups:
														
 
															                 self.scheduler.mark_blocks_as_computed(seq_group)
														
 
															+
														
 
															         for seq_group, outputs in zip(scheduled_seq_groups, output):
														
 
															             self._process_sequence_group_outputs(seq_group, outputs)
														
@@ -742,6 +858,7 @@ class AphroditeEngine:
 
															         # Create the outputs.
														
 
															         request_outputs: List[RequestOutput] = []
														
 
															         for seq_group in scheduled_seq_groups:
														
 
															+            seq_group.maybe_set_first_token_time(now)
														
 
															             request_output = RequestOutput.from_seq_group(seq_group)
														
 
															             request_outputs.append(request_output)
														
 
															         for seq_group in scheduler_outputs.ignored_seq_groups:
														
@@ -751,6 +868,7 @@ class AphroditeEngine:
 
															         # Log stats.
														
 
															         if self.log_stats:
														
 
															             self.stat_logger.log(self._get_stats(scheduler_outputs))
														
 
															+
														
 
															         return request_outputs
														
 
															     def step(self) -> List[RequestOutput]:
														
@@ -815,7 +933,9 @@ class AphroditeEngine:
 
															                     "blocks_to_swap_in": scheduler_outputs.blocks_to_swap_in,
														
 
															                     "blocks_to_swap_out": scheduler_outputs.blocks_to_swap_out,
														
 
															                     "blocks_to_copy": scheduler_outputs.blocks_to_copy,
														
 
															-                })
														
 
															+                },
														
 
															+                use_ray_compiled_dag=USE_RAY_COMPILED_DAG,
														
 
															+            )
														
 
															             # Only the driver worker returns the sampling results.
														
 
															             output = all_outputs[0]
														
@@ -840,10 +960,10 @@ class AphroditeEngine:
 
															         gpu_cache_usage = 1.0 - (num_free_gpu / num_total_gpu)
														
 
															         num_total_cpu = self.cache_config.num_cpu_blocks
														
 
															-        cpu_cache_usage = 0.
														
 
															+        cpu_cache_usage = 0.0
														
 
															         if num_total_cpu > 0:
														
 
															-            num_free_cpu = self.scheduler.block_manager.get_num_free_cpu_blocks(
														
 
															-            )
														
 
															+            num_free_cpu = (
														
 
															+                self.scheduler.block_manager.get_num_free_cpu_blocks())
														
 
															             cpu_cache_usage = 1.0 - (num_free_cpu / num_total_cpu)
														
 
															         # Scheduler State
														
@@ -898,16 +1018,24 @@ class AphroditeEngine:
 
															             time_e2e_requests=time_e2e_requests,
														
 
															         )
														
 
															-    def _decode_logprobs(self, seq: Sequence, prms: SamplingParams,
														
 
															-                         logprobs: Dict[int, Logprob],
														
 
															-                         all_input_ids: List[int]) -> None:
														
 
															+    def _decode_logprobs(
														
 
															+        self,
														
 
															+        seq: Sequence,
														
 
															+        prms: SamplingParams,
														
 
															+        logprobs: Dict[int, Logprob],
														
 
															+        all_input_ids: List[int],
														
 
															+    ) -> None:
														
 
															         if not logprobs:
														
 
															             return
														
 
															         for token_id, sample_logprob in logprobs.items():
														
 
															-            if (sample_logprob.decoded_token is None and token_id != -1):
														
 
															+            if sample_logprob.decoded_token is None and token_id != -1:
														
 
															                 all_input_ids_with_logprob = all_input_ids[:-1] + [token_id]
														
 
															-                # pylint: disable=unused-variable
														
 
															-                _, new_text, prefix_offset, read_offset = detokenize_incrementally(
														
 
															+                (
														
 
															+                    _,
														
 
															+                    new_text,
														
 
															+                    prefix_offset,
														
 
															+                    read_offset,
														
 
															+                ) = detokenize_incrementally(
														
 
															                     self.get_tokenizer_for_seq(seq),
														
 
															                     all_input_ids=all_input_ids_with_logprob,
														
 
															                     prev_tokens=seq.tokens,
														
@@ -924,16 +1052,21 @@ class AphroditeEngine:
 
															         all_input_ids = seq.get_token_ids()
														
 
															         self._decode_logprobs(seq, prms, seq.output_logprobs[-1],
														
 
															                               all_input_ids)
														
 
															-        (new_tokens, new_output_text, prefix_offset,
														
 
															-         read_offset) = detokenize_incrementally(
														
 
															-             self.get_tokenizer_for_seq(seq),
														
 
															-             all_input_ids=all_input_ids,
														
 
															-             prev_tokens=seq.tokens,
														
 
															-             prefix_offset=seq.prefix_offset,
														
 
															-             read_offset=seq.read_offset,
														
 
															-             skip_special_tokens=prms.skip_special_tokens,
														
 
															-             spaces_between_special_tokens=prms.spaces_between_special_tokens,
														
 
															-         )
														
 
															+
														
 
															+        (
														
 
															+            new_tokens,
														
 
															+            new_output_text,
														
 
															+            prefix_offset,
														
 
															+            read_offset,
														
 
															+        ) = detokenize_incrementally(
														
 
															+            self.get_tokenizer_for_seq(seq),
														
 
															+            all_input_ids=all_input_ids,
														
 
															+            prev_tokens=seq.tokens,
														
 
															+            prefix_offset=seq.prefix_offset,
														
 
															+            read_offset=seq.read_offset,
														
 
															+            skip_special_tokens=prms.skip_special_tokens,
														
 
															+            spaces_between_special_tokens=prms.spaces_between_special_tokens,
														
 
															+        )
														
 
															         if seq.tokens is None:
														
 
															             seq.tokens = new_tokens
														
 
															         else:
														
@@ -968,15 +1101,18 @@ class AphroditeEngine:
 
															             return
														
 
															         # Check if the sequence has generated the EOS token.
														
 
															-        if ((not sampling_params.ignore_eos) and seq.get_last_token_id()
														
 
															-                == self.get_tokenizer_for_seq(seq).eos_token_id):
														
 
															+        if (not sampling_params.ignore_eos
														
 
															+            ) and seq.get_last_token_id() == seq.eos_token_id:
														
 
															             seq.status = SequenceStatus.FINISHED_STOPPED
														
 
															             return
														
 
															     def _finalize_sequence(self, seq: Sequence,
														
 
															                            sampling_params: SamplingParams,
														
 
															                            stop_string: str) -> None:
														
 
															-        if not sampling_params.include_stop_str_in_output and stop_string:
														
 
															+        if sampling_params.include_stop_str_in_output:
														
 
															+            return
														
 
															+
														
 
															+        if stop_string and seq.output_text.endswith(stop_string):
														
 
															             # Truncate the output text so that the stop string is
														
 
															             # not included in the output.
														
 
															             seq.output_text = seq.output_text[:-len(stop_string)]
														
@@ -1005,6 +1141,7 @@ class AphroditeEngine:
 
															         driver_args: Optional[List[Any]] = None,
														
 
															         driver_kwargs: Optional[Dict[str, Any]] = None,
														
 
															         max_concurrent_workers: Optional[int] = None,
														
 
															+        use_ray_compiled_dag: bool = False,
														
 
															         **kwargs,
														
 
															     ) -> Any:
														
 
															         """Runs the given method on all workers."""
														
@@ -1013,11 +1150,17 @@ class AphroditeEngine:
 
															             raise NotImplementedError(
														
 
															                 "max_concurrent_workers is not supported yet.")
														
 
															-        # Start the ray workers first.
														
 
															-        ray_worker_outputs = [
														
 
															-            worker.execute_method.remote(method, *args, **kwargs)
														
 
															-            for worker in self.workers
														
 
															-        ]
														
 
															+        if use_ray_compiled_dag:
														
 
															+            # Right now, compiled DAG can only accept a single
														
 
															+            # input.
														
 
															+            # TODO: Fix it.
														
 
															+            output_channels = self.forward_dag.execute(1)
														
 
															+        else:
														
 
															+            # Start the ray workers first.
														
 
															+            ray_worker_outputs = [
														
 
															+                worker.execute_method.remote(method, *args, **kwargs)
														
 
															+                for worker in self.workers
														
 
															+            ]
														
 
															         if driver_args is None:
														
 
															             driver_args = args
														
@@ -1030,10 +1173,45 @@ class AphroditeEngine:
 
															         # Get the results of the ray workers.
														
 
															         if self.workers:
														
 
															-            ray_worker_outputs = ray.get(ray_worker_outputs)
														
 
															+            if use_ray_compiled_dag:
														
 
															+                try:
														
 
															+                    ray_worker_outputs = [
														
 
															+                        pickle.loads(chan.begin_read())
														
 
															+                        for chan in output_channels
														
 
															+                    ]
														
 
															+                finally:
														
 
															+                    # Has to call end_read in order to reuse the DAG.
														
 
															+                    for chan in output_channels:
														
 
															+                        chan.end_read()
														
 
															+            else:
														
 
															+                ray_worker_outputs = ray.get(ray_worker_outputs)
														
 
															         return [driver_worker_output] + ray_worker_outputs
														
 
															+    def _compiled_ray_dag(self):
														
 
															+        from packaging import version
														
 
															+        import pkg_resources
														
 
															+
														
 
															+        required_version = "2.9"
														
 
															+        current_version = pkg_resources.get_distribution("ray").version
														
 
															+
														
 
															+        if version.parse(current_version) < version.parse(required_version):
														
 
															+            raise ValueError(f"Ray version {required_version} or greater is "
														
 
															+                             f"required, but found {current_version}")
														
 
															+
														
 
															+        from ray.dag import MultiOutputNode, InputNode
														
 
															+
														
 
															+        assert self.parallel_config.worker_use_ray
														
 
															+
														
 
															+        # Right now, compiled DAG requires at least 1 arg. We send
														
 
															+        # a dummy value for now. It will be fixed soon.
														
 
															+        with InputNode() as input_data:
														
 
															+            forward_dag = MultiOutputNode([
														
 
															+                worker.execute_model_compiled_dag_remote.bind(input_data)
														
 
															+                for worker in self.workers
														
 
															+            ])
														
 
															+        return forward_dag.experimental_compile()
														
 
															+
														
 
															     def check_health(self) -> None:
														
 
															         """Raises an error if engine is unhealthy."""
														
 
															         self._check_if_any_actor_is_dead()
														
@@ -1052,7 +1230,7 @@ class AphroditeEngine:
 
															                 dead_actors.append(actor)
														
 
															         if dead_actors:
														
 
															             raise RuntimeError("At least one Worker is dead. "
														
 
															-                               f"Dead workers: {dead_actors}")
														
 
															+                               f"Dead Workers: {dead_actors}. ")
														
 
															 setup_logger()
														
--- a/aphrodite/engine/args_tools.py
+++ b/aphrodite/engine/args_tools.py
@@ -3,22 +3,29 @@ import dataclasses
 
															 from dataclasses import dataclass
														
 
															 from typing import Optional, Tuple
														
 
															-from aphrodite.common.config import (CacheConfig, ModelConfig, ParallelConfig,
														
 
															-                                     SchedulerConfig, LoRAConfig, DeviceConfig)
														
 
															+from aphrodite.common.config import (
														
 
															+    CacheConfig,
														
 
															+    ModelConfig,
														
 
															+    ParallelConfig,
														
 
															+    SchedulerConfig,
														
 
															+    LoRAConfig,
														
 
															+    DeviceConfig,
														
 
															+)
														
 
															 @dataclass
														
 
															 class EngineArgs:
														
 
															-    """Arguments for the Aphrodite engine."""
														
 
															+    """Arguments for Aphrodite engine."""
														
 
															+
														
 
															     model: str
														
 
															     tokenizer: Optional[str] = None
														
 
															-    tokenizer_mode: str = 'auto'
														
 
															+    tokenizer_mode: str = "auto"
														
 
															     trust_remote_code: bool = False
														
 
															     download_dir: Optional[str] = None
														
 
															-    load_format: str = 'auto'
														
 
															-    dtype: str = 'auto'
														
 
															-    kv_cache_dtype: str = 'auto'
														
 
															-    kv_quant_params_path: str = None
														
 
															+    load_format: str = "auto"
														
 
															+    dtype: str = "auto"
														
 
															+    kv_cache_dtype: str = "auto"
														
 
															+    # kv_quant_params_path: str = None
														
 
															     seed: int = 0
														
 
															     max_model_len: Optional[int] = None
														
 
															     worker_use_ray: bool = False
														
@@ -32,24 +39,26 @@ class EngineArgs:
 
															     max_num_batched_tokens: Optional[int] = None
														
 
															     max_num_seqs: int = 256
														
 
															     max_paddings: int = 256
														
 
															-    max_log_probs: int = 10
														
 
															+    max_log_probs: int = 10  # OpenAI default is 5, setting to 10 because ST
														
 
															     disable_log_stats: bool = False
														
 
															     revision: Optional[str] = None
														
 
															+    code_revision: Optional[str] = None
														
 
															     tokenizer_revision: Optional[str] = None
														
 
															     quantization: Optional[str] = None
														
 
															     load_in_4bit: bool = False
														
 
															     load_in_8bit: bool = False
														
 
															     load_in_smooth: bool = False
														
 
															-    enforce_eager: bool = False
														
 
															+    enforce_eager: bool = True
														
 
															     max_context_len_to_capture: int = 8192
														
 
															     disable_custom_all_reduce: bool = False
														
 
															     enable_lora: bool = False
														
 
															     max_loras: int = 1
														
 
															     max_lora_rank: int = 16
														
 
															     lora_extra_vocab_size: int = 256
														
 
															-    lora_dtype = 'auto'
														
 
															+    lora_dtype = "auto"
														
 
															     max_cpu_loras: Optional[int] = None
														
 
															-    device: str = 'cuda'
														
 
															+    device: str = "auto"
														
 
															+    ray_workers_use_nsight: bool = False
														
 
															     def __post_init__(self):
														
 
															         if self.tokenizer is None:
														
@@ -65,245 +74,333 @@ class EngineArgs:
 
															         # Model arguments
														
 
															         parser.add_argument(
														
 
															-            '--model',
														
 
															+            "--model",
														
 
															             type=str,
														
 
															-            default='EleutherAI/pythia-70m-deduped',
														
 
															-            help='name or path of the huggingface model to use')
														
 
															+            default="EleutherAI/pythia-70m-deduped",
														
 
															+            help="name or path of the huggingface model to use",
														
 
															+        )
														
 
															         parser.add_argument(
														
 
															-            '--tokenizer',
														
 
															+            "--tokenizer",
														
 
															             type=str,
														
 
															             default=EngineArgs.tokenizer,
														
 
															-            help='name or path of the huggingface tokenizer to use')
														
 
															+            help="name or path of the huggingface tokenizer to use",
														
 
															+        )
														
 
															         parser.add_argument(
														
 
															-            '--revision',
														
 
															+            "--revision",
														
 
															             type=str,
														
 
															             default=None,
														
 
															-            help='the specific model version to use. It can be a branch '
														
 
															-            'name, a tag name, or a commit id. If unspecified, will use '
														
 
															-            'the default version.')
														
 
															+            help="the specific model version to use. It can be a branch "
														
 
															+            "name, a tag name, or a commit id. If unspecified, will use "
														
 
															+            "the default version.",
														
 
															+        )
														
 
															         parser.add_argument(
														
 
															-            '--tokenizer-revision',
														
 
															+            "--code-revision",
														
 
															             type=str,
														
 
															             default=None,
														
 
															-            help='the specific tokenizer version to use. It can be a branch '
														
 
															-            'name, a tag name, or a commit id. If unspecified, will use '
														
 
															-            'the default version.')
														
 
															-        parser.add_argument('--tokenizer-mode',
														
 
															-                            type=str,
														
 
															-                            default=EngineArgs.tokenizer_mode,
														
 
															-                            choices=['auto', 'slow'],
														
 
															-                            help='tokenizer mode. "auto" will use the fast '
														
 
															-                            'tokenizer if available, and "slow" will '
														
 
															-                            'always use the slow tokenizer.')
														
 
															-        parser.add_argument('--trust-remote-code',
														
 
															-                            action='store_true',
														
 
															-                            help='trust remote code from huggingface')
														
 
															-        parser.add_argument('--download-dir',
														
 
															-                            type=str,
														
 
															-                            default=EngineArgs.download_dir,
														
 
															-                            help='directory to download and load the weights, '
														
 
															-                            'default to the default cache dir of '
														
 
															-                            'huggingface')
														
 
															-        parser.add_argument(
														
 
															-            '--load-format',
														
 
															+            help="the specific revision to use for the model code on "
														
 
															+            "Hugging Face Hub. It can be a branch name, a tag name, or a "
														
 
															+            "commit id. If unspecified, will use the default version.",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--tokenizer-revision",
														
 
															+            type=str,
														
 
															+            default=None,
														
 
															+            help="the specific tokenizer version to use. It can be a branch "
														
 
															+            "name, a tag name, or a commit id. If unspecified, will use "
														
 
															+            "the default version.",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--tokenizer-mode",
														
 
															+            type=str,
														
 
															+            default=EngineArgs.tokenizer_mode,
														
 
															+            choices=["auto", "slow"],
														
 
															+            help='tokenizer mode. "auto" will use the fast '
														
 
															+            'tokenizer if available, and "slow" will '
														
 
															+            "always use the slow tokenizer.",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--trust-remote-code",
														
 
															+            action="store_true",
														
 
															+            help="trust remote code from huggingface",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--download-dir",
														
 
															+            type=str,
														
 
															+            default=EngineArgs.download_dir,
														
 
															+            help="directory to download and load the weights, "
														
 
															+            "default to the default cache dir of "
														
 
															+            "huggingface",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--load-format",
														
 
															             type=str,
														
 
															             default=EngineArgs.load_format,
														
 
															-            choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'],
														
 
															-            help='The format of the model weights to load. '
														
 
															+            choices=["auto", "pt", "safetensors", "npcache", "dummy"],
														
 
															+            help="The format of the model weights to load. "
														
 
															             '"auto" will try to load the weights in the safetensors format '
														
 
															-            'and fall back to the pytorch bin format if safetensors format '
														
 
															-            'is not available. '
														
 
															+            "and fall back to the pytorch bin format if safetensors format "
														
 
															+            "is not available. "
														
 
															             '"pt" will load the weights in the pytorch bin format. '
														
 
															             '"safetensors" will load the weights in the safetensors format. '
														
 
															             '"npcache" will load the weights in pytorch format and store '
														
 
															-            'a numpy cache to speed up the loading. '
														
 
															+            "a numpy cache to speed up the loading. "
														
 
															             '"dummy" will initialize the weights with random values, '
														
 
															-            'which is mainly for profiling.')
														
 
															+            "which is mainly for profiling.",
														
 
															+        )
														
 
															         parser.add_argument(
														
 
															-            '--dtype',
														
 
															+            "--dtype",
														
 
															             type=str,
														
 
															             default=EngineArgs.dtype,
														
 
															             choices=[
														
 
															-                'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'
														
 
															+                "auto", "half", "float16", "bfloat16", "float", "float32"
														
 
															             ],
														
 
															-            help='data type for model weights and activations. '
														
 
															+            help="data type for model weights and activations. "
														
 
															             'The "auto" option will use FP16 precision '
														
 
															-            'for FP32 and FP16 models, and BF16 precision '
														
 
															-            'for BF16 models.')
														
 
															+            "for FP32 and FP16 models, and BF16 precision "
														
 
															+            "for BF16 models.",
														
 
															+        )
														
 
															         parser.add_argument(
														
 
															-            '--kv-cache-dtype',
														
 
															+            "--kv-cache-dtype",
														
 
															             type=str,
														
 
															-            choices=['auto', 'fp8_e5m2', 'int8'],
														
 
															+            # choices=["auto", "fp8_e5m2", "int8"],
														
 
															+            choices=['auto', 'fp8_e5m2'],
														
 
															             default=EngineArgs.kv_cache_dtype,
														
 
															             help='Data type for kv cache storage. If "auto", will use model '
														
 
															-            'data type. Note FP8 is not supported when cuda version is '
														
 
															-            'lower than 11.8.')
														
 
															+            "data type. Note FP8 is not supported when cuda version is "
														
 
															+            "lower than 11.8.",
														
 
															+        )
														
 
															+        # parser.add_argument(
														
 
															+        #     "--kv-quant-params-path",
														
 
															+        #     type=str,
														
 
															+        #     default=EngineArgs.kv_quant_params_path,
														
 
															+        #     help="Path to scales and zero points of KV cache "
														
 
															+        #     "quantization. Only applicable when kv-cache-dtype "
														
 
															+        #     "is int8.",
														
 
															+        # )
														
 
															         parser.add_argument(
														
 
															-            '--kv-quant-params-path',
														
 
															-            type=str,
														
 
															-            default=EngineArgs.kv_quant_params_path,
														
 
															-            help='Path to scales and zero points of KV cache '
														
 
															-            'quantization. Only applicable when kv-cache-dtype '
														
 
															-            'is int8.')
														
 
															-        parser.add_argument('--max-model-len',
														
 
															-                            type=int,
														
 
															-                            default=EngineArgs.max_model_len,
														
 
															-                            help='model context length. If unspecified, '
														
 
															-                            'will be automatically derived from the model.')
														
 
															+            "--max-model-len",
														
 
															+            type=int,
														
 
															+            default=EngineArgs.max_model_len,
														
 
															+            help="model context length. If unspecified, "
														
 
															+            "will be automatically derived from the model.",
														
 
															+        )
														
 
															         # Parallel arguments
														
 
															-        parser.add_argument('--worker-use-ray',
														
 
															-                            action='store_true',
														
 
															-                            help='use Ray for distributed serving, will be '
														
 
															-                            'automatically set when using more than 1 GPU')
														
 
															-        parser.add_argument('--pipeline-parallel-size',
														
 
															-                            '-pp',
														
 
															-                            type=int,
														
 
															-                            default=EngineArgs.pipeline_parallel_size,
														
 
															-                            help='number of pipeline stages')
														
 
															-        parser.add_argument('--tensor-parallel-size',
														
 
															-                            '-tp',
														
 
															-                            type=int,
														
 
															-                            default=EngineArgs.tensor_parallel_size,
														
 
															-                            help='number of tensor parallel replicas')
														
 
															         parser.add_argument(
														
 
															-            '--max-parallel-loading-workers',
														
 
															+            "--worker-use-ray",
														
 
															+            action="store_true",
														
 
															+            help="use Ray for distributed serving, will be "
														
 
															+            "automatically set when using more than 1 GPU",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--pipeline-parallel-size",
														
 
															+            "-pp",
														
 
															+            type=int,
														
 
															+            default=EngineArgs.pipeline_parallel_size,
														
 
															+            help="number of pipeline stages",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--tensor-parallel-size",
														
 
															+            "-tp",
														
 
															+            type=int,
														
 
															+            default=EngineArgs.tensor_parallel_size,
														
 
															+            help="number of tensor parallel replicas",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--max-parallel-loading-workers",
														
 
															             type=int,
														
 
															             default=EngineArgs.max_parallel_loading_workers,
														
 
															-            help='load model sequentially in multiple batches, '
														
 
															-            'to avoid RAM OOM when using tensor '
														
 
															-            'parallel and large models')
														
 
															+            help="load model sequentially in multiple batches, "
														
 
															+            "to avoid RAM OOM when using tensor "
														
 
															+            "parallel and large models",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--ray-workers-use-nsight",
														
 
															+            action="store_true",
														
 
															+            help="If specified, use nsight to profile ray workers",
														
 
															+        )
														
 
															         # KV cache arguments
														
 
															-        parser.add_argument('--block-size',
														
 
															-                            type=int,
														
 
															-                            default=EngineArgs.block_size,
														
 
															-                            choices=[8, 16, 32],
														
 
															-                            help='token block size')
														
 
															-        parser.add_argument('--context-shift',
														
 
															-                            action='store_true',
														
 
															-                            help='Enable context shifting.')
														
 
															-        parser.add_argument('--seed',
														
 
															+        parser.add_argument(
														
 
															+            "--block-size",
														
 
															+            type=int,
														
 
															+            default=EngineArgs.block_size,
														
 
															+            choices=[8, 16, 32, 128],
														
 
															+            help="token block size",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--context-shift",
														
 
															+            action="store_true",
														
 
															+            help="Enable context shifting.",
														
 
															+        )
														
 
															+        parser.add_argument("--seed",
														
 
															                             type=int,
														
 
															                             default=EngineArgs.seed,
														
 
															-                            help='random seed')
														
 
															-        parser.add_argument('--swap-space',
														
 
															-                            type=int,
														
 
															-                            default=EngineArgs.swap_space,
														
 
															-                            help='CPU swap space size (GiB) per GPU')
														
 
															+                            help="random seed")
														
 
															         parser.add_argument(
														
 
															-            '--gpu-memory-utilization',
														
 
															-            '-gmu',
														
 
															+            "--swap-space",
														
 
															+            type=int,
														
 
															+            default=EngineArgs.swap_space,
														
 
															+            help="CPU swap space size (GiB) per GPU",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--gpu-memory-utilization",
														
 
															+            "-gmu",
														
 
															             type=float,
														
 
															             default=EngineArgs.gpu_memory_utilization,
														
 
															-            help='the fraction of GPU memory to be used for '
														
 
															-            'the model executor, which can range from 0 to 1.'
														
 
															-            'If unspecified, will use the default value of 0.9.')
														
 
															-        parser.add_argument('--max-num-batched-tokens',
														
 
															-                            type=int,
														
 
															-                            default=EngineArgs.max_num_batched_tokens,
														
 
															-                            help='maximum number of batched tokens per '
														
 
															-                            'iteration')
														
 
															-        parser.add_argument('--max-num-seqs',
														
 
															-                            type=int,
														
 
															-                            default=EngineArgs.max_num_seqs,
														
 
															-                            help='maximum number of sequences per iteration')
														
 
															-        parser.add_argument('--max-paddings',
														
 
															-                            type=int,
														
 
															-                            default=EngineArgs.max_paddings,
														
 
															-                            help='maximum number of paddings in a batch')
														
 
															-        parser.add_argument('--max-log-probs',
														
 
															-                            type=int,
														
 
															-                            default=EngineArgs.max_log_probs,
														
 
															-                            help='maximum number of log probabilities to '
														
 
															-                            'return.')
														
 
															-        parser.add_argument('--disable-log-stats',
														
 
															-                            action='store_true',
														
 
															-                            help='disable logging statistics')
														
 
															+            help="the fraction of GPU memory to be used for "
														
 
															+            "the model executor, which can range from 0 to 1."
														
 
															+            "If unspecified, will use the default value of 0.9.",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--max-num-batched-tokens",
														
 
															+            type=int,
														
 
															+            default=EngineArgs.max_num_batched_tokens,
														
 
															+            help="maximum number of batched tokens per "
														
 
															+            "iteration",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--max-num-seqs",
														
 
															+            type=int,
														
 
															+            default=EngineArgs.max_num_seqs,
														
 
															+            help="maximum number of sequences per iteration",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--max-paddings",
														
 
															+            type=int,
														
 
															+            default=EngineArgs.max_paddings,
														
 
															+            help="maximum number of paddings in a batch",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--max-log-probs",
														
 
															+            type=int,
														
 
															+            default=EngineArgs.max_log_probs,
														
 
															+            help="maximum number of log probabilities to "
														
 
															+            "return.",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--disable-log-stats",
														
 
															+            action="store_true",
														
 
															+            help="disable logging statistics",
														
 
															+        )
														
 
															         # Quantization settings.
														
 
															-        parser.add_argument('--quantization',
														
 
															-                            '-q',
														
 
															-                            type=str,
														
 
															-                            choices=[
														
 
															-                                'aqlm', 'awq', 'bnb', 'exl2', 'gguf', 'gptq',
														
 
															-                                'quip', 'squeezellm', 'marlin', None
														
 
															-                            ],
														
 
															-                            default=EngineArgs.quantization,
														
 
															-                            help='Method used to quantize the weights. If '
														
 
															-                            'None, we first check the `quantization_config` '
														
 
															-                            'attribute in the model config file. If that is '
														
 
															-                            'None, we assume the model weights are not '
														
 
															-                            'quantized and use `dtype` to determine the data '
														
 
															-                            'type of the weights.')
														
 
															-        parser.add_argument('--load-in-4bit',
														
 
															-                            action='store_true',
														
 
															-                            help='Load the FP16 model in 4-bit format. Also '
														
 
															-                            'works with AWQ models. Throughput at 2.5x of '
														
 
															-                            'FP16.')
														
 
															-        parser.add_argument('--load-in-8bit',
														
 
															-                            action='store_true',
														
 
															-                            help='Load the FP16 model in 8-bit format. '
														
 
															-                            'Throughput at 0.3x of FP16.')
														
 
															-        parser.add_argument('--load-in-smooth',
														
 
															-                            action='store_true',
														
 
															-                            help='Load the FP16 model in smoothquant '
														
 
															-                            '8bit format. Throughput at 0.7x of FP16. ')
														
 
															-        parser.add_argument('--enforce-eager',
														
 
															-                            action='store_true',
														
 
															-                            help='Always use eager-mode PyTorch. If False, '
														
 
															-                            'will use eager mode and CUDA graph in hybrid '
														
 
															-                            'for maximal performance and flexibility.')
														
 
															-        parser.add_argument('--max-context-len-to-capture',
														
 
															-                            type=int,
														
 
															-                            default=EngineArgs.max_context_len_to_capture,
														
 
															-                            help='maximum context length covered by CUDA '
														
 
															-                            'graphs. When a sequence has context length '
														
 
															-                            'larger than this, we fall back to eager mode.')
														
 
															-        parser.add_argument('--disable-custom-all-reduce',
														
 
															-                            action='store_true',
														
 
															-                            default=EngineArgs.disable_custom_all_reduce,
														
 
															-                            help='See ParallelConfig')
														
 
															+        parser.add_argument(
														
 
															+            "--quantization",
														
 
															+            "-q",
														
 
															+            type=str,
														
 
															+            choices=[
														
 
															+                "aqlm",
														
 
															+                "awq",
														
 
															+                "bnb",
														
 
															+                "exl2",
														
 
															+                "gguf",
														
 
															+                "gptq",
														
 
															+                "quip",
														
 
															+                "squeezellm",
														
 
															+                "marlin",
														
 
															+                None,
														
 
															+            ],
														
 
															+            default=EngineArgs.quantization,
														
 
															+            help="Method used to quantize the weights. If "
														
 
															+            "None, we first check the `quantization_config` "
														
 
															+            "attribute in the model config file. If that is "
														
 
															+            "None, we assume the model weights are not "
														
 
															+            "quantized and use `dtype` to determine the data "
														
 
															+            "type of the weights.",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--load-in-4bit",
														
 
															+            action="store_true",
														
 
															+            help="Load the FP16 model in 4-bit format. Also "
														
 
															+            "works with AWQ models. Throughput at 2.5x of "
														
 
															+            "FP16.",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--load-in-8bit",
														
 
															+            action="store_true",
														
 
															+            help="Load the FP16 model in 8-bit format. "
														
 
															+            "Throughput at 0.3x of FP16.",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--load-in-smooth",
														
 
															+            action="store_true",
														
 
															+            help="Load the FP16 model in smoothquant "
														
 
															+            "8bit format. Throughput at 0.7x of FP16. ",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--enforce-eager",
														
 
															+            type=lambda x: (str(x).lower() == 'true'),
														
 
															+            default=EngineArgs.enforce_eager,
														
 
															+            help="Always use eager-mode PyTorch. If False, "
														
 
															+            "will use eager mode and CUDA graph in hybrid "
														
 
															+            "for maximal performance and flexibility.",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--max-context-len-to-capture",
														
 
															+            type=int,
														
 
															+            default=EngineArgs.max_context_len_to_capture,
														
 
															+            help="maximum context length covered by CUDA "
														
 
															+            "graphs. When a sequence has context length "
														
 
															+            "larger than this, we fall back to eager mode.",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--disable-custom-all-reduce",
														
 
															+            action="store_true",
														
 
															+            default=EngineArgs.disable_custom_all_reduce,
														
 
															+            help="See ParallelConfig",
														
 
															+        )
														
 
															         # LoRA related configs
														
 
															-        parser.add_argument('--enable-lora',
														
 
															-                            action='store_true',
														
 
															-                            help='If True, enable handling of LoRA adapters.')
														
 
															-        parser.add_argument('--max-loras',
														
 
															-                            type=int,
														
 
															-                            default=EngineArgs.max_loras,
														
 
															-                            help='Max number of LoRAs in a single batch.')
														
 
															-        parser.add_argument('--max-lora-rank',
														
 
															-                            type=int,
														
 
															-                            default=EngineArgs.max_lora_rank,
														
 
															-                            help='Max LoRA rank.')
														
 
															         parser.add_argument(
														
 
															-            '--lora-extra-vocab-size',
														
 
															+            "--enable-lora",
														
 
															+            action="store_true",
														
 
															+            help="If True, enable handling of LoRA adapters.",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--max-loras",
														
 
															+            type=int,
														
 
															+            default=EngineArgs.max_loras,
														
 
															+            help="Max number of LoRAs in a single batch.",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--max-lora-rank",
														
 
															+            type=int,
														
 
															+            default=EngineArgs.max_lora_rank,
														
 
															+            help="Max LoRA rank.",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--lora-extra-vocab-size",
														
 
															             type=int,
														
 
															             default=EngineArgs.lora_extra_vocab_size,
														
 
															-            help=('Maximum size of extra vocabulary that can be '
														
 
															-                  'present in a LoRA adapter (added to the base '
														
 
															-                  'model vocabulary).'))
														
 
															+            help=("Maximum size of extra vocabulary that can be "
														
 
															+                  "present in a LoRA adapter (added to the base "
														
 
															+                  "model vocabulary)."),
														
 
															+        )
														
 
															         parser.add_argument(
														
 
															-            '--lora-dtype',
														
 
															+            "--lora-dtype",
														
 
															             type=str,
														
 
															             default=EngineArgs.lora_dtype,
														
 
															-            choices=['auto', 'float16', 'bfloat16', 'float32'],
														
 
															-            help=('Data type for LoRA. If auto, will default to '
														
 
															-                  'base model dtype.'))
														
 
															+            choices=["auto", "float16", "bfloat16", "float32"],
														
 
															+            help=("Data type for LoRA. If auto, will default to "
														
 
															+                  "base model dtype."),
														
 
															+        )
														
 
															         parser.add_argument(
														
 
															-            '--max-cpu-loras',
														
 
															+            "--max-cpu-loras",
														
 
															             type=int,
														
 
															             default=EngineArgs.max_cpu_loras,
														
 
															-            help=('Maximum number of LoRAs to store in CPU memory. '
														
 
															-                  'Must be >= than max_num_seqs. '
														
 
															-                  'Defaults to max_num_seqs.'))
														
 
															-        parser.add_argument('--device',
														
 
															-                            type=str,
														
 
															-                            default=EngineArgs.device,
														
 
															-                            choices=['cuda'],
														
 
															-                            help=('Device to use for model execution. '
														
 
															-                                  'Currently, only "cuda" is supported.'))
														
 
															+            help=("Maximum number of LoRAs to store in CPU memory. "
														
 
															+                  "Must be >= than max_num_seqs. "
														
 
															+                  "Defaults to max_num_seqs."),
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--device",
														
 
															+            type=str,
														
 
															+            default=EngineArgs.device,
														
 
															+            choices=["cuda"],
														
 
															+            help=("Device to use for model execution. "
														
 
															+                  'Currently, only "cuda" is supported.'),
														
 
															+        )
														
 
															         return parser
														
 
															     @classmethod
														
 
															-    def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
														
 
															+    def from_cli_args(cls, args: argparse.Namespace) -> "EngineArgs":
														
 
															         # Get the list of attributes of this dataclass.
														
 
															         attrs = [attr.name for attr in dataclasses.fields(cls)]
														
 
															         # Set the attributes from the parsed arguments.
														
@@ -313,63 +410,99 @@ class EngineArgs:
 
															     def create_engine_configs(
														
 
															         self,
														
 
															     ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig,
														
 
															-               DeviceConfig, Optional[LoRAConfig]]:
														
 
															+               DeviceConfig, Optional[LoRAConfig], ]:
														
 
															         device_config = DeviceConfig(self.device)
														
 
															         model_config = ModelConfig(
														
 
															-            self.model, self.tokenizer, self.tokenizer_mode,
														
 
															-            self.trust_remote_code, self.download_dir, self.load_format,
														
 
															-            self.dtype, self.seed, self.revision, self.tokenizer_revision,
														
 
															-            self.max_model_len, self.quantization, self.load_in_4bit,
														
 
															-            self.load_in_8bit, self.load_in_smooth, self.enforce_eager,
														
 
															-            self.max_context_len_to_capture, self.max_log_probs)
														
 
															-        cache_config = CacheConfig(self.block_size,
														
 
															-                                   self.gpu_memory_utilization,
														
 
															-                                   self.swap_space, self.kv_cache_dtype,
														
 
															-                                   self.kv_quant_params_path,
														
 
															-                                   model_config.get_sliding_window(),
														
 
															-                                   self.context_shift)
														
 
															-        parallel_config = ParallelConfig(self.pipeline_parallel_size,
														
 
															-                                         self.tensor_parallel_size,
														
 
															-                                         self.worker_use_ray,
														
 
															-                                         self.max_parallel_loading_workers,
														
 
															-                                         self.disable_custom_all_reduce)
														
 
															-        scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
														
 
															-                                           self.max_num_seqs,
														
 
															-                                           model_config.max_model_len,
														
 
															-                                           self.max_paddings)
														
 
															-        lora_config = LoRAConfig(
														
 
															+            self.model,
														
 
															+            self.tokenizer,
														
 
															+            self.tokenizer_mode,
														
 
															+            self.trust_remote_code,
														
 
															+            self.download_dir,
														
 
															+            self.load_format,
														
 
															+            self.dtype,
														
 
															+            self.seed,
														
 
															+            self.revision,
														
 
															+            self.code_revision,
														
 
															+            self.tokenizer_revision,
														
 
															+            self.max_model_len,
														
 
															+            self.quantization,
														
 
															+            self.load_in_4bit,
														
 
															+            self.load_in_8bit,
														
 
															+            self.load_in_smooth,
														
 
															+            self.enforce_eager,
														
 
															+            self.max_context_len_to_capture,
														
 
															+            self.max_log_probs,
														
 
															+        )
														
 
															+        cache_config = CacheConfig(
														
 
															+            self.block_size,
														
 
															+            self.gpu_memory_utilization,
														
 
															+            self.swap_space,
														
 
															+            self.kv_cache_dtype,
														
 
															+            # self.kv_quant_params_path,
														
 
															+            model_config.get_sliding_window(),
														
 
															+            self.context_shift,
														
 
															+        )
														
 
															+        parallel_config = ParallelConfig(
														
 
															+            self.pipeline_parallel_size,
														
 
															+            self.tensor_parallel_size,
														
 
															+            self.worker_use_ray,
														
 
															+            self.max_parallel_loading_workers,
														
 
															+            self.disable_custom_all_reduce,
														
 
															+            self.ray_workers_use_nsight,
														
 
															+        )
														
 
															+        scheduler_config = SchedulerConfig(
														
 
															+            self.max_num_batched_tokens,
														
 
															+            self.max_num_seqs,
														
 
															+            model_config.max_model_len,
														
 
															+            self.max_paddings,
														
 
															+        )
														
 
															+        lora_config = (LoRAConfig(
														
 
															             max_lora_rank=self.max_lora_rank,
														
 
															             max_loras=self.max_loras,
														
 
															             lora_extra_vocab_size=self.lora_extra_vocab_size,
														
 
															             lora_dtype=self.lora_dtype,
														
 
															-            max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
														
 
															-            and self.max_cpu_loras > 0 else None) if self.enable_lora else None
														
 
															-        return (model_config, cache_config, parallel_config, scheduler_config,
														
 
															-                device_config, lora_config)
														
 
															+            max_cpu_loras=self.max_cpu_loras
														
 
															+            if self.max_cpu_loras and self.max_cpu_loras > 0 else None,
														
 
															+        ) if self.enable_lora else None)
														
 
															+        return (
														
 
															+            model_config,
														
 
															+            cache_config,
														
 
															+            parallel_config,
														
 
															+            scheduler_config,
														
 
															+            device_config,
														
 
															+            lora_config,
														
 
															+        )
														
 
															 @dataclass
														
 
															 class AsyncEngineArgs(EngineArgs):
														
 
															     """Arguments for asynchronous Aphrodite engine."""
														
 
															+
														
 
															     engine_use_ray: bool = False
														
 
															     disable_log_requests: bool = False
														
 
															-    max_log_len: Optional[int] = None
														
 
															+    max_log_len: int = 0
														
 
															     @staticmethod
														
 
															     def add_cli_args(
														
 
															             parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
														
 
															         parser = EngineArgs.add_cli_args(parser)
														
 
															-        parser.add_argument('--engine-use-ray',
														
 
															-                            action='store_true',
														
 
															-                            help='use Ray to start the LLM engine in a '
														
 
															-                            'separate process as the server process.')
														
 
															-        parser.add_argument('--disable-log-requests',
														
 
															-                            action='store_true',
														
 
															-                            help='disable logging requests')
														
 
															-        parser.add_argument('--max-log-len',
														
 
															-                            type=int,
														
 
															-                            default=None,
														
 
															-                            help='max number of prompt characters or prompt '
														
 
															-                            'ID numbers being printed in log. '
														
 
															-                            'Default: unlimited.')
														
 
															+        parser.add_argument(
														
 
															+            "--engine-use-ray",
														
 
															+            action="store_true",
														
 
															+            help="use Ray to start the LLM engine in a "
														
 
															+            "separate process as the server process.",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--disable-log-requests",
														
 
															+            action="store_true",
														
 
															+            help="disable logging requests",
														
 
															+        )
														
 
															+        parser.add_argument(
														
 
															+            "--max-log-len",
														
 
															+            type=int,
														
 
															+            default=0,
														
 
															+            help="max number of prompt characters or prompt "
														
 
															+            "ID numbers being printed in log. "
														
 
															+            "Default: unlimited.",
														
 
															+        )
														
 
															         return parser
														
--- a/aphrodite/engine/async_aphrodite.py
+++ b/aphrodite/engine/async_aphrodite.py
@@ -15,7 +15,7 @@ from aphrodite.common.outputs import RequestOutput
 
															 from aphrodite.common.sampling_params import SamplingParams
														
 
															 ENGINE_ITERATION_TIMEOUT_S = int(
														
 
															-    os.environ.get("APHRODITE_ENGINE_ITERATION_TIMEOUT_S", 60))
														
 
															+    os.environ.get("APHRODITE_ENGINE_ITERATION_TIMEOUT_S", "120"))
														
 
															 class AsyncEngineDeadError(RuntimeError):
														
@@ -26,13 +26,18 @@ def _raise_exception_on_finish(
 
															         task: asyncio.Task, error_callback: Callable[[Exception],
														
 
															                                                      None]) -> None:
														
 
															     msg = ("Task finished unexpectedly. This should never happen! "
														
 
															-           "Please open an issue on Github.")
														
 
															+           "Please open an issue on Github. Include your full error "
														
 
															+           "log after killing the process with Ctrl+C.")
														
 
															     exception = None
														
 
															     try:
														
 
															         task.result()
														
 
															         # NOTE: This will be thrown if task exits normally (which it should not)
														
 
															         raise AsyncEngineDeadError(msg)
														
 
															+    except asyncio.exceptions.CancelledError:
														
 
															+        pass
														
 
															+    except KeyboardInterrupt:
														
 
															+        raise
														
 
															     except Exception as e:
														
 
															         exception = e
														
 
															         logger.error("Engine background task failed", exc_info=e)
														
@@ -318,6 +323,8 @@ class AsyncAphrodite:
 
															             async frontend will be executed in a separate process as the
														
 
															             model workers.
														
 
															         log_requests: Whether to log the requests.
														
 
															+        max_log_len: Maximum number of prompt characters or prompt ID numbers
														
 
															+            being printed in log.
														
 
															         start_engine_loop: If True, the background task to run the engine
														
 
															             will be automatically started in the generate call.
														
 
															         *args: Arguments for AphroditeEngine.
														
@@ -331,7 +338,7 @@ class AsyncAphrodite:
 
															                  engine_use_ray: bool,
														
 
															                  *args,
														
 
															                  log_requests: bool = True,
														
 
															-                 max_log_len: Optional[int] = None,
														
 
															+                 max_log_len: int = 0,
														
 
															                  start_engine_loop: bool = True,
														
 
															                  **kwargs) -> None:
														
 
															         self.worker_use_ray = worker_use_ray
														
@@ -456,23 +463,27 @@ class AsyncAphrodite:
 
															     async def run_engine_loop(self):
														
 
															         has_requests_in_progress = False
														
 
															-        while True:
														
 
															-            if not has_requests_in_progress:
														
 
															-                logger.debug("Waiting for new requests...")
														
 
															-                await self._request_tracker.wait_for_new_requests()
														
 
															-                logger.debug("Got new requests!")
														
 
															-
														
 
															-            # Abort if iteration takes too long due to unrecoverable errors
														
 
															-            # (eg. NCCL timeouts).
														
 
															-            try:
														
 
															-                has_requests_in_progress = await asyncio.wait_for(
														
 
															-                    self.engine_step(), ENGINE_ITERATION_TIMEOUT_S)
														
 
															-            except asyncio.TimeoutError as exc:
														
 
															-                logger.error(
														
 
															-                    "Engine iteration timed out. This should never happen!")
														
 
															-                self.set_errored(exc)
														
 
															-                raise
														
 
															-            await asyncio.sleep(0)
														
 
															+        try:
														
 
															+            while True:
														
 
															+                if not has_requests_in_progress:
														
 
															+                    logger.debug("Waiting for new requests...")
														
 
															+                    await self._request_tracker.wait_for_new_requests()
														
 
															+                    logger.debug("Got new requests!")
														
 
															+
														
 
															+                # Abort if iteration takes too long due to unrecoverable errors
														
 
															+                # (eg. NCCL timeouts).
														
 
															+                try:
														
 
															+                    has_requests_in_progress = await asyncio.wait_for(
														
 
															+                        self.engine_step(), ENGINE_ITERATION_TIMEOUT_S)
														
 
															+                except asyncio.TimeoutError as exc:
														
 
															+                    logger.error(
														
 
															+                        "Engine iteration timed out. This should never happen!"
														
 
															+                    )
														
 
															+                    self.set_errored(exc)
														
 
															+                    raise
														
 
															+                await asyncio.sleep(0)
														
 
															+        except KeyboardInterrupt:
														
 
															+            logger.info("Engine loop interrupted. Exiting gracefully.")
														
 
															     async def add_request(
														
 
															         self,
														
@@ -494,8 +505,7 @@ class AsyncAphrodite:
 
															                                                               max_log_len]
														
 
															             logger.info(f"Received request {request_id}: "
														
 
															                         f"prompt: {shortened_prompt!r}, "
														
 
															-                        f"sampling params: {sampling_params}, "
														
 
															-                        f"prompt token ids: {shortened_token_ids}, "
														
 
															+                        f"sampling_params: {sampling_params}, "
														
 
															                         f"lora_request: {lora_request}.")
														
 
															         if not self.is_running:
														
@@ -510,6 +520,7 @@ class AsyncAphrodite:
 
															         if arrival_time is None:
														
 
															             arrival_time = time.time()
														
 
															+
														
 
															         if self.engine_use_ray:
														
 
															             prompt_token_ids = await self.engine.encode_request_async.remote(
														
 
															                 request_id=request_id,
														
@@ -609,15 +620,21 @@ class AsyncAphrodite:
 
															         arrival_time = time.monotonic()
														
 
															         try:
														
 
															-            stream = await self.add_request(request_id,
														
 
															-                                            prompt,
														
 
															-                                            sampling_params,
														
 
															-                                            prompt_token_ids=prompt_token_ids,
														
 
															-                                            arrival_time=arrival_time,
														
 
															-                                            lora_request=lora_request)
														
 
															+            stream = await self.add_request(
														
 
															+                request_id,
														
 
															+                prompt,
														
 
															+                sampling_params,
														
 
															+                prompt_token_ids=prompt_token_ids,
														
 
															+                arrival_time=arrival_time,
														
 
															+                lora_request=lora_request,
														
 
															+            )
														
 
															             async for request_output in stream:
														
 
															                 yield request_output
														
 
															+        except asyncio.exceptions.CancelledError:
														
 
															+            logger.info(f"Request {request_id} cancelled.")
														
 
															+            self._abort(request_id)
														
 
															+            raise
														
 
															         except (Exception, asyncio.CancelledError) as e:
														
 
															             # If there is an exception or coroutine is cancelled, abort the
														
 
															             # request.
														
--- a/aphrodite/engine/metrics.py
+++ b/aphrodite/engine/metrics.py
@@ -1,11 +1,18 @@
 
															+from loguru import logger
														
 
															+from prometheus_client import (
														
 
															+    Counter,
														
 
															+    Gauge,
														
 
															+    Histogram,
														
 
															+    Info,
														
 
															+    REGISTRY,
														
 
															+    disable_created_metrics,
														
 
															+)
														
 
															+
														
 
															 import time
														
 
															 import numpy as np
														
 
															 from typing import Dict, List
														
 
															 from dataclasses import dataclass
														
 
															-from prometheus_client import Counter, Gauge, Histogram, disable_created_metrics
														
 
															-from loguru import logger
														
 
															-
														
 
															 disable_created_metrics()
														
 
															 # The begin-* and end* here are used by the documentation generator
														
@@ -16,58 +23,104 @@ disable_created_metrics()
 
															 class Metrics:
														
 
															     def __init__(self, labelnames: List[str]):
														
 
															+        # Unregister any existing Aphrodite collectors
														
 
															+        for collector in list(REGISTRY._collector_to_names):
														
 
															+            if hasattr(collector, "_name") and "aphrodite" in collector._name:
														
 
															+                REGISTRY.unregister(collector)
														
 
															+
														
 
															+        # Config Information
														
 
															+        self.info_cache_config = Info(
														
 
															+            name="aphrodite:cache_config",
														
 
															+            documentation="information of cache_config",
														
 
															+        )
														
 
															+
														
 
															         # System stats
														
 
															         self.gauge_scheduler_running = Gauge(
														
 
															             name="aphrodite:num_requests_running",
														
 
															             documentation="Number of requests currently running on GPU.",
														
 
															-            labelnames=labelnames)
														
 
															+            labelnames=labelnames,
														
 
															+        )
														
 
															         self.gauge_scheduler_swapped = Gauge(
														
 
															             name="aphrodite:num_requests_swapped",
														
 
															             documentation="Number of requests swapped to CPU.",
														
 
															-            labelnames=labelnames)
														
 
															+            labelnames=labelnames,
														
 
															+        )
														
 
															         self.gauge_scheduler_waiting = Gauge(
														
 
															             name="aphrodite:num_requests_waiting",
														
 
															             documentation="Number of requests waiting to be processed.",
														
 
															-            labelnames=labelnames)
														
 
															+            labelnames=labelnames,
														
 
															+        )
														
 
															         self.gauge_gpu_cache_usage = Gauge(
														
 
															             name="aphrodite:gpu_cache_usage_perc",
														
 
															             documentation="GPU KV-cache usage. 1 means 100 percent usage.",
														
 
															-            labelnames=labelnames)
														
 
															+            labelnames=labelnames,
														
 
															+        )
														
 
															         self.gauge_cpu_cache_usage = Gauge(
														
 
															             name="aphrodite:cpu_cache_usage_perc",
														
 
															             documentation="CPU KV-cache usage. 1 means 100 percent usage.",
														
 
															-            labelnames=labelnames)
														
 
															+            labelnames=labelnames,
														
 
															+        )
														
 
															         # Raw stats from last model iteration
														
 
															         self.counter_prompt_tokens = Counter(
														
 
															             name="aphrodite:prompt_tokens_total",
														
 
															             documentation="Number of prefill tokens processed.",
														
 
															-            labelnames=labelnames)
														
 
															+            labelnames=labelnames,
														
 
															+        )
														
 
															         self.counter_generation_tokens = Counter(
														
 
															             name="aphrodite:generation_tokens_total",
														
 
															             documentation="Number of generation tokens processed.",
														
 
															-            labelnames=labelnames)
														
 
															+            labelnames=labelnames,
														
 
															+        )
														
 
															         self.histogram_time_to_first_token = Histogram(
														
 
															             name="aphrodite:time_to_first_token_seconds",
														
 
															             documentation="Histogram of time to first token in seconds.",
														
 
															             labelnames=labelnames,
														
 
															             buckets=[
														
 
															-                0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
														
 
															-                0.75, 1.0, 2.5, 5.0, 7.5, 10.0
														
 
															-            ])
														
 
															+                0.001,
														
 
															+                0.005,
														
 
															+                0.01,
														
 
															+                0.02,
														
 
															+                0.04,
														
 
															+                0.06,
														
 
															+                0.08,
														
 
															+                0.1,
														
 
															+                0.25,
														
 
															+                0.5,
														
 
															+                0.75,
														
 
															+                1.0,
														
 
															+                2.5,
														
 
															+                5.0,
														
 
															+                7.5,
														
 
															+                10.0,
														
 
															+            ],
														
 
															+        )
														
 
															         self.histogram_time_per_output_token = Histogram(
														
 
															             name="aphrodite:time_per_output_token_seconds",
														
 
															             documentation="Histogram of time per output token in seconds.",
														
 
															             labelnames=labelnames,
														
 
															             buckets=[
														
 
															-                0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
														
 
															-                1.0, 2.5
														
 
															-            ])
														
 
															+                0.01,
														
 
															+                0.025,
														
 
															+                0.05,
														
 
															+                0.075,
														
 
															+                0.1,
														
 
															+                0.15,
														
 
															+                0.2,
														
 
															+                0.3,
														
 
															+                0.4,
														
 
															+                0.5,
														
 
															+                0.75,
														
 
															+                1.0,
														
 
															+                2.5,
														
 
															+            ],
														
 
															+        )
														
 
															         self.histogram_e2e_request_latency = Histogram(
														
 
															             name="aphrodite:e2e_request_latency_seconds",
														
 
															             documentation="Histogram of end to end request latency in seconds.",
														
 
															             labelnames=labelnames,
														
 
															-            buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
														
 
															+            buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0],
														
 
															+        )
														
 
															         # Legacy metrics
														
 
															         self.gauge_avg_prompt_throughput = Gauge(
														
@@ -88,6 +141,7 @@ class Metrics:
 
															 @dataclass
														
 
															 class Stats:
														
 
															     """Created by AphroditeEngine for use by StatLogger."""
														
 
															+
														
 
															     now: float
														
 
															     # System stats.
														
@@ -121,6 +175,10 @@ class StatLogger:
 
															         self.labels = labels
														
 
															         self.metrics = Metrics(labelnames=list(labels.keys()))
														
 
															+    def info(self, type: str, obj: object) -> None:
														
 
															+        if type == "cache_config":
														
 
															+            self.metrics.info_cache_config.info(obj.metrics_info())
														
 
															+
														
 
															     def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
														
 
															         return float(np.sum(tracked_stats) / (now - self.last_local_log))
														
@@ -174,8 +232,8 @@ class StatLogger:
 
															     def log(self, stats: Stats) -> None:
														
 
															         """Called by AphroditeEngine.
														
 
															-           Logs to prometheus and tracked stats every iteration.
														
 
															-           Logs to Stdout every self.local_interval seconds."""
														
 
															+        Logs to prometheus and tracked stats every iteration.
														
 
															+        Logs to Stdout every self.local_interval seconds."""
														
 
															         # Log to prometheus.
														
 
															         self._log_prometheus(stats)
														
@@ -186,7 +244,6 @@ class StatLogger:
 
															         # Log locally every local_interval seconds.
														
 
															         if self._local_interval_elapsed(stats.now):
														
 
															-
														
 
															             # Compute summary metrics for tracked stats (and log them to
														
 
															             # prometheus if applicable).
														
 
															             prompt_throughput = self._get_throughput(self.num_prompt_tokens,
														
@@ -195,7 +252,8 @@ class StatLogger:
 
															                 self.num_generation_tokens, now=stats.now)
														
 
															             self._log_prometheus_interval(
														
 
															                 prompt_throughput=prompt_throughput,
														
 
															-                generation_throughput=generation_throughput)
														
 
															+                generation_throughput=generation_throughput,
														
 
															+            )
														
 
															             # Log to stdout.
														
 
															             logger.info(
														
--- a/aphrodite/engine/ray_tools.py
+++ b/aphrodite/engine/ray_tools.py
@@ -1,3 +1,5 @@
 
															+import pickle
														
 
															+
														
 
															 from typing import Optional, List, Tuple, TYPE_CHECKING
														
 
															 from loguru import logger
														
@@ -13,10 +15,14 @@ try:
 
															         def __init__(self, init_cached_hf_modules=False) -> None:
														
 
															             if init_cached_hf_modules:
														
 
															-                # pylint: disable=import-outside-toplevel
														
 
															                 from transformers.dynamic_module_utils import init_hf_modules
														
 
															                 init_hf_modules()
														
 
															             self.worker = None
														
 
															+            # Since the compiled DAG runs a main execution
														
 
															+            # in a different thread that calls cuda.set_device.
														
 
															+            # The flag indicates is set_device is called on
														
 
															+            # that thread.
														
 
															+            self.compiled_dag_cuda_device_set = False
														
 
															         def init_worker(self, worker_init_fn):
														
 
															             self.worker = worker_init_fn()
														
@@ -39,6 +45,17 @@ try:
 
															         def set_cuda_visible_devices(self, device_ids) -> None:
														
 
															             set_cuda_visible_devices(device_ids)
														
 
															+        def execute_model_compiled_dag_remote(self, ignored):
														
 
															+            """Used only when compiled DAG is enabled."""
														
 
															+            import torch
														
 
															+            if not self.compiled_dag_cuda_device_set:
														
 
															+                torch.cuda.set_device(self.worker.device)
														
 
															+                self.compiled_dag_cuda_device_set = True
														
 
															+
														
 
															+            output = self.worker.execute_model()
														
 
															+            output = pickle.dumps(output)
														
 
															+            return output
														
 
															+
														
 
															 except ImportError as e:
														
 
															     logger.warning(f"Failed to import Ray with {e!r}. "
														
 
															                    "For distributed inference, please install Ray with "
														
@@ -64,10 +81,9 @@ def initialize_cluster(
 
															             the default Ray cluster address.
														
 
															     Returns:
														
 
															-        A tuple of (`distributed_init_method`, `placement_group`). The
														
 
															-        `distributed_init_method` is the address for initializing the
														
 
															-        distributed backend. `placement_group` includes the specification
														
 
															-        of the resources for each distributed worker.
														
 
															+        An optional `PlacementGroup`. It includes the specification
														
 
															+        of the resources for each distributed worker. None if Ray is
														
 
															+        not used.
														
 
															     """
														
 
															     if parallel_config.worker_use_ray or engine_use_ray:
														
 
															         if ray is None:
														
--- a/aphrodite/modeling/layers/triton_kernel/__init__.py
+++ b/aphrodite/modeling/layers/triton_kernel/__init__.py
--- a/aphrodite/executor/executor_base.py
+++ b/aphrodite/executor/executor_base.py
@@ -0,0 +1,76 @@
 
															+from abc import ABC, abstractmethod
														
 
															+from typing import Dict, List, Optional
														
 
															+
														
 
															+from aphrodite.common.config import (CacheConfig, DeviceConfig, ModelConfig,
														
 
															+                                     ParallelConfig, SchedulerConfig,
														
 
															+                                     LoRAConfig)
														
 
															+from aphrodite.lora.request import LoRARequest
														
 
															+from aphrodite.common.sequence import SamplerOutput, SequenceGroupMetadata
														
 
															+
														
 
															+
														
 
															+class ExecutorBase(ABC):
														
 
															+    """Base class for all executors.
														
 
															+
														
 
															+    An executor is responsible for executing the model on a specific device
														
 
															+    type (e.g., CPU, GPU, Neuron, etc.). Or it can be a distributed executor
														
 
															+    that can execute the model on multiple devices.
														
 
															+    """
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        model_config: ModelConfig,
														
 
															+        cache_config: CacheConfig,
														
 
															+        parallel_config: ParallelConfig,
														
 
															+        scheduler_config: SchedulerConfig,
														
 
															+        device_config: DeviceConfig,
														
 
															+        lora_config: Optional[LoRAConfig],
														
 
															+    ) -> None:
														
 
															+        raise NotImplementedError
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def execute_model(self,
														
 
															+                      seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+                      blocks_to_swap_in: Dict[int, int],
														
 
															+                      blocks_to_swap_out: Dict[int, int],
														
 
															+                      blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
														
 
															+        """Executes one model step on the given sequences."""
														
 
															+        raise NotImplementedError
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def add_lora(self, lora_request: LoRARequest) -> bool:
														
 
															+        raise NotImplementedError
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def remove_lora(self, lora_id: int) -> bool:
														
 
															+        raise NotImplementedError
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def list_loras(self) -> List[int]:
														
 
															+        raise NotImplementedError
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def check_health(self) -> None:
														
 
															+        """Checks if the executor is healthy. If not, it should raise an
														
 
															+        exception."""
														
 
															+        raise NotImplementedError
														
 
															+
														
 
															+
														
 
															+class ExecutorAsyncBase(ExecutorBase):
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    async def execute_model_async(
														
 
															+        self,
														
 
															+        seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+        blocks_to_swap_in: Dict[int, int],
														
 
															+        blocks_to_swap_out: Dict[int, int],
														
 
															+        blocks_to_copy: Dict[int, List[int]],
														
 
															+    ) -> SamplerOutput:
														
 
															+        """Executes one model step on the given sequences."""
														
 
															+        raise NotImplementedError
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    async def check_health_async(self) -> None:
														
 
															+        """Checks if the executor is healthy. If not, it should raise an
														
 
															+        exception."""
														
 
															+        raise NotImplementedError
														
--- a/aphrodite/executor/gpu_executor.py
+++ b/aphrodite/executor/gpu_executor.py
@@ -0,0 +1,153 @@
 
															+from typing import Dict, List, Optional
														
 
															+
														
 
															+from loguru import logger
														
 
															+
														
 
															+from aphrodite.lora.request import LoRARequest
														
 
															+from aphrodite.common.config import (CacheConfig, DeviceConfig, ModelConfig,
														
 
															+                                     ParallelConfig, SchedulerConfig,
														
 
															+                                     LoRAConfig)
														
 
															+from aphrodite.executor.executor_base import ExecutorAsyncBase, ExecutorBase
														
 
															+from aphrodite.executor.utils import check_block_size_valid
														
 
															+from aphrodite.common.sequence import SamplerOutput, SequenceGroupMetadata
														
 
															+from aphrodite.common.utils import (get_ip, get_open_port,
														
 
															+                                    get_distributed_init_method, make_async)
														
 
															+
														
 
															+
														
 
															+class GPUExecutor(ExecutorBase):
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        model_config: ModelConfig,
														
 
															+        cache_config: CacheConfig,
														
 
															+        parallel_config: ParallelConfig,
														
 
															+        scheduler_config: SchedulerConfig,
														
 
															+        device_config: DeviceConfig,
														
 
															+        lora_config: Optional[LoRAConfig],
														
 
															+    ) -> None:
														
 
															+        self.model_config = model_config
														
 
															+        self.cache_config = cache_config
														
 
															+        self.lora_config = lora_config
														
 
															+        self.parallel_config = parallel_config
														
 
															+        self.scheduler_config = scheduler_config
														
 
															+        self.device_config = device_config
														
 
															+
														
 
															+        # Instantiate the worker and load the model to GPU.
														
 
															+        self._init_worker()
														
 
															+
														
 
															+        # Profile the memory usage and initialize the cache.
														
 
															+        self._init_cache()
														
 
															+
														
 
															+    def _init_worker(self):
														
 
															+        # Lazy import the Worker to avoid importing torch.cuda/xformers
														
 
															+        # before CUDA_VISIBLE_DEVICES is set in the Worker
														
 
															+        from aphrodite.task_handler.worker import Worker
														
 
															+
														
 
															+        assert self.parallel_config.world_size == 1, (
														
 
															+            "GPUExecutor only supports single GPU.")
														
 
															+
														
 
															+        distributed_init_method = get_distributed_init_method(
														
 
															+            get_ip(), get_open_port())
														
 
															+        self.driver_worker = Worker(
														
 
															+            self.model_config,
														
 
															+            self.parallel_config,
														
 
															+            self.scheduler_config,
														
 
															+            self.device_config,
														
 
															+            local_rank=0,
														
 
															+            rank=0,
														
 
															+            distributed_init_method=distributed_init_method,
														
 
															+            lora_config=self.lora_config,
														
 
															+            kv_cache_dtype=self.cache_config.cache_dtype,
														
 
															+            is_driver_worker=True,
														
 
															+        )
														
 
															+        self.driver_worker.init_device()
														
 
															+        self.driver_worker.load_model()
														
 
															+
														
 
															+    def _init_cache(self) -> None:
														
 
															+        """Profiles the memory usage and initializes the KV cache.
														
 
															+
														
 
															+        The engine first profiles the existing memory usage.
														
 
															+        Then, it allocates the remaining memory for KV blocks.
														
 
															+
														
 
															+        .. tip::
														
 
															+            You may limit the usage of GPU memory
														
 
															+            by adjusting the `gpu_memory_utilization` parameter.
														
 
															+        """
														
 
															+        # Get the maximum number of blocks that can be allocated on GPU and CPU.
														
 
															+        num_gpu_blocks, num_cpu_blocks = (
														
 
															+            self.driver_worker.profile_num_available_blocks(
														
 
															+                block_size=self.cache_config.block_size,
														
 
															+                gpu_memory_utilization=self.cache_config.
														
 
															+                gpu_memory_utilization,
														
 
															+                cpu_swap_space=self.cache_config.swap_space_bytes,
														
 
															+                cache_dtype=self.cache_config.cache_dtype,
														
 
															+            ))
														
 
															+
														
 
															+        logger.info(f"# GPU blocks: {num_gpu_blocks}, "
														
 
															+                    f"# CPU blocks: {num_cpu_blocks}")
														
 
															+
														
 
															+        logger.info(
														
 
															+            f"Minimum concurrency: {num_gpu_blocks * self.cache_config.block_size / self.scheduler_config.max_model_len:.2f}x"  # noqa: E501
														
 
															+        )
														
 
															+
														
 
															+        check_block_size_valid(num_gpu_blocks, self.cache_config.block_size,
														
 
															+                               self.model_config.max_model_len)
														
 
															+
														
 
															+        self.cache_config.num_gpu_blocks = num_gpu_blocks
														
 
															+        self.cache_config.num_cpu_blocks = num_cpu_blocks
														
 
															+
														
 
															+        # Initialize the cache.
														
 
															+        self.driver_worker.init_cache_engine(cache_config=self.cache_config)
														
 
															+        # Warm up the model. This includes capturing the model into CUDA graph
														
 
															+        # if enforce_eager is False.
														
 
															+        self.driver_worker.warm_up_model()
														
 
															+
														
 
															+    def execute_model(self,
														
 
															+                      seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+                      blocks_to_swap_in: Dict[int, int],
														
 
															+                      blocks_to_swap_out: Dict[int, int],
														
 
															+                      blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
														
 
															+        output = self.driver_worker.execute_model(
														
 
															+            seq_group_metadata_list=seq_group_metadata_list,
														
 
															+            blocks_to_swap_in=blocks_to_swap_in,
														
 
															+            blocks_to_swap_out=blocks_to_swap_out,
														
 
															+            blocks_to_copy=blocks_to_copy,
														
 
															+        )
														
 
															+        return output
														
 
															+
														
 
															+    def add_lora(self, lora_request: LoRARequest) -> bool:
														
 
															+        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
														
 
															+        return self.driver_worker.add_lora(lora_request)
														
 
															+
														
 
															+    def remove_lora(self, lora_id: int) -> bool:
														
 
															+        assert lora_id > 0, "lora_id must be greater than 0."
														
 
															+        return self.driver_worker.remove_lora(lora_id)
														
 
															+
														
 
															+    def list_loras(self) -> List[int]:
														
 
															+        return self.driver_worker.list_loras()
														
 
															+
														
 
															+    def check_health(self) -> None:
														
 
															+        # GPUExecutor will always be healthy as long as
														
 
															+        # it's running.
														
 
															+        return
														
 
															+
														
 
															+
														
 
															+class GPUExecutorAsync(GPUExecutor, ExecutorAsyncBase):
														
 
															+
														
 
															+    async def execute_model_async(
														
 
															+        self,
														
 
															+        seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+        blocks_to_swap_in: Dict[int, int],
														
 
															+        blocks_to_swap_out: Dict[int, int],
														
 
															+        blocks_to_copy: Dict[int, List[int]],
														
 
															+    ) -> SamplerOutput:
														
 
															+        output = await make_async(self.driver_worker.execute_model)(
														
 
															+            seq_group_metadata_list=seq_group_metadata_list,
														
 
															+            blocks_to_swap_in=blocks_to_swap_in,
														
 
															+            blocks_to_swap_out=blocks_to_swap_out,
														
 
															+            blocks_to_copy=blocks_to_copy)
														
 
															+        return output
														
 
															+
														
 
															+    async def check_health_async(self) -> None:
														
 
															+        # GPUExecutor will always be healthy as long as
														
 
															+        # it's running.
														
 
															+        return
														
--- a/aphrodite/executor/neuron_executor.py
+++ b/aphrodite/executor/neuron_executor.py
@@ -0,0 +1,78 @@
 
															+from typing import Dict, List, Optional
														
 
															+
														
 
															+from aphrodite.lora.request import LoRARequest
														
 
															+from aphrodite.common.config import (CacheConfig, DeviceConfig, ModelConfig,
														
 
															+                                     ParallelConfig, SchedulerConfig,
														
 
															+                                     LoRAConfig)
														
 
															+from aphrodite.executor.executor_base import ExecutorBase
														
 
															+from aphrodite.common.sequence import SamplerOutput, SequenceGroupMetadata
														
 
															+
														
 
															+
														
 
															+class NeuronExecutor(ExecutorBase):
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        model_config: ModelConfig,
														
 
															+        cache_config: CacheConfig,
														
 
															+        parallel_config: ParallelConfig,
														
 
															+        scheduler_config: SchedulerConfig,
														
 
															+        device_config: DeviceConfig,
														
 
															+        lora_config: Optional[LoRAConfig],
														
 
															+    ) -> None:
														
 
															+        self.model_config = model_config
														
 
															+        self.cache_config = cache_config
														
 
															+        assert lora_config is None, "LoRA is not supported for Neuron backend."
														
 
															+        self.parallel_config = parallel_config
														
 
															+        self.scheduler_config = scheduler_config
														
 
															+        self.device_config = device_config
														
 
															+
														
 
															+        # Set the number of GPU blocks to be the same as the maximum number of
														
 
															+        # sequences that can be processed in a single batch. This is equivalent
														
 
															+        # to schedule without PagedAttention.
														
 
															+        self.cache_config.num_gpu_blocks = self.scheduler_config.max_num_seqs
														
 
															+        self.cache_config.num_cpu_blocks = 0
														
 
															+
														
 
															+        # Instantiate the worker and load the model to the device.
														
 
															+        self._init_worker()
														
 
															+
														
 
															+    def _init_worker(self):
														
 
															+        from aphrodite.task_handler.neuron_worker import NeuronWorker
														
 
															+
														
 
															+        self.driver_worker = NeuronWorker(
														
 
															+            self.model_config,
														
 
															+            self.parallel_config,
														
 
															+            self.scheduler_config,
														
 
															+            self.device_config,
														
 
															+        )
														
 
															+        self.driver_worker.init_device()
														
 
															+        self.driver_worker.load_model()
														
 
															+
														
 
															+    def execute_model(self,
														
 
															+                      seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+                      blocks_to_swap_in: Dict[int, int],
														
 
															+                      blocks_to_swap_out: Dict[int, int],
														
 
															+                      blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
														
 
															+        assert (blocks_to_swap_in == {} and blocks_to_swap_out == {}
														
 
															+                and blocks_to_copy == {}), (
														
 
															+                    "Cache operations are not supported for Neuron backend.")
														
 
															+
														
 
															+        output = self.driver_worker.execute_model(
														
 
															+            seq_group_metadata_list=seq_group_metadata_list)
														
 
															+        return output
														
 
															+
														
 
															+    def add_lora(self, lora_request: LoRARequest) -> bool:
														
 
															+        raise NotImplementedError(
														
 
															+            "LoRA is not implemented for neuron backend.")
														
 
															+
														
 
															+    def remove_lora(self, lora_id: int) -> bool:
														
 
															+        raise NotImplementedError(
														
 
															+            "LoRA is not implemented for neuron backend.")
														
 
															+
														
 
															+    def list_loras(self) -> List[int]:
														
 
															+        raise NotImplementedError(
														
 
															+            "LoRA is not implemented for neuron backend.")
														
 
															+
														
 
															+    def check_health(self) -> None:
														
 
															+        # NeuronExecutor will always be healthy as long as
														
 
															+        # it's running.
														
 
															+        return
														
--- a/aphrodite/executor/ray_gpu_executor.py
+++ b/aphrodite/executor/ray_gpu_executor.py
@@ -0,0 +1,452 @@
 
															+import asyncio
														
 
															+import copy
														
 
															+from collections import defaultdict
														
 
															+import os
														
 
															+import pickle
														
 
															+from typing import TYPE_CHECKING, Any, Dict, List, Optional
														
 
															+
														
 
															+from loguru import logger
														
 
															+
														
 
															+from aphrodite.common.config import (
														
 
															+    CacheConfig,
														
 
															+    DeviceConfig,
														
 
															+    ModelConfig,
														
 
															+    ParallelConfig,
														
 
															+    SchedulerConfig,
														
 
															+    LoRAConfig,
														
 
															+)
														
 
															+from aphrodite.engine.ray_tools import RayWorkerAphrodite, ray
														
 
															+from aphrodite.executor.executor_base import ExecutorAsyncBase, ExecutorBase
														
 
															+from aphrodite.executor.utils import check_block_size_valid
														
 
															+from aphrodite.lora.request import LoRARequest
														
 
															+from aphrodite.common.sequence import SamplerOutput, SequenceGroupMetadata
														
 
															+from aphrodite.common.utils import (
														
 
															+    set_cuda_visible_devices,
														
 
															+    get_ip,
														
 
															+    get_open_port,
														
 
															+    get_distributed_init_method,
														
 
															+    make_async,
														
 
															+)
														
 
															+
														
 
															+if ray is not None:
														
 
															+    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
														
 
															+
														
 
															+if TYPE_CHECKING:
														
 
															+    from ray.util.placement_group import PlacementGroup
														
 
															+
														
 
															+# If the env var is set, it uses the Ray's compiled DAG API
														
 
															+# which optimizes the control plane overhead.
														
 
															+# Run Aphrodite with APHRODITE_USE_RAY_COMPILED_DAG=1 to enable it.
														
 
															+USE_RAY_COMPILED_DAG = bool(os.getenv("APHRODITE_USE_RAY_COMPILED_DAG", 0))
														
 
															+
														
 
															+
														
 
															+class RayGPUExecutor(ExecutorBase):
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        model_config: ModelConfig,
														
 
															+        cache_config: CacheConfig,
														
 
															+        parallel_config: ParallelConfig,
														
 
															+        scheduler_config: SchedulerConfig,
														
 
															+        device_config: DeviceConfig,
														
 
															+        lora_config: Optional[LoRAConfig],
														
 
															+    ) -> None:
														
 
															+        self.model_config = model_config
														
 
															+        self.cache_config = cache_config
														
 
															+        self.lora_config = lora_config
														
 
															+        self.parallel_config = parallel_config
														
 
															+        self.scheduler_config = scheduler_config
														
 
															+        self.device_config = device_config
														
 
															+
														
 
															+        assert self.parallel_config.worker_use_ray
														
 
															+        placement_group = self.parallel_config.placement_group
														
 
															+
														
 
															+        # Disable Ray usage stats collection.
														
 
															+        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
														
 
															+        if ray_usage != "1":
														
 
															+            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
														
 
															+
														
 
															+        # Create the parallel GPU workers.
														
 
															+        self._init_workers_ray(placement_group)
														
 
															+
														
 
															+        # Profile the memory usage and initialize the cache.
														
 
															+        self._init_cache()
														
 
															+
														
 
															+        self.forward_dag = None
														
 
															+        if USE_RAY_COMPILED_DAG:
														
 
															+            self.forward_dag = self._compiled_ray_dag()
														
 
															+
														
 
															+    def _init_workers_ray(self, placement_group: "PlacementGroup",
														
 
															+                          **ray_remote_kwargs):
														
 
															+        if self.parallel_config.tensor_parallel_size == 1:
														
 
															+            # For single GPU case, we use a ray worker with constrained memory.
														
 
															+            num_gpus = self.cache_config.gpu_memory_utilization
														
 
															+        else:
														
 
															+            # Otherwise, the ray workers are allocated with a full GPU.
														
 
															+            num_gpus = 1
														
 
															+
														
 
															+        # The driver dummy worker does not actually use any resources.
														
 
															+        # It holds the resource for the driver worker.
														
 
															+        self.driver_dummy_worker: RayWorkerAphrodite = None
														
 
															+        # The remaining workers are the actual ray actors.
														
 
															+        self.workers: List[RayWorkerAphrodite] = []
														
 
															+
														
 
															+        # Create the workers.
														
 
															+        driver_ip = get_ip()
														
 
															+        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
														
 
															+            if not bundle.get("GPU", 0):
														
 
															+                continue
														
 
															+            scheduling_strategy = PlacementGroupSchedulingStrategy(
														
 
															+                placement_group=placement_group,
														
 
															+                placement_group_capture_child_tasks=True,
														
 
															+                placement_group_bundle_index=bundle_id,
														
 
															+            )
														
 
															+            worker = ray.remote(
														
 
															+                num_cpus=0,
														
 
															+                num_gpus=num_gpus,
														
 
															+                scheduling_strategy=scheduling_strategy,
														
 
															+                **ray_remote_kwargs,
														
 
															+            )(RayWorkerAphrodite).remote(self.model_config.trust_remote_code)
														
 
															+
														
 
															+            worker_ip = ray.get(worker.get_node_ip.remote())
														
 
															+            if worker_ip == driver_ip and self.driver_dummy_worker is None:
														
 
															+                # If the worker is on the same node as the driver, we use it
														
 
															+                # as the resource holder for the driver process.
														
 
															+                self.driver_dummy_worker = worker
														
 
															+            else:
														
 
															+                # Else, added to the list of workers.
														
 
															+                self.workers.append(worker)
														
 
															+
														
 
															+        if self.driver_dummy_worker is None:
														
 
															+            raise ValueError(
														
 
															+                "Ray does not allocate any GPUs on the driver node. Consider "
														
 
															+                "adjusting the Ray placement group or running the driver on a "
														
 
															+                "GPU node.")
														
 
															+
														
 
															+        # Get the set of GPU IDs used on each node.
														
 
															+        driver_node_id, driver_gpu_ids = ray.get(
														
 
															+            self.driver_dummy_worker.get_node_and_gpu_ids.remote())
														
 
															+        worker_node_and_gpu_ids = ray.get(
														
 
															+            [worker.get_node_and_gpu_ids.remote() for worker in self.workers])
														
 
															+
														
 
															+        node_workers = defaultdict(list)
														
 
															+        node_gpus = defaultdict(list)
														
 
															+
														
 
															+        node_workers[driver_node_id].append(0)
														
 
															+        node_gpus[driver_node_id].extend(driver_gpu_ids)
														
 
															+        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids,
														
 
															+                                               start=1):
														
 
															+            node_workers[node_id].append(i)
														
 
															+            node_gpus[node_id].extend(gpu_ids)
														
 
															+        for node_id, gpu_ids in node_gpus.items():
														
 
															+            node_gpus[node_id] = sorted(gpu_ids)
														
 
															+
														
 
															+        # Set CUDA_VISIBLE_DEVICES for the driver and workers.
														
 
															+        set_cuda_visible_devices(node_gpus[driver_node_id])
														
 
															+        for worker, (node_id, _) in zip(self.workers, worker_node_and_gpu_ids):
														
 
															+            worker.set_cuda_visible_devices.remote(node_gpus[node_id])
														
 
															+
														
 
															+        distributed_init_method = get_distributed_init_method(
														
 
															+            driver_ip, get_open_port())
														
 
															+
														
 
															+        # Lazy import the Worker to avoid importing torch.cuda/xformers
														
 
															+        # before CUDA_VISIBLE_DEVICES is set in the Worker
														
 
															+        from aphrodite.task_handler.worker import Worker
														
 
															+
														
 
															+        model_config = copy.deepcopy(self.model_config)
														
 
															+        parallel_config = copy.deepcopy(self.parallel_config)
														
 
															+        scheduler_config = copy.deepcopy(self.scheduler_config)
														
 
															+        device_config = copy.deepcopy(self.device_config)
														
 
															+        lora_config = copy.deepcopy(self.lora_config)
														
 
															+        kv_cache_dtype = self.cache_config.cache_dtype
														
 
															+
														
 
															+        # Initialize the actual workers with the Worker class.
														
 
															+        for rank, (worker, (node_id, _)) in enumerate(
														
 
															+                zip(self.workers, worker_node_and_gpu_ids),
														
 
															+                start=1,
														
 
															+        ):
														
 
															+            local_rank = node_workers[node_id].index(rank)
														
 
															+            worker.init_worker.remote(
														
 
															+                lambda rank=rank, local_rank=local_rank: Worker(
														
 
															+                    model_config,
														
 
															+                    parallel_config,
														
 
															+                    scheduler_config,
														
 
															+                    device_config,
														
 
															+                    local_rank,
														
 
															+                    rank,
														
 
															+                    distributed_init_method,
														
 
															+                    lora_config=lora_config,
														
 
															+                    kv_cache_dtype=kv_cache_dtype,
														
 
															+                ))
														
 
															+
														
 
															+        # Initialize the driver worker with the Worker class.
														
 
															+        driver_rank = 0
														
 
															+        driver_local_rank = node_workers[driver_node_id].index(driver_rank)
														
 
															+        self.driver_worker = Worker(
														
 
															+            self.model_config,
														
 
															+            self.parallel_config,
														
 
															+            self.scheduler_config,
														
 
															+            self.device_config,
														
 
															+            driver_local_rank,
														
 
															+            driver_rank,
														
 
															+            distributed_init_method,
														
 
															+            lora_config=self.lora_config,
														
 
															+            kv_cache_dtype=kv_cache_dtype,
														
 
															+            is_driver_worker=True,
														
 
															+        )
														
 
															+
														
 
															+        # FIXME(woosuk): We are not properly initializing cupy NCCL when
														
 
															+        # we have multiple nodes.
														
 
															+        self._run_workers(
														
 
															+            "init_device",
														
 
															+            cupy_port=get_open_port()
														
 
															+            if not model_config.enforce_eager else None,
														
 
															+        )
														
 
															+        self._run_workers(
														
 
															+            "load_model",
														
 
															+            max_concurrent_workers=self.parallel_config.
														
 
															+            max_parallel_loading_workers,
														
 
															+        )
														
 
															+
														
 
															+    def _init_cache(self) -> None:
														
 
															+        """Profiles the memory usage and initializes the KV cache.
														
 
															+
														
 
															+        The engine will first conduct a profiling of the existing memory usage.
														
 
															+        Then, it calculate the maximum possible number of GPU and CPU blocks
														
 
															+        that can be allocated with the remaining free memory.
														
 
															+        More details can be found in the
														
 
															+        :meth:`~aphrodite.task_handler.worker.Worker.profile_num_available_blocks` method
														
 
															+        from class :class:`~aphrodite.task_handler.Worker`.
														
 
															+
														
 
															+        Afterwards, as there may be multiple workers,
														
 
															+        we take the minimum number of blocks across all workers
														
 
															+        to ensure this can be applied to all of them.
														
 
															+
														
 
															+        Finally, the engine will initialize the KV cache
														
 
															+        with the calculated number of blocks.
														
 
															+
														
 
															+        .. tip::
														
 
															+            You may limit the usage of GPU memory
														
 
															+            by adjusting the `gpu_memory_utilization` parameter.
														
 
															+        """  # noqa: E501
														
 
															+        # Get the maximum number of blocks that can be allocated on GPU and CPU.
														
 
															+        num_blocks = self._run_workers(
														
 
															+            "profile_num_available_blocks",
														
 
															+            block_size=self.cache_config.block_size,
														
 
															+            gpu_memory_utilization=self.cache_config.gpu_memory_utilization,
														
 
															+            cpu_swap_space=self.cache_config.swap_space_bytes,
														
 
															+            cache_dtype=self.cache_config.cache_dtype,
														
 
															+        )
														
 
															+
														
 
															+        # Since we use a shared centralized controller, we take the minimum
														
 
															+        # number of blocks across all workers to make sure all the memory
														
 
															+        # operators can be applied to all workers.
														
 
															+        num_gpu_blocks = min(b[0] for b in num_blocks)
														
 
															+        num_cpu_blocks = min(b[1] for b in num_blocks)
														
 
															+        logger.info(f"# GPU blocks: {num_gpu_blocks}, "
														
 
															+                    f"# CPU blocks: {num_cpu_blocks}")
														
 
															+
														
 
															+        logger.info(
														
 
															+            f"Minimum concurrency: {num_gpu_blocks * self.cache_config.block_size / self.scheduler_config.max_model_len:.2f}x"  # noqa: E501
														
 
															+        )
														
 
															+
														
 
															+        check_block_size_valid(
														
 
															+            num_gpu_blocks,
														
 
															+            self.cache_config.block_size,
														
 
															+            self.model_config.max_model_len,
														
 
															+        )
														
 
															+
														
 
															+        self.cache_config.num_gpu_blocks = num_gpu_blocks
														
 
															+        self.cache_config.num_cpu_blocks = num_cpu_blocks
														
 
															+
														
 
															+        # Initialize the cache.
														
 
															+        self._run_workers("init_cache_engine", cache_config=self.cache_config)
														
 
															+        # Warm up the model. This includes capturing the model into CUDA graph
														
 
															+        # if enforce_eager is False.
														
 
															+        self._run_workers("warm_up_model")
														
 
															+
														
 
															+    def execute_model(
														
 
															+        self,
														
 
															+        seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+        blocks_to_swap_in: Dict[int, int],
														
 
															+        blocks_to_swap_out: Dict[int, int],
														
 
															+        blocks_to_copy: Dict[int, List[int]],
														
 
															+    ) -> SamplerOutput:
														
 
															+        all_outputs = self._run_workers(
														
 
															+            "execute_model",
														
 
															+            driver_kwargs={
														
 
															+                "seq_group_metadata_list": seq_group_metadata_list,
														
 
															+                "blocks_to_swap_in": blocks_to_swap_in,
														
 
															+                "blocks_to_swap_out": blocks_to_swap_out,
														
 
															+                "blocks_to_copy": blocks_to_copy,
														
 
															+            },
														
 
															+            use_ray_compiled_dag=USE_RAY_COMPILED_DAG,
														
 
															+        )
														
 
															+
														
 
															+        # Only the driver worker returns the sampling results.
														
 
															+        output = all_outputs[0]
														
 
															+        return output
														
 
															+
														
 
															+    def add_lora(self, lora_request: LoRARequest) -> bool:
														
 
															+        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
														
 
															+        return self._run_workers(
														
 
															+            "add_lora",
														
 
															+            lora_request=lora_request,
														
 
															+        )
														
 
															+
														
 
															+    def remove_lora(self, lora_id: int) -> bool:
														
 
															+        assert lora_id > 0, "lora_id must be greater than 0."
														
 
															+        return self._run_workers(
														
 
															+            "remove_lora",
														
 
															+            lora_id=lora_id,
														
 
															+        )
														
 
															+
														
 
															+    def list_loras(self) -> List[int]:
														
 
															+        return self._run_workers("list_loras")
														
 
															+
														
 
															+    def _run_workers(
														
 
															+        self,
														
 
															+        method: str,
														
 
															+        *args,
														
 
															+        driver_args: Optional[List[Any]] = None,
														
 
															+        driver_kwargs: Optional[Dict[str, Any]] = None,
														
 
															+        max_concurrent_workers: Optional[int] = None,
														
 
															+        use_ray_compiled_dag: bool = False,
														
 
															+        **kwargs,
														
 
															+    ) -> Any:
														
 
															+        """Runs the given method on all workers."""
														
 
															+
														
 
															+        if max_concurrent_workers:
														
 
															+            raise NotImplementedError(
														
 
															+                "max_concurrent_workers is not supported yet.")
														
 
															+
														
 
															+        if use_ray_compiled_dag:
														
 
															+            # Right now, compiled DAG can only accept a single
														
 
															+            # input. TODO(sang): Fix it.
														
 
															+            output_channels = self.forward_dag.execute(1)
														
 
															+        else:
														
 
															+            # Start the ray workers first.
														
 
															+            ray_worker_outputs = [
														
 
															+                worker.execute_method.remote(method, *args, **kwargs)
														
 
															+                for worker in self.workers
														
 
															+            ]
														
 
															+
														
 
															+        if driver_args is None:
														
 
															+            driver_args = args
														
 
															+        if driver_kwargs is None:
														
 
															+            driver_kwargs = kwargs
														
 
															+
														
 
															+        # Start the driver worker after all the ray workers.
														
 
															+        driver_worker_output = getattr(self.driver_worker,
														
 
															+                                       method)(*driver_args, **driver_kwargs)
														
 
															+
														
 
															+        # Get the results of the ray workers.
														
 
															+        if self.workers:
														
 
															+            if use_ray_compiled_dag:
														
 
															+                try:
														
 
															+                    ray_worker_outputs = [
														
 
															+                        pickle.loads(chan.begin_read())
														
 
															+                        for chan in output_channels
														
 
															+                    ]
														
 
															+                finally:
														
 
															+                    # Has to call end_read in order to reuse the DAG.
														
 
															+                    for chan in output_channels:
														
 
															+                        chan.end_read()
														
 
															+            else:
														
 
															+                ray_worker_outputs = ray.get(ray_worker_outputs)
														
 
															+
														
 
															+        return [driver_worker_output] + ray_worker_outputs
														
 
															+
														
 
															+    def _compiled_ray_dag(self):
														
 
															+        import pkg_resources
														
 
															+
														
 
															+        required_version = "2.9"
														
 
															+        current_version = pkg_resources.get_distribution("ray").version
														
 
															+        if current_version < required_version:
														
 
															+            raise ValueError(f"Ray version {required_version} or greater is "
														
 
															+                             f"required, but found {current_version}")
														
 
															+
														
 
															+        from ray.dag import MultiOutputNode, InputNode
														
 
															+
														
 
															+        assert self.parallel_config.worker_use_ray
														
 
															+
														
 
															+        # Right now, compiled DAG requires at least 1 arg. We send
														
 
															+        # a dummy value for now. It will be fixed soon.
														
 
															+        with InputNode() as input_data:
														
 
															+            forward_dag = MultiOutputNode([
														
 
															+                worker.execute_model_compiled_dag_remote.bind(input_data)
														
 
															+                for worker in self.workers
														
 
															+            ])
														
 
															+        return forward_dag.experimental_compile()
														
 
															+
														
 
															+    def check_health(self) -> None:
														
 
															+        """Raises an error if engine is unhealthy."""
														
 
															+        self._check_if_any_actor_is_dead()
														
 
															+
														
 
															+    def _check_if_any_actor_is_dead(self):
														
 
															+        if not self.workers:
														
 
															+            return
														
 
															+
														
 
															+        dead_actors = []
														
 
															+        for actor in self.workers:
														
 
															+            actor_state = ray.state.actors(actor._ray_actor_id.hex())  # pylint: disable=protected-access
														
 
															+            if actor_state["State"] == "DEAD":
														
 
															+                dead_actors.append(actor)
														
 
															+        if dead_actors:
														
 
															+            raise RuntimeError("At least one Worker is dead. "
														
 
															+                               f"Dead Workers: {dead_actors}. ")
														
 
															+
														
 
															+
														
 
															+class RayGPUExecutorAsync(RayGPUExecutor, ExecutorAsyncBase):
														
 
															+
														
 
															+    async def _run_workers_async(
														
 
															+        self,
														
 
															+        method: str,
														
 
															+        *args,
														
 
															+        driver_args: Optional[List[Any]] = None,
														
 
															+        driver_kwargs: Optional[Dict[str, Any]] = None,
														
 
															+        **kwargs,
														
 
															+    ) -> Any:
														
 
															+        """Runs the given method on all workers."""
														
 
															+        coros = []
														
 
															+
														
 
															+        if driver_args is None:
														
 
															+            driver_args = args
														
 
															+        if driver_kwargs is None:
														
 
															+            driver_kwargs = kwargs
														
 
															+
														
 
															+        # Run the driver worker asynchronously.
														
 
															+        driver_executor = make_async(getattr(self.driver_worker, method))
														
 
															+        coros.append(driver_executor(*driver_args, **driver_kwargs))
														
 
															+
														
 
															+        # Run the ray workers asynchronously.
														
 
															+        for worker in self.workers:
														
 
															+            coros.append(worker.execute_method.remote(method, *args, **kwargs))
														
 
															+
														
 
															+        all_outputs = await asyncio.gather(*coros)
														
 
															+        return all_outputs
														
 
															+
														
 
															+    async def execute_model_async(
														
 
															+        self,
														
 
															+        seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+        blocks_to_swap_in: Dict[int, int],
														
 
															+        blocks_to_swap_out: Dict[int, int],
														
 
															+        blocks_to_copy: Dict[int, List[int]],
														
 
															+    ) -> SamplerOutput:
														
 
															+        all_outputs = await self._run_workers_async(
														
 
															+            "execute_model",
														
 
															+            driver_kwargs={
														
 
															+                "seq_group_metadata_list": seq_group_metadata_list,
														
 
															+                "blocks_to_swap_in": blocks_to_swap_in,
														
 
															+                "blocks_to_swap_out": blocks_to_swap_out,
														
 
															+                "blocks_to_copy": blocks_to_copy,
														
 
															+            },
														
 
															+        )
														
 
															+
														
 
															+        # Only the driver worker returns the sampling results.
														
 
															+        output = all_outputs[0]
														
 
															+        return output
														
 
															+
														
 
															+    async def check_health_async(self) -> None:
														
 
															+        """Raises an error if engine is unhealthy."""
														
 
															+        self._check_if_any_actor_is_dead()
														
--- a/aphrodite/executor/utils.py
+++ b/aphrodite/executor/utils.py
@@ -0,0 +1,13 @@
 
															+def check_block_size_valid(num_gpu_blocks, block_size, max_model_len) -> None:
														
 
															+    if num_gpu_blocks <= 0:
														
 
															+        raise ValueError("No available memory for the cache blocks. "
														
 
															+                         "Try increasing `gpu_memory_utilization` when "
														
 
															+                         "initializing the engine.")
														
 
															+    max_seq_len = block_size * num_gpu_blocks
														
 
															+    if max_model_len > max_seq_len:
														
 
															+        raise ValueError(
														
 
															+            f"The model's max seq len ({max_model_len}) "
														
 
															+            "is larger than the maximum number of tokens that can be "
														
 
															+            f"stored in KV cache ({max_seq_len}). Try increasing "
														
 
															+            "`gpu_memory_utilization` or decreasing `max_model_len` when "
														
 
															+            "initializing the engine.")
														
--- a/aphrodite/lora/layers.py
+++ b/aphrodite/lora/layers.py
@@ -824,6 +824,10 @@ class SamplerWithLoRA(BaseLayerWithLoRA):
 
															         self.dtype = dtype
														
 
															         self.device = device
														
 
															+    @property
														
 
															+    def logits_as_hidden_states(self):
														
 
															+        return self.base_layer.logits_as_hidden_states
														
 
															+
														
 
															     @property
														
 
															     def vocab_size(self):
														
 
															         return self.base_layer.vocab_size
														
--- a/aphrodite/modeling/hf_downloader.py
+++ b/aphrodite/modeling/hf_downloader.py
@@ -21,15 +21,19 @@ from aphrodite.common.gguf import GGUFReader
 
															 from aphrodite.modeling.layers.quantization import (get_quantization_config,
														
 
															                                                     QuantizationConfig)
														
 
															+_xdg_cache_home = os.getenv('XDG_CACHE_HOME', os.path.expanduser('~/.cache'))
														
 
															+_aphrodite_filelocks_path = os.path.join(_xdg_cache_home, 'aphrodite/locks/')
														
 
															-class Disabledtqdm(tqdm):  # pylint: disable=inconsistent-mro
														
 
															+
														
 
															+class Disabledtqdm(tqdm):
														
 
															     def __init__(self, *args, **kwargs):
														
 
															         super().__init__(*args, **kwargs, disable=True)
														
 
															 def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None):
														
 
															-    lock_dir = cache_dir if cache_dir is not None else "/tmp"
														
 
															+    lock_dir = cache_dir if cache_dir is not None else _aphrodite_filelocks_path
														
 
															+    os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
														
 
															     lock_file_name = model_name_or_path.replace("/", "-") + ".lock"
														
 
															     lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name))
														
 
															     return lock
														
@@ -164,7 +168,7 @@ def prepare_hf_model_weights(
 
															                 allow_patterns = [pattern]
														
 
															                 break
														
 
															-        logger.info(f"Downloading model weights {allow_patterns}")
														
 
															+        logger.info(f"Using model weights format {allow_patterns}")
														
 
															         # Use file lock to prevent multiple processes from
														
 
															         # downloading the same model weights at the same time.
														
 
															         with get_lock(model_name_or_path, cache_dir):
														
@@ -192,6 +196,7 @@ def prepare_hf_model_weights(
 
															             "scheduler.pt",
														
 
															             "scaler.pt",
														
 
															             "trainer_state.json",
														
 
															+            "hidden_states.safetensors",  # exllamav2
														
 
															         ]
														
 
															         hf_weights_files = [
														
 
															             f for f in hf_weights_files
														
--- a/aphrodite/modeling/layers/attention.py
+++ b/aphrodite/modeling/layers/attention.py
@@ -1,354 +0,0 @@
 
															-"""Multi-head attention."""
														
 
															-from typing import List, Optional
														
 
															-
														
 
															-import importlib
														
 
															-import torch
														
 
															-import torch.nn as nn
														
 
															-from xformers import ops as xops
														
 
															-from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask,
														
 
															-                                         LowerTriangularMaskWithTensorBias)
														
 
															-
														
 
															-from aphrodite._C import ops
														
 
															-from aphrodite._C import cache_ops
														
 
															-from aphrodite.modeling.metadata import InputMetadata
														
 
															-from aphrodite.modeling.layers.triton_kernel.prefix_prefill import (
														
 
															-    context_attention_fwd)
														
 
															-from aphrodite.common.utils import is_hip
														
 
															-
														
 
															-_SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256]
														
 
															-# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
														
 
															-_PARTITION_SIZE = 512
														
 
															-
														
 
															-
														
 
															-class PagedAttention(nn.Module):
														
 
															-    """MHA/MQA/GQA layer with PagedAttention.
														
 
															-
														
 
															-    This class takes query, key, and value tensors as input. The input tensors
														
 
															-    can either contain prompt tokens or generation tokens.
														
 
															-    The class does the following:
														
 
															-
														
 
															-    1. Reshape and store the input key and value tensors in the KV cache.
														
 
															-    2. Perform (multi-head/multi-query/grouped-query) attention using either
														
 
															-        xformers or the PagedAttention custom op.
														
 
															-    3. Return the output tensor.
														
 
															-    """
														
 
															-
														
 
															-    def __init__(
														
 
															-        self,
														
 
															-        num_heads: int,
														
 
															-        head_size: int,
														
 
															-        scale: float,
														
 
															-        num_kv_heads: Optional[int] = None,
														
 
															-        alibi_slopes: Optional[List[float]] = None,
														
 
															-        sliding_window: Optional[int] = None,
														
 
															-    ) -> None:
														
 
															-        super().__init__()
														
 
															-        self.num_heads = num_heads
														
 
															-        self.head_size = head_size
														
 
															-        self.scale = float(scale)
														
 
															-        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
														
 
															-        self.sliding_window = sliding_window
														
 
															-        if alibi_slopes is not None:
														
 
															-            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
														
 
															-        self.register_buffer("alibi_slopes", alibi_slopes, persistent=False)
														
 
															-
														
 
															-        assert self.num_heads % self.num_kv_heads == 0
														
 
															-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
														
 
															-
														
 
															-        if self.head_size not in _SUPPORTED_HEAD_SIZES:
														
 
															-            raise ValueError(f"head_size ({self.head_size}) is not supported. "
														
 
															-                             f"Supported head sizes: {_SUPPORTED_HEAD_SIZES}.")
														
 
															-
														
 
															-        self.use_ref_attention = self.check_use_ref_attention()
														
 
															-
														
 
															-    def check_use_ref_attention(self) -> bool:
														
 
															-        if not is_hip():
														
 
															-            return False
														
 
															-        # For ROCm, check whether flash attention is installed or not.
														
 
															-        # if not, use_ref_attention needs to be True
														
 
															-        return importlib.util.find_spec("flash_attn") is None
														
 
															-
														
 
															-    def ref_masked_attention(
														
 
															-        self,
														
 
															-        query: torch.Tensor,
														
 
															-        key: torch.Tensor,
														
 
															-        value: torch.Tensor,
														
 
															-    ) -> torch.Tensor:
														
 
															-        query = query.view(-1, self.num_heads, self.head_size)
														
 
															-        key = key.view(-1, self.num_kv_heads, self.head_size)
														
 
															-        value = value.view(-1, self.num_kv_heads, self.head_size)
														
 
															-
														
 
															-        seq_len, _, _ = query.shape
														
 
															-        attn_mask = torch.triu(torch.ones(seq_len,
														
 
															-                                          seq_len,
														
 
															-                                          dtype=query.dtype,
														
 
															-                                          device=query.device),
														
 
															-                               diagonal=1)
														
 
															-        attn_mask = attn_mask * torch.finfo(query.dtype).min
														
 
															-
														
 
															-        attn_weights = self.scale * torch.einsum("qhd,khd->hqk", query,
														
 
															-                                                 key).float()
														
 
															-        attn_weights = attn_weights + attn_mask.float()
														
 
															-        attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
														
 
															-        out = torch.einsum("hqk,khd->qhd", attn_weights, value)
														
 
															-        return out
														
 
															-
														
 
															-    def forward(
														
 
															-        self,
														
 
															-        query: torch.Tensor,
														
 
															-        key: torch.Tensor,
														
 
															-        value: torch.Tensor,
														
 
															-        key_cache: Optional[torch.Tensor],
														
 
															-        value_cache: Optional[torch.Tensor],
														
 
															-        input_metadata: InputMetadata,
														
 
															-        kv_quant_param: List[float] = None,
														
 
															-    ) -> torch.Tensor:
														
 
															-        """PagedAttention forward pass.
														
 
															-
														
 
															-        Args:
														
 
															-            query: shape = [batch_size, seq_len, num_heads * head_size]
														
 
															-            key: shape = [batch_size, seq_len, num_kv_heads * head_size]
														
 
															-            value: shape = [batch_size, seq_len, num_kv_heads * head_size]
														
 
															-            key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
														
 
															-                block_size, x]
														
 
															-            value_cache: shape = [num_blocks, num_kv_heads, head_size,
														
 
															-                block_size]
														
 
															-            input_metadata: metadata for the inputs.
														
 
															-        Returns:
														
 
															-            shape = [batch_size, seq_len, num_heads * head_size]
														
 
															-        """
														
 
															-        batch_size, seq_len, hidden_size = query.shape
														
 
															-        # Reshape the query, key, and value tensors.
														
 
															-        query = query.view(-1, self.num_heads, self.head_size)
														
 
															-        key = key.view(-1, self.num_kv_heads, self.head_size)
														
 
															-        value = value.view(-1, self.num_kv_heads, self.head_size)
														
 
															-        # FIXME: Remove this when all models support int8 kv cache
														
 
															-        kv_quant_param = [1.0, 0.0, 1.0, 0.0
														
 
															-                          ] if kv_quant_param is None else kv_quant_param
														
 
															-
														
 
															-        # Reshape the keys and values and store them in the cache.
														
 
															-        # If key_cache and value_cache are not provided, the new key and value
														
 
															-        # vectors will not be cached. This happens during the initial memory
														
 
															-        # profiling run.
														
 
															-        if key_cache is not None and value_cache is not None:
														
 
															-            cache_ops.reshape_and_cache(
														
 
															-                key,
														
 
															-                value,
														
 
															-                key_cache,
														
 
															-                value_cache,
														
 
															-                input_metadata.slot_mapping.flatten(),
														
 
															-                input_metadata.kv_cache_dtype,
														
 
															-                *kv_quant_param,
														
 
															-            )
														
 
															-
														
 
															-        if input_metadata.is_prompt:
														
 
															-            # Prompt run.
														
 
															-            if self.num_kv_heads != self.num_heads:
														
 
															-                # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
														
 
															-                # project the key and value tensors to the desired number of
														
 
															-                # heads.
														
 
															-                # TODO: Use MQA/GQA kernels for higher performance.
														
 
															-                query = query.view(query.shape[0], self.num_kv_heads,
														
 
															-                                   self.num_queries_per_kv, query.shape[-1])
														
 
															-                key = key[:, :,
														
 
															-                          None, :].expand(key.shape[0], self.num_kv_heads,
														
 
															-                                          self.num_queries_per_kv,
														
 
															-                                          key.shape[-1])
														
 
															-                value = value[:, :, None, :].expand(value.shape[0],
														
 
															-                                                    self.num_kv_heads,
														
 
															-                                                    self.num_queries_per_kv,
														
 
															-                                                    value.shape[-1])
														
 
															-            # normal attention
														
 
															-            if (key_cache is None or value_cache is None
														
 
															-                    or input_metadata.block_tables.numel() == 0):
														
 
															-                # Set attention bias if not provided. This typically happens at
														
 
															-                # the very attention layer of every iteration.
														
 
															-                # FIXME: This is a hack.
														
 
															-                if input_metadata.attn_bias is None:
														
 
															-                    if self.alibi_slopes is None:
														
 
															-                        attn_bias = BlockDiagonalCausalMask.from_seqlens(
														
 
															-                            [seq_len] * batch_size)
														
 
															-                        if self.sliding_window is not None:
														
 
															-                            attn_bias = attn_bias.make_local_attention(
														
 
															-                                self.sliding_window)
														
 
															-                        input_metadata.attn_bias = attn_bias
														
 
															-                    else:
														
 
															-                        input_metadata.attn_bias = _make_alibi_bias(
														
 
															-                            self.alibi_slopes, self.num_kv_heads, batch_size,
														
 
															-                            seq_len, query.dtype)
														
 
															-
														
 
															-                if self.use_ref_attention:
														
 
															-                    output = self.ref_masked_attention(
														
 
															-                        query,
														
 
															-                        key,
														
 
															-                        value,
														
 
															-                    )
														
 
															-                    return output.reshape(batch_size, seq_len, hidden_size)
														
 
															-
														
 
															-                # TODO: Too many view operations. Let's try to reduce
														
 
															-                # them in the future for code readability.
														
 
															-                if self.alibi_slopes is None:
														
 
															-                    query = query.unsqueeze(0)
														
 
															-                    key = key.unsqueeze(0)
														
 
															-                    value = value.unsqueeze(0)
														
 
															-                else:
														
 
															-                    query = query.unflatten(0, (batch_size, seq_len))
														
 
															-                    key = key.unflatten(0, (batch_size, seq_len))
														
 
															-                    value = value.unflatten(0, (batch_size, seq_len))
														
 
															-
														
 
															-                out = xops.memory_efficient_attention_forward(
														
 
															-                    query,
														
 
															-                    key,
														
 
															-                    value,
														
 
															-                    attn_bias=input_metadata.attn_bias,
														
 
															-                    p=0.0,
														
 
															-                    scale=self.scale,
														
 
															-                    op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if
														
 
															-                    (is_hip()) else None,
														
 
															-                )
														
 
															-                output = out.view_as(query)
														
 
															-            else:
														
 
															-                # prefix-enabled attention
														
 
															-                output = torch.empty_like(query)
														
 
															-                context_attention_fwd(
														
 
															-                    query,
														
 
															-                    key,
														
 
															-                    value,
														
 
															-                    output,
														
 
															-                    key_cache,
														
 
															-                    value_cache,
														
 
															-                    input_metadata.block_tables,  # [BS, max_block_per_request]
														
 
															-                    input_metadata.start_loc,
														
 
															-                    input_metadata.prompt_lens,
														
 
															-                    input_metadata.context_lens,
														
 
															-                    input_metadata.max_seq_len,
														
 
															-                    getattr(self, "alibi_slopes", None),
														
 
															-                )
														
 
															-
														
 
															-        else:
														
 
															-            # Decoding run.
														
 
															-            output = _paged_attention(
														
 
															-                query,
														
 
															-                key_cache,
														
 
															-                value_cache,
														
 
															-                input_metadata,
														
 
															-                self.num_kv_heads,
														
 
															-                self.scale,
														
 
															-                self.alibi_slopes,
														
 
															-                kv_quant_param,
														
 
															-            )
														
 
															-
														
 
															-        # Reshape the output tensor.
														
 
															-        return output.view(batch_size, seq_len, hidden_size)
														
 
															-
														
 
															-
														
 
															-def _make_alibi_bias(
														
 
															-    alibi_slopes: torch.Tensor,
														
 
															-    num_kv_heads: int,
														
 
															-    batch_size: int,
														
 
															-    seq_len: int,
														
 
															-    dtype: torch.dtype,
														
 
															-) -> LowerTriangularMaskWithTensorBias:
														
 
															-    bias = torch.arange(seq_len, dtype=dtype)
														
 
															-    # NOTE: HF uses
														
 
															-    #     `bias = bias[None, :].repeat(prompt_len, 1)`
														
 
															-    # here. We find that both biases give the same results, but
														
 
															-    # the bias below more accurately follows the original ALiBi
														
 
															-    # paper.
														
 
															-    bias = bias[None, :] - bias[:, None]
														
 
															-
														
 
															-    # When using custom attention bias, xformers requires the bias to
														
 
															-    # be sliced from a tensor whose length is a multiple of 8.
														
 
															-    padded_len = (seq_len + 7) // 8 * 8
														
 
															-    num_heads = alibi_slopes.shape[0]
														
 
															-    bias = torch.empty(
														
 
															-        batch_size,
														
 
															-        num_heads,
														
 
															-        seq_len,
														
 
															-        padded_len,
														
 
															-        device=alibi_slopes.device,
														
 
															-        dtype=dtype,
														
 
															-    )[:, :, :, :seq_len].copy_(bias)
														
 
															-    bias.mul_(alibi_slopes[:, None, None])
														
 
															-    if num_heads != num_kv_heads:
														
 
															-        bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads))
														
 
															-    attn_bias = LowerTriangularMaskWithTensorBias(bias)
														
 
															-    return attn_bias
														
 
															-
														
 
															-
														
 
															-def _paged_attention(
														
 
															-    query: torch.Tensor,
														
 
															-    key_cache: torch.Tensor,
														
 
															-    value_cache: torch.Tensor,
														
 
															-    input_metadata: InputMetadata,
														
 
															-    num_kv_heads: int,
														
 
															-    scale: float,
														
 
															-    alibi_slopes: Optional[torch.Tensor],
														
 
															-    kv_quant_param: List[float],
														
 
															-) -> torch.Tensor:
														
 
															-    output = torch.empty_like(query)
														
 
															-
														
 
															-    block_size = value_cache.shape[3]
														
 
															-    num_seqs, num_heads, head_size = query.shape
														
 
															-    max_num_partitions = (
														
 
															-        (input_metadata.max_context_len + _PARTITION_SIZE - 1) //
														
 
															-        _PARTITION_SIZE)
														
 
															-    # NOTE: We use a simple heuristic to decide whether to use
														
 
															-    # PagedAttention V1 or V2. If the number of partitions is 1, we use
														
 
															-    # V1 to avoid the overhead of reduction. Also, if the number of
														
 
															-    # sequences or heads is large, we use V1 since there is enough work
														
 
															-    # to parallelize.
														
 
															-    # TODO: Tune this heuristic.
														
 
															-    # For context len > 8192, use V2 kernel to avoid shared memory shortage.
														
 
															-    use_v1 = input_metadata.max_context_len <= 8192 and (
														
 
															-        max_num_partitions == 1 or num_seqs * num_heads > 512)
														
 
															-    if use_v1:
														
 
															-        # Run PagedAttention V1.
														
 
															-        ops.paged_attention_v1(
														
 
															-            output,
														
 
															-            query,
														
 
															-            key_cache,
														
 
															-            value_cache,
														
 
															-            num_kv_heads,
														
 
															-            scale,
														
 
															-            input_metadata.block_tables,
														
 
															-            input_metadata.context_lens,
														
 
															-            block_size,
														
 
															-            input_metadata.max_context_len,
														
 
															-            alibi_slopes,
														
 
															-            input_metadata.kv_cache_dtype,
														
 
															-            *kv_quant_param,
														
 
															-        )
														
 
															-    else:
														
 
															-        # Run PagedAttention V2.
														
 
															-        assert _PARTITION_SIZE % block_size == 0
														
 
															-        tmp_output = torch.empty(
														
 
															-            size=(num_seqs, num_heads, max_num_partitions, head_size),
														
 
															-            dtype=output.dtype,
														
 
															-            device=output.device,
														
 
															-        )
														
 
															-        exp_sums = torch.empty(
														
 
															-            size=(num_seqs, num_heads, max_num_partitions),
														
 
															-            dtype=torch.float32,
														
 
															-            device=output.device,
														
 
															-        )
														
 
															-        max_logits = torch.empty_like(exp_sums)
														
 
															-        ops.paged_attention_v2(
														
 
															-            output,
														
 
															-            exp_sums,
														
 
															-            max_logits,
														
 
															-            tmp_output,
														
 
															-            query,
														
 
															-            key_cache,
														
 
															-            value_cache,
														
 
															-            num_kv_heads,
														
 
															-            scale,
														
 
															-            input_metadata.block_tables,
														
 
															-            input_metadata.context_lens,
														
 
															-            block_size,
														
 
															-            input_metadata.max_context_len,
														
 
															-            alibi_slopes,
														
 
															-            input_metadata.kv_cache_dtype,
														
 
															-            *kv_quant_param,
														
 
															-        )
														
 
															-    return output
														
--- a/aphrodite/modeling/layers/attention/__init__.py
+++ b/aphrodite/modeling/layers/attention/__init__.py
@@ -0,0 +1,93 @@
 
															+"""Attention layer."""
														
 
															+from functools import lru_cache
														
 
															+from typing import List, Optional
														
 
															+
														
 
															+import torch
														
 
															+import torch.nn as nn
														
 
															+from loguru import logger
														
 
															+
														
 
															+from aphrodite.modeling.metadata import InputMetadata
														
 
															+from aphrodite.common.utils import is_hip
														
 
															+
														
 
															+
														
 
															+class Attention(nn.Module):
														
 
															+    """Attention layer.
														
 
															+
														
 
															+    This class takes query, key, and value tensors as input. The input tensors
														
 
															+    can either contain prompt tokens or generation tokens.
														
 
															+    The class does the following:
														
 
															+
														
 
															+    1. Store the input key and value tensors in the KV cache.
														
 
															+    2. Perform (multi-head/multi-query/grouped-query) attention.
														
 
															+    3. Return the output tensor.
														
 
															+    """
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        num_heads: int,
														
 
															+        head_size: int,
														
 
															+        scale: float,
														
 
															+        num_kv_heads: Optional[int] = None,
														
 
															+        alibi_slopes: Optional[List[float]] = None,
														
 
															+        sliding_window: Optional[int] = None,
														
 
															+    ) -> None:
														
 
															+        super().__init__()
														
 
															+        if _use_flash_attn():
														
 
															+            from aphrodite.modeling.layers.attention.backends.flash_attn import FlashAttentionBackend  # noqa: E501
														
 
															+
														
 
															+            self.backend = FlashAttentionBackend(
														
 
															+                num_heads,
														
 
															+                head_size,
														
 
															+                scale,
														
 
															+                num_kv_heads,
														
 
															+                alibi_slopes,
														
 
															+                sliding_window,
														
 
															+            )
														
 
															+        else:
														
 
															+            from aphrodite.modeling.layers.attention.backends.xformers import XFormersBackend  # noqa: E501
														
 
															+
														
 
															+            self.backend = XFormersBackend(
														
 
															+                num_heads,
														
 
															+                head_size,
														
 
															+                scale,
														
 
															+                num_kv_heads,
														
 
															+                alibi_slopes,
														
 
															+                sliding_window,
														
 
															+            )
														
 
															+
														
 
															+    def forward(
														
 
															+        self,
														
 
															+        query: torch.Tensor,
														
 
															+        key: torch.Tensor,
														
 
															+        value: torch.Tensor,
														
 
															+        key_cache: Optional[torch.Tensor],
														
 
															+        value_cache: Optional[torch.Tensor],
														
 
															+        input_metadata: InputMetadata,
														
 
															+    ) -> torch.Tensor:
														
 
															+        return self.backend.forward(query, key, value, key_cache, value_cache,
														
 
															+                                    input_metadata)
														
 
															+
														
 
															+
														
 
															+@lru_cache(maxsize=1)
														
 
															+def _use_flash_attn() -> bool:
														
 
															+    try:
														
 
															+        import flash_attn  # noqa: F401
														
 
															+    except ImportError:
														
 
															+        logger.info("flash_attn is not found. Using xformers backend.")
														
 
															+        return False
														
 
															+
														
 
															+    if is_hip():
														
 
															+        # AMD GPUs.
														
 
															+        return False
														
 
															+    if torch.cuda.get_device_capability()[0] < 8:
														
 
															+        logger.info("flash_attn is not supported on Turing or older GPUs. "
														
 
															+                    "Using xformers backend.")
														
 
															+        return False
														
 
															+    if torch.get_default_dtype() not in (torch.float16, torch.bfloat16):
														
 
															+        logger.info(
														
 
															+            "flash_attn only supports torch.float16 or torch.bfloat16. "
														
 
															+            "Using xformers backend.")
														
 
															+        return False
														
 
															+
														
 
															+    logger.info("Using Flash Attention backend.")
														
 
															+    return True
														
--- a/aphrodite/modeling/layers/attention/backends/__init__.py
+++ b/aphrodite/modeling/layers/attention/backends/__init__.py
--- a/aphrodite/modeling/layers/attention/backends/flash_attn.py
+++ b/aphrodite/modeling/layers/attention/backends/flash_attn.py
@@ -0,0 +1,121 @@
 
															+"""Attention layer with Flash and PagedAttention."""
														
 
															+from typing import List, Optional
														
 
															+
														
 
															+from flash_attn import flash_attn_func
														
 
															+import torch
														
 
															+
														
 
															+from aphrodite.modeling.metadata import InputMetadata
														
 
															+from aphrodite.modeling.layers.attention.ops.paged_attn import (
														
 
															+    PagedAttentionImpl)
														
 
															+
														
 
															+
														
 
															+class FlashAttentionBackend:
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        num_heads: int,
														
 
															+        head_size: int,
														
 
															+        scale: float,
														
 
															+        num_kv_heads: Optional[int] = None,
														
 
															+        alibi_slopes: Optional[List[float]] = None,
														
 
															+        sliding_window: Optional[int] = None,
														
 
															+    ) -> None:
														
 
															+        self.num_heads = num_heads
														
 
															+        self.head_size = head_size
														
 
															+        self.scale = float(scale)
														
 
															+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
														
 
															+        self.sliding_window = sliding_window
														
 
															+        if alibi_slopes is not None:
														
 
															+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
														
 
															+        self.alibi_slopes = alibi_slopes
														
 
															+
														
 
															+        assert self.num_heads % self.num_kv_heads == 0
														
 
															+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
														
 
															+        suppored_head_sizes = PagedAttentionImpl.get_supported_head_sizes()
														
 
															+        if head_size not in suppored_head_sizes:
														
 
															+            raise ValueError(
														
 
															+                f"Head size {head_size} is not supported by PagedAttention. "
														
 
															+                f"Supported head sizes are: {suppored_head_sizes}.")
														
 
															+
														
 
															+        self.sliding_window = ((self.sliding_window, self.sliding_window) if
														
 
															+                               self.sliding_window is not None else (-1, -1))
														
 
															+
														
 
															+    def forward(
														
 
															+        self,
														
 
															+        query: torch.Tensor,
														
 
															+        key: torch.Tensor,
														
 
															+        value: torch.Tensor,
														
 
															+        key_cache: Optional[torch.Tensor],
														
 
															+        value_cache: Optional[torch.Tensor],
														
 
															+        input_metadata: InputMetadata,
														
 
															+    ) -> torch.Tensor:
														
 
															+        """Forward pass with FlashAttention and PagedAttention.
														
 
															+
														
 
															+        Args:
														
 
															+            query: shape = [batch_size, seq_len, num_heads * head_size]
														
 
															+            key: shape = [batch_size, seq_len, num_kv_heads * head_size]
														
 
															+            value: shape = [batch_size, seq_len, num_kv_heads * head_size]
														
 
															+            key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
														
 
															+                block_size, x]
														
 
															+            value_cache: shape = [num_blocks, num_kv_heads, head_size,
														
 
															+                block_size]
														
 
															+            input_metadata: metadata for the inputs.
														
 
															+        Returns:
														
 
															+            shape = [batch_size, seq_len, num_heads * head_size]
														
 
															+        """
														
 
															+        batch_size, seq_len, hidden_size = query.shape
														
 
															+        # Reshape the query, key, and value tensors.
														
 
															+        query = query.view(-1, self.num_heads, self.head_size)
														
 
															+        key = key.view(-1, self.num_kv_heads, self.head_size)
														
 
															+        value = value.view(-1, self.num_kv_heads, self.head_size)
														
 
															+
														
 
															+        # Reshape the keys and values and store them in the cache.
														
 
															+        # If key_cache and value_cache are not provided, the new key and value
														
 
															+        # vectors will not be cached. This happens during the initial memory
														
 
															+        # profiling run.
														
 
															+        if key_cache is not None and value_cache is not None:
														
 
															+            PagedAttentionImpl.reshape_and_cache(key, value, key_cache,
														
 
															+                                                 value_cache, input_metadata)
														
 
															+
														
 
															+        if input_metadata.is_prompt:
														
 
															+            # Prompt run.
														
 
															+            if (key_cache is None or value_cache is None
														
 
															+                    or input_metadata.block_tables.numel() == 0):
														
 
															+                # normal attention
														
 
															+                query = query.unflatten(0, (batch_size, seq_len))
														
 
															+                key = key.unflatten(0, (batch_size, seq_len))
														
 
															+                value = value.unflatten(0, (batch_size, seq_len))
														
 
															+                output = flash_attn_func(
														
 
															+                    query,
														
 
															+                    key,
														
 
															+                    value,
														
 
															+                    softmax_scale=self.scale,
														
 
															+                    causal=True,
														
 
															+                    window_size=self.sliding_window,
														
 
															+                    alibi_slopes=self.alibi_slopes,
														
 
															+                )
														
 
															+            else:
														
 
															+                # prefix-enabled attention
														
 
															+                output = PagedAttentionImpl.forward_prefix(
														
 
															+                    query,
														
 
															+                    key,
														
 
															+                    value,
														
 
															+                    key_cache,
														
 
															+                    value_cache,
														
 
															+                    input_metadata,
														
 
															+                    self.alibi_slopes,
														
 
															+                )
														
 
															+        else:
														
 
															+            # Decoding run.
														
 
															+            output = PagedAttentionImpl.forward_decode(
														
 
															+                query,
														
 
															+                key_cache,
														
 
															+                value_cache,
														
 
															+                input_metadata,
														
 
															+                self.num_kv_heads,
														
 
															+                self.scale,
														
 
															+                self.alibi_slopes,
														
 
															+            )
														
 
															+
														
 
															+        # Reshape the output tensor.
														
 
															+        return output.view(batch_size, seq_len, hidden_size)
														
--- a/aphrodite/modeling/layers/attention/backends/xformers.py
+++ b/aphrodite/modeling/layers/attention/backends/xformers.py
@@ -0,0 +1,255 @@
 
															+"""Attention layer with xFormers and PagedAttention."""
														
 
															+import importlib
														
 
															+from typing import List, Optional
														
 
															+
														
 
															+import torch
														
 
															+from xformers import ops as xops
														
 
															+from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask,
														
 
															+                                         LowerTriangularMaskWithTensorBias)
														
 
															+
														
 
															+from aphrodite.modeling.metadata import InputMetadata
														
 
															+from aphrodite.modeling.layers.attention.ops.paged_attn import (
														
 
															+    PagedAttentionImpl)
														
 
															+from aphrodite.common.utils import is_hip
														
 
															+
														
 
															+
														
 
															+class XFormersBackend:
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        num_heads: int,
														
 
															+        head_size: int,
														
 
															+        scale: float,
														
 
															+        num_kv_heads: Optional[int] = None,
														
 
															+        alibi_slopes: Optional[List[float]] = None,
														
 
															+        sliding_window: Optional[int] = None,
														
 
															+    ) -> None:
														
 
															+        self.num_heads = num_heads
														
 
															+        self.head_size = head_size
														
 
															+        self.scale = float(scale)
														
 
															+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
														
 
															+        self.sliding_window = sliding_window
														
 
															+        if alibi_slopes is not None:
														
 
															+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
														
 
															+        self.alibi_slopes = alibi_slopes
														
 
															+
														
 
															+        assert self.num_heads % self.num_kv_heads == 0
														
 
															+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
														
 
															+        suppored_head_sizes = PagedAttentionImpl.get_supported_head_sizes()
														
 
															+        if head_size not in suppored_head_sizes:
														
 
															+            raise ValueError(
														
 
															+                f"Head size {head_size} is not supported by PagedAttention. "
														
 
															+                f"Supported head sizes are: {suppored_head_sizes}.")
														
 
															+
														
 
															+        self.use_ref_attention = _check_use_ref_attention()
														
 
															+
														
 
															+    def forward(
														
 
															+        self,
														
 
															+        query: torch.Tensor,
														
 
															+        key: torch.Tensor,
														
 
															+        value: torch.Tensor,
														
 
															+        key_cache: Optional[torch.Tensor],
														
 
															+        value_cache: Optional[torch.Tensor],
														
 
															+        input_metadata: InputMetadata,
														
 
															+    ) -> torch.Tensor:
														
 
															+        """Forward pass with xFormers and PagedAttention.
														
 
															+
														
 
															+        Args:
														
 
															+            query: shape = [batch_size, seq_len, num_heads * head_size]
														
 
															+            key: shape = [batch_size, seq_len, num_kv_heads * head_size]
														
 
															+            value: shape = [batch_size, seq_len, num_kv_heads * head_size]
														
 
															+            key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
														
 
															+                block_size, x]
														
 
															+            value_cache: shape = [num_blocks, num_kv_heads, head_size,
														
 
															+                block_size]
														
 
															+            input_metadata: metadata for the inputs.
														
 
															+        Returns:
														
 
															+            shape = [batch_size, seq_len, num_heads * head_size]
														
 
															+        """
														
 
															+        batch_size, seq_len, hidden_size = query.shape
														
 
															+        # Reshape the query, key, and value tensors.
														
 
															+        query = query.view(-1, self.num_heads, self.head_size)
														
 
															+        key = key.view(-1, self.num_kv_heads, self.head_size)
														
 
															+        value = value.view(-1, self.num_kv_heads, self.head_size)
														
 
															+
														
 
															+        # Reshape the keys and values and store them in the cache.
														
 
															+        # If key_cache and value_cache are not provided, the new key and value
														
 
															+        # vectors will not be cached. This happens during the initial memory
														
 
															+        # profiling run.
														
 
															+        if key_cache is not None and value_cache is not None:
														
 
															+            PagedAttentionImpl.reshape_and_cache(key, value, key_cache,
														
 
															+                                                 value_cache, input_metadata)
														
 
															+
														
 
															+        if input_metadata.is_prompt:
														
 
															+            # Prompt run.
														
 
															+            if (key_cache is None or value_cache is None
														
 
															+                    or input_metadata.block_tables.numel() == 0):
														
 
															+                # normal attention
														
 
															+                if self.num_kv_heads != self.num_heads:
														
 
															+                    # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
														
 
															+                    # project the key and value tensors to the desired number of
														
 
															+                    # heads.
														
 
															+                    # TODO: Use MQA/GQA kernels for higher performance.
														
 
															+                    query = query.view(query.shape[0], self.num_kv_heads,
														
 
															+                                       self.num_queries_per_kv,
														
 
															+                                       query.shape[-1])
														
 
															+                    key = key[:, :,
														
 
															+                              None, :].expand(key.shape[0], self.num_kv_heads,
														
 
															+                                              self.num_queries_per_kv,
														
 
															+                                              key.shape[-1])
														
 
															+                    value = value[:, :,
														
 
															+                                  None, :].expand(value.shape[0],
														
 
															+                                                  self.num_kv_heads,
														
 
															+                                                  self.num_queries_per_kv,
														
 
															+                                                  value.shape[-1])
														
 
															+
														
 
															+                # Set attention bias if not provided. This typically happens at
														
 
															+                # the very attention layer of every iteration.
														
 
															+                # FIXME: This is a hack.
														
 
															+                if input_metadata.attn_bias is None:
														
 
															+                    if self.alibi_slopes is None:
														
 
															+                        attn_bias = BlockDiagonalCausalMask.from_seqlens(
														
 
															+                            [seq_len] * batch_size)
														
 
															+                        if self.sliding_window is not None:
														
 
															+                            attn_bias = attn_bias.make_local_attention(
														
 
															+                                self.sliding_window)
														
 
															+                        input_metadata.attn_bias = attn_bias
														
 
															+                    else:
														
 
															+                        input_metadata.attn_bias = _make_alibi_bias(
														
 
															+                            self.alibi_slopes, self.num_kv_heads, batch_size,
														
 
															+                            seq_len, query.dtype)
														
 
															+
														
 
															+                if self.use_ref_attention:
														
 
															+                    output = _ref_masked_attention(
														
 
															+                        query,
														
 
															+                        key,
														
 
															+                        value,
														
 
															+                        self.num_heads,
														
 
															+                        self.num_kv_heads,
														
 
															+                        self.head_size,
														
 
															+                        self.scale,
														
 
															+                    )
														
 
															+                    # Using view got RuntimeError: view size is not compatible
														
 
															+                    # with input tensor's size and stride (at least one
														
 
															+                    # dimension spans across two contiguous subspaces).
														
 
															+                    # Use reshape instead.
														
 
															+                    return output.reshape(batch_size, seq_len, hidden_size)
														
 
															+
														
 
															+                # TODO: Too many view operations. Let's try to reduce
														
 
															+                # them in the future for code readability.
														
 
															+                if self.alibi_slopes is None:
														
 
															+                    query = query.unsqueeze(0)
														
 
															+                    key = key.unsqueeze(0)
														
 
															+                    value = value.unsqueeze(0)
														
 
															+                else:
														
 
															+                    query = query.unflatten(0, (batch_size, seq_len))
														
 
															+                    key = key.unflatten(0, (batch_size, seq_len))
														
 
															+                    value = value.unflatten(0, (batch_size, seq_len))
														
 
															+
														
 
															+                out = xops.memory_efficient_attention_forward(
														
 
															+                    query,
														
 
															+                    key,
														
 
															+                    value,
														
 
															+                    attn_bias=input_metadata.attn_bias,
														
 
															+                    p=0.0,
														
 
															+                    scale=self.scale,
														
 
															+                    op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if
														
 
															+                    (is_hip()) else None,
														
 
															+                )
														
 
															+                output = out.view_as(query)
														
 
															+
														
 
															+            else:
														
 
															+                # prefix-enabled attention
														
 
															+                output = PagedAttentionImpl.forward_prefix(
														
 
															+                    query,
														
 
															+                    key,
														
 
															+                    value,
														
 
															+                    key_cache,
														
 
															+                    value_cache,
														
 
															+                    input_metadata,
														
 
															+                    self.alibi_slopes,
														
 
															+                )
														
 
															+        else:
														
 
															+            # Decoding run.
														
 
															+            output = PagedAttentionImpl.forward_decode(
														
 
															+                query,
														
 
															+                key_cache,
														
 
															+                value_cache,
														
 
															+                input_metadata,
														
 
															+                self.num_kv_heads,
														
 
															+                self.scale,
														
 
															+                self.alibi_slopes,
														
 
															+            )
														
 
															+
														
 
															+        # Reshape the output tensor.
														
 
															+        return output.view(batch_size, seq_len, hidden_size)
														
 
															+
														
 
															+
														
 
															+def _make_alibi_bias(
														
 
															+    alibi_slopes: torch.Tensor,
														
 
															+    num_kv_heads: int,
														
 
															+    batch_size: int,
														
 
															+    seq_len: int,
														
 
															+    dtype: torch.dtype,
														
 
															+) -> LowerTriangularMaskWithTensorBias:
														
 
															+    bias = torch.arange(seq_len, dtype=dtype)
														
 
															+    # NOTE: HF uses
														
 
															+    #     `bias = bias[None, :].repeat(prompt_len, 1)`
														
 
															+    # here. We find that both biases give the same results, but
														
 
															+    # the bias below more accurately follows the original ALiBi
														
 
															+    # paper.
														
 
															+    bias = bias[None, :] - bias[:, None]
														
 
															+
														
 
															+    # When using custom attention bias, xformers requires the bias to
														
 
															+    # be sliced from a tensor whose length is a multiple of 8.
														
 
															+    padded_len = (seq_len + 7) // 8 * 8
														
 
															+    num_heads = alibi_slopes.shape[0]
														
 
															+    bias = torch.empty(
														
 
															+        batch_size,
														
 
															+        num_heads,
														
 
															+        seq_len,
														
 
															+        padded_len,
														
 
															+        device=alibi_slopes.device,
														
 
															+        dtype=dtype,
														
 
															+    )[:, :, :, :seq_len].copy_(bias)
														
 
															+    bias.mul_(alibi_slopes[:, None, None])
														
 
															+    if num_heads != num_kv_heads:
														
 
															+        bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads))
														
 
															+    attn_bias = LowerTriangularMaskWithTensorBias(bias)
														
 
															+    return attn_bias
														
 
															+
														
 
															+
														
 
															+def _check_use_ref_attention() -> bool:
														
 
															+    if not is_hip():
														
 
															+        return False
														
 
															+    # For ROCm, check whether flash attention is installed or not.
														
 
															+    # if not, use_ref_attention needs to be True
														
 
															+    return importlib.util.find_spec("flash_attn") is None
														
 
															+
														
 
															+
														
 
															+def _ref_masked_attention(
														
 
															+    query: torch.Tensor,
														
 
															+    key: torch.Tensor,
														
 
															+    value: torch.Tensor,
														
 
															+    num_heads: int,
														
 
															+    num_kv_heads: int,
														
 
															+    head_size: int,
														
 
															+    scale: float,
														
 
															+) -> torch.Tensor:
														
 
															+    query = query.view(-1, num_heads, head_size)
														
 
															+    key = key.view(-1, num_kv_heads, head_size)
														
 
															+    value = value.view(-1, num_kv_heads, head_size)
														
 
															+
														
 
															+    seq_len, _, _ = query.shape
														
 
															+    attn_mask = torch.triu(torch.ones(seq_len,
														
 
															+                                      seq_len,
														
 
															+                                      dtype=query.dtype,
														
 
															+                                      device=query.device),
														
 
															+                           diagonal=1)
														
 
															+    attn_mask = attn_mask * torch.finfo(query.dtype).min
														
 
															+
														
 
															+    attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
														
 
															+    attn_weights = attn_weights + attn_mask.float()
														
 
															+    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
														
 
															+    out = torch.einsum("hqk,khd->qhd", attn_weights, value)
														
 
															+    return out
														
--- a/aphrodite/modeling/layers/attention/ops/__init__.py
+++ b/aphrodite/modeling/layers/attention/ops/__init__.py
--- a/aphrodite/modeling/layers/attention/ops/paged_attn.py
+++ b/aphrodite/modeling/layers/attention/ops/paged_attn.py
@@ -0,0 +1,138 @@
 
															+from typing import List, Optional
														
 
															+
														
 
															+import torch
														
 
															+
														
 
															+from aphrodite._C import cache_ops
														
 
															+from aphrodite._C import ops
														
 
															+from aphrodite.modeling.metadata import InputMetadata
														
 
															+from aphrodite.modeling.layers.attention.ops.prefix_prefill import (
														
 
															+    context_attention_fwd)
														
 
															+
														
 
															+# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
														
 
															+_PARTITION_SIZE = 512
														
 
															+
														
 
															+
														
 
															+class PagedAttentionImpl:
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def get_supported_head_sizes() -> List[int]:
														
 
															+        return [64, 80, 96, 112, 128, 256]
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def reshape_and_cache(
														
 
															+        key: torch.Tensor,
														
 
															+        value: torch.Tensor,
														
 
															+        key_cache: torch.Tensor,
														
 
															+        value_cache: torch.Tensor,
														
 
															+        input_metadata: InputMetadata,
														
 
															+    ) -> None:
														
 
															+        cache_ops.reshape_and_cache(
														
 
															+            key,
														
 
															+            value,
														
 
															+            key_cache,
														
 
															+            value_cache,
														
 
															+            input_metadata.slot_mapping.flatten(),
														
 
															+            input_metadata.kv_cache_dtype,
														
 
															+        )
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def forward_decode(
														
 
															+        query: torch.Tensor,
														
 
															+        key_cache: torch.Tensor,
														
 
															+        value_cache: torch.Tensor,
														
 
															+        input_metadata: InputMetadata,
														
 
															+        num_kv_heads: int,
														
 
															+        scale: float,
														
 
															+        alibi_slopes: Optional[torch.Tensor],
														
 
															+    ) -> torch.Tensor:
														
 
															+        output = torch.empty_like(query)
														
 
															+
														
 
															+        block_size = value_cache.shape[3]
														
 
															+        num_seqs, num_heads, head_size = query.shape
														
 
															+        max_num_partitions = (
														
 
															+            (input_metadata.max_context_len + _PARTITION_SIZE - 1) //
														
 
															+            _PARTITION_SIZE)
														
 
															+        # NOTE: We use a simple heuristic to decide whether to use
														
 
															+        # PagedAttention V1 or V2. If the number of partitions is 1, we use
														
 
															+        # V1 to avoid the overhead of reduction. Also, if the number of
														
 
															+        # sequences or heads is large, we use V1 since there is enough work
														
 
															+        # to parallelize.
														
 
															+        # TODO: Tune this heuristic.
														
 
															+        # For context len > 8192, use V2 kernel to avoid shared memory shortage.
														
 
															+        use_v1 = input_metadata.max_context_len <= 8192 and (
														
 
															+            max_num_partitions == 1 or num_seqs * num_heads > 512)
														
 
															+        if use_v1:
														
 
															+            # Run PagedAttention V1.
														
 
															+            ops.paged_attention_v1(
														
 
															+                output,
														
 
															+                query,
														
 
															+                key_cache,
														
 
															+                value_cache,
														
 
															+                num_kv_heads,
														
 
															+                scale,
														
 
															+                input_metadata.block_tables,
														
 
															+                input_metadata.context_lens,
														
 
															+                block_size,
														
 
															+                input_metadata.max_context_len,
														
 
															+                alibi_slopes,
														
 
															+                input_metadata.kv_cache_dtype,
														
 
															+            )
														
 
															+        else:
														
 
															+            # Run PagedAttention V2.
														
 
															+            assert _PARTITION_SIZE % block_size == 0
														
 
															+            tmp_output = torch.empty(
														
 
															+                size=(num_seqs, num_heads, max_num_partitions, head_size),
														
 
															+                dtype=output.dtype,
														
 
															+                device=output.device,
														
 
															+            )
														
 
															+            exp_sums = torch.empty(
														
 
															+                size=(num_seqs, num_heads, max_num_partitions),
														
 
															+                dtype=torch.float32,
														
 
															+                device=output.device,
														
 
															+            )
														
 
															+            max_logits = torch.empty_like(exp_sums)
														
 
															+            ops.paged_attention_v2(
														
 
															+                output,
														
 
															+                exp_sums,
														
 
															+                max_logits,
														
 
															+                tmp_output,
														
 
															+                query,
														
 
															+                key_cache,
														
 
															+                value_cache,
														
 
															+                num_kv_heads,
														
 
															+                scale,
														
 
															+                input_metadata.block_tables,
														
 
															+                input_metadata.context_lens,
														
 
															+                block_size,
														
 
															+                input_metadata.max_context_len,
														
 
															+                alibi_slopes,
														
 
															+                input_metadata.kv_cache_dtype,
														
 
															+            )
														
 
															+        return output
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def forward_prefix(
														
 
															+        query: torch.Tensor,
														
 
															+        key: torch.Tensor,
														
 
															+        value: torch.Tensor,
														
 
															+        key_cache: torch.Tensor,
														
 
															+        value_cache: torch.Tensor,
														
 
															+        input_metadata: InputMetadata,
														
 
															+        alibi_slopes: Optional[torch.Tensor],
														
 
															+    ) -> torch.Tensor:
														
 
															+        output = torch.empty_like(query)
														
 
															+        context_attention_fwd(
														
 
															+            query,
														
 
															+            key,
														
 
															+            value,
														
 
															+            output,
														
 
															+            key_cache,
														
 
															+            value_cache,
														
 
															+            input_metadata.block_tables,  # [BS, max_block_per_request]
														
 
															+            input_metadata.start_loc,
														
 
															+            input_metadata.prompt_lens,
														
 
															+            input_metadata.context_lens,
														
 
															+            input_metadata.max_seq_len,
														
 
															+            alibi_slopes,
														
 
															+        )
														
 
															+        return output
														
--- a/aphrodite/modeling/layers/attention/ops/prefix_prefill.py
+++ b/aphrodite/modeling/layers/attention/ops/prefix_prefill.py
@@ -45,6 +45,7 @@ if triton.__version__ >= "2.1.0":
 
															         stride_v_cache_h,
														
 
															         stride_v_cache_d,
														
 
															         stride_v_cache_bl,
														
 
															+        num_queries_per_kv: int,
														
 
															         BLOCK_M: tl.constexpr,
														
 
															         BLOCK_DMODEL: tl.constexpr,
														
 
															         BLOCK_N: tl.constexpr,
														
@@ -53,6 +54,8 @@ if triton.__version__ >= "2.1.0":
 
															         cur_head = tl.program_id(1)
														
 
															         start_m = tl.program_id(2)
														
 
															+        cur_kv_head = cur_head // num_queries_per_kv
														
 
															+
														
 
															         cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
														
 
															         cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
														
 
															         cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
														
@@ -85,13 +88,14 @@ if triton.__version__ >= "2.1.0":
 
															                          mask=(start_n + offs_n) < cur_batch_ctx_len,
														
 
															                          other=0)
														
 
															             off_k = (bn[None, :] * stride_k_cache_bs +
														
 
															-                     cur_head * stride_k_cache_h +
														
 
															+                     cur_kv_head * stride_k_cache_h +
														
 
															                      (offs_d[:, None] // x) * stride_k_cache_d +
														
 
															                      ((start_n + offs_n[None, :]) % block_size) *
														
 
															                      stride_k_cache_bl +
														
 
															                      (offs_d[:, None] % x) * stride_k_cache_x)
														
 
															             off_v = (
														
 
															-                bn[:, None] * stride_v_cache_bs + cur_head * stride_v_cache_h +
														
 
															+                bn[:, None] * stride_v_cache_bs +
														
 
															+                cur_kv_head * stride_v_cache_h +
														
 
															                 offs_d[None, :] * stride_v_cache_d +
														
 
															                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
														
 
															             k = tl.load(K_cache + off_k,
														
@@ -131,9 +135,9 @@ if triton.__version__ >= "2.1.0":
 
															             l_i = l_i_new
														
 
															             m_i = m_i_new
														
 
															-        off_k = (offs_n[None, :] * stride_kbs + cur_head * stride_kh +
														
 
															+        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
														
 
															                  offs_d[:, None] * stride_kd)
														
 
															-        off_v = (offs_n[:, None] * stride_vbs + cur_head * stride_vh +
														
 
															+        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
														
 
															                  offs_d[None, :] * stride_vd)
														
 
															         k_ptrs = K + off_k
														
 
															         v_ptrs = V + off_v
														
@@ -232,6 +236,7 @@ if triton.__version__ >= "2.1.0":
 
															         stride_v_cache_h,
														
 
															         stride_v_cache_d,
														
 
															         stride_v_cache_bl,
														
 
															+        num_queries_per_kv: int,
														
 
															         BLOCK_M: tl.constexpr,
														
 
															         BLOCK_DMODEL: tl.constexpr,
														
 
															         BLOCK_N: tl.constexpr,
														
@@ -240,6 +245,8 @@ if triton.__version__ >= "2.1.0":
 
															         cur_head = tl.program_id(1)
														
 
															         start_m = tl.program_id(2)
														
 
															+        cur_kv_head = cur_head // num_queries_per_kv
														
 
															+
														
 
															         cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
														
 
															         cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
														
 
															         cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
														
@@ -272,13 +279,14 @@ if triton.__version__ >= "2.1.0":
 
															                          mask=(start_n + offs_n) < cur_batch_ctx_len,
														
 
															                          other=0)
														
 
															             off_k = (bn[None, :] * stride_k_cache_bs +
														
 
															-                     cur_head * stride_k_cache_h +
														
 
															+                     cur_kv_head * stride_k_cache_h +
														
 
															                      (offs_d[:, None] // x) * stride_k_cache_d +
														
 
															                      ((start_n + offs_n[None, :]) % block_size) *
														
 
															                      stride_k_cache_bl +
														
 
															                      (offs_d[:, None] % x) * stride_k_cache_x)
														
 
															             off_v = (
														
 
															-                bn[:, None] * stride_v_cache_bs + cur_head * stride_v_cache_h +
														
 
															+                bn[:, None] * stride_v_cache_bs +
														
 
															+                cur_kv_head * stride_v_cache_h +
														
 
															                 offs_d[None, :] * stride_v_cache_d +
														
 
															                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
														
 
															             k = tl.load(K_cache + off_k,
														
@@ -317,9 +325,9 @@ if triton.__version__ >= "2.1.0":
 
															             l_i = l_i_new
														
 
															             m_i = m_i_new
														
 
															-        off_k = (offs_n[None, :] * stride_kbs + cur_head * stride_kh +
														
 
															+        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
														
 
															                  offs_d[:, None] * stride_kd)
														
 
															-        off_v = (offs_n[:, None] * stride_vbs + cur_head * stride_vh +
														
 
															+        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
														
 
															                  offs_d[None, :] * stride_vd)
														
 
															         k_ptrs = K + off_k
														
 
															         v_ptrs = V + off_v
														
@@ -420,6 +428,7 @@ if triton.__version__ >= "2.1.0":
 
															         stride_v_cache_h,
														
 
															         stride_v_cache_d,
														
 
															         stride_v_cache_bl,
														
 
															+        num_queries_per_kv: int,
														
 
															         BLOCK_M: tl.constexpr,
														
 
															         BLOCK_DMODEL: tl.constexpr,
														
 
															         BLOCK_N: tl.constexpr,
														
@@ -429,6 +438,8 @@ if triton.__version__ >= "2.1.0":
 
															         cur_head = tl.program_id(1)
														
 
															         start_m = tl.program_id(2)
														
 
															+        cur_kv_head = cur_head // num_queries_per_kv
														
 
															+
														
 
															         # cur_batch_seq_len: the length of prompts
														
 
															         # cur_batch_ctx_len: the length of prefix
														
 
															         # cur_batch_in_all_start_index: the start id of the dim=0
														
@@ -468,13 +479,14 @@ if triton.__version__ >= "2.1.0":
 
															                          mask=(start_n + offs_n) < cur_batch_ctx_len,
														
 
															                          other=0)
														
 
															             off_k = (bn[None, :] * stride_k_cache_bs +
														
 
															-                     cur_head * stride_k_cache_h +
														
 
															+                     cur_kv_head * stride_k_cache_h +
														
 
															                      (offs_d[:, None] // x) * stride_k_cache_d +
														
 
															                      ((start_n + offs_n[None, :]) % block_size) *
														
 
															                      stride_k_cache_bl +
														
 
															                      (offs_d[:, None] % x) * stride_k_cache_x)
														
 
															             off_v = (
														
 
															-                bn[:, None] * stride_v_cache_bs + cur_head * stride_v_cache_h +
														
 
															+                bn[:, None] * stride_v_cache_bs +
														
 
															+                cur_kv_head * stride_v_cache_h +
														
 
															                 offs_d[None, :] * stride_v_cache_d +
														
 
															                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
														
 
															             k = tl.load(K_cache + off_k,
														
@@ -522,9 +534,9 @@ if triton.__version__ >= "2.1.0":
 
															             l_i = l_i_new
														
 
															             m_i = m_i_new
														
 
															-        off_k = (offs_n[None, :] * stride_kbs + cur_head * stride_kh +
														
 
															+        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
														
 
															                  offs_d[:, None] * stride_kd)
														
 
															-        off_v = (offs_n[:, None] * stride_vbs + cur_head * stride_vh +
														
 
															+        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
														
 
															                  offs_d[None, :] * stride_vd)
														
 
															         k_ptrs = K + off_k
														
 
															         v_ptrs = V + off_v
														
@@ -618,6 +630,7 @@ if triton.__version__ >= "2.1.0":
 
															                               b_ctx_len,
														
 
															                               max_input_len,
														
 
															                               alibi_slopes=None):
														
 
															+
														
 
															         cap = torch.cuda.get_device_capability()
														
 
															         BLOCK = 128 if cap[0] >= 8 else 64
														
 
															         # shape constraints
														
@@ -627,6 +640,7 @@ if triton.__version__ >= "2.1.0":
 
															         sm_scale = 1.0 / (Lq**0.5)
														
 
															         batch, head = b_seq_len.shape[0], q.shape[1]
														
 
															+        num_queries_per_kv = q.shape[1] // k.shape[1]
														
 
															         grid = (batch, head, triton.cdiv(max_input_len, BLOCK))  # batch, head,
														
@@ -673,6 +687,7 @@ if triton.__version__ >= "2.1.0":
 
															                 v_cache.stride(2),
														
 
															                 v_cache.stride(
														
 
															                     3),  #[num_blocks, num_kv_heads, head_size, block_size]
														
 
															+                num_queries_per_kv=num_queries_per_kv,
														
 
															                 BLOCK_M=BLOCK,
														
 
															                 BLOCK_DMODEL=Lk,
														
 
															                 BLOCK_N=BLOCK,
														
@@ -720,6 +735,7 @@ if triton.__version__ >= "2.1.0":
 
															             v_cache.stride(2),
														
 
															             v_cache.stride(
														
 
															                 3),  #[num_blocks, num_kv_heads, head_size, block_size]
														
 
															+            num_queries_per_kv=num_queries_per_kv,
														
 
															             BLOCK_M=BLOCK,
														
 
															             BLOCK_DMODEL=Lk,
														
 
															             BLOCK_N=BLOCK,
														
--- a/aphrodite/modeling/layers/fused_moe/__init__.py
+++ b/aphrodite/modeling/layers/fused_moe/__init__.py
@@ -0,0 +1,8 @@
 
															+from aphrodite.modeling.layers.fused_moe.fused_moe import (fused_moe,
														
 
															+                                                           get_config_file_name
														
 
															+                                                           )
														
 
															+
														
 
															+__all__ = [
														
 
															+    "fused_moe",
														
 
															+    "get_config_file_name",
														
 
															+]
														
--- a/aphrodite/modeling/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
+++ b/aphrodite/modeling/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
@@ -0,0 +1,146 @@
 
															+{
														
 
															+    "1": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 32,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "8": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "16": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "24": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "32": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "48": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "64": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "96": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "128": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "256": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "512": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1024": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1536": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2048": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "3072": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4096": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    }
														
 
															+}
														
--- a/aphrodite/modeling/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
+++ b/aphrodite/modeling/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
 
															+{
														
 
															+    "1": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 32,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 32,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "8": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "16": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "24": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "32": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "48": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "64": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "96": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "128": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "256": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "512": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1024": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1536": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2048": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "3072": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4096": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    }
														
 
															+}
														
--- a/aphrodite/modeling/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
+++ b/aphrodite/modeling/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
 
															+{
														
 
															+    "1": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 32,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 32,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "8": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "16": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "24": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "32": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "48": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "64": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "96": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "128": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "256": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "512": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1024": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1536": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2048": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "3072": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4096": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    }
														
 
															+}
														
--- a/aphrodite/modeling/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
+++ b/aphrodite/modeling/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
 
															+{
														
 
															+    "1": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 32,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "8": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "16": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "24": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "32": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "48": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "64": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "96": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "128": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "256": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "512": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1024": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1536": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2048": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "3072": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4096": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    }
														
 
															+}
														
--- a/aphrodite/modeling/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
+++ b/aphrodite/modeling/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
 
															+{
														
 
															+    "1": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "8": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "16": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "24": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "32": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "48": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "64": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "96": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 32,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "128": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 32,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "256": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "512": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1024": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1536": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2048": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "3072": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4096": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    }
														
 
															+}
														
--- a/aphrodite/modeling/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
+++ b/aphrodite/modeling/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
@@ -0,0 +1,146 @@
 
															+{
														
 
															+    "1": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 32,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "8": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 32,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "16": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "24": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "32": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "48": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 32,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "64": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "96": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "128": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "256": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "512": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1024": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1536": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2048": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "3072": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4096": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    }
														
 
															+}
														
--- a/aphrodite/modeling/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
+++ b/aphrodite/modeling/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
 
															+{
														
 
															+    "1": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 32,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "8": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "16": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "24": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 32,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "32": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 32,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "48": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 32,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "64": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 32,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "96": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 32,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "128": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "256": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "512": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1024": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1536": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2048": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "3072": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4096": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    }
														
 
															+}
														
--- a/aphrodite/modeling/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
+++ b/aphrodite/modeling/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
 
															+{
														
 
															+    "1": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 32,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 32,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "8": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "16": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "24": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "32": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "48": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "64": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "96": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "128": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "256": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "512": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1024": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1536": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2048": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "3072": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4096": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    }
														
 
															+}
														
--- a/aphrodite/modeling/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
+++ b/aphrodite/modeling/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
@@ -0,0 +1,146 @@
 
															+{
														
 
															+    "1": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 32,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 32,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "8": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "16": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "24": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "32": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "48": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "64": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "96": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "128": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "256": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "512": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1024": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1536": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2048": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "3072": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4096": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    }
														
 
															+}
														
--- a/aphrodite/modeling/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
+++ b/aphrodite/modeling/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
 
															+{
														
 
															+    "1": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "8": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "16": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "24": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "32": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "48": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "64": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "96": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "128": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "256": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "512": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1024": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1536": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2048": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "3072": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4096": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    }
														
 
															+}
														
--- a/aphrodite/modeling/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
+++ b/aphrodite/modeling/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
 
															+{
														
 
															+    "1": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "8": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "16": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "24": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "32": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "48": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "64": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "96": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "128": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "256": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "512": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1024": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1536": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2048": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "3072": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4096": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    }
														
 
															+}
														
--- a/aphrodite/modeling/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
+++ b/aphrodite/modeling/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
 
															+{
														
 
															+    "1": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "8": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "16": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "24": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "32": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "48": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "64": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "96": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "128": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "256": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "512": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1024": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1536": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2048": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "3072": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4096": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    }
														
 
															+}
														
--- a/aphrodite/modeling/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
+++ b/aphrodite/modeling/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
 
															+{
														
 
															+    "1": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 64,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 32,
														
 
															+        "BLOCK_SIZE_K": 256,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "8": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "16": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "24": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "32": {
														
 
															+        "BLOCK_SIZE_M": 16,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "48": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "64": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "96": {
														
 
															+        "BLOCK_SIZE_M": 32,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "128": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 1,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "256": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 128,
														
 
															+        "BLOCK_SIZE_K": 128,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "512": {
														
 
															+        "BLOCK_SIZE_M": 64,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 4,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1024": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "1536": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "2048": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 32,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "3072": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 64,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    },
														
 
															+    "4096": {
														
 
															+        "BLOCK_SIZE_M": 128,
														
 
															+        "BLOCK_SIZE_N": 256,
														
 
															+        "BLOCK_SIZE_K": 64,
														
 
															+        "GROUP_SIZE_M": 16,
														
 
															+        "num_warps": 8,
														
 
															+        "num_stages": 4
														
 
															+    }
														
 
															+}
														
--- a/aphrodite/modeling/layers/fused_moe/configs/README
+++ b/aphrodite/modeling/layers/fused_moe/configs/README
@@ -0,0 +1,9 @@
 
															+This directory contains tuned configurations for different settings of the fused_moe kernel.
														
 
															+For different settings of
														
 
															+- E (number of experts)
														
 
															+- N (intermediate size)
														
 
															+- device_name (torch.cuda.get_device_name())
														
 
															+the JSON file contains a mapping from M (batch size) to the chosen configuration.
														
 
															+
														
 
															+Mixtral has intermediate size N = 14336, i.e. for TP2 we have
														
 
															+N = 7168 and for TP4 we have N = 3584.
														
--- a/aphrodite/modeling/layers/triton_kernel/fused_moe.py
+++ b/aphrodite/modeling/layers/triton_kernel/fused_moe.py
@@ -1,7 +1,13 @@
 
															 """Fused MoE kernel."""
														
 
															+import functools
														
 
															+import json
														
 
															+import os
														
 
															+from typing import Any, Dict, Optional, Tuple
														
 
															+
														
 
															 import torch
														
 
															 import triton
														
 
															 import triton.language as tl
														
 
															+from loguru import logger
														
 
															 from aphrodite._C import ops
														
 
															 from aphrodite.common.utils import is_hip
														
@@ -22,9 +28,10 @@ def fused_moe_kernel(
 
															     K,
														
 
															     EM,
														
 
															     num_valid_tokens,
														
 
															-    # The stride variables represent how much to increase the ptr by when moving
														
 
															-    # by 1 element in a particular dimension. E.g. `stride_am` is how much to
														
 
															-    # increase `a_ptr` by to get the element one row down (A has M rows).
														
 
															+    # The stride variables represent how much to increase the ptr by when
														
 
															+    # moving by 1 element in a particular dimension. E.g. `stride_am` is
														
 
															+    # how much to increase `a_ptr` by to get the element one row down
														
 
															+    # (A has M rows).
														
 
															     stride_am,
														
 
															     stride_ak,
														
 
															     stride_be,
														
@@ -44,22 +51,23 @@ def fused_moe_kernel(
 
															     """
														
 
															     Implements the fused computation for a Mixture of Experts (MOE) using
														
 
															     token and expert matrices.
														
 
															+
														
 
															     Key Parameters:
														
 
															-    - A: The input tensor representing tokens with shape (*, K), where '*'
														
 
															-        can be any shape representing batches and K is the feature dimension
														
 
															-        of each token.
														
 
															-    - B: The stacked MOE weight tensor with shape (E, N, K), where E is the
														
 
															-        number of experts, K is the input feature dimension, and N is the
														
 
															-        output feature dimension.
														
 
															+    - A: The input tensor representing tokens with shape (*, K), where '*' can
														
 
															+        be any shape representing batches and K is the feature dimension of
														
 
															+        each token.
														
 
															+    - B: The stacked MOE weight tensor with shape (E, N, K), where E is
														
 
															+        the number of experts, K is the input feature dimension, and N is
														
 
															+        the output feature dimension.
														
 
															     - C: The output cache tensor with shape (M, topk, N), where M is the
														
 
															         total number of tokens post padding, topk is the number of times
														
 
															         each token is repeated, and N is the output feature dimension.
														
 
															     - sorted_token_ids: A tensor containing the sorted indices of tokens,
														
 
															-        repeated topk times and arranged by the expert index they are assigned
														
 
															-        to.
														
 
															-    - expert_ids: A tensor containing the indices of the expert for each block.
														
 
															-        It determines which expert matrix from B should be used for each block
														
 
															-        in A.
														
 
															+        repeated topk times and arranged by the expert index they are
														
 
															+        assigned to.
														
 
															+    - expert_ids: A tensor containing the indices of the expert for each
														
 
															+        block. It determines which expert matrix from B should be used for
														
 
															+        each block in A.
														
 
															     This kernel performs the multiplication of a token by its corresponding
														
 
															     expert matrix as determined by `expert_ids`. The sorting of
														
 
															     `sorted_token_ids` by expert index and padding ensures divisibility by
														
@@ -142,39 +150,43 @@ def fused_moe_kernel(
 
															 def moe_align_block_size(
														
 
															         topk_ids: torch.Tensor, block_size: int,
														
 
															-        num_experts: int) -> (torch.Tensor, torch.Tensor, torch.Tensor):
														
 
															+        num_experts: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
														
 
															     """
														
 
															     Aligns the token distribution across experts to be compatible with block
														
 
															     size for matrix multiplication.
														
 
															+
														
 
															     Parameters:
														
 
															-    - topk_ids: A tensor of shape [total_tokens, top_k] representing the top-k
														
 
															-        expert indices for each token.
														
 
															+    - topk_ids: A tensor of shape [total_tokens, top_k] representing the
														
 
															+        top-k expert indices for each token.
														
 
															     - block_size: The block size used in block matrix multiplication.
														
 
															     - num_experts: The total number of experts.
														
 
															+
														
 
															     Returns:
														
 
															     - sorted_token_ids: A tensor containing the sorted token indices according
														
 
															         to their allocated expert.
														
 
															     - expert_ids: A tensor indicating the assigned expert index for each block.
														
 
															     - num_tokens_post_padded: The total number of tokens after padding,
														
 
															         ensuring divisibility by block_size.
														
 
															+
														
 
															     This function pads the number of tokens that each expert needs to process
														
 
															-    so that it is divisible by block_size. Padding ensures that during block
														
 
															-    matrix multiplication, the dimensions align correctly.
														
 
															+    so that it is divisible by block_size.
														
 
															+    Padding ensures that during block matrix multiplication, the dimensions
														
 
															+    align correctly.
														
 
															+
														
 
															     Example:
														
 
															     Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]],
														
 
															     block_size = 4, and num_experts = 4:
														
 
															-    - We initially have 12 tokens (after repeating 'top_k' times) and 4
														
 
															-        experts, with each expert needing to process 3 tokens.
														
 
															+    - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts,
														
 
															+        with each expert needing to process 3 tokens.
														
 
															     - As block_size is 4, we pad 1 token for each expert.
														
 
															     - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3].
														
 
															     - Then append padding tokens [12, 12, 12, 12] for each block.
														
 
															-    - After sorting by expert index, we obtain token_ids [3, 6, 9, 12, 0, 4,
														
 
															-                                                          10, 12, 1, 7, 11,
														
 
															-                                                          12, 2, 5, 8, 12].
														
 
															-        Tokens 12 are non-existent (padding) and are ignored in the subsequent
														
 
															-        matrix multiplication.
														
 
															-    - The padding ensures that the total number of tokens is now divisible by
														
 
															-        block_size for proper block matrix operations.
														
 
															+    - After sorting by expert index, we obtain token_ids
														
 
															+        [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12].
														
 
															+        Tokens 12 are non-existent (padding) and are ignored in
														
 
															+        the subsequent matrix multiplication.
														
 
															+    - The padding ensures that the total number of tokens is now divisible
														
 
															+        by block_size for proper block matrix operations.
														
 
															     """
														
 
															     sorted_ids = torch.empty(
														
 
															         (topk_ids.numel() + num_experts * (block_size - 1), ),
														
@@ -197,12 +209,11 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
 
															                             sorted_token_ids: torch.Tensor,
														
 
															                             expert_ids: torch.Tensor,
														
 
															                             num_tokens_post_padded: torch.Tensor,
														
 
															-                            mul_routed_weight: bool, top_k: int, config: dict):
														
 
															-
														
 
															+                            mul_routed_weight: bool, top_k: int,
														
 
															+                            config: Dict[str, Any]) -> None:
														
 
															     assert topk_weights.stride(1) == 1
														
 
															     assert sorted_token_ids.stride(0) == 1
														
 
															-    # ruff: noqa: E731
														
 
															     grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[
														
 
															         'BLOCK_SIZE_M']) * triton.cdiv(B.shape[1], META['BLOCK_SIZE_N']), )
														
@@ -232,6 +243,40 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
 
															     )
														
 
															+def get_config_file_name(E: int, N: int) -> str:
														
 
															+    device_name = torch.cuda.get_device_name().replace(" ", "_")
														
 
															+    return f"E={E},N={N},device_name={device_name}.json"
														
 
															+
														
 
															+
														
 
															+@functools.lru_cache
														
 
															+def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]:
														
 
															+    """
														
 
															+    Return optimized configurations for the fused MoE kernel.
														
 
															+
														
 
															+    The return value will be a dictionary that maps an irregular grid of
														
 
															+    batch sizes to configurations of the fused_moe kernel. To evaluate the
														
 
															+    kernel on a given batch size bs, the closest batch size in the grid should
														
 
															+    be picked and the associated configuration chosen to invoke the kernel.
														
 
															+    """
														
 
															+
														
 
															+    # First look up if an optimized configuration is available in the configs
														
 
															+    # directory
														
 
															+    json_file_name = get_config_file_name(E, N)
														
 
															+
														
 
															+    config_file_path = os.path.join(
														
 
															+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
														
 
															+    if os.path.exists(config_file_path):
														
 
															+        with open(config_file_path) as f:
														
 
															+            logger.info(
														
 
															+                f"Using configuration from {config_file_path} for MoE layer.")
														
 
															+            # If a configuration has been found, return it
														
 
															+            return {int(key): val for key, val in json.load(f).items()}
														
 
															+
														
 
															+    # If no optimized configuration is available, we will use the default
														
 
															+    # configuration
														
 
															+    return None
														
 
															+
														
 
															+
														
 
															 def fused_moe(
														
 
															     hidden_states: torch.Tensor,
														
 
															     w1: torch.Tensor,
														
@@ -240,6 +285,7 @@ def fused_moe(
 
															     topk: int,
														
 
															     renormalize: bool,
														
 
															     inplace: bool = False,
														
 
															+    override_config: Optional[Dict[str, Any]] = None,
														
 
															 ) -> torch.Tensor:
														
 
															     """
														
 
															     This function computes a Mixture of Experts (MoE) layer using two sets of
														
@@ -249,25 +295,29 @@ def fused_moe(
 
															     - hidden_states (torch.Tensor): The input tensor to the MoE layer.
														
 
															     - w1 (torch.Tensor): The first set of expert weights.
														
 
															     - w2 (torch.Tensor): The second set of expert weights.
														
 
															-    - gating_output (torch.Tensor): The output of the gating operation (before
														
 
															-        softmax).
														
 
															+    - gating_output (torch.Tensor): The output of the gating operation
														
 
															+        (before softmax).
														
 
															     - topk (int): The number of top-k experts to select.
														
 
															     - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
														
 
															-    - inplace (bool): If True, perform the operation in-place. Defaults to
														
 
															-        False.
														
 
															+    - inplace (bool): If True, perform the operation in-place.
														
 
															+        Defaults to False.
														
 
															+    - override_config (Optional[Dict[str, Any]]): Optional override
														
 
															+        for the kernel configuration.
														
 
															     Returns:
														
 
															     - torch.Tensor: The output tensor after applying the MoE layer.
														
 
															     """
														
 
															     # Check constraints.
														
 
															     assert hidden_states.shape[0] == gating_output.shape[0], (
														
 
															-        'Number of tokens mismatch')
														
 
															-    assert hidden_states.shape[1] == w1.shape[2], 'Hidden size mismatch'
														
 
															-    assert gating_output.shape[1] == w1.shape[0], 'Number of experts mismatch'
														
 
															-    assert hidden_states.is_contiguous(), 'Hidden_states must be contiguous'
														
 
															-    assert w1.is_contiguous(), 'Expert weights1 must be contiguous'
														
 
															-    assert w2.is_contiguous(), 'Expert weights2 must be contiguous'
														
 
															-    assert hidden_states.dtype in [torch.float16, torch.bfloat16]
														
 
															+        "Number of tokens mismatch")
														
 
															+    assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
														
 
															+    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
														
 
															+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
														
 
															+    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
														
 
															+    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
														
 
															+    assert hidden_states.dtype in [
														
 
															+        torch.float32, torch.float16, torch.bfloat16
														
 
															+    ]
														
 
															     M, _ = hidden_states.shape
														
 
															     E, N, _ = w1.shape
														
@@ -302,20 +352,32 @@ def fused_moe(
 
															     if renormalize:
														
 
															         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
														
 
															-    config = {
														
 
															-        'BLOCK_SIZE_M': 64,
														
 
															-        'BLOCK_SIZE_N': 64,
														
 
															-        'BLOCK_SIZE_K': 32,
														
 
															-        'GROUP_SIZE_M': 8
														
 
															-    }
														
 
															-
														
 
															-    if topk_ids.numel() <= w1.shape[0]:
														
 
															-        config = {
														
 
															-            'BLOCK_SIZE_M': 16,
														
 
															-            'BLOCK_SIZE_N': 32,
														
 
															-            'BLOCK_SIZE_K': 64,
														
 
															-            'GROUP_SIZE_M': 1
														
 
															-        }
														
 
															+    if override_config:
														
 
															+        config = override_config
														
 
															+    else:
														
 
															+        # First try to load optimal config from the file
														
 
															+        configs = get_moe_configs(E, w2.shape[2])
														
 
															+
														
 
															+        if configs:
														
 
															+            # If an optimal configuration map has been found, look up the
														
 
															+            # optimal config
														
 
															+            config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
														
 
															+        else:
														
 
															+            # Else use the default config
														
 
															+            config = {
														
 
															+                'BLOCK_SIZE_M': 64,
														
 
															+                'BLOCK_SIZE_N': 64,
														
 
															+                'BLOCK_SIZE_K': 32,
														
 
															+                'GROUP_SIZE_M': 8
														
 
															+            }
														
 
															+
														
 
															+            if M <= E:
														
 
															+                config = {
														
 
															+                    'BLOCK_SIZE_M': 16,
														
 
															+                    'BLOCK_SIZE_N': 32,
														
 
															+                    'BLOCK_SIZE_K': 64,
														
 
															+                    'GROUP_SIZE_M': 1
														
 
															+                }
														
 
															     intermediate_cache1 = torch.empty((M, topk_ids.shape[1], N),
														
 
															                                       device=hidden_states.device,
														
@@ -327,8 +389,8 @@ def fused_moe(
 
															                                       device=hidden_states.device,
														
 
															                                       dtype=hidden_states.dtype)
														
 
															-    sorted_token_ids, expert_ids, num_tokens_post_padded = (
														
 
															-        moe_align_block_size(topk_ids, config['BLOCK_SIZE_M'], E))
														
 
															+    sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
														
 
															+        topk_ids, config['BLOCK_SIZE_M'], E)
														
 
															     invoke_fused_moe_kernel(hidden_states, w1, intermediate_cache1,
														
 
															                             topk_weights, topk_ids, sorted_token_ids,
														
--- a/aphrodite/modeling/layers/rejection.py
+++ b/aphrodite/modeling/layers/rejection.py
@@ -8,23 +8,26 @@ import torch.jit
 
															 class RejectionSampler(nn.Module):
														
 
															     """Apply modified rejection sampling as described in "Accelerating Large
														
 
															-        Language Model Decoding with Speculative Sampling"
														
 
															-        https://arxiv.org/pdf/2302.01318.pdf.
														
 
															+    Language Model Decoding with Speculative Sampling"
														
 
															+    https://arxiv.org/pdf/2302.01318.pdf.
														
 
															     """
														
 
															     def __init__(self, strict_mode: bool = False):
														
 
															         """Create a rejection sampler.
														
 
															+
														
 
															         Args:
														
 
															             strict_mode: Whether or not to perform shape/device/dtype checks
														
 
															                 during sampling. This catches correctness issues but adds
														
 
															                 nontrivial latency.
														
 
															         """
														
 
															         super().__init__()
														
 
															-        self.probs_dtype = torch.float32
														
 
															-        self.token_id_dtype = torch.int64
														
 
															-        self._num_bonus_tokens = 1
														
 
															         self._strict_mode = strict_mode
														
 
															+        # NOTE: A "bonus token" is accepted iff all proposal tokens are
														
 
															+        # accepted. There is always only one possible bonus token. We store this
														
 
															+        # value in a variable for readability.
														
 
															+        self._num_bonus_tokens = 1
														
 
															+
														
 
															         self.num_accepted_tokens: Optional[torch.Tensor] = None
														
 
															         self.num_emitted_tokens: Optional[torch.Tensor] = None
														
 
															         self.num_draft_tokens: int = 0
														
@@ -39,6 +42,14 @@ class RejectionSampler(nn.Module):
 
															                                                dtype=torch.long,
														
 
															                                                device=device)
														
 
															+    @property
														
 
															+    def probs_dtype(self):
														
 
															+        return torch.float32
														
 
															+
														
 
															+    @property
														
 
															+    def token_id_dtype(self):
														
 
															+        return torch.int64
														
 
															+
														
 
															     def forward(
														
 
															         self,
														
 
															         target_probs: torch.Tensor,
														
@@ -49,24 +60,31 @@ class RejectionSampler(nn.Module):
 
															         """Sample token ids using rejection sampling. This accepts or rejects
														
 
															         tokens proposed by the draft model using the probability of each token
														
 
															         according to the draft and target models.
														
 
															+
														
 
															         In the worst case where all draft tokens are rejected, it is guaranteed
														
 
															         one correct token will be emitted.
														
 
															+
														
 
															         In the case where all draft tokens are accepted, a bonus token will be
														
 
															         accepted as its cheap to have the target model score this speculative
														
 
															         sequence.
														
 
															+
														
 
															         Args:
														
 
															             target_probs: The probability distribution over token ids given
														
 
															                 context according to the target model.
														
 
															             shape = [batch_size, num_speculative_tokens, vocab_size]
														
 
															+
														
 
															             bonus_token_ids: The "bonus" token ids that are accepted iff all
														
 
															                 speculative tokens in a sequence are accepted.
														
 
															             shape = [batch_size, num_bonus_tokens]
														
 
															+
														
 
															             draft_probs: The probability distribution over token ids given
														
 
															                 context according to the draft model.
														
 
															             shape = [batch_size, num_speculative_tokens, vocab_size]
														
 
															+
														
 
															             draft_token_ids: The token ids that were sampled from the draft
														
 
															                 probabilities.
														
 
															             shape = [batch_size, num_speculative_tokens]
														
 
															+
														
 
															         Returns:
														
 
															             output_token_ids: The token ids sampled via rejection sampling,
														
 
															                 or -1 if unable to sample a token because the previous token
														
@@ -107,6 +125,7 @@ class RejectionSampler(nn.Module):
 
															             draft_token_ids: torch.Tensor,  # [batch_size, k]
														
 
															     ) -> Tuple[torch.Tensor, torch.Tensor]:
														
 
															         """Perform modified rejection sampling on each sequence.
														
 
															+
														
 
															         Returns:
														
 
															             A tuple of two tensors:
														
 
															             0: A bool tensor of which tokens in each sequence is accepted.
														
@@ -139,16 +158,20 @@ class RejectionSampler(nn.Module):
 
															         r"""Create bool matrix over the proposed draft tokens. If
														
 
															         True, then a token can be accepted, else it should be
														
 
															         rejected.
														
 
															+
														
 
															         Given :math:`q(\hat{x}_{n+1}|x_1, \dots, x_n)`, the probability of
														
 
															         :math:`\hat{x}_{n+1}` given context :math:`x_1, \dots, x_n` according
														
 
															         to the target model, and :math:`p(\hat{x}_{n+1}|x_1, \dots, x_n)`, the
														
 
															         same conditional probability according to the draft model, the token
														
 
															         is accepted with probability:
														
 
															+
														
 
															         .. math::
														
 
															             \min\left(1, \frac{q(\hat{x}_{n+1}|x_1, \dots, x_n)}
														
 
															                            {p(\hat{x}_{n+1}|x_1, \dots, x_n)}\right)
														
 
															+
														
 
															         This implementation does not apply causality. When using the output,
														
 
															         if a token is rejected, subsequent tokens should not be used.
														
 
															+
														
 
															         Returns a bool tensor of shape [batch_size, k] specifying which tokens
														
 
															         are accepted.
														
 
															         """
														
@@ -171,7 +194,8 @@ class RejectionSampler(nn.Module):
 
															                                   device=target_probs.device)
														
 
															         capped_ratio = torch.minimum(
														
 
															             selected_target_probs / selected_draft_probs,
														
 
															-            torch.full((1, ), 1, device=target_probs.device))
														
 
															+            torch.full((1, ), 1, device=target_probs.device),
														
 
															+        )
														
 
															         accepted = uniform_rand < capped_ratio
														
 
															         return accepted
														
@@ -183,21 +207,26 @@ class RejectionSampler(nn.Module):
 
															     ) -> torch.Tensor:
														
 
															         r"""Create a probability distribution for each proposed token which can
														
 
															         be sampled if the proposed token is rejected.
														
 
															+
														
 
															         When this routine is applied sequentially, the true distribution of the
														
 
															         target model is recovered (within hardware numerics).
														
 
															+
														
 
															         The probability distribution used in this rejection case is constructed
														
 
															         as follows. Given :math:`q(x|x_1, \dots, x_n)`, the probability of
														
 
															         :math:`x` given context :math:`x_1, \dots, x_n` according to the target
														
 
															         model and :math:`p(x|x_1, \dots, x_n)`, the same conditional probability
														
 
															         according to the draft model:
														
 
															+
														
 
															         .. math::
														
 
															             x_{n+1} \sim (q(x|x_1, \dots, x_n) - p(x|x_1, \dots, x_n))_+
														
 
															+
														
 
															         where :math:`(f(x))_+` is defined as:
														
 
															+
														
 
															         .. math::
														
 
															             (f(x))_+ = \frac{\max(0, f(x))}{\sum_x \max(0, f(x))}
														
 
															-        See https://github.com/vllm-project/vllm/pull/2336 for a visualization
														
 
															-        of the draft, target, and recovered probability distributions.
														
 
															+
														
 
															         Returns a tensor of shape [batch_size, k, vocab_size].
														
 
															+
														
 
															         Note: This batches operations on GPU and thus constructs the recovered
														
 
															         distribution for all tokens, even if they are accepted. This causes
														
 
															         division-by-zero errors, so we use self._smallest_positive_value to
														
@@ -208,7 +237,7 @@ class RejectionSampler(nn.Module):
 
															         # shape [batch_size, k, vocab_size]
														
 
															         difference = target_probs - draft_probs
														
 
															-        # TODO(cade): Can we use logprobs instead of probs, and avoid the
														
 
															+        # TODO: Can we use logprobs instead of probs, and avoid the
														
 
															         # division-by-zero errors without introducing distribution drift?
														
 
															         # shape [batch_size, k, vocab_size]
														
@@ -224,7 +253,9 @@ class RejectionSampler(nn.Module):
 
															         """Return the smallest positive value representable by the probs dtype.
														
 
															         This value is used when constructing a distribution from which to sample
														
 
															         recovered tokens in the first rejection case.
														
 
															+
														
 
															         See _get_recovered_probs for more details
														
 
															+
														
 
															         Note that this isn't actually the smallest positive value representable
														
 
															         by float32, but the smallest positive normal value.
														
 
															         See https://en.wikipedia.org/wiki/Subnormal_number for more information.
														
@@ -241,6 +272,7 @@ class RejectionSampler(nn.Module):
 
															         """Format output. Returns a matrix of token ids. When
														
 
															         a token is rejected via rejection sampling, all subsequent
														
 
															         token ids are set to -1 for the sequence.
														
 
															+
														
 
															         shape = [batch_size, k + num_bonus_tokens]
														
 
															         """
														
 
															         bonus_token_ids = bonus_token_ids.squeeze()
														
@@ -259,7 +291,8 @@ class RejectionSampler(nn.Module):
 
															         output_with_bonus_tokens = -torch.ones(
														
 
															             (batch_size, k + self._num_bonus_tokens),
														
 
															             dtype=self.token_id_dtype,
														
 
															-            device=accepted.device)
														
 
															+            device=accepted.device,
														
 
															+        )
														
 
															         output = output_with_bonus_tokens[:, :k]
														
 
															         # Fill in the first k columns of the output tensor using masks and data
														
@@ -290,8 +323,11 @@ class RejectionSampler(nn.Module):
 
															         draft_probs: torch.Tensor,
														
 
															         draft_token_ids: torch.Tensor,
														
 
															     ) -> None:
														
 
															-        (target_batch_size, num_target_probs,
														
 
															-         target_vocab_size) = target_probs.shape
														
 
															+        (
														
 
															+            target_batch_size,
														
 
															+            num_target_probs,
														
 
															+            target_vocab_size,
														
 
															+        ) = target_probs.shape
														
 
															         bonus_batch_size, num_bonus_tokens = bonus_token_ids.shape
														
 
															         draft_batch_size, num_draft_probs, draft_vocab_size = draft_probs.shape
														
 
															         draft_token_ids_batch_size, num_draft_token_ids = draft_token_ids.shape
														
@@ -327,10 +363,13 @@ class RejectionSampler(nn.Module):
 
															         draft_token_ids: torch.Tensor,
														
 
															     ) -> None:
														
 
															         devices = [
														
 
															-            t.device for t in
														
 
															-            [target_probs, bonus_token_ids, draft_probs, draft_token_ids]
														
 
															+            t.device for t in [
														
 
															+                target_probs,
														
 
															+                bonus_token_ids,
														
 
															+                draft_probs,
														
 
															+                draft_token_ids,
														
 
															+            ]
														
 
															         ]
														
 
															-        # pylint: disable=use-a-generator
														
 
															         assert all([devices[0] == device for device in devices])
														
 
															     def _raise_if_out_of_bounds_vocab(
														
@@ -358,8 +397,8 @@ def _multinomial(
 
															     if num_samples > 1:
														
 
															         # This is equivalent to torch.repeat_interleaved (which also
														
 
															         # forces a GPU<->CPU sync).
														
 
															-        probs = probs[:, None, :].expand(probs.shape[0], num_samples,
														
 
															-                                         probs.shape[1]).contiguous().view(
														
 
															-                                             -1, probs.shape[1])
														
 
															+        probs = (probs[:, None, :].expand(probs.shape[0], num_samples,
														
 
															+                                          probs.shape[1]).contiguous().view(
														
 
															+                                              -1, probs.shape[1]))
														
 
															     q = torch.empty_like(probs).exponential_(1.0)
														
 
															     return probs.div_(q).argmax(dim=1).view(-1, num_samples)
														
--- a/aphrodite/modeling/layers/sampler.py
+++ b/aphrodite/modeling/layers/sampler.py
@@ -802,7 +802,6 @@ def _get_logprobs(
 
															         if (i < sampling_metadata.num_prompts
														
 
															                 and sampling_params.prompt_logprobs is not None):
														
 
															             num_logprobs = sampling_params.prompt_logprobs
														
 
															-            prompt_len = sampling_metadata.prompt_lens[i]
														
 
															             prompt_tokens = sampling_metadata.seq_data[
														
 
															                 seq_ids[0]].prompt_token_ids
														
 
															             group_prompt_logprobs: PromptLogprobs = [None]
														
@@ -876,7 +875,7 @@ def _build_sampler_output(
 
															                                output_metadata.get(seq_ids[parent_id])))
														
 
															         sampler_output.append(
														
 
															             SequenceGroupOutput(seq_outputs, group_prompt_logprobs))
														
 
															-    return sampler_output
														
 
															+    return SamplerOutput(outputs=sampler_output)
														
 
															 def _miro_store_args(seqids: List[int], mus: List[float],
														
--- a/aphrodite/modeling/loader.py
+++ b/aphrodite/modeling/loader.py
@@ -2,18 +2,24 @@
 
															 import contextlib
														
 
															 import gc
														
 
															 from contextlib import nullcontext
														
 
															-from typing import Optional, Type
														
 
															+from typing import Type
														
 
															 from loguru import logger
														
 
															 import torch
														
 
															 import torch.nn as nn
														
 
															-from aphrodite.common.config import DeviceConfig, ModelConfig, LoRAConfig
														
 
															+from aphrodite.common.config import DeviceConfig, ModelConfig
														
 
															 from aphrodite.modeling.models import ModelRegistry
														
 
															-from aphrodite.modeling.hf_downloader import (get_quant_config,
														
 
															-                                              initialize_dummy_weights)
														
 
															+from aphrodite.modeling.hf_downloader import (
														
 
															+    get_quant_config,
														
 
															+    initialize_dummy_weights,
														
 
															+)
														
 
															 from aphrodite.modeling.layers.quantization.bitsandbytes import (
														
 
															-    BNBLinearMethod, replace_quant_params)
														
 
															+    BNBLinearMethod,
														
 
															+    replace_quant_params,
														
 
															+)
														
 
															+from aphrodite.modeling.megatron.parallel_state import (
														
 
															+    get_tensor_model_parallel_world_size, )
														
 
															 @contextlib.contextmanager
														
@@ -32,6 +38,7 @@ def _get_model_architecture(model_config: ModelConfig) -> Type[nn.Module]:
 
															     if (model_config.quantization is not None
														
 
															             and "MixtralForCausalLM" in architectures):
														
 
															         architectures = ["QuantMixtralForCausalLM"]
														
 
															+
														
 
															     for arch in architectures:
														
 
															         model_cls = ModelRegistry.load_model_cls(arch)
														
 
															         if model_cls is not None:
														
@@ -41,9 +48,9 @@ def _get_model_architecture(model_config: ModelConfig) -> Type[nn.Module]:
 
															         f"Supported architectures: {ModelRegistry.get_supported_archs()}")
														
 
															-def get_model(model_config: ModelConfig,
														
 
															-              device_config: DeviceConfig,
														
 
															-              lora_config: Optional[LoRAConfig] = None) -> nn.Module:
														
 
															+def get_model(model_config: ModelConfig, device_config: DeviceConfig,
														
 
															+              **kwargs) -> nn.Module:
														
 
															+    lora_config = kwargs.get("lora_config", None)
														
 
															     model_class = _get_model_architecture(model_config)
														
 
															     # Get the (maybe quantized) linear method.
														
@@ -68,9 +75,9 @@ def get_model(model_config: ModelConfig,
 
															     with _set_default_torch_dtype(model_config.dtype):
														
 
															         # Create a model instance.
														
 
															         # The weights will be initialized as empty tensors.
														
 
															-        with torch.device(device_config.device) if not \
														
 
															-            (isinstance(linear_method, BNBLinearMethod) and
														
 
															-             linear_method.quant_config.from_float) else nullcontext():
														
 
															+        with torch.device(device_config.device) if not (
														
 
															+                isinstance(linear_method, BNBLinearMethod)
														
 
															+                and linear_method.quant_config.from_float) else nullcontext():
														
 
															             if hasattr(model_class, "supported_lora_modules"):
														
 
															                 model = model_class(model_config.hf_config, linear_method,
														
 
															                                     lora_config)
														
@@ -88,23 +95,54 @@ def get_model(model_config: ModelConfig,
 
															             initialize_dummy_weights(model)
														
 
															         else:
														
 
															             # Load the weights from the cached or downloaded files.
														
 
															-            model.load_weights(model_config.model, model_config.download_dir,
														
 
															-                               model_config.load_format, model_config.revision)
														
 
															+            model.load_weights(
														
 
															+                model_config.model,
														
 
															+                model_config.download_dir,
														
 
															+                model_config.load_format,
														
 
															+                model_config.revision,
														
 
															+            )
														
 
															         if isinstance(linear_method, BNBLinearMethod):
														
 
															-            replace_quant_params(model,
														
 
															-                                 quant_config=linear_method.quant_config,
														
 
															-                                 modules_to_not_convert="lm_head")
														
 
															+            replace_quant_params(
														
 
															+                model,
														
 
															+                quant_config=linear_method.quant_config,
														
 
															+                modules_to_not_convert="lm_head",
														
 
															+            )
														
 
															             torch.cuda.synchronize()
														
 
															             if linear_method.quant_config.from_float:
														
 
															                 model = model.cuda()
														
 
															             gc.collect()
														
 
															             torch.cuda.empty_cache()
														
 
															-            logger.info("Memory allocated for converted model: {} GiB".format(
														
 
															-                round(
														
 
															-                    torch.cuda.memory_allocated(torch.cuda.current_device()) /
														
 
															-                    (1024 * 1024 * 1024), 2)))
														
 
															-            logger.info("Memory reserved for converted model: {} GiB".format(
														
 
															-                round(
														
 
															-                    torch.cuda.memory_reserved(torch.cuda.current_device()) /
														
 
															-                    (1024 * 1024 * 1024), 2)))
														
 
															+            tp = get_tensor_model_parallel_world_size()
														
 
															+            logger.info(
														
 
															+                "Memory allocated for converted model: {} GiB x {} = {} "
														
 
															+                "GiB".format(
														
 
															+                    round(
														
 
															+                        torch.cuda.memory_allocated(
														
 
															+                            torch.cuda.current_device()) /
														
 
															+                        (1024 * 1024 * 1024),
														
 
															+                        2,
														
 
															+                    ),
														
 
															+                    tp,
														
 
															+                    round(
														
 
															+                        torch.cuda.memory_allocated(
														
 
															+                            torch.cuda.current_device()) * tp /
														
 
															+                        (1024 * 1024 * 1024),
														
 
															+                        2,
														
 
															+                    ),
														
 
															+                ))
														
 
															+            logger.info(
														
 
															+                "Memory reserved for converted model: {} GiB x {} = {} "
														
 
															+                "GiB".format(
														
 
															+                    round(
														
 
															+                        torch.cuda.memory_reserved(torch.cuda.current_device())
														
 
															+                        / (1024 * 1024 * 1024),
														
 
															+                        2,
														
 
															+                    ),
														
 
															+                    tp,
														
 
															+                    round(
														
 
															+                        torch.cuda.memory_reserved(torch.cuda.current_device())
														
 
															+                        * tp / (1024 * 1024 * 1024),
														
 
															+                        2,
														
 
															+                    ),
														
 
															+                ))
														
 
															     return model.eval()
														
--- a/aphrodite/modeling/metadata.py
+++ b/aphrodite/modeling/metadata.py
@@ -1,4 +1,4 @@
 
															-from typing import Optional, List
														
 
															+from typing import Optional
														
 
															 import torch
														
@@ -28,7 +28,7 @@ class InputMetadata:
 
															         block_tables: Optional[torch.Tensor],
														
 
															         use_cuda_graph: bool,
														
 
															         kv_cache_dtype: str,
														
 
															-        kv_quant_params: List[List[float]],
														
 
															+        # kv_quant_params: List[List[float]],
														
 
															     ) -> None:
														
 
															         self.is_prompt = is_prompt
														
 
															         self.prompt_lens = prompt_lens
														
@@ -40,7 +40,7 @@ class InputMetadata:
 
															         self.block_tables = block_tables
														
 
															         self.use_cuda_graph = use_cuda_graph
														
 
															         self.kv_cache_dtype = kv_cache_dtype
														
 
															-        self.kv_quant_params = kv_quant_params
														
 
															+        # self.kv_quant_params = kv_quant_params
														
 
															         # Set during the execution of the first attention op.
														
 
															         # FIXME: This is a hack.
														
@@ -55,4 +55,5 @@ class InputMetadata:
 
															                 f"block_tables={self.block_tables}, "
														
 
															                 f"use_cuda_graph={self.use_cuda_graph}, "
														
 
															                 f"kv_cache_dtype={self.kv_cache_dtype}, "
														
 
															-                f"kv_quant_params={self.kv_quant_params})")
														
 
															+                # f"kv_quant_params={self.kv_quant_params})"
														
 
															+                )
														
--- a/aphrodite/modeling/models/baichuan.py
+++ b/aphrodite/modeling/models/baichuan.py
@@ -27,7 +27,7 @@ from torch import nn
 
															 from aphrodite.modeling.metadata import InputMetadata
														
 
															 from aphrodite.modeling.layers.activation import SiluAndMul
														
 
															-from aphrodite.modeling.layers.attention import PagedAttention
														
 
															+from aphrodite.modeling.layers.attention import Attention
														
 
															 from aphrodite.modeling.layers.layernorm import RMSNorm
														
 
															 from aphrodite.modeling.layers.linear import (
														
 
															     LinearMethodBase,
														
@@ -187,7 +187,7 @@ class BaiChuanAttention(nn.Module):
 
															             alibi_slopes = alibi_slopes[head_start:head_end].tolist()
														
 
															             scaling = self.head_dim**-0.5
														
 
															-            self.attn = PagedAttention(
														
 
															+            self.attn = Attention(
														
 
															                 self.num_heads,
														
 
															                 self.head_dim,
														
 
															                 scaling,
														
@@ -205,8 +205,7 @@ class BaiChuanAttention(nn.Module):
 
															                 is_neox_style=is_neox_style,
														
 
															             )
														
 
															             self.scaling = self.head_dim**-0.5
														
 
															-            self.attn = PagedAttention(self.num_heads, self.head_dim,
														
 
															-                                       self.scaling)
														
 
															+            self.attn = Attention(self.num_heads, self.head_dim, self.scaling)
														
 
															     def forward(
														
 
															         self,
														
--- a/aphrodite/modeling/models/bloom.py
+++ b/aphrodite/modeling/models/bloom.py
@@ -26,7 +26,7 @@ from transformers import BloomConfig
 
															 from aphrodite.modeling.metadata import InputMetadata
														
 
															 from aphrodite.modeling.layers.activation import get_act_fn
														
 
															-from aphrodite.modeling.layers.attention import PagedAttention
														
 
															+from aphrodite.modeling.layers.attention import Attention
														
 
															 from aphrodite.modeling.layers.linear import (ColumnParallelLinear,
														
 
															                                               LinearMethodBase,
														
 
															                                               QKVParallelLinear,
														
@@ -108,10 +108,10 @@ class BloomAttention(nn.Module):
 
															         alibi_slopes = alibi_slopes[head_start:head_end].tolist()
														
 
															         scaling = self.head_dim**-0.5
														
 
															-        self.attn = PagedAttention(self.num_heads,
														
 
															-                                   self.head_dim,
														
 
															-                                   scaling,
														
 
															-                                   alibi_slopes=alibi_slopes)
														
 
															+        self.attn = Attention(self.num_heads,
														
 
															+                              self.head_dim,
														
 
															+                              scaling,
														
 
															+                              alibi_slopes=alibi_slopes)
														
 
															     def forward(
														
 
															         self,
														
--- a/aphrodite/modeling/models/chatglm.py
+++ b/aphrodite/modeling/models/chatglm.py
@@ -10,7 +10,7 @@ from torch.nn import LayerNorm
 
															 from aphrodite.modeling.metadata import InputMetadata
														
 
															 from aphrodite.modeling.layers.activation import SiluAndMul
														
 
															-from aphrodite.modeling.layers.attention import PagedAttention
														
 
															+from aphrodite.modeling.layers.attention import Attention
														
 
															 from aphrodite.modeling.layers.layernorm import RMSNorm
														
 
															 from aphrodite.modeling.layers.linear import (LinearMethodBase,
														
 
															                                               MergedColumnParallelLinear,
														
@@ -87,7 +87,7 @@ class GLMAttention(nn.Module):
 
															             base=10000 * rope_ratio,
														
 
															             is_neox_style=False,
														
 
															         )
														
 
															-        self.attn = PagedAttention(
														
 
															+        self.attn = Attention(
														
 
															             self.num_heads,
														
 
															             self.head_dim,
														
 
															             self.scaling,
														
--- a/aphrodite/modeling/models/cohere.py
+++ b/aphrodite/modeling/models/cohere.py
@@ -30,7 +30,7 @@ from transformers import CohereConfig
 
															 from aphrodite.modeling.metadata import InputMetadata
														
 
															 from aphrodite.modeling.layers.activation import SiluAndMul
														
 
															-from aphrodite.modeling.layers.attention import PagedAttention as Attention
														
 
															+from aphrodite.modeling.layers.attention import Attention as Attention
														
 
															 from aphrodite.modeling.layers.linear import (
														
 
															     LinearMethodBase,
														
 
															     MergedColumnParallelLinear,
														
--- a/aphrodite/modeling/models/deepseek.py
+++ b/aphrodite/modeling/models/deepseek.py
@@ -31,8 +31,8 @@ from transformers import PretrainedConfig
 
															 from aphrodite.modeling.metadata import InputMetadata
														
 
															 from aphrodite.modeling.layers.activation import SiluAndMul
														
 
															-from aphrodite.modeling.layers.attention import PagedAttention
														
 
															-from aphrodite.modeling.layers.triton_kernel.fused_moe import fused_moe
														
 
															+from aphrodite.modeling.layers.attention import Attention
														
 
															+from aphrodite.modeling.layers.fused_moe.fused_moe import fused_moe
														
 
															 from aphrodite.modeling.layers.layernorm import RMSNorm
														
 
															 from aphrodite.modeling.layers.linear import (
														
 
															     LinearMethodBase,
														
@@ -249,7 +249,7 @@ class DeepseekAttention(nn.Module):
 
															             base=rope_theta,
														
 
															             rope_scaling=rope_scaling,
														
 
															         )
														
 
															-        self.attn = PagedAttention(
														
 
															+        self.attn = Attention(
														
 
															             self.num_heads,
														
 
															             self.head_dim,
														
 
															             self.scaling,
														
--- a/aphrodite/modeling/models/falcon.py
+++ b/aphrodite/modeling/models/falcon.py
@@ -29,7 +29,7 @@ from transformers import FalconConfig as HF_FalconConfig
 
															 from aphrodite.modeling.metadata import InputMetadata
														
 
															 from aphrodite.modeling.layers.activation import get_act_fn
														
 
															-from aphrodite.modeling.layers.attention import PagedAttention
														
 
															+from aphrodite.modeling.layers.attention import Attention
														
 
															 from aphrodite.modeling.layers.linear import (ColumnParallelLinear,
														
 
															                                               LinearMethodBase,
														
 
															                                               QKVParallelLinear,
														
@@ -151,10 +151,10 @@ class FalconAttention(nn.Module):
 
															                 max_position=max_position_embeddings,
														
 
															                 base=rope_theta,
														
 
															             )
														
 
															-            self.attn = PagedAttention(self.num_heads,
														
 
															-                                       self.head_dim,
														
 
															-                                       self.inv_norm_factor,
														
 
															-                                       num_kv_heads=self.num_kv_heads)
														
 
															+            self.attn = Attention(self.num_heads,
														
 
															+                                  self.head_dim,
														
 
															+                                  self.inv_norm_factor,
														
 
															+                                  num_kv_heads=self.num_kv_heads)
														
 
															         elif self.use_alibi:
														
 
															             tp_rank = get_tensor_model_parallel_rank()
														
 
															             head_start = tp_rank * self.num_heads
														
@@ -162,16 +162,16 @@ class FalconAttention(nn.Module):
 
															             alibi_slopes = (_get_alibi_slopes(self.total_num_heads) *
														
 
															                             self.inv_norm_factor)
														
 
															             alibi_slopes = alibi_slopes[head_start:head_end].tolist()
														
 
															-            self.attn = PagedAttention(self.num_heads,
														
 
															-                                       self.head_dim,
														
 
															-                                       self.inv_norm_factor,
														
 
															-                                       num_kv_heads=self.num_kv_heads,
														
 
															-                                       alibi_slopes=alibi_slopes)
														
 
															+            self.attn = Attention(self.num_heads,
														
 
															+                                  self.head_dim,
														
 
															+                                  self.inv_norm_factor,
														
 
															+                                  num_kv_heads=self.num_kv_heads,
														
 
															+                                  alibi_slopes=alibi_slopes)
														
 
															         else:
														
 
															-            self.attn = PagedAttention(self.num_heads,
														
 
															-                                       self.head_dim,
														
 
															-                                       scale=self.inv_norm_factor,
														
 
															-                                       num_kv_heads=self.num_kv_heads)
														
 
															+            self.attn = Attention(self.num_heads,
														
 
															+                                  self.head_dim,
														
 
															+                                  scale=self.inv_norm_factor,
														
 
															+                                  num_kv_heads=self.num_kv_heads)
														
 
															     def forward(
														
 
															         self,
														
--- a/aphrodite/modeling/models/gemma.py
+++ b/aphrodite/modeling/models/gemma.py
@@ -24,7 +24,7 @@ from transformers import GemmaConfig
 
															 from aphrodite.modeling.metadata import InputMetadata
														
 
															 from aphrodite.modeling.layers.activation import GeluAndMul
														
 
															-from aphrodite.modeling.layers.attention import PagedAttention
														
 
															+from aphrodite.modeling.layers.attention import Attention
														
 
															 from aphrodite.modeling.layers.layernorm import RMSNorm
														
 
															 from aphrodite.modeling.layers.linear import (
														
 
															     LinearMethodBase,
														
@@ -181,7 +181,7 @@ class GemmaAttention(nn.Module):
 
															             base=self.rope_theta,
														
 
															             is_neox_style=True,
														
 
															         )
														
 
															-        self.attn = PagedAttention(
														
 
															+        self.attn = Attention(
														
 
															             self.num_heads,
														
 
															             self.head_dim,
														
 
															             self.scaling,
														
--- a/aphrodite/modeling/models/gpt2.py
+++ b/aphrodite/modeling/models/gpt2.py
@@ -26,7 +26,7 @@ from transformers import GPT2Config
 
															 from aphrodite.modeling.metadata import InputMetadata
														
 
															 from aphrodite.modeling.layers.activation import get_act_fn
														
 
															-from aphrodite.modeling.layers.attention import PagedAttention
														
 
															+from aphrodite.modeling.layers.attention import Attention
														
 
															 from aphrodite.modeling.layers.linear import (ColumnParallelLinear,
														
 
															                                               LinearMethodBase,
														
 
															                                               QKVParallelLinear,
														
@@ -74,9 +74,7 @@ class GPT2Attention(nn.Module):
 
															             bias=True,
														
 
															             linear_method=linear_method,
														
 
															         )
														
 
															-        self.attn = PagedAttention(self.num_heads,
														
 
															-                                   self.head_dim,
														
 
															-                                   scale=self.scale)
														
 
															+        self.attn = Attention(self.num_heads, self.head_dim, scale=self.scale)
														
 
															     def forward(
														
 
															         self,
														
--- a/aphrodite/modeling/models/gpt_bigcode.py
+++ b/aphrodite/modeling/models/gpt_bigcode.py
@@ -27,7 +27,7 @@ from transformers import GPTBigCodeConfig
 
															 from aphrodite.modeling.metadata import InputMetadata
														
 
															 from aphrodite.modeling.layers.activation import get_act_fn
														
 
															-from aphrodite.modeling.layers.attention import PagedAttention
														
 
															+from aphrodite.modeling.layers.attention import Attention
														
 
															 from aphrodite.modeling.layers.linear import (ColumnParallelLinear,
														
 
															                                               LinearMethodBase,
														
 
															                                               QKVParallelLinear,
														
@@ -86,10 +86,10 @@ class GPTBigCodeAttention(nn.Module):
 
															             bias=True,
														
 
															             linear_method=linear_method,
														
 
															         )
														
 
															-        self.attn = PagedAttention(self.num_heads,
														
 
															-                                   self.head_dim,
														
 
															-                                   scale=self.scale,
														
 
															-                                   num_kv_heads=self.num_kv_heads)
														
 
															+        self.attn = Attention(self.num_heads,
														
 
															+                              self.head_dim,
														
 
															+                              scale=self.scale,
														
 
															+                              num_kv_heads=self.num_kv_heads)
														
 
															     def forward(
														
 
															         self,
														
--- a/aphrodite/modeling/models/gpt_j.py
+++ b/aphrodite/modeling/models/gpt_j.py
@@ -26,7 +26,7 @@ from transformers import GPTJConfig
 
															 from aphrodite.modeling.metadata import InputMetadata
														
 
															 from aphrodite.modeling.layers.activation import get_act_fn
														
 
															-from aphrodite.modeling.layers.attention import PagedAttention
														
 
															+from aphrodite.modeling.layers.attention import Attention
														
 
															 from aphrodite.modeling.layers.linear import (
														
 
															     ColumnParallelLinear,
														
 
															     LinearMethodBase,
														
@@ -117,7 +117,7 @@ class GPTJAttention(nn.Module):
 
															             base=rope_theta,
														
 
															             is_neox_style=False,
														
 
															         )
														
 
															-        self.attn = PagedAttention(self.num_heads, self.head_size, scaling)
														
 
															+        self.attn = Attention(self.num_heads, self.head_size, scaling)
														
 
															     def forward(
														
 
															         self,
														
--- a/aphrodite/modeling/models/gpt_neox.py
+++ b/aphrodite/modeling/models/gpt_neox.py
@@ -26,7 +26,7 @@ from transformers import GPTNeoXConfig
 
															 from aphrodite.modeling.metadata import InputMetadata
														
 
															 from aphrodite.modeling.layers.activation import get_act_fn
														
 
															-from aphrodite.modeling.layers.attention import PagedAttention
														
 
															+from aphrodite.modeling.layers.attention import Attention
														
 
															 from aphrodite.modeling.layers.linear import (
														
 
															     ColumnParallelLinear,
														
 
															     LinearMethodBase,
														
@@ -99,7 +99,7 @@ class GPTNeoXAttention(nn.Module):
 
															             base=rope_theta,
														
 
															             is_neox_style=is_neox_style,
														
 
															         )
														
 
															-        self.attn = PagedAttention(self.num_heads, self.head_size, scaling)
														
 
															+        self.attn = Attention(self.num_heads, self.head_size, scaling)
														
 
															     def forward(
														
 
															         self,
														
--- a/aphrodite/modeling/models/internlm2.py
+++ b/aphrodite/modeling/models/internlm2.py
@@ -7,7 +7,7 @@ from transformers import PretrainedConfig
 
															 from aphrodite.modeling.metadata import InputMetadata
														
 
															 from aphrodite.modeling.layers.activation import SiluAndMul
														
 
															-from aphrodite.modeling.layers.attention import PagedAttention
														
 
															+from aphrodite.modeling.layers.attention import Attention
														
 
															 from aphrodite.modeling.layers.layernorm import RMSNorm
														
 
															 from aphrodite.modeling.layers.linear import (
														
 
															     LinearMethodBase,
														
@@ -147,7 +147,7 @@ class InternLM2Attention(nn.Module):
 
															             base=rope_theta,
														
 
															             rope_scaling=rope_scaling,
														
 
															         )
														
 
															-        self.attn = PagedAttention(
														
 
															+        self.attn = Attention(
														
 
															             self.num_heads,
														
 
															             self.head_dim,
														
 
															             self.scaling,
														
--- a/aphrodite/modeling/models/llama.py
+++ b/aphrodite/modeling/models/llama.py
@@ -30,7 +30,7 @@ from transformers import LlamaConfig
 
															 from aphrodite.modeling.metadata import InputMetadata
														
 
															 from aphrodite.modeling.layers.activation import SiluAndMul
														
 
															-from aphrodite.modeling.layers.attention import PagedAttention
														
 
															+from aphrodite.modeling.layers.attention import Attention
														
 
															 from aphrodite.modeling.layers.layernorm import RMSNorm
														
 
															 from aphrodite.modeling.layers.linear import (
														
 
															     LinearMethodBase,
														
@@ -199,7 +199,7 @@ class LlamaAttention(nn.Module):
 
															             rope_scaling=rope_scaling,
														
 
															             is_neox_style=is_neox_style,
														
 
															         )
														
 
															-        self.attn = PagedAttention(
														
 
															+        self.attn = Attention(
														
 
															             self.num_heads,
														
 
															             self.head_dim,
														
 
															             self.scaling,
														
@@ -213,7 +213,7 @@ class LlamaAttention(nn.Module):
 
															         hidden_states: torch.Tensor,
														
 
															         kv_cache: KVCache,
														
 
															         input_metadata: InputMetadata,
														
 
															-        kv_quant_param: List[float],
														
 
															+        # kv_quant_param: List[float],
														
 
															     ) -> torch.Tensor:
														
 
															         if self.merge_weight:
														
 
															             qkv, _ = self.qkv_proj(hidden_states)
														
@@ -225,8 +225,15 @@ class LlamaAttention(nn.Module):
 
															             v, _ = self.v_proj(hidden_states)
														
 
															         q, k = self.rotary_emb(positions, q, k)
														
 
															         k_cache, v_cache = kv_cache
														
 
															-        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
														
 
															-                                kv_quant_param)
														
 
															+        attn_output = self.attn(
														
 
															+            q,
														
 
															+            k,
														
 
															+            v,
														
 
															+            k_cache,
														
 
															+            v_cache,
														
 
															+            input_metadata,
														
 
															+            # kv_quant_param
														
 
															+        )
														
 
															         output, _ = self.o_proj(attn_output)
														
 
															         return output
														
@@ -279,7 +286,7 @@ class LlamaDecoderLayer(nn.Module):
 
															         kv_cache: KVCache,
														
 
															         input_metadata: InputMetadata,
														
 
															         residual: Optional[torch.Tensor],
														
 
															-        kv_quant_param: List[float],
														
 
															+        # kv_quant_param: List[float],
														
 
															     ) -> Tuple[torch.Tensor, torch.Tensor]:
														
 
															         # Self Attention
														
 
															         if residual is None:
														
@@ -293,7 +300,7 @@ class LlamaDecoderLayer(nn.Module):
 
															             hidden_states=hidden_states,
														
 
															             kv_cache=kv_cache,
														
 
															             input_metadata=input_metadata,
														
 
															-            kv_quant_param=kv_quant_param,
														
 
															+            # kv_quant_param=kv_quant_param,
														
 
															         )
														
 
															         # Fully Connected
														
@@ -347,8 +354,8 @@ class LlamaModel(nn.Module):
 
															                 kv_caches[i],
														
 
															                 input_metadata,
														
 
															                 residual,
														
 
															-                input_metadata.kv_quant_params[i]
														
 
															-                if input_metadata.kv_quant_params is not None else None,
														
 
															+                # input_metadata.kv_quant_params[i]
														
 
															+                # if input_metadata.kv_quant_params is not None else None,
														
 
															             )
														
 
															         hidden_states, _ = self.norm(hidden_states, residual)
														
 
															         return hidden_states
														
--- a/aphrodite/modeling/models/mixtral.py
+++ b/aphrodite/modeling/models/mixtral.py
@@ -31,8 +31,8 @@ from transformers import MixtralConfig
 
															 from aphrodite.common.config import LoRAConfig
														
 
															 from aphrodite.modeling.metadata import InputMetadata
														
 
															-from aphrodite.modeling.layers.attention import PagedAttention
														
 
															-from aphrodite.modeling.layers.triton_kernel.fused_moe import fused_moe
														
 
															+from aphrodite.modeling.layers.attention import Attention
														
 
															+from aphrodite.modeling.layers.fused_moe.fused_moe import fused_moe
														
 
															 from aphrodite.modeling.layers.layernorm import RMSNorm
														
 
															 from aphrodite.modeling.layers.linear import (
														
 
															     LinearMethodBase,
														
@@ -256,7 +256,7 @@ class MixtralAttention(nn.Module):
 
															             base=int(self.rope_theta),
														
 
															             is_neox_style=is_neox_style,
														
 
															         )
														
 
															-        self.attn = PagedAttention(
														
 
															+        self.attn = Attention(
														
 
															             self.num_heads,
														
 
															             self.head_dim,
														
 
															             self.scaling,
														
--- a/aphrodite/modeling/models/mixtral_quant.py
+++ b/aphrodite/modeling/models/mixtral_quant.py
@@ -34,7 +34,7 @@ from torch import nn
 
															 from transformers import MixtralConfig
														
 
															 from aphrodite.modeling.metadata import InputMetadata
														
 
															-from aphrodite.modeling.layers.attention import PagedAttention
														
 
															+from aphrodite.modeling.layers.attention import Attention
														
 
															 from aphrodite.modeling.layers.layernorm import RMSNorm
														
 
															 from aphrodite.modeling.layers.linear import (
														
 
															     LinearMethodBase,
														
@@ -259,7 +259,7 @@ class MixtralAttention(nn.Module):
 
															             base=int(self.rope_theta),
														
 
															             is_neox_style=True,
														
 
															         )
														
 
															-        self.attn = PagedAttention(
														
 
															+        self.attn = Attention(
														
 
															             self.num_heads,
														
 
															             self.head_dim,
														
 
															             self.scaling,
														
--- a/aphrodite/modeling/models/mpt.py
+++ b/aphrodite/modeling/models/mpt.py
@@ -8,7 +8,7 @@ import torch.nn as nn
 
															 from aphrodite.modeling.metadata import InputMetadata
														
 
															 from aphrodite.modeling.layers.activation import get_act_fn
														
 
															-from aphrodite.modeling.layers.attention import PagedAttention
														
 
															+from aphrodite.modeling.layers.attention import Attention
														
 
															 from aphrodite.modeling.layers.linear import (ColumnParallelLinear,
														
 
															                                               LinearMethodBase,
														
 
															                                               QKVParallelLinear,
														
@@ -105,11 +105,11 @@ class MPTAttention(nn.Module):
 
															         self.head_dim = self.d_model // self.total_num_heads
														
 
															         scaling = self.head_dim**-0.5
														
 
															-        self.attn = PagedAttention(self.num_heads,
														
 
															-                                   self.head_dim,
														
 
															-                                   scaling,
														
 
															-                                   alibi_slopes=alibi_slopes,
														
 
															-                                   num_kv_heads=self.num_kv_heads)
														
 
															+        self.attn = Attention(self.num_heads,
														
 
															+                              self.head_dim,
														
 
															+                              scaling,
														
 
															+                              alibi_slopes=alibi_slopes,
														
 
															+                              num_kv_heads=self.num_kv_heads)
														
 
															     def forward(
														
 
															         self,
														
--- a/aphrodite/modeling/models/neuron/llama.py
+++ b/aphrodite/modeling/models/neuron/llama.py
@@ -0,0 +1,79 @@
 
															+"""Inference-only LLaMA model compatible with HuggingFace weights."""
														
 
															+import os
														
 
															+from typing import List, Optional, Tuple
														
 
															+
														
 
															+import torch
														
 
															+from torch import nn
														
 
															+from transformers import LlamaConfig
														
 
															+
														
 
															+from aphrodite.modeling.metadata import InputMetadata
														
 
															+from aphrodite.modeling.layers.sampler import Sampler
														
 
															+from aphrodite.modeling.sampling_metadata import SamplingMetadata
														
 
															+from aphrodite.common.sequence import SamplerOutput
														
 
															+
														
 
															+KVCache = Tuple[torch.Tensor, torch.Tensor]
														
 
															+
														
 
															+
														
 
															+class LlamaForCausalLM(nn.Module):
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        config: LlamaConfig,
														
 
															+        linear_method=None,
														
 
															+    ) -> None:
														
 
															+        super().__init__()
														
 
															+        self.config = config
														
 
															+        self.linear_method = linear_method
														
 
															+        self.model = None
														
 
															+        self.sampler = Sampler(config.vocab_size)
														
 
															+
														
 
															+    def forward(
														
 
															+        self,
														
 
															+        input_ids: torch.Tensor,
														
 
															+        positions: torch.Tensor,
														
 
															+        kv_caches: List[KVCache],
														
 
															+        input_metadata: InputMetadata,
														
 
															+    ) -> torch.Tensor:
														
 
															+        with torch.inference_mode():
														
 
															+            block_size = self.model.context_buckets[-1]
														
 
															+            if input_metadata.is_prompt:
														
 
															+                seq_ids = input_metadata.slot_mapping[:, 0] // block_size
														
 
															+            else:
														
 
															+                seq_ids = input_metadata.block_tables
														
 
															+            logits = self.model(input_ids,
														
 
															+                                cache_ids=positions,
														
 
															+                                start_ids=seq_ids.flatten())
														
 
															+        return logits
														
 
															+
														
 
															+    def sample(
														
 
															+        self,
														
 
															+        hidden_states: torch.Tensor,
														
 
															+        sampling_metadata: SamplingMetadata,
														
 
															+    ) -> Optional[SamplerOutput]:
														
 
															+        next_tokens = self.sampler(self.model.chkpt_model.lm_head,
														
 
															+                                   hidden_states, sampling_metadata)
														
 
															+        return next_tokens
														
 
															+
														
 
															+    def load_weights(self,
														
 
															+                     model_name_or_path: str,
														
 
															+                     cache_dir: Optional[str] = None,
														
 
															+                     load_format: str = "auto",
														
 
															+                     revision: Optional[str] = None,
														
 
															+                     **kwargs):
														
 
															+        from transformers_neuronx.llama.model import LlamaForSampling
														
 
															+
														
 
															+        split_model_dir = f"{model_name_or_path}-split"
														
 
															+        if os.path.isdir(os.path.join(model_name_or_path,
														
 
															+                                      "pytorch_model.bin")):
														
 
															+            split_model_dir = model_name_or_path
														
 
															+        elif not os.path.exists(f"{model_name_or_path}-split"):
														
 
															+            from transformers.models.llama import LlamaForCausalLM
														
 
															+            from transformers_neuronx.module import save_pretrained_split
														
 
															+
														
 
															+            hf_model = LlamaForCausalLM.from_pretrained(model_name_or_path,
														
 
															+                                                        low_cpu_mem_usage=True)
														
 
															+            save_pretrained_split(hf_model, f"{model_name_or_path}-split")
														
 
															+
														
 
															+        self.model = LlamaForSampling.from_pretrained(split_model_dir,
														
 
															+                                                      **kwargs)
														
 
															+        self.model.to_neuron()
														
--- a/aphrodite/modeling/models/olmo.py
+++ b/aphrodite/modeling/models/olmo.py
@@ -45,7 +45,7 @@ import torch.nn.functional as F
 
															 from torch import nn
														
 
															 from aphrodite.modeling.metadata import InputMetadata
														
 
															-from aphrodite.modeling.layers.attention import PagedAttention
														
 
															+from aphrodite.modeling.layers.attention import Attention
														
 
															 from aphrodite.modeling.layers.linear import (
														
 
															     ColumnParallelLinear,
														
 
															     LinearMethodBase,
														
@@ -131,9 +131,9 @@ class OlmoAttention(nn.Module):
 
															                 base=rope_theta,
														
 
															             )
														
 
															         self.scaling = self.head_dim**-0.5
														
 
															-        self.attn = PagedAttention(self.num_heads,
														
 
															-                                   self.head_dim,
														
 
															-                                   scale=self.scaling)
														
 
															+        self.attn = Attention(self.num_heads,
														
 
															+                              self.head_dim,
														
 
															+                              scale=self.scaling)
														
 
															         # Attention output projection.
														
 
															         self.attn_out = RowParallelLinear(
														
--- a/aphrodite/modeling/models/opt.py
+++ b/aphrodite/modeling/models/opt.py
@@ -27,7 +27,7 @@ from transformers import OPTConfig
 
															 from aphrodite.modeling.metadata import InputMetadata
														
 
															 from aphrodite.modeling.layers.activation import get_act_fn
														
 
															-from aphrodite.modeling.layers.attention import PagedAttention
														
 
															+from aphrodite.modeling.layers.attention import Attention
														
 
															 from aphrodite.modeling.layers.linear import (
														
 
															     ColumnParallelLinear,
														
 
															     LinearMethodBase,
														
@@ -114,9 +114,9 @@ class OPTAttention(nn.Module):
 
															             bias=bias,
														
 
															             linear_method=linear_method,
														
 
															         )
														
 
															-        self.attn = PagedAttention(self.num_heads,
														
 
															-                                   self.head_dim,
														
 
															-                                   scale=self.scaling)
														
 
															+        self.attn = Attention(self.num_heads,
														
 
															+                              self.head_dim,
														
 
															+                              scale=self.scaling)
														
 
															     def forward(
														
 
															         self,
														
--- a/aphrodite/modeling/models/phi.py
+++ b/aphrodite/modeling/models/phi.py
@@ -45,7 +45,7 @@ from transformers import PretrainedConfig
 
															 from aphrodite.modeling.metadata import InputMetadata
														
 
															 from aphrodite.modeling.layers.activation import get_act_fn
														
 
															-from aphrodite.modeling.layers.attention import PagedAttention
														
 
															+from aphrodite.modeling.layers.attention import Attention
														
 
															 from aphrodite.modeling.layers.linear import (
														
 
															     ColumnParallelLinear,
														
 
															     LinearMethodBase,
														
@@ -145,7 +145,7 @@ class PhiAttention(nn.Module):
 
															             base=rope_theta,
														
 
															             is_neox_style=is_neox_style,
														
 
															         )
														
 
															-        self.attn = PagedAttention(self.num_heads, self.head_size, scaling)
														
 
															+        self.attn = Attention(self.num_heads, self.head_size, scaling)
														
 
															     def forward(
														
 
															         self,
														
--- a/aphrodite/modeling/models/qwen.py
+++ b/aphrodite/modeling/models/qwen.py
@@ -12,7 +12,7 @@ from torch import nn
 
															 from aphrodite.modeling.metadata import InputMetadata
														
 
															 from aphrodite.modeling.layers.activation import SiluAndMul
														
 
															-from aphrodite.modeling.layers.attention import PagedAttention
														
 
															+from aphrodite.modeling.layers.attention import Attention
														
 
															 from aphrodite.modeling.layers.layernorm import RMSNorm
														
 
															 from aphrodite.modeling.layers.linear import (
														
 
															     LinearMethodBase,
														
@@ -142,7 +142,7 @@ class QWenAttention(nn.Module):
 
															             rope_scaling=rope_scaling,
														
 
															             is_neox_style=is_neox_style,
														
 
															         )
														
 
															-        self.attn = PagedAttention(self.num_heads, self.head_dim, self.scaling)
														
 
															+        self.attn = Attention(self.num_heads, self.head_dim, self.scaling)
														
 
															     def forward(
														
 
															         self,
														
--- a/aphrodite/modeling/models/qwen2.py
+++ b/aphrodite/modeling/models/qwen2.py
@@ -32,7 +32,7 @@ from transformers import Qwen2Config
 
															 from aphrodite.modeling.metadata import InputMetadata
														
 
															 from aphrodite.modeling.layers.activation import SiluAndMul
														
 
															-from aphrodite.modeling.layers.attention import PagedAttention
														
 
															+from aphrodite.modeling.layers.attention import Attention
														
 
															 from aphrodite.modeling.layers.layernorm import RMSNorm
														
 
															 from aphrodite.modeling.layers.linear import (
														
 
															     LinearMethodBase,
														
@@ -193,7 +193,7 @@ class Qwen2Attention(nn.Module):
 
															             max_position=max_position,
														
 
															             base=self.rope_theta,
														
 
															         )
														
 
															-        self.attn = PagedAttention(
														
 
															+        self.attn = Attention(
														
 
															             self.num_heads,
														
 
															             self.head_dim,
														
 
															             self.scaling,
														
--- a/aphrodite/modeling/models/stablelm.py
+++ b/aphrodite/modeling/models/stablelm.py
@@ -28,7 +28,7 @@ from transformers import PretrainedConfig
 
															 from aphrodite.modeling.metadata import InputMetadata
														
 
															 from aphrodite.modeling.layers.activation import SiluAndMul
														
 
															-from aphrodite.modeling.layers.attention import PagedAttention
														
 
															+from aphrodite.modeling.layers.attention import Attention
														
 
															 from aphrodite.modeling.layers.linear import (
														
 
															     LinearMethodBase,
														
 
															     MergedColumnParallelLinear,
														
@@ -188,7 +188,7 @@ class StablelmAttention(nn.Module):
 
															             max_position=self.config.max_position_embeddings,
														
 
															             base=self.config.rope_theta,
														
 
															         )
														
 
															-        self.attn = PagedAttention(
														
 
															+        self.attn = Attention(
														
 
															             self.num_heads,
														
 
															             self.head_dim,
														
 
															             self.scaling,
														
--- a/aphrodite/modeling/neuron_loader.py
+++ b/aphrodite/modeling/neuron_loader.py
@@ -0,0 +1,70 @@
 
															+"""Utilities for selecting and loading models."""
														
 
															+from typing import Type
														
 
															+
														
 
															+import torch
														
 
															+import torch.nn as nn
														
 
															+from transformers import PretrainedConfig
														
 
															+
														
 
															+from aphrodite.common.config import ModelConfig, DeviceConfig
														
 
															+from aphrodite.modeling.models import ModelRegistry
														
 
															+
														
 
															+TORCH_DTYPE_TO_NEURON_AMP = {
														
 
															+    "auto": "f32",
														
 
															+    "half": "f16",
														
 
															+    "float16": "f16",
														
 
															+    "bfloat16": "bf16",
														
 
															+    "float": "f32",
														
 
															+    "float32": "f32",
														
 
															+    torch.float16: "f16",
														
 
															+    torch.bfloat16: "bf16",
														
 
															+    torch.float32: "f32",
														
 
															+}
														
 
															+
														
 
															+
														
 
															+def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
														
 
															+    architectures = getattr(config, "architectures", [])
														
 
															+    for arch in architectures:
														
 
															+        model_cls = ModelRegistry.load_model_cls(arch)
														
 
															+        if model_cls is not None:
														
 
															+            return model_cls
														
 
															+    raise ValueError(
														
 
															+        f"Model architectures {architectures} are not supported for now. "
														
 
															+        f"Supported architectures: {ModelRegistry.get_supported_archs()}")
														
 
															+
														
 
															+
														
 
															+def get_model(model_config: ModelConfig, device_config: DeviceConfig,
														
 
															+              **kwargs) -> nn.Module:
														
 
															+    from transformers_neuronx.config import (
														
 
															+        NeuronConfig,
														
 
															+        ContinuousBatchingConfig,
														
 
															+    )
														
 
															+
														
 
															+    parallel_config = kwargs.get("parallel_config")
														
 
															+    scheduler_config = kwargs.get("scheduler_config")
														
 
															+
														
 
															+    model_class = _get_model_architecture(model_config.hf_config)
														
 
															+    linear_method = None
														
 
															+
														
 
															+    # Create a model instance.
														
 
															+    model = model_class(model_config.hf_config, linear_method)
														
 
															+
														
 
															+    continuous_batching_config = ContinuousBatchingConfig(
														
 
															+        batch_size_for_shared_caches=scheduler_config.max_num_seqs)
														
 
															+    neuron_config = NeuronConfig(
														
 
															+        continuous_batching=continuous_batching_config)
														
 
															+
														
 
															+    # Load the weights from the cached or downloaded files.
														
 
															+    model.load_weights(
														
 
															+        model_config.model,
														
 
															+        model_config.download_dir,
														
 
															+        model_config.load_format,
														
 
															+        model_config.revision,
														
 
															+        tp_degree=parallel_config.neuron_tp_degree,
														
 
															+        amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
														
 
															+        neuron_config=neuron_config,
														
 
															+        context_length_estimate=[scheduler_config.max_model_len],
														
 
															+        n_positions=[scheduler_config.max_model_len],
														
 
															+        batch_size=scheduler_config.max_num_seqs,
														
 
															+    )
														
 
															+
														
 
															+    return model.eval()
														
--- a/aphrodite/modeling/sampling_metadata.py
+++ b/aphrodite/modeling/sampling_metadata.py
@@ -5,7 +5,7 @@ import torch
 
															 from aphrodite.common.sampling_params import SamplingParams, SamplingType
														
 
															 from aphrodite.common.sequence import SequenceData
														
 
															-from aphrodite.common.utils import in_wsl
														
 
															+from aphrodite.common.utils import in_wsl, is_neuron
														
 
															 _SAMPLING_EPS = 1e-5
														
@@ -292,7 +292,7 @@ class SamplingTensors:
 
															                    dtype: torch.dtype) -> "SamplingTensors":
														
 
															         # Note that the performance will be very bad without
														
 
															         # pinned memory.
														
 
															-        pin_memory = not in_wsl()
														
 
															+        pin_memory = not in_wsl() and not is_neuron()
														
 
															         prompt_max_len = max(len(tokens) for tokens in prompt_tokens)
														
 
															         prompt_padded_tokens = [
														
 
															             tokens + [vocab_size] * (prompt_max_len - len(tokens))
														
--- a/aphrodite/modeling/utils.py
+++ b/aphrodite/modeling/utils.py
@@ -1,10 +1,18 @@
 
															 """Utils for model executor."""
														
 
															 import random
														
 
															+import importlib
														
 
															 from typing import Any, Dict, Optional
														
 
															 import numpy as np
														
 
															 import torch
														
 
															+from aphrodite.common.config import DeviceConfig, ModelConfig
														
 
															+
														
 
															+DEVICE_TO_MODEL_LOADER_MAP = {
														
 
															+    "cuda": "loader",
														
 
															+    "neuron": "neuron_loader",
														
 
															+}
														
 
															+
														
 
															 def set_random_seed(seed: int) -> None:
														
 
															     random.seed(seed)
														
@@ -33,3 +41,12 @@ def set_weight_attrs(
 
															         assert not hasattr(
														
 
															             weight, key), (f"Overwriting existing tensor attribute: {key}")
														
 
															         setattr(weight, key, value)
														
 
															+
														
 
															+
														
 
															+def get_model(model_config: ModelConfig, device_config: DeviceConfig,
														
 
															+              **kwargs) -> torch.nn.Module:
														
 
															+    model_loader_module = DEVICE_TO_MODEL_LOADER_MAP[device_config.device_type]
														
 
															+    imported_model_loader = importlib.import_module(
														
 
															+        f"aphrodite.modeling.{model_loader_module}")
														
 
															+    get_model_fn = imported_model_loader.get_model
														
 
															+    return get_model_fn(model_config, device_config, **kwargs)
														
--- a/aphrodite/processing/block_manager.py
+++ b/aphrodite/processing/block_manager.py
@@ -1,7 +1,6 @@
 
															 """A block manager that manages token blocks."""
														
 
															-
														
 
															 import enum
														
 
															-from itertools import count
														
 
															+from itertools import count, takewhile
														
 
															 from os.path import commonprefix
														
 
															 from typing import Dict, List, Optional, Set, Tuple
														
@@ -19,14 +18,12 @@ class BlockAllocator:
 
															     the reference count becomes zero, the block is added back to the free list.
														
 
															     """
														
 
															-    def __init__(
														
 
															-        self,
														
 
															-        device: Device,
														
 
															-        block_size: int,
														
 
															-        num_blocks: int,
														
 
															-        eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
														
 
															-        enable_caching: bool = False,
														
 
															-    ) -> None:
														
 
															+    def __init__(self,
														
 
															+                 device: Device,
														
 
															+                 block_size: int,
														
 
															+                 num_blocks: int,
														
 
															+                 eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
														
 
															+                 enable_caching: bool = False) -> None:
														
 
															         self.device = device
														
 
															         self.block_size = block_size
														
 
															         self.num_blocks = num_blocks
														
@@ -49,13 +46,11 @@ class BlockAllocator:
 
															             block.block_hash = block_hash
														
 
															             block.num_hashed_tokens = num_hashed_tokens
														
 
															             return block
														
 
															-        block = PhysicalTokenBlock(
														
 
															-            device=self.device,
														
 
															-            block_number=self.current_num_blocks,
														
 
															-            block_size=self.block_size,
														
 
															-            block_hash=block_hash,
														
 
															-            num_hashed_tokens=num_hashed_tokens,
														
 
															-        )
														
 
															+        block = PhysicalTokenBlock(device=self.device,
														
 
															+                                   block_number=self.current_num_blocks,
														
 
															+                                   block_size=self.block_size,
														
 
															+                                   block_hash=block_hash,
														
 
															+                                   num_hashed_tokens=num_hashed_tokens)
														
 
															         self.current_num_blocks += 1
														
 
															         return block
														
@@ -126,7 +121,6 @@ class AllocStatus(enum.Enum):
 
															     3. Never: seq_group can never be allocated.
														
 
															       The seq_group is too large to allocated in GPU.
														
 
															     """
														
 
															-
														
 
															     OK = enum.auto()
														
 
															     LATER = enum.auto()
														
 
															     NEVER = enum.auto()
														
@@ -150,10 +144,8 @@ class BlockSpaceManager:
 
															         self.block_sliding_window = None
														
 
															         if sliding_window is not None:
														
 
															-            assert sliding_window % block_size == 0, (
														
 
															-                sliding_window,
														
 
															-                block_size,
														
 
															-            )
														
 
															+            assert sliding_window % block_size == 0, (sliding_window,
														
 
															+                                                      block_size)
														
 
															             self.block_sliding_window = sliding_window // block_size
														
 
															         self.watermark = watermark
														
@@ -162,23 +154,19 @@ class BlockSpaceManager:
 
															         self.enable_caching = enable_caching
														
 
															         self.watermark_blocks = int(watermark * num_gpu_blocks)
														
 
															-        self.gpu_allocator = BlockAllocator(
														
 
															-            Device.GPU,
														
 
															-            block_size,
														
 
															-            num_gpu_blocks,
														
 
															-            enable_caching=enable_caching,
														
 
															-        )
														
 
															-        self.cpu_allocator = BlockAllocator(
														
 
															-            Device.CPU,
														
 
															-            block_size,
														
 
															-            num_cpu_blocks,
														
 
															-            enable_caching=enable_caching,
														
 
															-        )
														
 
															+        self.gpu_allocator = BlockAllocator(Device.GPU,
														
 
															+                                            block_size,
														
 
															+                                            num_gpu_blocks,
														
 
															+                                            enable_caching=enable_caching)
														
 
															+        self.cpu_allocator = BlockAllocator(Device.CPU,
														
 
															+                                            block_size,
														
 
															+                                            num_cpu_blocks,
														
 
															+                                            enable_caching=enable_caching)
														
 
															         # Mapping: seq_id -> BlockTable.
														
 
															         self.block_tables: Dict[int, BlockTable] = {}
														
 
															     def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
														
 
															-        # FIXME(woosuk): Here we assume that all sequences in the group share
														
 
															+        # FIXME: Here we assume that all sequences in the group share
														
 
															         # the same prompt. This may not be true for preempted sequences.
														
 
															         seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
														
 
															         num_required_blocks = len(seq.logical_token_blocks)
														
@@ -213,8 +201,7 @@ class BlockSpaceManager:
 
															             else:
														
 
															                 block = self.gpu_allocator.allocate(
														
 
															                     seq.hash_of_block(logical_idx),
														
 
															-                    seq.num_hashed_tokens_of_block(logical_idx),
														
 
															-                )
														
 
															+                    seq.num_hashed_tokens_of_block(logical_idx))
														
 
															             block_table.append(block)
														
 
															         # Assign the block table for each sequence.
														
@@ -444,23 +431,29 @@ class BlockSpaceManager:
 
															         for block in block_table:
														
 
															             block.last_accessed = access_time
														
 
															-    def compute_last_full_block_in_seq(self, seq: Sequence):
														
 
															+    def compute_full_blocks_in_seq(self, seq: Sequence):
														
 
															         if seq.seq_id not in self.block_tables:
														
 
															             return
														
 
															         max_full_block = seq.get_len() // self.block_size - 1
														
 
															         block_table = self.block_tables[seq.seq_id]
														
 
															         if max_full_block == -1:
														
 
															             return
														
 
															-        block_table[max_full_block].computed = True
														
 
															+        for i in reversed(range(max_full_block)):
														
 
															+            if block_table[i].computed:
														
 
															+                break
														
 
															+            block_table[i].computed = True
														
 
															-    def get_all_block_ids_till_computed(self, seq: Sequence) -> List[int]:
														
 
															+    def get_all_computed_blocks(self, seq: Sequence) -> List[int]:
														
 
															         if seq.seq_id not in self.block_tables:
														
 
															             return []
														
 
															         block_table = self.block_tables[seq.seq_id]
														
 
															-        for block_idx in reversed(range(len(block_table))):
														
 
															-            if block_table[block_idx].computed:
														
 
															-                return [b.block_number for b in block_table[:block_idx + 1]]
														
 
															-        return []
														
 
															+        # NOTE We exclude the last block to avoid the case where the entire
														
 
															+        # prompt is cached. This would cause erroneous behavior in model
														
 
															+        # runner.
														
 
															+        return [
														
 
															+            b.block_number
														
 
															+            for b in takewhile(lambda b: b.computed, block_table[:-1])
														
 
															+        ]
														
 
															     def get_common_computed_block_ids(self,
														
 
															                                       seq_group: SequenceGroup) -> List[int]:
														
@@ -469,14 +462,12 @@ class BlockSpaceManager:
 
															             return []
														
 
															         ids_list = [
														
 
															-            self.get_all_block_ids_till_computed(seq)
														
 
															+            self.get_all_computed_blocks(seq)
														
 
															             for seq in iter(seq_group.seqs_dict.values())
														
 
															         ]
														
 
															         return commonprefix([ids for ids in ids_list if ids != []])
														
 
															     def mark_blocks_as_computed(self, seq_group: SequenceGroup):
														
 
															-        # NOTE: We only mark the last full block because with prefix caching,
														
 
															-        # all blocks until the marked one are guaranteed to be computed.
														
 
															         if self.enable_caching:
														
 
															             for seq in seq_group.seqs_dict.values():
														
 
															-                self.compute_last_full_block_in_seq(seq)
														
 
															+                self.compute_full_blocks_in_seq(seq)
														
--- a/aphrodite/processing/evictor.py
+++ b/aphrodite/processing/evictor.py
@@ -7,8 +7,9 @@ from aphrodite.common.block import PhysicalTokenBlock
 
															 class EvictionPolicy(enum.Enum):
														
 
															     """Enum for eviction policy used by make_evictor to instantiate the correct
														
 
															-       Evictor subclass.
														
 
															+    Evictor subclass.
														
 
															     """
														
 
															+
														
 
															     LRU = enum.auto()
														
 
															     FIFO = enum.auto()
														
@@ -115,7 +116,6 @@ class LRUEvictor(Evictor):
 
															         return block
														
 
															     @property
														
 
															-    # pylint: disable=invalid-overridden-method
														
 
															     def num_blocks(self) -> int:
														
 
															         return len(self.free_table)
														
@@ -149,7 +149,6 @@ class RandomEvictor(Evictor):
 
															         return block
														
 
															     @property
														
 
															-    # pylint: disable=invalid-overridden-method
														
 
															     def num_blocks(self) -> int:
														
 
															         return len(self.free_table)
														
--- a/aphrodite/processing/scheduler.py
+++ b/aphrodite/processing/scheduler.py
@@ -65,10 +65,7 @@ class SchedulerOutputs:
 
															     def _sort_by_lora_ids(self) -> bool:
														
 
															         self.scheduled_seq_groups = sorted(
														
 
															             self.scheduled_seq_groups,
														
 
															-            key=lambda g: (
														
 
															-                g.lora_request.lora_int_id if g.lora_request else 0,
														
 
															-                g.request_id,
														
 
															-            ),
														
 
															+            key=lambda g: (g.lora_int_id, g.request_id),
														
 
															         )
														
 
															     @property
														
--- a/aphrodite/spec_decode/batch_expansion.py
+++ b/aphrodite/spec_decode/batch_expansion.py
@@ -0,0 +1,398 @@
 
															+from typing import Iterator, List, Tuple, Optional, Dict
														
 
															+from itertools import chain, count
														
 
															+
														
 
															+import torch
														
 
															+
														
 
															+from aphrodite.common.sequence import (
														
 
															+    SamplerOutput,
														
 
															+    SequenceGroupMetadata,
														
 
															+    SequenceData,
														
 
															+)
														
 
															+from aphrodite.task_handler.worker import Worker
														
 
															+from aphrodite.spec_decode.util import (
														
 
															+    nvtx_range,
														
 
															+    sampler_output_to_torch,
														
 
															+    get_all_seq_ids,
														
 
															+    split_batch_by_proposal_len,
														
 
															+)
														
 
															+from aphrodite.spec_decode.interfaces import (
														
 
															+    SpeculativeScorer,
														
 
															+    SpeculativeProposals,
														
 
															+    SpeculativeScores,
														
 
															+)
														
 
															+
														
 
															+SeqId = int
														
 
															+TargetSeqId = int
														
 
															+TokenId = int
														
 
															+
														
 
															+
														
 
															+class BatchExpansionTop1Scorer(SpeculativeScorer):
														
 
															+    """Implements a speculative scorer that uses batch expansion to get
														
 
															+    probabilities of speculative tokens according to the scoring model.
														
 
															+
														
 
															+    Batch expansion converts a list of sequences and multiple query positions
														
 
															+    to a new batch of sequences, each with a single query position. This allows
														
 
															+    for MQA-like scoring in speculative decoding without requiring an MQA
														
 
															+    kernel.
														
 
															+
														
 
															+    It is strictly less efficient than MQA scoring.
														
 
															+
														
 
															+    It only supports scoring the top1 proposal tokens of the proposer, instead
														
 
															+    of topk/tree.
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self, scorer_worker: Worker, device: str, vocab_size: int):
														
 
															+        self._scorer_worker = scorer_worker
														
 
															+        self._device = device
														
 
															+        self._vocab_size = vocab_size
														
 
															+
														
 
															+    @nvtx_range("BatchExpansionTop1Scorer.score_proposals")
														
 
															+    def score_proposals(
														
 
															+        self,
														
 
															+        seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+        blocks_to_swap_in: Optional[Dict[int, int]],
														
 
															+        blocks_to_swap_out: Optional[Dict[int, int]],
														
 
															+        blocks_to_copy: Optional[Dict[int, List[int]]],
														
 
															+        k: int,
														
 
															+        proposals: SpeculativeProposals,
														
 
															+    ) -> SpeculativeScores:
														
 
															+        """Score the proposed tokens via the scorer model.
														
 
															+
														
 
															+        This converts each input sequence to a set of k+1 target sequences. The
														
 
															+        target sequences have the unique continuations to be scored and a
														
 
															+        unique sequence ID that is different from all input sequence ids.
														
 
															+
														
 
															+        If a speculative sequence length would exceed the max model length, then
														
 
															+        no speculation is produced for that sequence.
														
 
															+
														
 
															+        Args:
														
 
															+            seq_group_metadata_list: The input sequence group metadata.
														
 
															+            blocks_to_swap_in: This is passed to the worker during scoring.
														
 
															+            blocks_to_swap_out: This is passed to the worker during scoring.
														
 
															+            blocks_to_copy: This is passed to the worker during scoring.
														
 
															+            k: The fixed proposal length.
														
 
															+            proposals: The speculative proposals to score.
														
 
															+        Returns:
														
 
															+            SpeculativeScores: The scores of each speculative token, along with
														
 
															+                which sequences were ignored during scoring.
														
 
															+        """
														
 
															+
														
 
															+        # TODO: perform this on GPU to remove blocking call.
														
 
															+        proposal_lens_list = proposals.proposal_lens.tolist()
														
 
															+        proposal_token_ids_list = proposals.proposal_token_ids.tolist()
														
 
															+
														
 
															+        (
														
 
															+            spec_indices,
														
 
															+            non_spec_indices,
														
 
															+            target_seq_group_metadata_list,
														
 
															+            num_scoring_tokens,
														
 
															+        ) = self._expand_batch(
														
 
															+            seq_group_metadata_list=seq_group_metadata_list,
														
 
															+            proposal_token_ids_list=proposal_token_ids_list,
														
 
															+            proposal_lens_list=proposal_lens_list,
														
 
															+        )
														
 
															+
														
 
															+        target_sampler_output = self._scorer_worker.execute_model(
														
 
															+            seq_group_metadata_list=target_seq_group_metadata_list,
														
 
															+            blocks_to_swap_in=blocks_to_swap_in,
														
 
															+            blocks_to_swap_out=blocks_to_swap_out,
														
 
															+            blocks_to_copy=blocks_to_copy,
														
 
															+            return_python_output=False,
														
 
															+        )
														
 
															+
														
 
															+        all_tokens, all_probs = self._contract_batch(
														
 
															+            original_bs=len(seq_group_metadata_list),
														
 
															+            target_sampler_output=target_sampler_output,
														
 
															+            proposals=proposals,
														
 
															+            num_scoring_tokens=num_scoring_tokens,
														
 
															+            non_spec_indices=non_spec_indices,
														
 
															+            spec_indices=spec_indices,
														
 
															+            k=k,
														
 
															+        )
														
 
															+
														
 
															+        return SpeculativeScores(
														
 
															+            probs=all_probs,
														
 
															+            token_ids=all_tokens,
														
 
															+        )
														
 
															+
														
 
															+    def _expand_batch(
														
 
															+        self,
														
 
															+        seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+        proposal_token_ids_list: List[TokenId],
														
 
															+        proposal_lens_list: List[int],
														
 
															+    ) -> Tuple[List[int], List[int], List[SequenceGroupMetadata], int]:
														
 
															+        """Given the input sequences and potentially multiple corresponding
														
 
															+        proposal tokens, create a new batch where each sequence has a single
														
 
															+        query token.
														
 
															+        """
														
 
															+
														
 
															+        # Aphrodite currently only supports proposal lens equal to zero or the
														
 
															+        # batch proposal len. This adds some complexity (splitting the batch
														
 
															+        # into spec and non spec sequences) and should be removed in the
														
 
															+        # future. It can be done by supporting per-sequence proposal lens.
														
 
															+        spec_seqs, spec_indices = split_batch_by_proposal_len(
														
 
															+            seq_group_metadata_list,
														
 
															+            proposal_lens_list,
														
 
															+            select_proposal_len_zero=False,
														
 
															+        )
														
 
															+        non_spec_seqs, non_spec_indices = split_batch_by_proposal_len(
														
 
															+            seq_group_metadata_list,
														
 
															+            proposal_lens_list,
														
 
															+            select_proposal_len_zero=True,
														
 
															+        )
														
 
															+
														
 
															+        target_seq_group_metadata_list = self._create_scoring_model_input(
														
 
															+            spec_seqs, proposal_token_ids_list)
														
 
															+        num_scoring_tokens = len(target_seq_group_metadata_list)
														
 
															+        target_seq_group_metadata_list.extend(non_spec_seqs)
														
 
															+
														
 
															+        return (
														
 
															+            spec_indices,
														
 
															+            non_spec_indices,
														
 
															+            target_seq_group_metadata_list,
														
 
															+            num_scoring_tokens,
														
 
															+        )
														
 
															+
														
 
															+    def _contract_batch(
														
 
															+        self,
														
 
															+        original_bs: int,
														
 
															+        target_sampler_output: List[SamplerOutput],
														
 
															+        proposals: SpeculativeProposals,
														
 
															+        num_scoring_tokens: int,
														
 
															+        non_spec_indices: List[int],
														
 
															+        spec_indices: List[int],
														
 
															+        k: int,
														
 
															+    ) -> Tuple[torch.Tensor, torch.Tensor]:
														
 
															+        """Contract the expanded batch back into its original size.
														
 
															+        This maps the scores of speculative tokens back to their original
														
 
															+        sequences.
														
 
															+        """
														
 
															+        (
														
 
															+            target_token_ids,
														
 
															+            target_probs,
														
 
															+            non_spec_target_token_ids,
														
 
															+            non_spec_target_probs,
														
 
															+        ) = self._split_scoring_output(target_sampler_output,
														
 
															+                                       num_scoring_tokens)
														
 
															+
														
 
															+        # Map distinct sequences used to score each token
														
 
															+        # of shape [batch_size * k + 1] back to [batch_size, k + 1].
														
 
															+        batch_size, k = proposals.proposal_token_ids.shape
														
 
															+
														
 
															+        target_token_ids = target_token_ids.squeeze().reshape(
														
 
															+            batch_size, k + 1)
														
 
															+        target_probs = target_probs.squeeze().reshape(batch_size, k + 1,
														
 
															+                                                      self._vocab_size)
														
 
															+
														
 
															+        all_tokens = torch.full(
														
 
															+            size=(original_bs, k + 1),
														
 
															+            fill_value=-1,
														
 
															+            device=self._device,
														
 
															+            dtype=torch.long,
														
 
															+        )
														
 
															+        all_probs = torch.zeros(
														
 
															+            original_bs,
														
 
															+            k + 1,
														
 
															+            self._vocab_size,
														
 
															+            device=self._device,
														
 
															+            dtype=torch.float32,
														
 
															+        )
														
 
															+
														
 
															+        if non_spec_indices:
														
 
															+            all_tokens[non_spec_indices, 0] = non_spec_target_token_ids
														
 
															+            all_probs[non_spec_indices, :1, :] = non_spec_target_probs
														
 
															+
														
 
															+        if spec_indices:
														
 
															+            all_tokens[spec_indices] = target_token_ids
														
 
															+            all_probs[spec_indices] = target_probs
														
 
															+
														
 
															+        return all_tokens, all_probs
														
 
															+
														
 
															+    def _create_scoring_model_input(
														
 
															+            self,
														
 
															+            seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+            proposal_token_ids: List[List[TokenId]],  # shape: [batch_size, k]
														
 
															+    ) -> List[SequenceGroupMetadata]:
														
 
															+        """Given the original input sequences and proposed tokens from the draft
														
 
															+        model, create a list of target sequences that can be used for scoring.
														
 
															+        """
														
 
															+
														
 
															+        if not seq_group_metadata_list:
														
 
															+            return []
														
 
															+
														
 
															+        target_seq_ids_iter = self._create_target_seq_id_iterator(
														
 
															+            get_all_seq_ids(seq_group_metadata_list))
														
 
															+
														
 
															+        target_seq_group_metadata = list(
														
 
															+            chain.from_iterable(
														
 
															+                self._create_target_seq_group_metadata(
														
 
															+                    seq_group_metadata,
														
 
															+                    proposal_token_ids,
														
 
															+                    i,
														
 
															+                    target_seq_ids_iter,
														
 
															+                ) for i, seq_group_metadata in enumerate(
														
 
															+                    seq_group_metadata_list)))
														
 
															+
														
 
															+        return target_seq_group_metadata
														
 
															+
														
 
															+    def _create_target_seq_group_metadata(
														
 
															+        self,
														
 
															+        input_seq_group_metadata: SequenceGroupMetadata,
														
 
															+        proposal_token_ids: List[TokenId],  # shape: [batch_size, k]
														
 
															+        batch_index: int,
														
 
															+        target_seq_ids_iter: Iterator[TargetSeqId],
														
 
															+    ) -> List[SequenceGroupMetadata]:
														
 
															+        """Given an input sequence group metadata and a list of draft tokens,
														
 
															+        create a list of target SequenceGroupMetadata, one for each
														
 
															+        token id that needs to be scored.
														
 
															+
														
 
															+        Naive speculative decoding requires K target model scores, one for each
														
 
															+        draft model token. However one can add a bonus token such that if each
														
 
															+        token is accepted, then a final token may be sampled from the model.
														
 
															+        This function creates K+1 target SequenceGroupMetadata to take
														
 
															+        advantage of the bonus token.
														
 
															+        """
														
 
															+        assert not input_seq_group_metadata.is_prompt, (
														
 
															+            "Speculating on "
														
 
															+            "prompts not yet supported")
														
 
															+        assert len(input_seq_group_metadata.seq_data) == 1, (
														
 
															+            "Beam search "
														
 
															+            "not supported in speculative decoding")
														
 
															+        input_seq_id = next(iter(input_seq_group_metadata.seq_data.keys()))
														
 
															+
														
 
															+        token_ids_to_score = self._get_token_ids_to_score(
														
 
															+            proposal_token_ids[batch_index])
														
 
															+
														
 
															+        target_seq_group_metadata_list: List[SequenceGroupMetadata] = []
														
 
															+        for token_ids in token_ids_to_score:
														
 
															+            target_seq_group_metadata_list.append(
														
 
															+                self._create_single_target_seq_group_metadata(
														
 
															+                    input_seq_group_metadata,
														
 
															+                    input_seq_id,
														
 
															+                    next(target_seq_ids_iter),
														
 
															+                    token_ids,
														
 
															+                ))
														
 
															+
														
 
															+        return target_seq_group_metadata_list
														
 
															+
														
 
															+    def _create_single_target_seq_group_metadata(
														
 
															+        self,
														
 
															+        seq_group_metadata: SequenceGroupMetadata,
														
 
															+        seq_id: SeqId,
														
 
															+        target_seq_id: TargetSeqId,
														
 
															+        token_ids: List[TokenId],
														
 
															+    ) -> SequenceGroupMetadata:
														
 
															+        """Create a single target SequenceGroupMetadata.
														
 
															+
														
 
															+        Args:
														
 
															+            seq_group_metadata: The metadata for the input sequence.
														
 
															+            seq_id: The input sequence ID.
														
 
															+            target_seq_id: The corresponding target sequence ID.
														
 
															+            token_ids: The list of token ids that are to be appended to the
														
 
															+                input sequence.
														
 
															+        """
														
 
															+        seq_data = seq_group_metadata.seq_data[seq_id]
														
 
															+        prompt_token_ids = seq_data.get_prompt_token_ids()
														
 
															+        new_output_token_ids = [*seq_data.get_output_token_ids(), *token_ids]
														
 
															+
														
 
															+        return SequenceGroupMetadata(
														
 
															+            request_id=seq_group_metadata.request_id,
														
 
															+            is_prompt=seq_group_metadata.is_prompt,
														
 
															+            seq_data={
														
 
															+                target_seq_id:
														
 
															+                SequenceData(
														
 
															+                    prompt_token_ids=prompt_token_ids,
														
 
															+                    output_token_ids=new_output_token_ids,
														
 
															+                ),
														
 
															+            },
														
 
															+            sampling_params=seq_group_metadata.sampling_params,
														
 
															+            block_tables={
														
 
															+                target_seq_id: seq_group_metadata.block_tables[seq_id],
														
 
															+            },
														
 
															+            lora_request=None,
														
 
															+            persistent_data={},
														
 
															+        )
														
 
															+
														
 
															+    def _split_scoring_output(
														
 
															+        self, sampler_output: SamplerOutput, num_scoring_tokens: int
														
 
															+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
														
 
															+        """Split the target model output into speculative and non-speculative
														
 
															+        output.
														
 
															+        """
														
 
															+
														
 
															+        # Aphrodite currently only supports proposal lens equal to zero or the
														
 
															+        # batch proposal len. This adds some complexity (splitting the batch
														
 
															+        # into spec and non spec sequences) and should be removed in the
														
 
															+        # future. It can be done by supporting per-sequence proposal lens.
														
 
															+        # First samples are from speculative scoring, latter samples are non-
														
 
															+        # speculative samples.
														
 
															+        split_sizes = [
														
 
															+            num_scoring_tokens,
														
 
															+            sampler_output.sampled_token_ids.numel() - num_scoring_tokens,
														
 
															+        ]
														
 
															+        (spec_probs, non_spec_probs
														
 
															+         ) = sampler_output.sampled_token_probs.split(split_sizes)
														
 
															+        (
														
 
															+            spec_sampled_tokens,
														
 
															+            non_spec_sampled_tokens,
														
 
															+        ) = sampler_output.sampled_token_ids.flatten().split(split_sizes)
														
 
															+
														
 
															+        # Convert scores to tensors.
														
 
															+        sampler_output.sampled_token_probs = spec_probs
														
 
															+        sampler_output.sampled_token_ids = spec_sampled_tokens
														
 
															+        target_token_ids, target_probs = sampler_output_to_torch(
														
 
															+            [sampler_output])
														
 
															+
														
 
															+        # Convert non-speculative output tokens to tensors.
														
 
															+        sampler_output.sampled_token_probs = non_spec_probs
														
 
															+        sampler_output.sampled_token_ids = non_spec_sampled_tokens
														
 
															+        (
														
 
															+            non_spec_target_token_ids,
														
 
															+            non_spec_target_probs,
														
 
															+        ) = sampler_output_to_torch([sampler_output])
														
 
															+
														
 
															+        return (
														
 
															+            target_token_ids,
														
 
															+            target_probs,
														
 
															+            non_spec_target_token_ids,
														
 
															+            non_spec_target_probs,
														
 
															+        )
														
 
															+
														
 
															+    def _create_target_seq_id_iterator(
														
 
															+            self, seq_ids: List[SeqId]) -> Iterator[TargetSeqId]:
														
 
															+        """Create an iterator for creating target sequence ids.
														
 
															+        Target sequence ids are distinct from sequence ids because we create a
														
 
															+        distinct target sequence id for each proposal token to be scored.
														
 
															+
														
 
															+        This implementation increments a counter starting at 1 + max of all
														
 
															+        provided input sequence ids.
														
 
															+        """
														
 
															+        return count(start=max(seq_ids) + 1)
														
 
															+
														
 
															+    def _get_token_ids_to_score(
														
 
															+            self,
														
 
															+            full_spec_token_ids: List[TokenId],  # shape: [k]
														
 
															+    ) -> List[List[TokenId]]:
														
 
															+        """Given an int tensor of proposal token ids, return a list of
														
 
															+        token ids that should be scored.
														
 
															+
														
 
															+        Returns k+1 output lists. The additional one is used for generating the
														
 
															+        bonus token.
														
 
															+
														
 
															+        Example:
														
 
															+            Input: [0, 1, 2, 3] (k=4)
														
 
															+            Output: (k+1 lists)
														
 
															+                []
														
 
															+                [0]
														
 
															+                [0, 1]
														
 
															+                [0, 1, 2]
														
 
															+                [0, 1, 2, 3]
														
 
															+        """
														
 
															+        empty_token_ids = []
														
 
															+
														
 
															+        token_ids_to_score = [empty_token_ids]
														
 
															+        token_ids_to_score.extend([
														
 
															+            full_spec_token_ids[:i + 1]
														
 
															+            for i in range(len(full_spec_token_ids))
														
 
															+        ])
														
 
															+        return token_ids_to_score
														
--- a/aphrodite/spec_decode/interfaces.py
+++ b/aphrodite/spec_decode/interfaces.py
@@ -0,0 +1,77 @@
 
															+from typing import List, Tuple, Optional, Dict
														
 
															+from dataclasses import dataclass
														
 
															+from abc import ABC, abstractmethod
														
 
															+
														
 
															+import torch
														
 
															+
														
 
															+from aphrodite.common.sequence import SequenceGroupMetadata
														
 
															+
														
 
															+
														
 
															+@dataclass
														
 
															+class SpeculativeProposals:
														
 
															+    """Datastructure used to represent proposal tokens from some proposer. It
														
 
															+    also tracks how many speculative tokens each sequence has.
														
 
															+    """
														
 
															+
														
 
															+    # Speculative proposal tokens.
														
 
															+    proposal_token_ids: torch.Tensor
														
 
															+
														
 
															+    # Probabilities of the proposal tokens according to the proposer.
														
 
															+    proposal_probs: torch.Tensor
														
 
															+
														
 
															+    # The valid length of each proposal; can be zero.
														
 
															+    proposal_lens: torch.Tensor
														
 
															+
														
 
															+    def __repr__(self):
														
 
															+        return (f"SpeculativeProposals("
														
 
															+                f"proposal_token_ids={self.proposal_token_ids.shape}, "
														
 
															+                f"proposal_probs={self.proposal_probs.shape}, "
														
 
															+                f"proposal_lens={self.proposal_lens.shape})")
														
 
															+
														
 
															+
														
 
															+@dataclass
														
 
															+class SpeculativeScores:
														
 
															+    """Datastructure used to represent the scores of speculative tokens
														
 
															+    according to the scoring model.
														
 
															+    """
														
 
															+
														
 
															+    # Probabilities of the speculative tokens according to the scoring model.
														
 
															+    probs: torch.Tensor
														
 
															+
														
 
															+    # Token ids sampled from the scoring model. Used for speculative bonus
														
 
															+    # tokens and also non-speculative normal decoding.
														
 
															+    token_ids: torch.Tensor
														
 
															+
														
 
															+    def __repr__(self):
														
 
															+        return (f"SpeculativeScores("
														
 
															+                f"probs={self.probs.shape}, "
														
 
															+                f"token_ids={self.token_ids.shape})")
														
 
															+
														
 
															+
														
 
															+class SpeculativeProposer(ABC):
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def get_proposals(
														
 
															+        self,
														
 
															+        seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+        blocks_to_swap_in: Dict[int, int],
														
 
															+        blocks_to_swap_out: Dict[int, int],
														
 
															+        blocks_to_copy: Dict[int, List[int]],
														
 
															+        max_proposal_len: int,
														
 
															+    ) -> SpeculativeProposals:
														
 
															+        raise NotImplementedError
														
 
															+
														
 
															+
														
 
															+class SpeculativeScorer(ABC):
														
 
															+
														
 
															+    @abstractmethod
														
 
															+    def score_proposals(
														
 
															+        self,
														
 
															+        seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+        blocks_to_swap_in: Optional[Dict[int, int]],
														
 
															+        blocks_to_swap_out: Optional[Dict[int, int]],
														
 
															+        blocks_to_copy: Optional[Dict[int, List[int]]],
														
 
															+        k: int,
														
 
															+        proposals: SpeculativeProposals,
														
 
															+    ) -> Tuple[torch.Tensor, torch.Tensor]:
														
 
															+        raise NotImplementedError
														
--- a/aphrodite/spec_decode/metrics.py
+++ b/aphrodite/spec_decode/metrics.py
@@ -0,0 +1,175 @@
 
															+import torch
														
 
															+from dataclasses import dataclass
														
 
															+from typing import Optional
														
 
															+import time
														
 
															+from typing import Callable
														
 
															+
														
 
															+from aphrodite.modeling.layers.rejection import RejectionSampler
														
 
															+from aphrodite.common.utils import in_wsl
														
 
															+
														
 
															+
														
 
															+@dataclass
														
 
															+class SpecDecodeWorkerMetrics:
														
 
															+    """Dataclass holding metrics emitted from the spec decode worker."""
														
 
															+
														
 
															+    # The empirical acceptance rate of the proposal method on a per-token basis.
														
 
															+    # This is useful for evaluating how well the proposal method aligns with the
														
 
															+    # scoring method.
														
 
															+    draft_acceptance_rate: float
														
 
															+
														
 
															+    # The empirical efficiency, measured as the number of tokens emitted by the
														
 
															+    # system divided by the number of tokens that could be emitted by the system
														
 
															+    # if the proposal method were perfect.
														
 
															+    system_efficiency: float
														
 
															+
														
 
															+    # The number of speculative tokens produced by the proposal method.
														
 
															+    draft_tokens: int
														
 
															+
														
 
															+    # The number of tokens emitted by the entire system.
														
 
															+    emitted_tokens: int
														
 
															+
														
 
															+    # The number of tokens accepted by the scoring model and verification
														
 
															+    # routine, e.g. Llama2-70B and lossless rejection sampling.
														
 
															+    #
														
 
															+    # NOTE: Any token accepted by the verification routine is considered
														
 
															+    # accepted (regardless of if the speculative prefix is also accepted). The
														
 
															+    # user will usually see less accepted tokens. This metric is helpful when
														
 
															+    # evaluating alignment of the proposal method with the scoring model.
														
 
															+    accepted_tokens: int
														
 
															+
														
 
															+    # The number of speculative tokens per sequence.
														
 
															+    num_spec_tokens: int
														
 
															+
														
 
															+
														
 
															+Timer = Callable[[], float]
														
 
															+
														
 
															+
														
 
															+class AsyncMetricsCollector:
														
 
															+    """Class which copies rejection sampler metrics from the device to CPU on a
														
 
															+    non-default Torch stream.
														
 
															+    """
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        rejection_sampler: RejectionSampler,
														
 
															+        timer: Optional[Timer] = None,
														
 
															+        collect_interval_s: float = 5.0,
														
 
															+    ):
														
 
															+        self._rejection_sampler = rejection_sampler
														
 
															+        self._timer = time.time if timer is None else timer
														
 
															+
														
 
															+        self._rank: Optional[int] = None
														
 
															+
														
 
															+        # We don't have a device set yet.
														
 
															+        self._copy_stream: Optional[torch.cuda.Stream] = None
														
 
															+
														
 
															+        self._in_flight_copy: Optional[torch.cuda.Event] = None
														
 
															+
														
 
															+        pin_memory = not in_wsl()
														
 
															+        self._aggregate_num_accepted_tokens = torch.tensor(
														
 
															+            0, dtype=torch.long, device="cpu", pin_memory=pin_memory)
														
 
															+        self._aggregate_num_emitted_tokens = torch.tensor(
														
 
															+            0, dtype=torch.long, device="cpu", pin_memory=pin_memory)
														
 
															+        self._aggregate_num_draft_tokens = 0
														
 
															+
														
 
															+        self._rejsample_metrics_collect_interval_s = collect_interval_s
														
 
															+        self._last_metrics_collect_time = self._timer()
														
 
															+
														
 
															+    def init_gpu_tensors(self, rank: int) -> None:
														
 
															+        self._rank = rank
														
 
															+        self._copy_stream = torch.cuda.Stream()
														
 
															+
														
 
															+    def maybe_collect_rejsample_metrics(
														
 
															+            self, k: int) -> Optional[SpecDecodeWorkerMetrics]:
														
 
															+        # If a copy was initiated in the previous call, collect and return.
														
 
															+        if self._in_flight_copy is not None:
														
 
															+            ready_event = self._in_flight_copy
														
 
															+            self._in_flight_copy = None
														
 
															+            return self._collect_rejsample_metrics(k, ready_event)
														
 
															+
														
 
															+        # Otherwise, check if we should start a new copy.
														
 
															+        if self._should_collect_rejsample_metrics(self._timer()):
														
 
															+            assert self._in_flight_copy is None
														
 
															+            self._in_flight_copy = self._copy_rejsample_metrics_async()
														
 
															+
														
 
															+        return None
														
 
															+
														
 
															+    def _should_collect_rejsample_metrics(self, now: float) -> bool:
														
 
															+        """Return whether or not this iteration should print rejection sampling
														
 
															+        metrics.
														
 
															+        """
														
 
															+        if self._rank != 0:
														
 
															+            return False
														
 
															+
														
 
															+        if (now - self._last_metrics_collect_time <
														
 
															+                self._rejsample_metrics_collect_interval_s):
														
 
															+            return False
														
 
															+        return True
														
 
															+
														
 
															+    def _copy_rejsample_metrics_async(self) -> torch.cuda.Event:
														
 
															+        """Copy rejection sampling metrics (number of accepted tokens, etc) to
														
 
															+        CPU asynchronously.
														
 
															+
														
 
															+        Returns a CUDA event recording when the copy is complete.
														
 
															+        """
														
 
															+        self._copy_stream.wait_stream(torch.cuda.current_stream())
														
 
															+
														
 
															+        with torch.cuda.stream(self._copy_stream):
														
 
															+            self._aggregate_num_accepted_tokens.copy_(
														
 
															+                self._rejection_sampler.num_accepted_tokens, non_blocking=True)
														
 
															+            self._aggregate_num_emitted_tokens.copy_(
														
 
															+                self._rejection_sampler.num_emitted_tokens, non_blocking=True)
														
 
															+            # Number of draft tokens is calculated on CPU, so no copy is
														
 
															+            # required.
														
 
															+            self._aggregate_num_draft_tokens = (
														
 
															+                self._rejection_sampler.num_draft_tokens)
														
 
															+
														
 
															+        aggregate_metrics_ready = torch.cuda.Event()
														
 
															+        aggregate_metrics_ready.record(self._copy_stream)
														
 
															+
														
 
															+        return aggregate_metrics_ready
														
 
															+
														
 
															+    def _collect_rejsample_metrics(
														
 
															+            self, k: int,
														
 
															+            ready_event: torch.cuda.Event) -> SpecDecodeWorkerMetrics:
														
 
															+        """Create metrics object from statistics copied asynchronously.
														
 
															+
														
 
															+        Args:
														
 
															+            k: int. The number of speculative tokens; used to determine system
														
 
															+                efficiency.
														
 
															+            ready_event: torch.cuda.Event. The CUDA event recording when the
														
 
															+                async GPU->CPU copy is complete.
														
 
															+        """
														
 
															+
														
 
															+        ready_event.synchronize()
														
 
															+        accepted_tokens = self._aggregate_num_accepted_tokens.item()
														
 
															+        emitted_tokens = self._aggregate_num_emitted_tokens.item()
														
 
															+        draft_tokens = self._aggregate_num_draft_tokens
														
 
															+
														
 
															+        num_possible_tokens = self.get_max_num_accepted_tokens(draft_tokens, k)
														
 
															+
														
 
															+        if draft_tokens > 0:
														
 
															+            draft_acceptance_rate = accepted_tokens / draft_tokens
														
 
															+        else:
														
 
															+            draft_acceptance_rate = float("nan")
														
 
															+
														
 
															+        if num_possible_tokens > 0:
														
 
															+            system_efficiency = emitted_tokens / num_possible_tokens
														
 
															+        else:
														
 
															+            system_efficiency = float("nan")
														
 
															+
														
 
															+        return SpecDecodeWorkerMetrics(
														
 
															+            num_spec_tokens=k,
														
 
															+            draft_acceptance_rate=draft_acceptance_rate,
														
 
															+            system_efficiency=system_efficiency,
														
 
															+            accepted_tokens=accepted_tokens,
														
 
															+            draft_tokens=draft_tokens,
														
 
															+            emitted_tokens=emitted_tokens,
														
 
															+        )
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def get_max_num_accepted_tokens(draft_tokens: int, k: int) -> int:
														
 
															+        # Divide by k since batch size can be variable.
														
 
															+        total_num_spec_seqs = draft_tokens / k
														
 
															+        num_accepted_per_seq_if_all_accepted = k + 1
														
 
															+        return int(total_num_spec_seqs / num_accepted_per_seq_if_all_accepted)
														
--- a/aphrodite/spec_decode/multi_step_worker.py
+++ b/aphrodite/spec_decode/multi_step_worker.py
@@ -0,0 +1,392 @@
 
															+from typing import List, Dict, Optional, Tuple
														
 
															+import copy
														
 
															+
														
 
															+import torch
														
 
															+
														
 
															+from aphrodite.common.sequence import SamplerOutput, SequenceGroupMetadata
														
 
															+from aphrodite.task_handler.worker import Worker
														
 
															+from aphrodite.spec_decode.interfaces import (
														
 
															+    SpeculativeProposals,
														
 
															+    SpeculativeProposer,
														
 
															+)
														
 
															+from aphrodite.spec_decode.util import sampler_output_to_torch
														
 
															+
														
 
															+
														
 
															+class MultiStepWorker(Worker):
														
 
															+    """The MultiStepWorker is equivalent to a Worker except that it allows
														
 
															+    multiple forward passes in a single call, assuming the scheduler has
														
 
															+    allocated enough space to store the additional KV. This reduces overhead
														
 
															+    by invoking the scheduler less.
														
 
															+
														
 
															+    The MultiStepWorker does not support cache swap operations, or beam search.
														
 
															+    Cache swap operations do not require large modifications. On the other hand,
														
 
															+    beam search requires memory allocations during sequence forks and thus
														
 
															+    requires more thought for MultiStepWorker support.
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self, *args, **kwargs):
														
 
															+        super().__init__(*args, **kwargs)
														
 
															+
														
 
															+        self._proposer: Optional[DraftModelTop1Proposer] = None
														
 
															+
														
 
															+    def init_model(self):
														
 
															+        super().init_model()
														
 
															+
														
 
															+        self._proposer = DraftModelTop1Proposer(
														
 
															+            self,
														
 
															+            self.device,
														
 
															+            self.max_model_len,
														
 
															+            self.vocab_size,
														
 
															+        )
														
 
															+
														
 
															+    @torch.inference_mode()
														
 
															+    def execute_model_multi_step(
														
 
															+        self,
														
 
															+        seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+        blocks_to_swap_in: Dict[int, int],
														
 
															+        blocks_to_swap_out: Dict[int, int],
														
 
															+        blocks_to_copy: Dict[int, List[int]],
														
 
															+        num_steps: int,
														
 
															+    ) -> List[SamplerOutput]:
														
 
															+        """Run the model forward pass num_steps times. Returns the list of
														
 
															+        sampler output, one per model forward pass.
														
 
															+        """
														
 
															+        self._raise_if_unsupported(
														
 
															+            seq_group_metadata_list,
														
 
															+            blocks_to_swap_in,
														
 
															+            blocks_to_swap_out,
														
 
															+            blocks_to_copy,
														
 
															+        )
														
 
															+
														
 
															+        # Shallow copy input data so modifications (such as appending tokens)
														
 
															+        # do not cause side-effects.
														
 
															+        copied_seq_group_metadata_list = self._shallow_copy_inputs(
														
 
															+            seq_group_metadata_list)
														
 
															+
														
 
															+        # Assert enough KV space for num_steps tokens per sequence.
														
 
															+        self._assert_enough_kv_space(seq_group_metadata_list, num_steps)
														
 
															+
														
 
															+        # Run model num_steps times.
														
 
															+        model_outputs = []
														
 
															+        for _ in range(num_steps):
														
 
															+            model_output = super().execute_model(
														
 
															+                seq_group_metadata_list=copied_seq_group_metadata_list,
														
 
															+                blocks_to_swap_in=blocks_to_swap_in,
														
 
															+                blocks_to_swap_out=blocks_to_swap_out,
														
 
															+                blocks_to_copy=blocks_to_copy,
														
 
															+            )
														
 
															+
														
 
															+            self._append_new_tokens(model_output,
														
 
															+                                    copied_seq_group_metadata_list)
														
 
															+            model_outputs.append(model_output)
														
 
															+
														
 
															+        return model_outputs
														
 
															+
														
 
															+    def get_spec_proposals(
														
 
															+        self,
														
 
															+        seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+        blocks_to_swap_in: Dict[int, int],
														
 
															+        blocks_to_swap_out: Dict[int, int],
														
 
															+        blocks_to_copy: Dict[int, List[int]],
														
 
															+        max_proposal_len: int,
														
 
															+    ) -> SpeculativeProposals:
														
 
															+        """Produce speculations given an input batch of sequences. The number of
														
 
															+        speculative tokens per sequence is determined by max_proposal_len.
														
 
															+        """
														
 
															+
														
 
															+        return self._proposer.get_proposals(
														
 
															+            seq_group_metadata_list,
														
 
															+            blocks_to_swap_in,
														
 
															+            blocks_to_swap_out,
														
 
															+            blocks_to_copy,
														
 
															+            max_proposal_len,
														
 
															+        )
														
 
															+
														
 
															+    def _append_new_tokens(
														
 
															+        self,
														
 
															+        model_output: SamplerOutput,
														
 
															+        seq_group_metadata_list: SequenceGroupMetadata,
														
 
															+    ) -> None:
														
 
															+        """Given model output from a single run, append the tokens to the
														
 
															+        sequences. This is normally done outside of the worker, but it is
														
 
															+        required if the worker is to perform multiple forward passes.
														
 
															+        """
														
 
															+        for seq_group_metadata, sequence_group_outputs in zip(
														
 
															+                seq_group_metadata_list, model_output):
														
 
															+            seq_group_metadata.is_prompt = False
														
 
															+
														
 
															+            for seq_output in sequence_group_outputs.samples:
														
 
															+                # NOTE: Beam search is not supported, so we can assume that
														
 
															+                # parent_seq_id == seq_id.
														
 
															+                seq = seq_group_metadata.seq_data[seq_output.parent_seq_id]
														
 
															+
														
 
															+                token_id = seq_output.output_token
														
 
															+                token_logprob = seq_output.logprobs[token_id]
														
 
															+
														
 
															+                seq.append_token_id(token_id, token_logprob.logprob)
														
 
															+
														
 
															+    def _shallow_copy_inputs(
														
 
															+        self, seq_group_metadata_list: List[SequenceGroupMetadata]
														
 
															+    ) -> List[SequenceGroupMetadata]:
														
 
															+        """Copy input data structures to remove side-effects when input data
														
 
															+        structures are shared with other modules.
														
 
															+
														
 
															+        Helpful when the Aphrodite scheduler runs in the same process as the
														
 
															+        worker. The alternative is deep-copying (or other form of deep copy);
														
 
															+        this has performance downsides.
														
 
															+        """
														
 
															+
														
 
															+        # Shallow-copy the list of SequenceGroupMetadata. This allows us to
														
 
															+        # append tokens and change is_prompt without external side-effects.
														
 
															+        new_seq_group_metadata_list = []
														
 
															+
														
 
															+        for old_seq_group_metadata in seq_group_metadata_list:
														
 
															+            # We must shallow-copy seq_group_metadata as is_prompt could change.
														
 
															+            seq_group_metadata = copy.copy(old_seq_group_metadata)
														
 
															+            new_seq_group_metadata_list.append(seq_group_metadata)
														
 
															+
														
 
															+            # We must shallow-copy seq_data as we will append token ids
														
 
															+            new_seq_data = {}
														
 
															+            for seq_id, old_seq_data in seq_group_metadata.seq_data.items():
														
 
															+                new_seq_data[seq_id] = copy.copy(old_seq_data)
														
 
															+                new_seq_data[
														
 
															+                    seq_id].output_token_ids = old_seq_data.output_token_ids[:]
														
 
															+
														
 
															+            seq_group_metadata.seq_data = new_seq_data
														
 
															+
														
 
															+        return new_seq_group_metadata_list
														
 
															+
														
 
															+    def _assert_enough_kv_space(
														
 
															+        self,
														
 
															+        seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+        num_steps: int,
														
 
															+    ) -> None:
														
 
															+        """Assert there are enough physical blocks per sequence to store the
														
 
															+        current KV plus additional KV from num_steps tokens.
														
 
															+        """
														
 
															+        assert self.model_runner.block_size is not None
														
 
															+        for seq_group_metadata in seq_group_metadata_list:
														
 
															+            # Only one seq_id is guaranteed because there is no beam search.
														
 
															+            seq_id = list(seq_group_metadata.seq_data.keys())[0]
														
 
															+            seq = seq_group_metadata.seq_data[seq_id]
														
 
															+
														
 
															+            # After num_steps, the seq len will be the current seq len
														
 
															+            # plus one token per step.
														
 
															+            final_seq_len = seq.get_len() + num_steps
														
 
															+
														
 
															+            # We will have final_seq_len - 1 KV because Aphrodite saves KV for a
														
 
															+            # token in the iteration after the token was generated.
														
 
															+            required_num_kv_slots = final_seq_len - 1
														
 
															+
														
 
															+            # The allocated number of kv slots is the number of allocated blocks
														
 
															+            # times the number of slots of block.
														
 
															+            number_physical_blocks = len(
														
 
															+                seq_group_metadata.block_tables[seq_id])
														
 
															+            allocated_kv_slots = (number_physical_blocks *
														
 
															+                                  self.model_runner.block_size)
														
 
															+
														
 
															+            if required_num_kv_slots > allocated_kv_slots:
														
 
															+                request_id = seq_group_metadata.request_id
														
 
															+                raise ValueError(
														
 
															+                    "The worker attempted to run "
														
 
															+                    f"{num_steps} times but found insufficient KV space for "
														
 
															+                    f"{request_id=} {seq_id=}. ({allocated_kv_slots=} "
														
 
															+                    f"{required_num_kv_slots=}).")
														
 
															+
														
 
															+    def _raise_if_unsupported(
														
 
															+        self,
														
 
															+        seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+        blocks_to_swap_in: Dict[int, int],
														
 
															+        blocks_to_swap_out: Dict[int, int],
														
 
															+        blocks_to_copy: Dict[int, List[int]],
														
 
															+    ) -> None:
														
 
															+        """MultiStepWorker does not yet implement support for cache swap
														
 
															+        operations or beam search.
														
 
															+        """
														
 
															+        if any([blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy]):
														
 
															+            raise NotImplementedError(
														
 
															+                "MultiStepWorker does not support cache operations")
														
 
															+
														
 
															+        if any(
														
 
															+                len(seq_group_metadata.seq_data.keys()) != 1
														
 
															+                for seq_group_metadata in seq_group_metadata_list):
														
 
															+            raise NotImplementedError(
														
 
															+                "MultiStepWorker does not support beam search.")
														
 
															+
														
 
															+
														
 
															+class DraftModelTop1Proposer(SpeculativeProposer):
														
 
															+    """Helper class which separates out sequences which would exceed the max
														
 
															+    model length when speculated upon.
														
 
															+
														
 
															+    This allows combinations of models such as JackFram/llama-68m draft with
														
 
															+    meta-llama/Llama2-13b-chat-hf, as llama-68m has max_position_embeddings of
														
 
															+    2048 while Llama2-13b has max_position_embeddings of 4096.
														
 
															+
														
 
															+    We treat the sequences which exceed the proposal draft model length as
														
 
															+    "non-spec sequences". Essentially they skip the draft model and go through
														
 
															+    normal decoding in the target model.
														
 
															+
														
 
															+    Currently, only proposal_lens of 0 and k are supported, where k is a global
														
 
															+    batch proposal length. In the future Aphrodite should support per-sequence
														
 
															+    proposal lengths.
														
 
															+    """
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        draft_worker: MultiStepWorker,
														
 
															+        device: str,
														
 
															+        max_model_len: int,
														
 
															+        vocab_size: int,
														
 
															+    ):
														
 
															+        self._draft_worker = draft_worker
														
 
															+        self._device = device
														
 
															+        self._max_model_len = max_model_len
														
 
															+        self._vocab_size = vocab_size
														
 
															+
														
 
															+    def get_proposals(
														
 
															+        self,
														
 
															+        seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+        blocks_to_swap_in: Dict[int, int],
														
 
															+        blocks_to_swap_out: Dict[int, int],
														
 
															+        blocks_to_copy: Dict[int, List[int]],
														
 
															+        max_proposal_len: int,
														
 
															+    ) -> SpeculativeProposals:
														
 
															+        """Get speculative proposals given the input batch.
														
 
															+
														
 
															+        Sequences which would exceed the max model length are skipped during
														
 
															+        speculation.
														
 
															+        """
														
 
															+
														
 
															+        # Split speculative- and non-speculative- sequences.
														
 
															+        (
														
 
															+            proposal_lens,
														
 
															+            nonzero_proposal_len_seqs,
														
 
															+            nonzero_proposal_len_indices,
														
 
															+        ) = self._split_by_max_model_len(seq_group_metadata_list,
														
 
															+                                         max_proposal_len)
														
 
															+
														
 
															+        if nonzero_proposal_len_seqs:
														
 
															+            # Speculate tokens using the draft worker for the speculative
														
 
															+            # sequences.
														
 
															+            maybe_sampler_output = self._draft_worker.execute_model_multi_step(
														
 
															+                seq_group_metadata_list=nonzero_proposal_len_seqs,
														
 
															+                blocks_to_swap_in=blocks_to_swap_in,
														
 
															+                blocks_to_swap_out=blocks_to_swap_out,
														
 
															+                blocks_to_copy=blocks_to_copy,
														
 
															+                num_steps=max_proposal_len,
														
 
															+            )
														
 
															+        else:
														
 
															+            # If no sequences can be speculated, set sampler output to None.
														
 
															+            maybe_sampler_output = None
														
 
															+
														
 
															+        # Combine speculative- and non-speculative sequences into the same
														
 
															+        # representation.
														
 
															+        proposal_tokens, proposal_probs, proposal_lens = self._merge_outputs(
														
 
															+            batch_size=len(seq_group_metadata_list),
														
 
															+            max_proposal_len=max_proposal_len,
														
 
															+            maybe_sampler_output=maybe_sampler_output,
														
 
															+            proposal_lens=proposal_lens,
														
 
															+            nonzero_proposal_len_indices=nonzero_proposal_len_indices,
														
 
															+        )
														
 
															+
														
 
															+        proposals = SpeculativeProposals(
														
 
															+            proposal_token_ids=proposal_tokens,
														
 
															+            proposal_probs=proposal_probs,
														
 
															+            proposal_lens=proposal_lens,
														
 
															+        )
														
 
															+
														
 
															+        return proposals
														
 
															+
														
 
															+    def _split_by_max_model_len(
														
 
															+        self,
														
 
															+        seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+        max_proposal_len: int,
														
 
															+    ) -> Tuple[List[int], List[SequenceGroupMetadata], List[int]]:
														
 
															+        """Determine which sequences would exceed the max model length."""
														
 
															+
														
 
															+        proposal_lens: List[int] = []
														
 
															+        nonzero_proposal_len_seqs: List[SequenceGroupMetadata] = []
														
 
															+        nonzero_proposal_len_indices: List[int] = []
														
 
															+        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
														
 
															+            seq_data = next(iter(seq_group_metadata.seq_data.values()))
														
 
															+            seq_len = seq_data.get_len()
														
 
															+
														
 
															+            # Currently only proposal lens of 0 or the global batch proposal len
														
 
															+            # are supported.
														
 
															+            if seq_len + max_proposal_len < self._max_model_len:
														
 
															+                proposal_lens.append(max_proposal_len)
														
 
															+                nonzero_proposal_len_seqs.append(seq_group_metadata)
														
 
															+                nonzero_proposal_len_indices.append(i)
														
 
															+            else:
														
 
															+                proposal_lens.append(0)
														
 
															+
														
 
															+        return (
														
 
															+            proposal_lens,
														
 
															+            nonzero_proposal_len_seqs,
														
 
															+            nonzero_proposal_len_indices,
														
 
															+        )
														
 
															+
														
 
															+    def _merge_outputs(
														
 
															+        self,
														
 
															+        batch_size: int,
														
 
															+        max_proposal_len: int,
														
 
															+        maybe_sampler_output: Optional[SamplerOutput],
														
 
															+        proposal_lens: List[int],
														
 
															+        nonzero_proposal_len_indices: List[int],
														
 
															+    ) -> Tuple[torch.Tensor, torch.tensor, torch.Tensor]:
														
 
															+        """After speculations are produced, merge the speculation results with
														
 
															+        the skipped sequences.
														
 
															+        """
														
 
															+        if maybe_sampler_output is None:
														
 
															+            # If no speculative tokens, the sampler output will be None.
														
 
															+            # In this case we return empty tensors.
														
 
															+            proposal_tokens = torch.zeros(0,
														
 
															+                                          max_proposal_len,
														
 
															+                                          dtype=torch.long,
														
 
															+                                          device=self._device)
														
 
															+            proposal_probs = torch.zeros(
														
 
															+                0,
														
 
															+                max_proposal_len,
														
 
															+                self._vocab_size,
														
 
															+                dtype=torch.float32,
														
 
															+                device=self._device,
														
 
															+            )
														
 
															+            proposal_lens = torch.zeros(len(proposal_lens),
														
 
															+                                        dtype=torch.long,
														
 
															+                                        device=self._device)
														
 
															+            return proposal_tokens, proposal_probs, proposal_lens
														
 
															+
														
 
															+        sampler_output = maybe_sampler_output
														
 
															+
														
 
															+        proposal_tokens, proposal_probs = sampler_output_to_torch(
														
 
															+            sampler_output)
														
 
															+
														
 
															+        # Now, reformat the output GPU tensors such that each sequence has
														
 
															+        # a proposal. the proposal can be empty, e.g. [-1, -1, -1]
														
 
															+
														
 
															+        entire_proposal_tokens = torch.full(
														
 
															+            size=(batch_size, *proposal_tokens.shape[1:]),
														
 
															+            fill_value=-1,
														
 
															+            dtype=torch.long,
														
 
															+            device=self._device,
														
 
															+        )
														
 
															+        entire_proposal_tokens[nonzero_proposal_len_indices] = proposal_tokens
														
 
															+        entire_proposal_probs = torch.zeros(
														
 
															+            batch_size,
														
 
															+            *proposal_probs.shape[1:],
														
 
															+            dtype=torch.float32,
														
 
															+            device=self._device,
														
 
															+        )
														
 
															+        entire_proposal_probs[nonzero_proposal_len_indices] = proposal_probs
														
 
															+
														
 
															+        proposal_tokens, proposal_probs = (
														
 
															+            entire_proposal_tokens,
														
 
															+            entire_proposal_probs,
														
 
															+        )
														
 
															+
														
 
															+        proposal_lens = torch.zeros(batch_size,
														
 
															+                                    dtype=torch.long,
														
 
															+                                    device=self._device)
														
 
															+        proposal_lens[nonzero_proposal_len_indices] = max_proposal_len
														
 
															+
														
 
															+        return proposal_tokens, proposal_probs, proposal_lens
														
--- a/aphrodite/spec_decode/spec_decode_worker.py
+++ b/aphrodite/spec_decode/spec_decode_worker.py
@@ -0,0 +1,394 @@
 
															+from typing import List, Tuple, Optional, Dict
														
 
															+from functools import cached_property
														
 
															+
														
 
															+import torch
														
 
															+
														
 
															+from aphrodite.spec_decode.metrics import AsyncMetricsCollector
														
 
															+from aphrodite.common.sequence import (
														
 
															+    SamplerOutput,
														
 
															+    SequenceGroupMetadata,
														
 
															+    SequenceGroupOutput,
														
 
															+    SequenceOutput,
														
 
															+)
														
 
															+from aphrodite.task_handler.worker import Worker
														
 
															+from aphrodite.spec_decode.multi_step_worker import MultiStepWorker
														
 
															+from aphrodite.modeling.layers.rejection import RejectionSampler
														
 
															+from aphrodite.common.config import CacheConfig
														
 
															+from aphrodite.spec_decode.util import (
														
 
															+    nvtx_range,
														
 
															+    get_all_seq_ids,
														
 
															+    split_batch_by_proposal_len,
														
 
															+)
														
 
															+from aphrodite.spec_decode.interfaces import (
														
 
															+    SpeculativeProposals,
														
 
															+    SpeculativeScores,
														
 
															+)
														
 
															+from aphrodite.spec_decode.batch_expansion import BatchExpansionTop1Scorer
														
 
															+from aphrodite.spec_decode.interfaces import SpeculativeScorer
														
 
															+
														
 
															+
														
 
															+class SpecDecodeWorker:
														
 
															+    """Worker which implements speculative decoding.
														
 
															+
														
 
															+    Speculative decoding reduces decoding per-token latency by using a proposal
														
 
															+    method, such as a small draft model, to speculate ahead of a larger LLM. The
														
 
															+    probabilities of the speculative tokens are then determined by the larger
														
 
															+    LLM, after which some verification routine determines which (if any) of the
														
 
															+    speculative tokens are accepted by the larger LLM.
														
 
															+
														
 
															+    The current implementation has the following limitations:
														
 
															+    * Only draft-model proposal is implemented (contributions for more forms are
														
 
															+        welcome!).
														
 
															+    * Only top-1 proposal and scoring are implemented. Tree-attention is left as
														
 
															+        future work.
														
 
															+    * Only lossless rejection sampling is supported. Contributions adding lossy
														
 
															+        verification routines are welcome (e.g. Medusa's typical acceptance).
														
 
															+    * All sequences in a batch must have the same proposal length, or zero. This
														
 
															+        can be improved by having per-sequence speculation in the future.
														
 
															+    * The scoring forward pass is done without an MQA kernel, which is
														
 
															+        suboptimal especially as the batch size, proposal length, and sequence
														
 
															+        lengths grow. Contributions to add a MQA scoring are welcome once
														
 
															+        correctness tests pass.
														
 
															+    """
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        proposer_worker: MultiStepWorker,
														
 
															+        scorer_worker: Worker,
														
 
															+        rejection_sampler: RejectionSampler,
														
 
															+        metrics_collector: Optional[AsyncMetricsCollector] = None,
														
 
															+    ):
														
 
															+        """
														
 
															+        Create a SpecDecodeWorker.
														
 
															+
														
 
															+        Args:
														
 
															+            proposer_worker: A worker that can produce speculative tokens for
														
 
															+                sequences.
														
 
															+            scorer_worker: A worker that produces probabilities of speculative
														
 
															+                tokens according to some base model. Typically a vanilla
														
 
															+                Aphrodite Worker.
														
 
															+            rejection_sampler: A Torch module used to perform modified rejection
														
 
															+                sampling for speculative decoding.
														
 
															+            metrics_collector: Helper class for collecting metrics; can be set
														
 
															+                for testing purposes.
														
 
															+        """
														
 
															+        self.proposer_worker = proposer_worker
														
 
															+        self.scorer_worker = scorer_worker
														
 
															+        self.rejection_sampler = rejection_sampler
														
 
															+
														
 
															+        self._metrics = (AsyncMetricsCollector(rejection_sampler)
														
 
															+                         if metrics_collector is None else metrics_collector)
														
 
															+
														
 
															+        self.probs_dtype = self.rejection_sampler.probs_dtype
														
 
															+        self.token_id_dtype = self.rejection_sampler.token_id_dtype
														
 
															+
														
 
															+        self.scorer: SpeculativeScorer = None
														
 
															+
														
 
															+    def init_model(self) -> None:
														
 
															+        """Initialize both scorer and proposer models."""
														
 
															+        # The scorer worker model is initialized first in case the proposer
														
 
															+        # model has a smaller TP degree than the target worker.
														
 
															+        self.scorer_worker.init_model()
														
 
															+        self.proposer_worker.init_model()
														
 
															+
														
 
															+        self._metrics.init_gpu_tensors(self.rank)
														
 
															+        self.rejection_sampler.init_gpu_tensors(self.rank)
														
 
															+        self.scorer = BatchExpansionTop1Scorer(
														
 
															+            scorer_worker=self.scorer_worker,
														
 
															+            device=self.device,
														
 
															+            vocab_size=self._vocab_size,
														
 
															+        )
														
 
															+
														
 
															+    def profile_num_available_blocks(
														
 
															+        self,
														
 
															+        block_size: int,
														
 
															+        gpu_memory_utilization: float,
														
 
															+        cpu_swap_space: int,
														
 
															+        cache_dtype: str,
														
 
															+    ) -> Tuple[int, int]:
														
 
															+        """Determine the number of cache blocks to use.
														
 
															+
														
 
															+        This is done by profiling the scorer model (which is typically the
														
 
															+        larger of the two). Then the total memory which would be used by the
														
 
															+        scorer cache is divided evenly between the proposer and scorer model KV,
														
 
															+        such that the number of blocks is equal in both KV caches.
														
 
															+        """
														
 
															+        (
														
 
															+            num_gpu_blocks,
														
 
															+            num_cpu_blocks,
														
 
															+        ) = self.scorer_worker.profile_num_available_blocks(
														
 
															+            block_size, gpu_memory_utilization, cpu_swap_space, cache_dtype)
														
 
															+
														
 
															+        scorer_cache_block_size_bytes = (
														
 
															+            self.scorer_worker.get_cache_block_size_bytes(
														
 
															+                block_size, cache_dtype))
														
 
															+        proposer_cache_block_size_bytes = (
														
 
															+            self.proposer_worker.get_cache_block_size_bytes(
														
 
															+                block_size, cache_dtype))
														
 
															+
														
 
															+        new_num_gpu_blocks = split_num_cache_blocks_evenly(
														
 
															+            scorer_cache_block_size_bytes,
														
 
															+            proposer_cache_block_size_bytes,
														
 
															+            num_gpu_blocks,
														
 
															+        )
														
 
															+        return new_num_gpu_blocks, num_cpu_blocks
														
 
															+
														
 
															+    def init_cache_engine(self, cache_config: CacheConfig):
														
 
															+        """Initialize the cache engine of the scorer and proposer workers."""
														
 
															+        self.scorer_worker.init_cache_engine(cache_config)
														
 
															+        self.proposer_worker.init_cache_engine(cache_config)
														
 
															+
														
 
															+    @torch.inference_mode()
														
 
															+    def execute_model(
														
 
															+        self,
														
 
															+        seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+        blocks_to_swap_in: Optional[Dict[int, int]],
														
 
															+        blocks_to_swap_out: Optional[Dict[int, int]],
														
 
															+        blocks_to_copy: Optional[Dict[int, List[int]]],
														
 
															+        num_spec_tokens: int,
														
 
															+    ) -> List[SamplerOutput]:
														
 
															+        """Perform speculative decoding on the input batch."""
														
 
															+
														
 
															+        assert seq_group_metadata_list is not None, (
														
 
															+            "speculative decoding "
														
 
															+            "requires non-None seq_group_metadata_list")
														
 
															+
														
 
															+        # If no spec tokens, call the proposer and scorer workers normally.
														
 
															+        # Used for prefill.
														
 
															+        if num_spec_tokens == 0 or len(seq_group_metadata_list) == 0:
														
 
															+            return self._run_no_spec(
														
 
															+                seq_group_metadata_list=seq_group_metadata_list,
														
 
															+                blocks_to_swap_in=blocks_to_swap_in,
														
 
															+                blocks_to_swap_out=blocks_to_swap_out,
														
 
															+                blocks_to_copy=blocks_to_copy,
														
 
															+            )
														
 
															+
														
 
															+        return self._run_speculative_decoding_step(
														
 
															+            seq_group_metadata_list=seq_group_metadata_list,
														
 
															+            blocks_to_swap_in=blocks_to_swap_in,
														
 
															+            blocks_to_swap_out=blocks_to_swap_out,
														
 
															+            blocks_to_copy=blocks_to_copy,
														
 
															+            k=num_spec_tokens,
														
 
															+        )
														
 
															+
														
 
															+    @nvtx_range("spec_decode_worker._run_no_spec")
														
 
															+    def _run_no_spec(
														
 
															+        self,
														
 
															+        seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+        blocks_to_swap_in: Optional[Dict[int, int]],
														
 
															+        blocks_to_swap_out: Optional[Dict[int, int]],
														
 
															+        blocks_to_copy: Optional[Dict[int, List[int]]],
														
 
															+    ) -> List[SamplerOutput]:
														
 
															+        """Run a prefill step, without any speculation. The input is sent to the
														
 
															+        proposer and scorer model so that the KV cache is consistent between the
														
 
															+        two.
														
 
															+        """
														
 
															+
														
 
															+        self.proposer_worker.execute_model(
														
 
															+            seq_group_metadata_list=seq_group_metadata_list,
														
 
															+            blocks_to_swap_in=blocks_to_swap_in,
														
 
															+            blocks_to_swap_out=blocks_to_swap_out,
														
 
															+            blocks_to_copy=blocks_to_copy,
														
 
															+            return_python_output=False,
														
 
															+        )
														
 
															+
														
 
															+        sampler_output = self.scorer_worker.execute_model(
														
 
															+            seq_group_metadata_list=seq_group_metadata_list,
														
 
															+            blocks_to_swap_in=blocks_to_swap_in,
														
 
															+            blocks_to_swap_out=blocks_to_swap_out,
														
 
															+            blocks_to_copy=blocks_to_copy,
														
 
															+        )
														
 
															+
														
 
															+        # Clear device tensors from sampler output. This reduces communication
														
 
															+        # overhead when the engine runs in a different process than the workers.
														
 
															+        sampler_output.probs = None
														
 
															+        sampler_output.sampled_tokens = None
														
 
															+        return [sampler_output]
														
 
															+
														
 
															+    @nvtx_range("spec_decode_worker._run_speculative_decoding_step")
														
 
															+    def _run_speculative_decoding_step(
														
 
															+        self,
														
 
															+        seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+        blocks_to_swap_in: Optional[Dict[int, int]],
														
 
															+        blocks_to_swap_out: Optional[Dict[int, int]],
														
 
															+        blocks_to_copy: Optional[Dict[int, List[int]]],
														
 
															+        k: int,
														
 
															+    ) -> List[SamplerOutput]:
														
 
															+        """Execute a single step of speculative decoding.
														
 
															+
														
 
															+        This invokes the proposer worker to get k speculative tokens for each
														
 
															+        sequence, then scores each speculative token using the scoring worker.
														
 
															+
														
 
															+        Returns a list of SamplerOutput, each containing a single token per
														
 
															+        sequence.
														
 
															+        """
														
 
															+
														
 
															+        # Generate proposals using draft worker.
														
 
															+        proposals = self.proposer_worker.get_spec_proposals(
														
 
															+            seq_group_metadata_list,
														
 
															+            blocks_to_swap_in,
														
 
															+            blocks_to_swap_out,
														
 
															+            blocks_to_copy,
														
 
															+            k,
														
 
															+        )
														
 
															+
														
 
															+        proposal_scores = self.scorer.score_proposals(
														
 
															+            seq_group_metadata_list,
														
 
															+            blocks_to_swap_in,
														
 
															+            blocks_to_swap_out,
														
 
															+            blocks_to_copy,
														
 
															+            k,
														
 
															+            proposals,
														
 
															+        )
														
 
															+
														
 
															+        accepted_token_ids = self._verify_tokens(seq_group_metadata_list,
														
 
															+                                                 proposal_scores, proposals, k)
														
 
															+
														
 
															+        return self._create_output_sampler_list(seq_group_metadata_list,
														
 
															+                                                accepted_token_ids, k)
														
 
															+
														
 
															+    @nvtx_range("spec_decode_worker._verify_tokens")
														
 
															+    def _verify_tokens(
														
 
															+        self,
														
 
															+        seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+        proposal_scores: SpeculativeScores,
														
 
															+        proposals: SpeculativeProposals,
														
 
															+        max_proposal_len: int,
														
 
															+    ) -> torch.Tensor:
														
 
															+        """Determine which speculative tokens are accepted using the
														
 
															+        probabilities of each token according to the proposer and scorer models.
														
 
															+        """
														
 
															+        proposal_lens_list = proposals.proposal_lens.tolist()
														
 
															+
														
 
															+        # Aphrodite currently only supports proposal lens equal to zero or the
														
 
															+        # batch proposal len. This adds some complexity (splitting the batch
														
 
															+        # into spec and non spec sequences) and should be removed in the
														
 
															+        # future. It can be done by supporting per-sequence proposal lens.
														
 
															+        _, spec_indices = split_batch_by_proposal_len(
														
 
															+            seq_group_metadata_list,
														
 
															+            proposal_lens_list,
														
 
															+            select_proposal_len_zero=False,
														
 
															+        )
														
 
															+        _, non_spec_indices = split_batch_by_proposal_len(
														
 
															+            seq_group_metadata_list,
														
 
															+            proposal_lens_list,
														
 
															+            select_proposal_len_zero=True,
														
 
															+        )
														
 
															+        original_indices = spec_indices + non_spec_indices
														
 
															+
														
 
															+        proposal_probs = proposal_scores.probs[spec_indices, :-1]
														
 
															+        bonus_token_ids = proposal_scores.token_ids[spec_indices, -1:]
														
 
															+        non_spec_token_ids = proposal_scores.token_ids[non_spec_indices]
														
 
															+
														
 
															+        accepted_token_ids = self.rejection_sampler(
														
 
															+            proposal_probs,
														
 
															+            bonus_token_ids,
														
 
															+            proposals.proposal_probs,
														
 
															+            proposals.proposal_token_ids,
														
 
															+        )
														
 
															+
														
 
															+        # Append output tokens from non-speculative sequences to
														
 
															+        # the accepted token ids tensor.
														
 
															+        non_spec_token_ids = non_spec_token_ids.expand(-1, max_proposal_len +
														
 
															+                                                       1).clone()
														
 
															+        non_spec_token_ids[:, 1:] = -1
														
 
															+        accepted_token_ids = torch.cat(
														
 
															+            [accepted_token_ids, non_spec_token_ids])
														
 
															+
														
 
															+        # Rearrange so that results are in the order of the original seq group
														
 
															+        # metadata.
														
 
															+        accepted_token_ids[original_indices] = accepted_token_ids.clone()
														
 
															+
														
 
															+        return accepted_token_ids
														
 
															+
														
 
															+    def _create_output_sampler_list(
														
 
															+        self,
														
 
															+        seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+        accepted_token_ids: torch.Tensor,  # shape: [batch_size, k+1]
														
 
															+        k: int,
														
 
															+    ) -> List[SamplerOutput]:
														
 
															+        """Given the accepted token ids, create a list of SamplerOutput.
														
 
															+
														
 
															+        The output is padded with -1 tokens such that each sequence has
														
 
															+        the same number of outputs.
														
 
															+        """
														
 
															+        seq_ids = get_all_seq_ids(seq_group_metadata_list)
														
 
															+
														
 
															+        # shape: [k+1, batch_size]
														
 
															+        accepted_token_ids_by_step = accepted_token_ids.transpose(0,
														
 
															+                                                                  1).tolist()
														
 
															+        sampler_output_list = []
														
 
															+        for token_ids_by_step in accepted_token_ids_by_step:
														
 
															+            if all(token_id == -1 for token_id in token_ids_by_step):
														
 
															+                break
														
 
															+
														
 
															+            step_output_token_ids = []
														
 
															+            for token_id, seq_id in zip(token_ids_by_step, seq_ids):
														
 
															+                step_output_token_ids.append(
														
 
															+                    SequenceGroupOutput(
														
 
															+                        samples=[
														
 
															+                            SequenceOutput(
														
 
															+                                parent_seq_id=seq_id,
														
 
															+                                output_token=token_id,
														
 
															+                                # TODO Add verifier logprobs.
														
 
															+                                logprobs={token_id: 0.0},
														
 
															+                                persistent_data={},
														
 
															+                            )
														
 
															+                        ],
														
 
															+                        prompt_logprobs=None,
														
 
															+                    ))
														
 
															+            sampler_output_list.append(
														
 
															+                SamplerOutput(outputs=step_output_token_ids))
														
 
															+
														
 
															+        maybe_rejsample_metrics = self._metrics.maybe_collect_rejsample_metrics(
														
 
															+            k)
														
 
															+        if maybe_rejsample_metrics is not None:
														
 
															+            sampler_output_list[
														
 
															+                0].spec_decode_worker_metrics = maybe_rejsample_metrics
														
 
															+
														
 
															+        return sampler_output_list
														
 
															+
														
 
															+    @cached_property
														
 
															+    def _vocab_size(self) -> int:
														
 
															+        """Get the vocab size of the model and make sure it's consistent between
														
 
															+        draft and target workers.
														
 
															+        """
														
 
															+        vocab_sizes = [
														
 
															+            worker.vocab_size
														
 
															+            for worker in [self.proposer_worker, self.scorer_worker]
														
 
															+        ]
														
 
															+        assert all(vocab_sizes[0] == vocab_size for vocab_size in vocab_sizes)
														
 
															+        return vocab_sizes[0]
														
 
															+
														
 
															+    @property
														
 
															+    def rank(self):
														
 
															+        return self.scorer_worker.rank
														
 
															+
														
 
															+    @property
														
 
															+    def device(self):
														
 
															+        return self.scorer_worker.device
														
 
															+
														
 
															+
														
 
															+def split_num_cache_blocks_evenly(
														
 
															+    scorer_cache_block_size_bytes: int,
														
 
															+    proposer_cache_block_size_bytes: int,
														
 
															+    total_num_gpu_blocks: int,
														
 
															+) -> int:
														
 
															+    """Given total_num_gpu_blocks, the number of GPU blocks that could be
														
 
															+    allocate to the target model, this function calculates how many blocks
														
 
															+    should be given to the draft and target model.
														
 
															+
														
 
															+    Note that usually the block size, in bytes, of each model is different,
														
 
															+    as it's a function of number of KV/layer, number of heads, and hidden
														
 
															+    dimension size.
														
 
															+
														
 
															+    Since the target and draft models allocate the same number of blocks, we
														
 
															+    simply calculate the number of blocks where if allocated by both models,
														
 
															+    the total memory usage from KV cache is no larger than the number of
														
 
															+    blocks allocatable by the target model alone.
														
 
															+    """
														
 
															+    new_num_gpu_blocks = int(
														
 
															+        total_num_gpu_blocks * scorer_cache_block_size_bytes /
														
 
															+        (proposer_cache_block_size_bytes + scorer_cache_block_size_bytes))
														
 
															+
														
 
															+    return new_num_gpu_blocks
														
--- a/aphrodite/spec_decode/util.py
+++ b/aphrodite/spec_decode/util.py
@@ -0,0 +1,101 @@
 
															+import torch
														
 
															+from typing import List, Tuple
														
 
															+from contextlib import contextmanager
														
 
															+from itertools import chain
														
 
															+
														
 
															+from aphrodite.common.sequence import SequenceGroupMetadata, SamplerOutput
														
 
															+
														
 
															+SeqId = int
														
 
															+
														
 
															+
														
 
															+def get_all_seq_ids(
														
 
															+        seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[SeqId]:
														
 
															+    """Given a list of SequenceGroupMetadata, create a list of all
														
 
															+    sequence ids.
														
 
															+    """
														
 
															+    return list(
														
 
															+        chain.from_iterable([
														
 
															+            seq_group_metadata.seq_data.keys()
														
 
															+            for seq_group_metadata in seq_group_metadata_list
														
 
															+        ]))
														
 
															+
														
 
															+
														
 
															+def split_batch_by_proposal_len(
														
 
															+    seq_group_metadata_list: List[SequenceGroupMetadata],
														
 
															+    proposal_lens: List[int],
														
 
															+    select_proposal_len_zero: bool,
														
 
															+) -> Tuple[List[SequenceGroupMetadata], List[int]]:
														
 
															+    """Utility function that splits a batch based on whether the proposal len is
														
 
															+    zero or not. We should remove this once Aphrodite supports per-sequence
														
 
															+    proposal lens in a batch.
														
 
															+    """
														
 
															+
														
 
															+    if select_proposal_len_zero:
														
 
															+        predicate = lambda proposal_len: proposal_len == 0
														
 
															+    else:
														
 
															+        predicate = lambda proposal_len: proposal_len != 0
														
 
															+
														
 
															+    indices = [
														
 
															+        i for i, (_, proposal_len
														
 
															+                  ) in enumerate(zip(seq_group_metadata_list, proposal_lens))
														
 
															+        if predicate(proposal_len)
														
 
															+    ]
														
 
															+    seq_groups = [
														
 
															+        seq_group for seq_group, proposal_len in zip(
														
 
															+            seq_group_metadata_list, proposal_lens) if predicate(proposal_len)
														
 
															+    ]
														
 
															+
														
 
															+    return seq_groups, indices
														
 
															+
														
 
															+
														
 
															+def sampler_output_to_torch(
														
 
															+    sampler_output_list: List[SamplerOutput],
														
 
															+) -> Tuple[torch.Tensor, torch.Tensor]:
														
 
															+    """Utility function which converts a list of SamplerOutput to tensors.
														
 
															+
														
 
															+    Returns:
														
 
															+        sampled_token_ids: torch.Tensor
														
 
															+            shape: [batch_size, len(sampler_output_list)]
														
 
															+
														
 
															+        sampled_token_probs: torch.Tensor
														
 
															+            shape: [batch_size, len(sampler_output_list), vocab_size]
														
 
															+    """
														
 
															+
														
 
															+    # shape: [batch_size, num_sampler_output, vocab_size]
														
 
															+    sampled_token_probs = torch.stack(
														
 
															+        [
														
 
															+            sampler_output.sampled_token_probs
														
 
															+            for sampler_output in sampler_output_list
														
 
															+        ],
														
 
															+        dim=0,
														
 
															+    ).transpose(0, 1)
														
 
															+
														
 
															+    # shape: [batch_size, num_sampler_output]
														
 
															+    sampled_token_ids = torch.stack(
														
 
															+        [
														
 
															+            sampler_output.sampled_token_ids.flatten()
														
 
															+            for sampler_output in sampler_output_list
														
 
															+        ],
														
 
															+        dim=0,
														
 
															+    ).transpose(0, 1)
														
 
															+
														
 
															+    return sampled_token_ids, sampled_token_probs
														
 
															+
														
 
															+
														
 
															+@contextmanager
														
 
															+def nvtx_range(msg, *args, **kwargs):
														
 
															+    """
														
 
															+    Context manager / decorator that pushes an NVTX range at the beginning
														
 
															+    of its scope, and pops it at the end. If extra arguments are given,
														
 
															+    they are passed as arguments to msg.format().
														
 
															+
														
 
															+    If running with cuda graphs, you must enable nsys cuda graph profiling.
														
 
															+
														
 
															+    Arguments:
														
 
															+        msg (string): message to associate with the range
														
 
															+    """
														
 
															+    torch.cuda.nvtx.range_push(msg.format(*args, **kwargs))
														
 
															+    try:
														
 
															+        yield
														
 
															+    finally:
														
 
															+        torch.cuda.nvtx.range_pop()
														
--- a/aphrodite/task_handler/cache_engine.py
+++ b/aphrodite/task_handler/cache_engine.py
@@ -4,9 +4,8 @@ from typing import Dict, List, Tuple
 
															 import torch
														
 
															 from loguru import logger
														
 
															-from aphrodite._C import cache_ops
														
 
															 from aphrodite.common.config import CacheConfig, ModelConfig, ParallelConfig
														
 
															-from aphrodite.common.utils import in_wsl, STR_DTYPE_TO_TORCH_DTYPE
														
 
															+from aphrodite.common.utils import in_wsl, is_neuron, STR_DTYPE_TO_TORCH_DTYPE
														
 
															 KVCache = Tuple[torch.Tensor, torch.Tensor]
														
@@ -37,6 +36,10 @@ class CacheEngine:
 
															         self.num_gpu_blocks = cache_config.num_gpu_blocks
														
 
															         self.num_cpu_blocks = cache_config.num_cpu_blocks
														
 
															+        # Skip initializing CUDA stream and buffer for Neuron backend.
														
 
															+        if is_neuron():
														
 
															+            return
														
 
															+
														
 
															         if cache_config.cache_dtype == "auto":
														
 
															             self.dtype = model_config.dtype
														
 
															         else:
														
@@ -119,6 +122,8 @@ class CacheEngine:
 
															         dst: List[KVCache],
														
 
															         src_to_dst: Dict[int, int],
														
 
															     ) -> None:
														
 
															+        from aphrodite._C import cache_ops
														
 
															+
														
 
															         with torch.cuda.stream(self.cache_stream):
														
 
															             for i in range(self.num_layers):
														
 
															                 src_key_cache, src_value_cache = src[i]
														
@@ -138,6 +143,8 @@ class CacheEngine:
 
															         self._swap(self.gpu_cache, self.cpu_cache, src_to_dst)
														
 
															     def copy(self, src_to_dsts: Dict[int, List[int]]) -> None:
														
 
															+        from aphrodite._C import cache_ops
														
 
															+
														
 
															         key_caches = [key_cache for key_cache, _ in self.gpu_cache]
														
 
															         value_caches = [value_cache for _, value_cache in self.gpu_cache]
														
 
															         # NOTE: This operation implicitly synchronizes the CPU and GPU.
														
--- a/aphrodite/task_handler/model_runner.py
+++ b/aphrodite/task_handler/model_runner.py
@@ -53,7 +53,7 @@ class ModelRunner:
 
															         device_config: DeviceConfig,
														
 
															         lora_config: Optional[LoRAConfig],
														
 
															         kv_cache_dtype: Optional[str] = "auto",
														
 
															-        kv_quant_params_path: Optional[str] = None,
														
 
															+        # kv_quant_params_path: Optional[str] = None,
														
 
															         is_driver_worker: bool = False,
														
 
															     ):
														
 
															         self.model_config = model_config
														
@@ -69,6 +69,7 @@ class ModelRunner:
 
															         self.device_config = (device_config
														
 
															                               if device_config is not None else DeviceConfig())
														
 
															         self.device = self.device_config.device
														
 
															+
														
 
															         self.model = None
														
 
															         self.block_size = None  # Set after initial profiling.
														
 
															         self.lora_manager = None
														
@@ -89,37 +90,52 @@ class ModelRunner:
 
															         # cache in_wsl result
														
 
															         self.in_wsl = in_wsl()
														
 
															         self.kv_cache_dtype = kv_cache_dtype
														
 
															-        self.kv_quant_params = (self.load_kv_quant_params(
														
 
															-            model_config, kv_quant_params_path)
														
 
															-                                if self.kv_cache_dtype == "int8" else None)
														
 
															-
														
 
															-    def load_kv_quant_params(self, model_config: ModelConfig,
														
 
															-                             kv_quant_params_path: str) -> List[List[float]]:
														
 
															-        if model_config is None:
														
 
															-            return None
														
 
															-        # Remove it when all models support kv cache int8.
														
 
															-        architectures = model_config.hf_config.architectures
														
 
															-        for arch in architectures:
														
 
															-            if arch not in ["LlamaForCausalLM", "LLaMAForCausalLM"]:
														
 
															-                raise ValueError(
														
 
															-                    "KV CACHE INT8 is not supported for model architectures "
														
 
															-                    f"{arch} for now. "
														
 
															-                    "Supported architectures: LlamaForCausalLM and "
														
 
															-                    "LLaMAForCausalLM.")
														
 
															-        num_layers = model_config.hf_config.num_hidden_layers
														
 
															-        kv_quant_params = []
														
 
															-        for i in range(num_layers):
														
 
															-            if kv_quant_params_path is not None:
														
 
															-                path = (kv_quant_params_path +
														
 
															-                        f"/layers.{i}.past_kv_scale.0.weight")
														
 
															-                kv_quant_param = list(np.fromfile(path, dtype=np.float32))
														
 
															-            kv_quant_params.append(kv_quant_param)
														
 
															-        return kv_quant_params
														
 
															+        # self.kv_quant_params = (
														
 
															+        #     self.load_kv_quant_params(model_config, kv_quant_params_path)
														
 
															+        #     if self.kv_cache_dtype == "int8"
														
 
															+        #     else None
														
 
															+        # )
														
 
															+
														
 
															+        # Set enforce_eager to True for Neuron backend, to avoid capturing graph
														
 
															+        if self.device_config.is_neuron:
														
 
															+            self.model_config.enforce_eager = True
														
 
															+
														
 
															+    # def load_kv_quant_params(
														
 
															+    #     self, model_config: ModelConfig, kv_quant_params_path: str
														
 
															+    # ) -> List[List[float]]:
														
 
															+    #     if model_config is None:
														
 
															+    #         return None
														
 
															+    #     # Remove it when all models support kv cache int8.
														
 
															+    #     architectures = model_config.hf_config.architectures
														
 
															+    #     for arch in architectures:
														
 
															+    #         if arch not in ["LlamaForCausalLM", "LLaMAForCausalLM"]:
														
 
															+    #             raise ValueError(
														
 
															+    #                 "KV CACHE INT8 is not supported for model architectures "
														
 
															+    #                 f"{arch} for now. "
														
 
															+    #                 "Supported architectures: LlamaForCausalLM and "
														
 
															+    #                 "LLaMAForCausalLM."
														
 
															+    #             )
														
 
															+    #     num_layers = model_config.hf_config.num_hidden_layers
														
 
															+    #     kv_quant_params = []
														
 
															+    #     for i in range(num_layers):
														
 
															+    #         if kv_quant_params_path is not None:
														
 
															+    #             path = (
														
 
															+    #                 kv_quant_params_path + f"/layers.{i}.past_kv_scale.0.weight"  # noqa: E501
														
 
															+    #             )
														
 
															+    #             kv_quant_param = list(np.fromfile(path, dtype=np.float32))
														
 
															+    #         kv_quant_params.append(kv_quant_param)
														
 
															+    #     return kv_quant_params
														
 
															     def load_model(self) -> None:
														
 
															         with measure_cuda_memory() as m:
														
 
															-            self.model = get_model(self.model_config, self.device_config,
														
 
															-                                   self.lora_config)
														
 
															+            self.model = get_model(
														
 
															+                self.model_config,
														
 
															+                self.device_config,
														
 
															+                lora_config=self.lora_config,
														
 
															+                parallel_config=self.parallel_config,
														
 
															+                scheduler_config=self.scheduler_config,
														
 
															+            )
														
 
															+
														
 
															         self.model_memory_usage = m.consumed_memory
														
 
															         tp = get_tensor_model_parallel_world_size()
														
 
															         logger.info(
														
@@ -127,8 +143,6 @@ class ModelRunner:
 
															             f"{self.model_memory_usage / float(2**30):.2f} GiB x {tp} = "
														
 
															             f"{self.model_memory_usage * tp / float(2**30):.2f} GiB")
														
 
															-        vocab_size = self.model.config.vocab_size
														
 
															-
														
 
															         if self.lora_config:
														
 
															             assert (hasattr(self.model, "supported_lora_modules")
														
 
															                     and self.model.supported_lora_modules
														
@@ -142,7 +156,7 @@ class ModelRunner:
 
															                 self.scheduler_config.max_num_seqs,
														
 
															                 self.scheduler_config.max_num_batched_tokens +
														
 
															                 self.scheduler_config.max_paddings,
														
 
															-                vocab_size,
														
 
															+                self.vocab_size,
														
 
															                 self.lora_config,
														
 
															                 self.device,
														
 
															                 self.model.embedding_modules,
														
@@ -250,6 +264,7 @@ class ModelRunner:
 
															                 slot_mapping[-1].append(slot)
														
 
															         max_prompt_len = max(subquery_lens)
														
 
															+        assert max_prompt_len > 0
														
 
															         input_tokens = _make_tensor_with_pad(
														
 
															             input_tokens,
														
 
															             max_prompt_len,
														
@@ -309,7 +324,7 @@ class ModelRunner:
 
															             block_tables=block_tables,
														
 
															             use_cuda_graph=False,
														
 
															             kv_cache_dtype=self.kv_cache_dtype,
														
 
															-            kv_quant_params=self.kv_quant_params,
														
 
															+            # kv_quant_params=self.kv_quant_params,
														
 
															         )
														
 
															         return (
														
 
															             input_tokens,
														
@@ -449,7 +464,7 @@ class ModelRunner:
 
															             block_tables=block_tables,
														
 
															             use_cuda_graph=use_captured_graph,
														
 
															             kv_cache_dtype=self.kv_cache_dtype,
														
 
															-            kv_quant_params=self.kv_quant_params,
														
 
															+            # kv_quant_params=self.kv_quant_params,
														
 
															         )
														
 
															         return (
														
 
															             input_tokens,
														
@@ -472,6 +487,7 @@ class ModelRunner:
 
															         selected_token_start_idx = 0
														
 
															         categorized_sample_indices = {t: [] for t in SamplingType}
														
 
															         categorized_sample_indices_start_idx = 0
														
 
															+        pin_memory = not self.in_wsl and not self.device_config.is_neuron
														
 
															         max_subquery_len = max(subquery_lens) if subquery_lens else 1
														
 
															         for i, seq_group_metadata in enumerate(seq_group_metadata_list):
														
@@ -501,8 +517,8 @@ class ModelRunner:
 
															                 selected_token_indices.append(selected_token_start_idx +
														
 
															                                               subquery_len - 1)
														
 
															                 selected_token_start_idx += max_subquery_len
														
 
															-                if (sampling_params.sampling_type == SamplingType.RANDOM_SEED):
														
 
															-                    assert sampling_params.seed is not None
														
 
															+
														
 
															+                if sampling_params.seed is not None:
														
 
															                     seq_group_metadata.state.generator = torch.Generator(
														
 
															                         device="cuda").manual_seed(sampling_params.seed)
														
 
															             else:
														
@@ -522,21 +538,21 @@ class ModelRunner:
 
															                         ))
														
 
															                 categorized_sample_indices_start_idx += num_seqs
														
 
															-            if (seq_group_metadata.state.generator is not None):
														
 
															+            if sampling_params.seed is not None:
														
 
															                 generators.append(seq_group_metadata.state.generator)
														
 
															         selected_token_indices = _async_h2d(
														
 
															             selected_token_indices,
														
 
															             dtype=torch.long,
														
 
															             target_device=self.device,
														
 
															-            pin_memory=not self.in_wsl,
														
 
															+            pin_memory=pin_memory,
														
 
															         )
														
 
															         categorized_sample_indices = {
														
 
															             t: _async_h2d(
														
 
															                 seq_ids,
														
 
															                 dtype=torch.int,
														
 
															                 target_device=self.device,
														
 
															-                pin_memory=not self.in_wsl,
														
 
															+                pin_memory=pin_memory,
														
 
															             )
														
 
															             for t, seq_ids in categorized_sample_indices.items()
														
 
															         }
														
@@ -621,9 +637,9 @@ class ModelRunner:
 
															                 "block_tables": input_metadata.block_tables,
														
 
															                 "use_cuda_graph": input_metadata.use_cuda_graph,
														
 
															                 "kv_cache_dtype": input_metadata.kv_cache_dtype,
														
 
															-                "kv_quant_params": input_metadata.kv_quant_params,
														
 
															+                # "kv_quant_params": input_metadata.kv_quant_params,
														
 
															                 "selected_token_indices":
														
 
															-                sampling_metadata.selected_token_indices,  # noqa
														
 
															+                sampling_metadata.selected_token_indices,
														
 
															                 "lora_requests": lora_requests,
														
 
															                 "lora_mapping": lora_mapping,
														
 
															             }
														
@@ -645,7 +661,7 @@ class ModelRunner:
 
															                 block_tables=metadata_dict["block_tables"],
														
 
															                 use_cuda_graph=metadata_dict["use_cuda_graph"],
														
 
															                 kv_cache_dtype=metadata_dict["kv_cache_dtype"],
														
 
															-                kv_quant_params=metadata_dict["kv_quant_params"],
														
 
															+                # kv_quant_params=metadata_dict["kv_quant_params"],
														
 
															             )
														
 
															             sampling_metadata = SamplingMetadata(
														
 
															                 seq_groups=None,
														
@@ -707,8 +723,7 @@ class ModelRunner:
 
															     @torch.inference_mode()
														
 
															     def profile_run(self) -> None:
														
 
															         # Enable top-k sampling to reflect the accurate memory usage.
														
 
															-        vocab_size = self.model_config.get_vocab_size()
														
 
															-        sampling_params = SamplingParams(top_p=0.99, top_k=vocab_size - 1)
														
 
															+        sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
														
 
															         max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
														
 
															         max_num_seqs = self.scheduler_config.max_num_seqs
														
@@ -789,8 +804,9 @@ class ModelRunner:
 
															     @torch.inference_mode()
														
 
															     def capture_model(self, kv_caches: List[KVCache]) -> None:
														
 
															         # NOTE: This is a hack to ensure that the NCCL backend is never
														
 
															-        # deleted before the CUDA graph
														
 
															+        # deleted before the CUDA graphs.
														
 
															         self.cupy_nccl_backend = cupy_utils.get_nccl_backend()
														
 
															+
														
 
															         assert not self.model_config.enforce_eager
														
 
															         logger.info("Capturing the model for CUDA graphs. This may lead to "
														
 
															                     "unexpected consequences if the model is not static. To "
														
@@ -818,8 +834,6 @@ class ModelRunner:
 
															             bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size
														
 
															         ]
														
 
															-        # NOTE: Capturing the largest batch size first may help reduce the
														
 
															-        # memory usage of CUDA graph.
														
 
															         # NOTE: There are 3 backends for all-reduce: custom all-reduce
														
 
															         # kernel, CuPy NCCL, and PyTorch NCCL. When using CUDA graph, we use
														
 
															         # either custom all-reduce kernel or CuPy NCCL. When not using CUDA
														
@@ -847,7 +861,7 @@ class ModelRunner:
 
															                     block_tables=block_tables[:batch_size],
														
 
															                     use_cuda_graph=True,
														
 
															                     kv_cache_dtype=self.kv_cache_dtype,
														
 
															-                    kv_quant_params=self.kv_quant_params,
														
 
															+                    # kv_quant_params=self.kv_quant_params,
														
 
															                 )
														
 
															                 if self.lora_config:
														
@@ -882,6 +896,10 @@ class ModelRunner:
 
															         self.graph_runners.clear()
														
 
															         self.cupy_nccl_backend = None
														
 
															+    @property
														
 
															+    def vocab_size(self) -> int:
														
 
															+        return self.model_config.get_vocab_size()
														
 
															+
														
 
															 class CUDAGraphRunner:
														
@@ -916,14 +934,14 @@ class CUDAGraphRunner:
 
															         # NOTE: Python 3.8 does not support multi-line with statements.
														
 
															         # https://stackoverflow.com/questions/31039022/python-multi-line-with-statement
														
 
															         self.graph = torch.cuda.CUDAGraph()
														
 
															-        with torch.cuda.graph(self.graph,
														
 
															-                              pool=memory_pool), _maybe_cupy_nccl():
														
 
															-            hidden_states = self.model(
														
 
															-                input_ids,
														
 
															-                positions,
														
 
															-                kv_caches,
														
 
															-                input_metadata,
														
 
															-            )
														
 
															+        with torch.cuda.graph(self.graph, pool=memory_pool):  # noqa: SIM117
														
 
															+            with _maybe_cupy_nccl():
														
 
															+                hidden_states = self.model(
														
 
															+                    input_ids,
														
 
															+                    positions,
														
 
															+                    kv_caches,
														
 
															+                    input_metadata,
														
 
															+                )
														
 
															         torch.cuda.synchronize()
														
 
															         # Save the input and output buffers.
														
--- a/aphrodite/task_handler/neuron_worker.py
+++ b/aphrodite/task_handler/neuron_worker.py
@@ -0,0 +1,204 @@
 
															+"""A Neuron worker class."""
														
 
															+from typing import Dict, List, Optional, Tuple
														
 
															+
														
 
															+import torch
														
 
															+import torch.distributed
														
 
															+
														
 
															+from aphrodite.common.config import (
														
 
															+    CacheConfig,
														
 
															+    DeviceConfig,
														
 
															+    ModelConfig,
														
 
															+    ParallelConfig,
														
 
															+    SchedulerConfig,
														
 
															+    LoRAConfig,
														
 
															+)
														
 
															+from aphrodite.modeling import set_random_seed
														
 
															+from aphrodite.modeling.megatron.communication_op import broadcast_tensor_dict
														
 
															+from aphrodite.modeling.megatron.parallel_state import (
														
 
															+    ensure_model_parallel_initialized, )
														
 
															+from aphrodite.common.sequence import SamplerOutput, SequenceGroupMetadata
														
 
															+from aphrodite.task_handler.cache_engine import CacheEngine
														
 
															+from aphrodite.task_handler.model_runner import ModelRunner
														
 
															+
														
 
															+
														
 
															+class Worker:
														
 
															+    """A worker class that executes the model on a group of neuron cores."""
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        model_config: ModelConfig,
														
 
															+        parallel_config: ParallelConfig,
														
 
															+        scheduler_config: SchedulerConfig,
														
 
															+        device_config: DeviceConfig,
														
 
															+        local_rank: int,
														
 
															+        rank: int,
														
 
															+        distributed_init_method: str,
														
 
															+        lora_config: Optional[LoRAConfig] = None,
														
 
															+        kv_cache_dtype: Optional[str] = "auto",
														
 
															+        # kv_quant_params_path: Optional[str] = None,
														
 
															+        is_driver_worker: bool = False,
														
 
															+    ) -> None:
														
 
															+        self.model_config = model_config
														
 
															+        self.parallel_config = parallel_config
														
 
															+        self.scheduler_config = scheduler_config
														
 
															+        self.device_config = device_config
														
 
															+        self.local_rank = local_rank
														
 
															+        self.rank = rank
														
 
															+        self.distributed_init_method = distributed_init_method
														
 
															+        self.lora_config = lora_config
														
 
															+        self.is_driver_worker = is_driver_worker
														
 
															+        if self.is_driver_worker:
														
 
															+            assert self.rank == 0, "The driver worker must have rank 0."
														
 
															+
														
 
															+        self.model_runner = ModelRunner(
														
 
															+            model_config,
														
 
															+            parallel_config,
														
 
															+            scheduler_config,
														
 
															+            device_config,
														
 
															+            lora_config=self.lora_config,
														
 
															+            is_driver_worker=is_driver_worker,
														
 
															+        )
														
 
															+        # Uninitialized cache engine. Will be initialized by
														
 
															+        # self.init_cache_engine().
														
 
															+        self.cache_config = None
														
 
															+        self.cache_engine = None
														
 
															+        self.cache_events = None
														
 
															+        self.gpu_cache = None
														
 
															+
														
 
															+    def init_model(self) -> None:
														
 
															+        # Initialize the distributed environment.
														
 
															+        _init_distributed_environment(
														
 
															+            self.parallel_config,
														
 
															+            self.rank,
														
 
															+            self.distributed_init_method,
														
 
															+            distributed_backend="gloo",
														
 
															+        )
														
 
															+
														
 
															+        # Initialize the model.
														
 
															+        set_random_seed(self.model_config.seed)
														
 
															+
														
 
															+    def load_model(self):
														
 
															+        self.model_runner.load_model()
														
 
															+
														
 
															+    @torch.inference_mode()
														
 
															+    def profile_num_available_blocks(
														
 
															+        self,
														
 
															+        block_size: int = 128,
														
 
															+        gpu_memory_utilization: float = 0.9,
														
 
															+        cpu_swap_space: int = 0,
														
 
															+        cache_dtype: str = "float16",
														
 
															+    ) -> Tuple[int, int]:
														
 
															+        """Simply returns max_num_seqs as num_gpu_blocks, 0 as num_cpu_blocks.
														
 
															+        """
														
 
															+        num_gpu_blocks = self.scheduler_config.max_num_seqs
														
 
															+        num_cpu_blocks = 0
														
 
															+        return num_gpu_blocks, num_cpu_blocks
														
 
															+
														
 
															+    def init_cache_engine(self, cache_config: CacheConfig) -> None:
														
 
															+        self.cache_config = cache_config
														
 
															+        self.cache_engine = CacheEngine(self.cache_config, self.model_config,
														
 
															+                                        self.parallel_config)
														
 
															+        self.model_runner.set_block_size(self.cache_engine.block_size)
														
 
															+
														
 
															+    def warm_up_model(self) -> None:
														
 
															+        # Warm up is maintained in transformers-neuronx
														
 
															+        pass
														
 
															+
														
 
															+    def cache_swap(
														
 
															+        self,
														
 
															+        blocks_to_swap_in: Dict[int, int],
														
 
															+        blocks_to_swap_out: Dict[int, int],
														
 
															+        blocks_to_copy: Dict[int, List[int]],
														
 
															+    ) -> None:
														
 
															+        # Issue cache operations.
														
 
															+        issued_cache_op = False
														
 
															+        if blocks_to_swap_in:
														
 
															+            self.cache_engine.swap_in(blocks_to_swap_in)
														
 
															+            issued_cache_op = True
														
 
															+        if blocks_to_swap_out:
														
 
															+            self.cache_engine.swap_out(blocks_to_swap_out)
														
 
															+            issued_cache_op = True
														
 
															+        if blocks_to_copy:
														
 
															+            self.cache_engine.copy(blocks_to_copy)
														
 
															+            issued_cache_op = True
														
 
															+
														
 
															+        cache_events = self.cache_events if issued_cache_op else None
														
 
															+
														
 
															+        # Wait for cache operations to finish.
														
 
															+        if cache_events is not None:
														
 
															+            raise NotImplementedError(
														
 
															+                "cache operations are not implemented for neuron backend.")
														
 
															+
														
 
															+    @torch.inference_mode()
														
 
															+    def execute_model(
														
 
															+        self,
														
 
															+        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None,
														
 
															+        blocks_to_swap_in: Optional[Dict[int, int]] = None,
														
 
															+        blocks_to_swap_out: Optional[Dict[int, int]] = None,
														
 
															+        blocks_to_copy: Optional[Dict[int, List[int]]] = None,
														
 
															+    ) -> Optional[SamplerOutput]:
														
 
															+        if self.is_driver_worker:
														
 
															+            assert seq_group_metadata_list is not None
														
 
															+            num_seq_groups = len(seq_group_metadata_list)
														
 
															+            assert blocks_to_swap_in is not None
														
 
															+            assert blocks_to_swap_out is not None
														
 
															+            assert blocks_to_copy is not None
														
 
															+            data = {
														
 
															+                "num_seq_groups": num_seq_groups,
														
 
															+                "blocks_to_swap_in": blocks_to_swap_in,
														
 
															+                "blocks_to_swap_out": blocks_to_swap_out,
														
 
															+                "blocks_to_copy": blocks_to_copy,
														
 
															+            }
														
 
															+            broadcast_tensor_dict(data, src=0)
														
 
															+        else:
														
 
															+            data = broadcast_tensor_dict(src=0)
														
 
															+            num_seq_groups = data["num_seq_groups"]
														
 
															+            blocks_to_swap_in = data["blocks_to_swap_in"]
														
 
															+            blocks_to_swap_out = data["blocks_to_swap_out"]
														
 
															+            blocks_to_copy = data["blocks_to_copy"]
														
 
															+
														
 
															+        self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
														
 
															+
														
 
															+        # If there is no input, we don't need to execute the model.
														
 
															+        if num_seq_groups == 0:
														
 
															+            return {}
														
 
															+
														
 
															+        output = self.model_runner.execute_model(seq_group_metadata_list,
														
 
															+                                                 self.gpu_cache)
														
 
															+        return output
														
 
															+
														
 
															+
														
 
															+def _init_distributed_environment(
														
 
															+    parallel_config: ParallelConfig,
														
 
															+    rank: int,
														
 
															+    distributed_init_method: Optional[str] = None,
														
 
															+    distributed_backend: Optional[str] = None,
														
 
															+) -> None:
														
 
															+    """Initialize the distributed environment."""
														
 
															+    if torch.distributed.is_initialized():
														
 
															+        torch_world_size = torch.distributed.get_world_size()
														
 
															+        if torch_world_size != parallel_config.world_size:
														
 
															+            raise RuntimeError(
														
 
															+                "torch.distributed is already initialized but the torch world "
														
 
															+                "size does not match parallel_config.world_size "
														
 
															+                f"({torch_world_size} vs. {parallel_config.world_size}).")
														
 
															+    elif not distributed_init_method:
														
 
															+        raise ValueError(
														
 
															+            "distributed_init_method must be set if torch.distributed "
														
 
															+            "is not already initialized")
														
 
															+    else:
														
 
															+        distributed_backend = (distributed_backend
														
 
															+                               if distributed_backend else "nccl")
														
 
															+        torch.distributed.init_process_group(
														
 
															+            backend=distributed_backend,
														
 
															+            world_size=parallel_config.world_size,
														
 
															+            rank=rank,
														
 
															+            init_method=distributed_init_method,
														
 
															+        )
														
 
															+
														
 
															+    # A small all_reduce for warmup.
														
 
															+    torch.distributed.all_reduce(torch.zeros(1))
														
 
															+    ensure_model_parallel_initialized(
														
 
															+        parallel_config.tensor_parallel_size,
														
 
															+        parallel_config.pipeline_parallel_size,
														
 
															+    )
														
--- a/aphrodite/task_handler/worker.py
+++ b/aphrodite/task_handler/worker.py
@@ -6,9 +6,9 @@ from typing import Dict, List, Tuple, Set, Optional
 
															 import torch
														
 
															 import torch.distributed
														
 
															-from aphrodite.common.config import (CacheConfig, ModelConfig, ParallelConfig,
														
 
															-                                     SchedulerConfig, LoRAConfig, DeviceConfig)
														
 
															-from aphrodite.common.utils import in_wsl
														
 
															+from aphrodite.common.config import (CacheConfig, DeviceConfig, ModelConfig,
														
 
															+                                     ParallelConfig, SchedulerConfig,
														
 
															+                                     LoRAConfig)
														
 
															 from aphrodite.modeling import set_random_seed
														
 
															 from aphrodite.modeling.megatron import cupy_utils
														
 
															 from aphrodite.modeling.megatron.communication_op import (broadcast_tensor_dict
														
@@ -20,7 +20,7 @@ from aphrodite.common.sequence import SamplerOutput, SequenceGroupMetadata
 
															 from aphrodite.task_handler.cache_engine import CacheEngine
														
 
															 from aphrodite.task_handler.model_runner import ModelRunner
														
 
															 from aphrodite.lora.request import LoRARequest
														
 
															-from aphrodite.common.utils import is_hip
														
 
															+from aphrodite.common.utils import in_wsl
														
 
															 class Worker:
														
@@ -42,7 +42,7 @@ class Worker:
 
															         distributed_init_method: str,
														
 
															         lora_config: Optional[LoRAConfig] = None,
														
 
															         kv_cache_dtype: Optional[str] = "auto",
														
 
															-        kv_quant_params_path: Optional[str] = None,
														
 
															+        # kv_quant_params_path: Optional[str] = None,
														
 
															         is_driver_worker: bool = False,
														
 
															     ) -> None:
														
 
															         self.model_config = model_config
														
@@ -64,7 +64,7 @@ class Worker:
 
															             device_config,
														
 
															             lora_config=self.lora_config,
														
 
															             kv_cache_dtype=kv_cache_dtype,
														
 
															-            kv_quant_params_path=kv_quant_params_path,
														
 
															+            # kv_quant_params_path=kv_quant_params_path,
														
 
															             is_driver_worker=is_driver_worker)
														
 
															         # Uninitialized cache engine. Will be initialized by
														
 
															         # self.init_cache_engine().
														
@@ -99,12 +99,9 @@ class Worker:
 
															         else:
														
 
															             raise RuntimeError(
														
 
															                 f"Not support device type: {self.device_config.device}")
														
 
															-
														
 
															         # Initialize the distributed environment.
														
 
															         init_distributed_environment(self.parallel_config, self.rank,
														
 
															                                      cupy_port, self.distributed_init_method)
														
 
															-        if not self.parallel_config.disable_custom_all_reduce:
														
 
															-            init_custom_ar()
														
 
															         # Initialize the model.
														
 
															         set_random_seed(self.model_config.seed)
														
@@ -143,8 +140,8 @@ class Worker:
 
															         # GPU did not change their memory usage during the profiling.
														
 
															         peak_memory = self.init_gpu_memory - free_gpu_memory
														
 
															-        cache_block_size = CacheEngine.get_cache_block_size(
														
 
															-            block_size, cache_dtype, self.model_config, self.parallel_config)
														
 
															+        cache_block_size = self.get_cache_block_size_bytes(
														
 
															+            block_size, cache_dtype)
														
 
															         num_gpu_blocks = int(
														
 
															             (total_gpu_memory * gpu_memory_utilization - peak_memory) //
														
 
															             cache_block_size)
														
@@ -195,7 +192,7 @@ class Worker:
 
															         # Wait for cache operations to finish.
														
 
															         # TODO: Profile swapping overhead and optimize if needed.
														
 
															         if cache_events is not None:
														
 
															-            for event in cache_events:  # pylint: disable=not-an-iterable
														
 
															+            for event in cache_events:
														
 
															                 event.wait()
														
 
															     @torch.inference_mode()
														
@@ -245,6 +242,22 @@ class Worker:
 
															     def list_loras(self) -> Set[int]:
														
 
															         return self.model_runner.list_loras()
														
 
															+    @property
														
 
															+    def max_model_len(self) -> int:
														
 
															+        return self.model_config.max_model_len
														
 
															+
														
 
															+    @property
														
 
															+    def vocab_size(self) -> int:
														
 
															+        return self.model_runner.vocab_size
														
 
															+
														
 
															+    def get_cache_block_size_bytes(self, block_size: int,
														
 
															+                                   cache_dtype: str) -> int:
														
 
															+        """Get the size of the KV cache block size in bytes.
														
 
															+        """
														
 
															+        return CacheEngine.get_cache_block_size(block_size, cache_dtype,
														
 
															+                                                self.model_config,
														
 
															+                                                self.parallel_config)
														
 
															+
														
 
															 def init_distributed_environment(
														
 
															     parallel_config: ParallelConfig,
														
@@ -279,8 +292,7 @@ def init_distributed_environment(
 
															                 "cupy.distributed is already initialized but the cupy world "
														
 
															                 "size does not match parallel_config.world_size "
														
 
															                 f"({cupy_world_size} vs. {parallel_config.world_size}).")
														
 
															-    elif (parallel_config.world_size > 1 and cupy_port is not None
														
 
															-          and not is_hip()):
														
 
															+    elif (parallel_config.world_size > 1 and cupy_port is not None):
														
 
															         # NOTE: We don't initialize CuPy process group when world size
														
 
															         # is 1.
														
 
															         # TODO: Support multi-node connection.
														
@@ -298,6 +310,10 @@ def init_distributed_environment(
 
															     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
														
 
															                                       parallel_config.pipeline_parallel_size)
														
 
															+    # Initialize a custom fast all-reduce implementation.
														
 
															+    if not parallel_config.disable_custom_all_reduce:
														
 
															+        init_custom_ar()
														
 
															+
														
 
															 def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
														
 
															     # Check if the GPU supports the dtype.
														
--- a/kernels/attention/attention_dtypes.h
+++ b/kernels/attention/attention_dtypes.h
@@ -4,5 +4,4 @@
 
															 #include "dtype_float16.cuh"
														
 
															 #include "dtype_float32.cuh"
														
 
															 #include "dtype_bfloat16.cuh"
														
 
															-#include "dtype_fp8_e5m2.cuh"
														
 
															-#include "dtype_int8.cuh"
														
 
															+#include "dtype_fp8_e5m2.cuh"
														
--- a/kernels/attention/attention_kernels.cu
+++ b/kernels/attention/attention_kernels.cu
@@ -16,1017 +16,939 @@
 
															  * See the License for the specific language governing permissions and
														
 
															  * limitations under the License.
														
 
															  */
														
 
															-#ifdef USE_ROCM
														
 
															-#include <hip/hip_runtime.h>
														
 
															-#endif
														
 
															-
														
 
															-#include <torch/extension.h>
														
 
															-#include <ATen/cuda/CUDAContext.h>
														
 
															-#include <c10/cuda/CUDAGuard.h>
														
 
															-
														
 
															-#include "attention_dtypes.h"
														
 
															-#include "attention_utils.cuh"
														
 
															-#include "../quantization/int8_kvcache/quant_utils.cuh"
														
 
															-#ifdef ENABLE_FP8_E5M2
														
 
															-#include "../quantization/fp8_e5m2_kvcache/quant_utils.cuh"
														
 
															-#endif
														
 
															-
														
 
															-#include <algorithm>
														
 
															-
														
 
															-#ifndef USE_ROCM
														
 
															-#define WARP_SIZE 32
														
 
															-#else
														
 
															-#define WARP_SIZE warpSize
														
 
															-#endif
														
 
															-#define MAX(a, b) ((a) > (b) ? (a) : (b))
														
 
															-#define MIN(a, b) ((a) < (b) ? (a) : (b))
														
 
															-#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
														
 
															-
														
 
															-enum kv_cache_dtype {
														
 
															-  AUTO,
														
 
															-#ifdef ENABLE_FP8_E5M2
														
 
															-  FP8_E5M2,
														
 
															-#endif
														
 
															-  INT8};
														
 
															-
														
 
															-namespace aphrodite {
														
 
															-
														
 
															-// Utility function for attention softmax.
														
 
															-template<int NUM_WARPS>
														
 
															-inline __device__ float block_sum(float* red_smem, float sum) {
														
 
															-  // Decompose the thread index into warp / lane.
														
 
															-  int warp = threadIdx.x / WARP_SIZE;
														
 
															-  int lane = threadIdx.x % WARP_SIZE;
														
 
															-
														
 
															-  // Compute the sum per warp.
														
 
															-#pragma unroll
														
 
															-  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
														
 
															-    sum += APHRODITE_SHFL_XOR_SYNC(sum, mask);
														
 
															-  }
														
 
															-
														
 
															-  // Warp leaders store the data to shared memory.
														
 
															-  if (lane == 0) {
														
 
															-    red_smem[warp] = sum;
														
 
															-  }
														
 
															-
														
 
															-  // Make sure the data is in shared memory.
														
 
															-  __syncthreads();
														
 
															-
														
 
															-  // The warps compute the final sums.
														
 
															-  if (lane < NUM_WARPS) {
														
 
															-    sum = red_smem[lane];
														
 
															-  }
														
 
															-
														
 
															-  // Parallel reduction inside the warp.
														
 
															-#pragma unroll
														
 
															-  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
														
 
															-    sum += APHRODITE_SHFL_XOR_SYNC(sum, mask);
														
 
															-  }
														
 
															-
														
 
															-  // Broadcast to other threads.
														
 
															-  return APHRODITE_SHFL_SYNC(sum, 0);
														
 
															-}
														
 
															-
														
 
															-// TODO: Merge the last two dimensions of the grid.
														
 
															-// Grid: (num_heads, num_seqs, max_num_partitions).
														
 
															-template<
														
 
															-  typename scalar_t,
														
 
															-  typename cache_t,
														
 
															-  int HEAD_SIZE,
														
 
															-  int BLOCK_SIZE,
														
 
															-  int NUM_THREADS,
														
 
															-  kv_cache_dtype KV_CACHE_DTYPE,
														
 
															-  int PARTITION_SIZE = 0> // Zero means no partitioning.
														
 
															-__device__ void paged_attention_kernel(
														
 
															-  float* __restrict__ exp_sums,           // [num_seqs, num_heads, max_num_partitions]
														
 
															-  float* __restrict__ max_logits,         // [num_seqs, num_heads, max_num_partitions]
														
 
															-  scalar_t* __restrict__ out,             // [num_seqs, num_heads, max_num_partitions, head_size]
														
 
															-  const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
														
 
															-  const cache_t* __restrict__ k_cache,    // [num_blocks, num_kv_heads, head_size/x, block_size, x]
														
 
															-  const cache_t* __restrict__ v_cache,    // [num_blocks, num_kv_heads, head_size, block_size]
														
 
															-  const int num_kv_heads,                 // [num_heads]
														
 
															-  const float scale,
														
 
															-  const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
														
 
															-  const int* __restrict__ context_lens,   // [num_seqs]
														
 
															-  const int max_num_blocks_per_seq,
														
 
															-  const float* __restrict__ alibi_slopes, // [num_heads]
														
 
															-  const int q_stride,
														
 
															-  const int kv_block_stride,
														
 
															-  const int kv_head_stride,
														
 
															-  const float k_scale = 1.0f,
														
 
															-  const float k_zp = 0.0f,
														
 
															-  const float v_scale = 1.0f,
														
 
															-  const float v_zp = 0.0f) {
														
 
															-  const int seq_idx = blockIdx.y;
														
 
															-  const int partition_idx = blockIdx.z;
														
 
															-  const int max_num_partitions = gridDim.z;
														
 
															-  constexpr bool USE_PARTITIONING = PARTITION_SIZE > 0;
														
 
															-  const int context_len = context_lens[seq_idx];
														
 
															-  if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= context_len) {
														
 
															-    // No work to do. Terminate the thread block.
														
 
															-    return;
														
 
															-  }
														
 
															-
														
 
															-  const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
														
 
															-  const int num_blocks_per_partition = USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_context_blocks;
														
 
															-
														
 
															-  // [start_block_idx, end_block_idx) is the range of blocks to process.
														
 
															-  const int start_block_idx = USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
														
 
															-  const int end_block_idx = MIN(start_block_idx + num_blocks_per_partition, num_context_blocks);
														
 
															-  const int num_blocks = end_block_idx - start_block_idx;
														
 
															-
														
 
															-  // [start_token_idx, end_token_idx) is the range of tokens to process.
														
 
															-  const int start_token_idx = start_block_idx * BLOCK_SIZE;
														
 
															-  const int end_token_idx = MIN(start_token_idx + num_blocks * BLOCK_SIZE, context_len);
														
 
															-  const int num_tokens = end_token_idx - start_token_idx;
														
 
															-
														
 
															-  constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
														
 
															-  constexpr int NUM_THREAD_GROUPS = NUM_THREADS / THREAD_GROUP_SIZE; // Note: This assumes THREAD_GROUP_SIZE divides NUM_THREADS
														
 
															-  assert(NUM_THREADS % THREAD_GROUP_SIZE == 0);
														
 
															-  constexpr int NUM_TOKENS_PER_THREAD_GROUP = DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE);
														
 
															-  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
														
 
															-  const int thread_idx = threadIdx.x;
														
 
															-  const int warp_idx = thread_idx / WARP_SIZE;
														
 
															-  const int lane = thread_idx % WARP_SIZE;
														
 
															-
														
 
															-  const int head_idx = blockIdx.x;
														
 
															-  const int num_heads = gridDim.x;
														
 
															-  const int num_queries_per_kv = num_heads / num_kv_heads;
														
 
															-  const int kv_head_idx = head_idx / num_queries_per_kv;
														
 
															-  const float alibi_slope = alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx];
														
 
															-
														
 
															-  // A vector type to store a part of a key or a query.
														
 
															-  // The vector size is configured in such a way that the threads in a thread group
														
 
															-  // fetch or compute 16 bytes at a time.
														
 
															-  // For example, if the size of a thread group is 4 and the data type is half,
														
 
															-  // then the vector size is 16 / (4 * sizeof(half)) == 2.
														
 
															-  constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(scalar_t)), 1);
														
 
															-  using K_vec = typename Vec<scalar_t, VEC_SIZE>::Type;
														
 
															-  using Q_vec = typename Vec<scalar_t, VEC_SIZE>::Type;
														
 
															-  using Quant_vec = typename Vec<cache_t, VEC_SIZE>::Type;
														
 
															-
														
 
															-  constexpr int NUM_ELEMS_PER_THREAD = HEAD_SIZE / THREAD_GROUP_SIZE;
														
 
															-  constexpr int NUM_VECS_PER_THREAD = NUM_ELEMS_PER_THREAD / VEC_SIZE;
														
 
															-
														
 
															-  const int thread_group_idx = thread_idx / THREAD_GROUP_SIZE;
														
 
															-  const int thread_group_offset = thread_idx % THREAD_GROUP_SIZE;
														
 
															-
														
 
															-  // Load the query to registers.
														
 
															-  // Each thread in a thread group has a different part of the query.
														
 
															-  // For example, if the the thread group size is 4, then the first thread in the group
														
 
															-  // has 0, 4, 8, ... th vectors of the query, and the second thread has 1, 5, 9, ...
														
 
															-  // th vectors of the query, and so on.
														
 
															-  // NOTE: Because q is split from a qkv tensor, it may not be contiguous.
														
 
															-  const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
														
 
															-  __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
														
 
															-#pragma unroll
														
 
															-  for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD; i += NUM_THREAD_GROUPS) {
														
 
															-    const int vec_idx = thread_group_offset + i * THREAD_GROUP_SIZE;
														
 
															-    q_vecs[thread_group_offset][i] = *reinterpret_cast<const Q_vec*>(q_ptr + vec_idx * VEC_SIZE);
														
 
															-  }
														
 
															-  __syncthreads(); // TODO: possible speedup if this is replaced with a memory wall right before we use q_vecs
														
 
															-
														
 
															-  // Memory planning.
														
 
															-  extern __shared__ char shared_mem[];
														
 
															-  // NOTE: We use FP32 for the softmax logits for better accuracy.
														
 
															-  float* logits = reinterpret_cast<float*>(shared_mem);
														
 
															-  // Workspace for reduction.
														
 
															-  __shared__ float red_smem[2 * NUM_WARPS];
														
 
															-
														
 
															-  // x == THREAD_GROUP_SIZE * VEC_SIZE
														
 
															-  // Each thread group fetches x elements from the key at a time.
														
 
															-  constexpr int x = 16 / sizeof(cache_t);
														
 
															-  float qk_max = -FLT_MAX;
														
 
															-
														
 
															-  // Iterate over the key blocks.
														
 
															-  // Each warp fetches a block of keys for each iteration.
														
 
															-  // Each thread group in a warp fetches a key from the block, and computes
														
 
															-  // dot product with the query.
														
 
															-  const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
														
 
															-  for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) {
														
 
															-    // NOTE: The block number is stored in int32. However, we cast it to int64
														
 
															-    // because int32 can lead to overflow when this variable is multiplied by large numbers
														
 
															-    // (e.g., kv_block_stride).
														
 
															-    const int64_t physical_block_number = static_cast<int64_t>(block_table[block_idx]);
														
 
															-
														
 
															-    // Load a key to registers.
														
 
															-    // Each thread in a thread group has a different part of the key.
														
 
															-    // For example, if the the thread group size is 4, then the first thread in the group
														
 
															-    // has 0, 4, 8, ... th vectors of the key, and the second thread has 1, 5, 9, ... th
														
 
															-    // vectors of the key, and so on.
														
 
															-    for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
														
 
															-      const int physical_block_offset = (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
														
 
															-      const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
														
 
															-      K_vec k_vecs[NUM_VECS_PER_THREAD];
														
 
															-
														
 
															-#pragma unroll
														
 
															-      for (int j = 0; j < NUM_VECS_PER_THREAD; j++) {
														
 
															-        const cache_t* k_ptr = k_cache + physical_block_number * kv_block_stride
														
 
															-                                       + kv_head_idx * kv_head_stride
														
 
															-                                       + physical_block_offset * x;
														
 
															-        const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE;
														
 
															-        const int offset1 = (vec_idx * VEC_SIZE) / x;
														
 
															-        const int offset2 = (vec_idx * VEC_SIZE) % x;
														
 
															-        if constexpr (KV_CACHE_DTYPE == INT8) {
														
 
															-          Quant_vec k_vec_quant = *reinterpret_cast<const Quant_vec*>(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
														
 
															-          using Dequant_vec = typename FloatVec<Quant_vec>::Type;
														
 
															-          Dequant_vec k_vec_dequant = int8::dequant(k_vec_quant, k_scale, k_zp);
														
 
															-          k_vecs[j] = int8::vec_conversion<K_vec, Dequant_vec>(k_vec_dequant);
														
 
															-#ifdef ENABLE_FP8_E5M2
														
 
															-        } else if constexpr (KV_CACHE_DTYPE == FP8_E5M2) {
														
 
															-          Quant_vec k_vec_quant = *reinterpret_cast<const Quant_vec*>(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
														
 
															-          // Vector conversion from Quant_vec to K_vec.
														
 
															-          k_vecs[j] = fp8_e5m2_unscaled::vec_conversion<K_vec, Quant_vec>(k_vec_quant);
														
 
															-#endif
														
 
															-        } else {
														
 
															-          k_vecs[j] = *reinterpret_cast<const K_vec*>(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
														
 
															-        }
														
 
															-      }
														
 
															-
														
 
															-      // Compute dot product.
														
 
															-      // This includes a reduction across the threads in the same thread group.
														
 
															-      float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
														
 
															-      // Add the ALiBi bias if slopes are given.
														
 
															-      qk += (alibi_slope != 0) ? alibi_slope * (token_idx - context_len + 1) : 0;
														
 
															-
														
 
															-      if (thread_group_offset == 0) {
														
 
															-        // Store the partial reductions to shared memory.
														
 
															-        // NOTE: It is required to zero out the masked logits.
														
 
															-        const bool mask = token_idx >= context_len;
														
 
															-        logits[token_idx - start_token_idx] = mask ? 0.f : qk;
														
 
															-        // Update the max value.
														
 
															-        qk_max = mask ? qk_max : fmaxf(qk_max, qk);
														
 
															-      }
														
 
															-    }
														
 
															-  }
														
 
															-
														
 
															-  // Perform reduction across the threads in the same warp to get the
														
 
															-  // max qk value for each "warp" (not across the thread block yet).
														
 
															-  // The 0-th thread of each thread group already has its max qk value.
														
 
															-#pragma unroll
														
 
															-  for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
														
 
															-    qk_max = fmaxf(qk_max, APHRODITE_SHFL_XOR_SYNC(qk_max, mask));
														
 
															-  }
														
 
															-  if (lane == 0) {
														
 
															-    red_smem[warp_idx] = qk_max;
														
 
															-  }
														
 
															-  __syncthreads();
														
 
															-
														
 
															-  // TODO: Refactor this part.
														
 
															-  // Get the max qk value for the sequence.
														
 
															-  qk_max = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
														
 
															-#pragma unroll
														
 
															-  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
														
 
															-    qk_max = fmaxf(qk_max, APHRODITE_SHFL_XOR_SYNC(qk_max, mask));
														
 
															-  }
														
 
															-  // Broadcast the max qk value to all threads.
														
 
															-  qk_max = APHRODITE_SHFL_SYNC(qk_max, 0);
														
 
															-
														
 
															-  // Get the sum of the exp values.
														
 
															-  float exp_sum = 0.f;
														
 
															-  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
														
 
															-    float val = __expf(logits[i] - qk_max);
														
 
															-    logits[i] = val;
														
 
															-    exp_sum += val;
														
 
															-  }
														
 
															-  exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum);
														
 
															-
														
 
															-  // Compute softmax.
														
 
															-  const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
														
 
															-  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
														
 
															-    logits[i] *= inv_sum;
														
 
															-  }
														
 
															-  __syncthreads();
														
 
															-
														
 
															-  // If partitioning is enabled, store the max logit and exp_sum.
														
 
															-  if (USE_PARTITIONING && thread_idx == 0) {
														
 
															-    float* max_logits_ptr = max_logits + seq_idx * num_heads * max_num_partitions
														
 
															-                                       + head_idx * max_num_partitions
														
 
															-                                       + partition_idx;
														
 
															-    *max_logits_ptr = qk_max;
														
 
															-    float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions
														
 
															-                                   + head_idx * max_num_partitions
														
 
															-                                   + partition_idx;
														
 
															-    *exp_sums_ptr = exp_sum;
														
 
															-  }
														
 
															-
														
 
															-  // Each thread will fetch 16 bytes from the value cache at a time.
														
 
															-  constexpr int V_VEC_SIZE = MIN(16 / sizeof(scalar_t), BLOCK_SIZE);
														
 
															-  using V_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
														
 
															-  using L_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
														
 
															-  using V_quant_vec = typename Vec<cache_t, V_VEC_SIZE>::Type;
														
 
															-  using Float_L_vec = typename FloatVec<L_vec>::Type;
														
 
															-
														
 
															-  constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE;
														
 
															-  constexpr int NUM_ROWS_PER_ITER = WARP_SIZE / NUM_V_VECS_PER_ROW;
														
 
															-  constexpr int NUM_ROWS_PER_THREAD = DIVIDE_ROUND_UP(HEAD_SIZE, NUM_ROWS_PER_ITER);
														
 
															-
														
 
															-  // NOTE: We use FP32 for the accumulator for better accuracy.
														
 
															-  float accs[NUM_ROWS_PER_THREAD];
														
 
															-#pragma unroll
														
 
															-  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
														
 
															-    accs[i] = 0.f;
														
 
															-  }
														
 
															-
														
 
															-  scalar_t zero_value;
														
 
															-  zero(zero_value);
														
 
															-  for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) {
														
 
															-    // NOTE: The block number is stored in int32. However, we cast it to int64
														
 
															-    // because int32 can lead to overflow when this variable is multiplied by large numbers
														
 
															-    // (e.g., kv_block_stride).
														
 
															-    const int64_t physical_block_number = static_cast<int64_t>(block_table[block_idx]);
														
 
															-    const int physical_block_offset = (lane % NUM_V_VECS_PER_ROW) * V_VEC_SIZE;
														
 
															-    const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
														
 
															-    L_vec logits_vec;
														
 
															-    from_float(logits_vec, *reinterpret_cast<Float_L_vec*>(logits + token_idx - start_token_idx));
														
 
															-
														
 
															-    const cache_t* v_ptr = v_cache + physical_block_number * kv_block_stride
														
 
															-                                   + kv_head_idx * kv_head_stride;
														
 
															-#pragma unroll
														
 
															-    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
														
 
															-      const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
														
 
															-      if (row_idx < HEAD_SIZE) {
														
 
															-        const int offset = row_idx * BLOCK_SIZE + physical_block_offset;
														
 
															-        V_vec v_vec;
														
 
															-        if constexpr (KV_CACHE_DTYPE == INT8) {
														
 
															-          // dequant and conversion
														
 
															-          V_quant_vec v_vec_quant = *reinterpret_cast<const V_quant_vec*>(v_ptr + offset);
														
 
															-          using V_dequant_vec = typename FloatVec<V_quant_vec>::Type;
														
 
															-          V_dequant_vec v_vec_dequant = int8::dequant(v_vec_quant, v_scale, v_zp);
														
 
															-          v_vec = int8::vec_conversion<V_vec, V_dequant_vec>(v_vec_dequant);
														
 
															-#ifdef ENABLE_FP8_E5M2
														
 
															-        } else if constexpr (KV_CACHE_DTYPE == FP8_E5M2) {
														
 
															-          V_quant_vec v_quant_vec = *reinterpret_cast<const V_quant_vec*>(v_ptr + offset);
														
 
															-          // Vector conversion from V_quant_vec to V_vec.
														
 
															-          v_vec = fp8_e5m2_unscaled::vec_conversion<V_vec, V_quant_vec>(v_quant_vec);
														
 
															-#endif
														
 
															-        } else {
														
 
															-          v_vec = *reinterpret_cast<const V_vec*>(v_ptr + offset);
														
 
															-        }
														
 
															-        if (block_idx == num_context_blocks - 1) {
														
 
															-          // NOTE: When v_vec contains the tokens that are out of the context,
														
 
															-          // we should explicitly zero out the values since they may contain NaNs.
														
 
															-          scalar_t* v_vec_ptr = reinterpret_cast<scalar_t*>(&v_vec);
														
 
															-#pragma unroll
														
 
															-          for (int j = 0; j < V_VEC_SIZE; j++) {
														
 
															-            v_vec_ptr[j] = token_idx + j < context_len ? v_vec_ptr[j] : zero_value;
														
 
															-          }
														
 
															-        }
														
 
															-        accs[i] += dot(logits_vec, v_vec);
														
 
															-      }
														
 
															-    }
														
 
															-  }
														
 
															-
														
 
															-  // Perform reduction within each warp.
														
 
															-#pragma unroll
														
 
															-  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
														
 
															-    float acc = accs[i];
														
 
															-#pragma unroll
														
 
															-    for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
														
 
															-      acc += APHRODITE_SHFL_XOR_SYNC(acc, mask);
														
 
															-    }
														
 
															-    accs[i] = acc;
														
 
															-  }
														
 
															-
														
 
															-  // NOTE: A barrier is required because the shared memory space for logits
														
 
															-  // is reused for the output.
														
 
															-  __syncthreads();
														
 
															-
														
 
															-  // Perform reduction across warps.
														
 
															-  float* out_smem = reinterpret_cast<float*>(shared_mem);
														
 
															-#pragma unroll
														
 
															-  for (int i = NUM_WARPS; i > 1; i /= 2) {
														
 
															-    int mid = i / 2;
														
 
															-    // Upper warps write to shared memory.
														
 
															-    if (warp_idx >= mid && warp_idx < i) {
														
 
															-      float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
														
 
															-#pragma unroll
														
 
															-      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
														
 
															-        const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
														
 
															-        if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
														
 
															-          dst[row_idx] = accs[i];
														
 
															-        }
														
 
															-      }
														
 
															-    }
														
 
															-    __syncthreads();
														
 
															-
														
 
															-    // Lower warps update the output.
														
 
															-    if (warp_idx < mid) {
														
 
															-      const float* src = &out_smem[warp_idx * HEAD_SIZE];
														
 
															-#pragma unroll
														
 
															-      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
														
 
															-        const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
														
 
															-        if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
														
 
															-          accs[i] += src[row_idx];
														
 
															-        }
														
 
															-      }
														
 
															-    }
														
 
															-    __syncthreads();
														
 
															-  }
														
 
															-
														
 
															-  // Write the final output.
														
 
															-  if (warp_idx == 0) {
														
 
															-    scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
														
 
															-                            + head_idx * max_num_partitions * HEAD_SIZE
														
 
															-                            + partition_idx * HEAD_SIZE;
														
 
															-#pragma unroll
														
 
															-    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
														
 
															-      const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
														
 
															-      if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
														
 
															-        from_float(*(out_ptr + row_idx), accs[i]);
														
 
															-      }
														
 
															-    }
														
 
															-  }
														
 
															-}
														
 
															-
														
 
															-// Grid: (num_heads, num_seqs, 1).
														
 
															-template<
														
 
															-  typename scalar_t,
														
 
															-  typename cache_t,
														
 
															-  int HEAD_SIZE,
														
 
															-  int BLOCK_SIZE,
														
 
															-  int NUM_THREADS,
														
 
															-  kv_cache_dtype KV_CACHE_DTYPE>
														
 
															-__global__ void paged_attention_v1_kernel(
														
 
															-  scalar_t* __restrict__ out,             // [num_seqs, num_heads, head_size]
														
 
															-  const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
														
 
															-  const cache_t* __restrict__ k_cache,    // [num_blocks, num_kv_heads, head_size/x, block_size, x]
														
 
															-  const cache_t* __restrict__ v_cache,    // [num_blocks, num_kv_heads, head_size, block_size]
														
 
															-  const int num_kv_heads,                 // [num_heads]
														
 
															-  const float scale,
														
 
															-  const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
														
 
															-  const int* __restrict__ context_lens,   // [num_seqs]
														
 
															-  const int max_num_blocks_per_seq,
														
 
															-  const float* __restrict__ alibi_slopes, // [num_heads]
														
 
															-  const int q_stride,
														
 
															-  const int kv_block_stride,
														
 
															-  const int kv_head_stride,
														
 
															-  const float k_scale,
														
 
															-  const float k_zp,
														
 
															-  const float v_scale,
														
 
															-  const float v_zp) {
														
 
															-  paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, KV_CACHE_DTYPE>(
														
 
															-    /* exp_sums */ nullptr, /* max_logits */ nullptr,
														
 
															-    out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, context_lens,
														
 
															-    max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride, k_scale, k_zp, v_scale, v_zp);
														
 
															-}
														
 
															-
														
 
															-// Grid: (num_heads, num_seqs, max_num_partitions).
														
 
															-template<
														
 
															-  typename scalar_t,
														
 
															-  typename cache_t,
														
 
															-  int HEAD_SIZE,
														
 
															-  int BLOCK_SIZE,
														
 
															-  int NUM_THREADS,
														
 
															-  kv_cache_dtype KV_CACHE_DTYPE,
														
 
															-  int PARTITION_SIZE>
														
 
															-__global__ void paged_attention_v2_kernel(
														
 
															-  float* __restrict__ exp_sums,           // [num_seqs, num_heads, max_num_partitions]
														
 
															-  float* __restrict__ max_logits,         // [num_seqs, num_heads, max_num_partitions]
														
 
															-  scalar_t* __restrict__ tmp_out,         // [num_seqs, num_heads, max_num_partitions, head_size]
														
 
															-  const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
														
 
															-  const cache_t* __restrict__ k_cache,    // [num_blocks, num_kv_heads, head_size/x, block_size, x]
														
 
															-  const cache_t* __restrict__ v_cache,    // [num_blocks, num_kv_heads, head_size, block_size]
														
 
															-  const int num_kv_heads,                 // [num_heads]
														
 
															-  const float scale,
														
 
															-  const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
														
 
															-  const int* __restrict__ context_lens,   // [num_seqs]
														
 
															-  const int max_num_blocks_per_seq,
														
 
															-  const float* __restrict__ alibi_slopes, // [num_heads]
														
 
															-  const int q_stride,
														
 
															-  const int kv_block_stride,
														
 
															-  const int kv_head_stride,
														
 
															-  const float k_scale,
														
 
															-  const float k_zp,
														
 
															-  const float v_scale,
														
 
															-  const float v_zp) {
														
 
															-  paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, KV_CACHE_DTYPE, PARTITION_SIZE>(
														
 
															-    exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
														
 
															-    block_tables, context_lens, max_num_blocks_per_seq, alibi_slopes,
														
 
															-    q_stride, kv_block_stride, kv_head_stride, k_scale, k_zp, v_scale, v_zp);
														
 
															-}
														
 
															-
														
 
															-// Grid: (num_heads, num_seqs).
														
 
															-template<
														
 
															-  typename scalar_t,
														
 
															-  int HEAD_SIZE,
														
 
															-  int NUM_THREADS,
														
 
															-  int PARTITION_SIZE>
														
 
															-__global__ void paged_attention_v2_reduce_kernel(
														
 
															-  scalar_t* __restrict__ out,             // [num_seqs, num_heads, head_size]
														
 
															-  const float* __restrict__ exp_sums,     // [num_seqs, num_heads, max_num_partitions]
														
 
															-  const float* __restrict__ max_logits,   // [num_seqs, num_heads, max_num_partitions]
														
 
															-  const scalar_t* __restrict__ tmp_out,   // [num_seqs, num_heads, max_num_partitions, head_size]
														
 
															-  const int* __restrict__ context_lens,   // [num_seqs]
														
 
															-  const int max_num_partitions) {
														
 
															-  const int num_heads = gridDim.x;
														
 
															-  const int head_idx = blockIdx.x;
														
 
															-  const int seq_idx = blockIdx.y;
														
 
															-  const int context_len = context_lens[seq_idx];
														
 
															-  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
														
 
															-  if (num_partitions == 1) {
														
 
															-    // No need to reduce. Only copy tmp_out to out.
														
 
															-    scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
														
 
															-    const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
														
 
															-                                          + head_idx * max_num_partitions * HEAD_SIZE;
														
 
															-    for (int i = threadIdx.x; i < HEAD_SIZE; i += blockDim.x) {
														
 
															-      out_ptr[i] = tmp_out_ptr[i];
														
 
															-    }
														
 
															-    // Terminate the thread block.
														
 
															-    return;
														
 
															-  }
														
 
															-
														
 
															-  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
														
 
															-  const int warp_idx = threadIdx.x / WARP_SIZE;
														
 
															-  const int lane = threadIdx.x % WARP_SIZE;
														
 
															-
														
 
															-  // Size: 2 * num_partitions.
														
 
															-  extern __shared__ char shared_mem[];
														
 
															-  // Workspace for reduction.
														
 
															-  __shared__ float red_smem[2 * NUM_WARPS];
														
 
															-
														
 
															-  // Load max logits to shared memory.
														
 
															-  float* shared_max_logits = reinterpret_cast<float*>(shared_mem);
														
 
															-  const float* max_logits_ptr = max_logits + seq_idx * num_heads * max_num_partitions
														
 
															-                                           + head_idx * max_num_partitions;
														
 
															-  float max_logit = -FLT_MAX;
														
 
															-  for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
														
 
															-    const float l = max_logits_ptr[i];
														
 
															-    shared_max_logits[i] = l;
														
 
															-    max_logit = fmaxf(max_logit, l);
														
 
															-  }
														
 
															-  __syncthreads();
														
 
															-
														
 
															-  // Get the global max logit.
														
 
															-  // Reduce within the warp.
														
 
															-#pragma unroll
														
 
															-  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
														
 
															-    max_logit = fmaxf(max_logit, APHRODITE_SHFL_XOR_SYNC(max_logit, mask));
														
 
															-  }
														
 
															-  if (lane == 0) {
														
 
															-    red_smem[warp_idx] = max_logit;
														
 
															-  }
														
 
															-  __syncthreads();
														
 
															-  // Reduce across warps.
														
 
															-  max_logit = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
														
 
															-#pragma unroll
														
 
															-  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
														
 
															-    max_logit = fmaxf(max_logit, APHRODITE_SHFL_XOR_SYNC(max_logit, mask));
														
 
															-  }
														
 
															-  // Broadcast the max value to all threads.
														
 
															-  max_logit = APHRODITE_SHFL_SYNC(max_logit, 0);
														
 
															-
														
 
															-  // Load rescaled exp sums to shared memory.
														
 
															-  float* shared_exp_sums = reinterpret_cast<float*>(shared_mem + sizeof(float) * num_partitions);
														
 
															-  const float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions
														
 
															-                                       + head_idx * max_num_partitions;
														
 
															-  float global_exp_sum = 0.0f;
														
 
															-  for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
														
 
															-    float l = shared_max_logits[i];
														
 
															-    float rescaled_exp_sum = exp_sums_ptr[i] * expf(l - max_logit);
														
 
															-    global_exp_sum += rescaled_exp_sum;
														
 
															-    shared_exp_sums[i] = rescaled_exp_sum;
														
 
															-  }
														
 
															-  __syncthreads();
														
 
															-  global_exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], global_exp_sum);
														
 
															-  const float inv_global_exp_sum = __fdividef(1.0f, global_exp_sum + 1e-6f);
														
 
															-
														
 
															-  // Aggregate tmp_out to out.
														
 
															-  const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
														
 
															-                                        + head_idx * max_num_partitions * HEAD_SIZE;
														
 
															-  scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
														
 
															-#pragma unroll
														
 
															-  for (int i = threadIdx.x; i < HEAD_SIZE; i += NUM_THREADS) {
														
 
															-    float acc = 0.0f;
														
 
															-    for (int j = 0; j < num_partitions; ++j) {
														
 
															-      acc += to_float(tmp_out_ptr[j * HEAD_SIZE + i]) * shared_exp_sums[j] * inv_global_exp_sum;
														
 
															-    }
														
 
															-    from_float(out_ptr[i], acc);
														
 
															-  }
														
 
															-}
														
 
															-
														
 
															-} // namespace aphrodite
														
 
															-
														
 
															-#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE)                                                        \
														
 
															-  APHRODITE_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(                                        \
														
 
															-    ((void*)aphrodite::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,    \
														
 
															-      KV_CACHE_DTYPE>), shared_mem_size);                                                           \
														
 
															-  aphrodite::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,              \
														
 
															-  KV_CACHE_DTYPE><<<grid, block, shared_mem_size, stream>>>(                                        \
														
 
															-    out_ptr,                                                                                        \
														
 
															-    query_ptr,                                                                                      \
														
 
															-    key_cache_ptr,                                                                                  \
														
 
															-    value_cache_ptr,                                                                                \
														
 
															-    num_kv_heads,                                                                                   \
														
 
															-    scale,                                                                                          \
														
 
															-    block_tables_ptr,                                                                               \
														
 
															-    context_lens_ptr,                                                                               \
														
 
															-    max_num_blocks_per_seq,                                                                         \
														
 
															-    alibi_slopes_ptr,                                                                               \
														
 
															-    q_stride,                                                                                       \
														
 
															-    kv_block_stride,                                                                                \
														
 
															-    kv_head_stride,                                                                                 \
														
 
															-    k_scale,                                                                                        \
														
 
															-    k_zp,                                                                                           \
														
 
															-    v_scale,                                                                                        \
														
 
															-    v_zp);
														
 
															-
														
 
															-// TODO: Tune NUM_THREADS.
														
 
															-template<
														
 
															-  typename T,
														
 
															-  typename CACHE_T,
														
 
															-  int BLOCK_SIZE,
														
 
															-  kv_cache_dtype KV_CACHE_DTYPE,
														
 
															-  int NUM_THREADS = 128>
														
 
															-void paged_attention_v1_launcher(
														
 
															-  torch::Tensor& out,
														
 
															-  torch::Tensor& query,
														
 
															-  torch::Tensor& key_cache,
														
 
															-  torch::Tensor& value_cache,
														
 
															-  int num_kv_heads,
														
 
															-  float scale,
														
 
															-  torch::Tensor& block_tables,
														
 
															-  torch::Tensor& context_lens,
														
 
															-  int max_context_len,
														
 
															-  const c10::optional<torch::Tensor>& alibi_slopes,
														
 
															-  const float k_scale,
														
 
															-  const float k_zp,
														
 
															-  const float v_scale,
														
 
															-  const float v_zp) {
														
 
															-  int num_seqs = query.size(0);
														
 
															-  int num_heads = query.size(1);
														
 
															-  int head_size = query.size(2);
														
 
															-  int max_num_blocks_per_seq = block_tables.size(1);
														
 
															-  int q_stride = query.stride(0);
														
 
															-  int kv_block_stride = key_cache.stride(0);
														
 
															-  int kv_head_stride = key_cache.stride(1);
														
 
															-
														
 
															-  int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
														
 
															-  assert(head_size % thread_group_size == 0);
														
 
															-
														
 
															-  // NOTE: alibi_slopes is optional.
														
 
															-  const float* alibi_slopes_ptr = alibi_slopes ?
														
 
															-    reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
														
 
															-    : nullptr;
														
 
															-
														
 
															-  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
														
 
															-  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
														
 
															-  CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
														
 
															-  CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
														
 
															-  int* block_tables_ptr = block_tables.data_ptr<int>();
														
 
															-  int* context_lens_ptr = context_lens.data_ptr<int>();
														
 
															-
														
 
															-  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
														
 
															-  int padded_max_context_len = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE) * BLOCK_SIZE;
														
 
															-  int logits_size = padded_max_context_len * sizeof(float);
														
 
															-  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
														
 
															-  // Python-side check in aphrodite.task_handler.worker._check_if_can_support_max_seq_len
														
 
															-  // Keep that in sync with the logic here!
														
 
															-  int shared_mem_size = std::max(logits_size, outputs_size);
														
 
															-
														
 
															-  dim3 grid(num_heads, num_seqs, 1);
														
 
															-  dim3 block(NUM_THREADS);
														
 
															-  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
														
 
															-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
														
 
															-  switch (head_size) {
														
 
															-    // NOTE: To reduce the compilation time, we only compile for the
														
 
															-    // head sizes that we use in the model. However, we can easily extend this
														
 
															-    // to support any head size which is a multiple of 16.
														
 
															-    case 64:
														
 
															-      LAUNCH_PAGED_ATTENTION_V1(64);
														
 
															-      break;
														
 
															-    case 80:
														
 
															-      LAUNCH_PAGED_ATTENTION_V1(80);
														
 
															-      break;
														
 
															-    case 96:
														
 
															-      LAUNCH_PAGED_ATTENTION_V1(96);
														
 
															-      break;
														
 
															-    case 112:
														
 
															-      LAUNCH_PAGED_ATTENTION_V1(112);
														
 
															-      break;
														
 
															-    case 128:
														
 
															-      LAUNCH_PAGED_ATTENTION_V1(128);
														
 
															-      break;
														
 
															-    case 256:
														
 
															-      LAUNCH_PAGED_ATTENTION_V1(256);
														
 
															-      break;
														
 
															-    default:
														
 
															-      TORCH_CHECK(false, "Unsupported head size: ", head_size);
														
 
															-      break;
														
 
															-  }
														
 
															-}
														
 
															-
														
 
															-#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_CACHE_DTYPE)             \
														
 
															-  paged_attention_v1_launcher<T, CACHE_T, BLOCK_SIZE, KV_CACHE_DTYPE>(       \
														
 
															-    out,                                                                     \
														
 
															-    query,                                                                   \
														
 
															-    key_cache,                                                               \
														
 
															-    value_cache,                                                             \
														
 
															-    num_kv_heads,                                                            \
														
 
															-    scale,                                                                   \
														
 
															-    block_tables,                                                            \
														
 
															-    context_lens,                                                            \
														
 
															-    max_context_len,                                                         \
														
 
															-    alibi_slopes,                                                            \
														
 
															-    k_scale,                                                                 \
														
 
															-    k_zp,                                                                    \
														
 
															-    v_scale,                                                                 \
														
 
															-    v_zp);
														
 
															-
														
 
															-// NOTE: To reduce the compilation time, we omitted block sizes
														
 
															-// 1, 2, 4, 64, 128, 256.
														
 
															-#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_CACHE_DTYPE)       \
														
 
															-  switch (block_size) {                                               \
														
 
															-    case 8:                                                           \
														
 
															-      CALL_V1_LAUNCHER(T, CACHE_T, 8, KV_CACHE_DTYPE);                \
														
 
															-      break;                                                          \
														
 
															-    case 16:                                                          \
														
 
															-      CALL_V1_LAUNCHER(T, CACHE_T, 16, KV_CACHE_DTYPE);               \
														
 
															-      break;                                                          \
														
 
															-    case 32:                                                          \
														
 
															-      CALL_V1_LAUNCHER(T, CACHE_T, 32, KV_CACHE_DTYPE);               \
														
 
															-      break;                                                          \
														
 
															-    default:                                                          \
														
 
															-      TORCH_CHECK(false, "Unsupported block size: ", block_size);     \
														
 
															-      break;                                                          \
														
 
															-  }
														
 
															-
														
 
															-void paged_attention_v1(
														
 
															-  torch::Tensor& out,             // [num_seqs, num_heads, head_size]
														
 
															-  torch::Tensor& query,           // [num_seqs, num_heads, head_size]
														
 
															-  torch::Tensor& key_cache,       // [num_blocks, num_heads, head_size/x, block_size, x]
														
 
															-  torch::Tensor& value_cache,     // [num_blocks, num_heads, head_size, block_size]
														
 
															-  int num_kv_heads,               // [num_heads]
														
 
															-  float scale,
														
 
															-  torch::Tensor& block_tables,    // [num_seqs, max_num_blocks_per_seq]
														
 
															-  torch::Tensor& context_lens,    // [num_seqs]
														
 
															-  int block_size,
														
 
															-  int max_context_len,
														
 
															-  const c10::optional<torch::Tensor>& alibi_slopes,
														
 
															-  const std::string& kv_cache_dtype,
														
 
															-  const float k_scale = 1.0f,
														
 
															-  const float k_zp = 0.0f,
														
 
															-  const float v_scale = 1.0f,
														
 
															-  const float v_zp = 0.0f) {
														
 
															-  if (kv_cache_dtype == "auto") {
														
 
															-    if (query.dtype() == at::ScalarType::Float) {
														
 
															-      CALL_V1_LAUNCHER_BLOCK_SIZE(float, float, AUTO);
														
 
															-    } else if (query.dtype() == at::ScalarType::Half) {
														
 
															-      CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t, uint16_t, AUTO);
														
 
															-    } else if (query.dtype() == at::ScalarType::BFloat16) {
														
 
															-      CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, __nv_bfloat16, AUTO);
														
 
															-    } else {
														
 
															-      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
														
 
															-    }
														
 
															-#ifdef ENABLE_FP8_E5M2
														
 
															-  } else if (kv_cache_dtype == "fp8_e5m2") {
														
 
															-    if (query.dtype() == at::ScalarType::Float) {
														
 
															-      CALL_V1_LAUNCHER_BLOCK_SIZE(float, uint8_t, FP8_E5M2);
														
 
															-    } else if (query.dtype() == at::ScalarType::Half) {
														
 
															-      CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t, uint8_t, FP8_E5M2);
														
 
															-    } else if (query.dtype() == at::ScalarType::BFloat16) {
														
 
															-      CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, uint8_t, FP8_E5M2);
														
 
															-    } else {
														
 
															-      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
														
 
															-    }
														
 
															-#endif
														
 
															-  } else if (kv_cache_dtype == "int8") {
														
 
															-    if (query.dtype() == at::ScalarType::Float) {
														
 
															-      CALL_V1_LAUNCHER_BLOCK_SIZE(float, int8_t, INT8);
														
 
															-    } else if (query.dtype() == at::ScalarType::Half) {
														
 
															-      CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t, int8_t, INT8);
														
 
															-    } else if (query.dtype() == at::ScalarType::BFloat16) {
														
 
															-      CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, int8_t, INT8);
														
 
															-    } else {
														
 
															-      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
														
 
															-    }
														
 
															-  } else {
														
 
															-    TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
														
 
															-  }
														
 
															-}
														
 
															-
														
 
															-#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE)                                                  \
														
 
															-  aphrodite::paged_attention_v2_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,        \
														
 
															-  KV_CACHE_DTYPE, PARTITION_SIZE>                                                             \
														
 
															-  <<<grid, block, shared_mem_size, stream>>>(                                                 \
														
 
															-    exp_sums_ptr,                                                                             \
														
 
															-    max_logits_ptr,                                                                           \
														
 
															-    tmp_out_ptr,                                                                              \
														
 
															-    query_ptr,                                                                                \
														
 
															-    key_cache_ptr,                                                                            \
														
 
															-    value_cache_ptr,                                                                          \
														
 
															-    num_kv_heads,                                                                             \
														
 
															-    scale,                                                                                    \
														
 
															-    block_tables_ptr,                                                                         \
														
 
															-    context_lens_ptr,                                                                         \
														
 
															-    max_num_blocks_per_seq,                                                                   \
														
 
															-    alibi_slopes_ptr,                                                                         \
														
 
															-    q_stride,                                                                                 \
														
 
															-    kv_block_stride,                                                                          \
														
 
															-    kv_head_stride,                                                                           \
														
 
															-    k_scale,                                                                                  \
														
 
															-    k_zp,                                                                                     \
														
 
															-    v_scale,                                                                                  \
														
 
															-    v_zp);                                                                                    \
														
 
															-  aphrodite::paged_attention_v2_reduce_kernel<T, HEAD_SIZE, NUM_THREADS, PARTITION_SIZE>           \
														
 
															-  <<<reduce_grid, block, reduce_shared_mem_size, stream>>>(                                   \
														
 
															-    out_ptr,                                                                                  \
														
 
															-    exp_sums_ptr,                                                                             \
														
 
															-    max_logits_ptr,                                                                           \
														
 
															-    tmp_out_ptr,                                                                              \
														
 
															-    context_lens_ptr,                                                                         \
														
 
															-    max_num_partitions);
														
 
															-
														
 
															-template<
														
 
															-  typename T,
														
 
															-  typename CACHE_T,
														
 
															-  int BLOCK_SIZE,
														
 
															-  kv_cache_dtype KV_CACHE_DTYPE,
														
 
															-  int NUM_THREADS = 128,
														
 
															-  int PARTITION_SIZE = 512>
														
 
															-void paged_attention_v2_launcher(
														
 
															-  torch::Tensor& out,
														
 
															-  torch::Tensor& exp_sums,
														
 
															-  torch::Tensor& max_logits,
														
 
															-  torch::Tensor& tmp_out,
														
 
															-  torch::Tensor& query,
														
 
															-  torch::Tensor& key_cache,
														
 
															-  torch::Tensor& value_cache,
														
 
															-  int num_kv_heads,
														
 
															-  float scale,
														
 
															-  torch::Tensor& block_tables,
														
 
															-  torch::Tensor& context_lens,
														
 
															-  int max_context_len,
														
 
															-  const c10::optional<torch::Tensor>& alibi_slopes,
														
 
															-  const float k_scale,
														
 
															-  const float k_zp,
														
 
															-  const float v_scale,
														
 
															-  const float v_zp) {
														
 
															-  int num_seqs = query.size(0);
														
 
															-  int num_heads = query.size(1);
														
 
															-  int head_size = query.size(2);
														
 
															-  int max_num_blocks_per_seq = block_tables.size(1);
														
 
															-  int q_stride = query.stride(0);
														
 
															-  int kv_block_stride = key_cache.stride(0);
														
 
															-  int kv_head_stride = key_cache.stride(1);
														
 
															-
														
 
															-  int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
														
 
															-  assert(head_size % thread_group_size == 0);
														
 
															-
														
 
															-  // NOTE: alibi_slopes is optional.
														
 
															-  const float* alibi_slopes_ptr = alibi_slopes ?
														
 
															-    reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
														
 
															-    : nullptr;
														
 
															-
														
 
															-  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
														
 
															-  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
														
 
															-  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
														
 
															-  T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
														
 
															-  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
														
 
															-  CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
														
 
															-  CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
														
 
															-  int* block_tables_ptr = block_tables.data_ptr<int>();
														
 
															-  int* context_lens_ptr = context_lens.data_ptr<int>();
														
 
															-
														
 
															-  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
														
 
															-  int max_num_partitions = DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
														
 
															-  int logits_size = PARTITION_SIZE * sizeof(float);
														
 
															-  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
														
 
															-
														
 
															-  // For paged attention v2 kernel.
														
 
															-  dim3 grid(num_heads, num_seqs, max_num_partitions);
														
 
															-  int shared_mem_size = std::max(logits_size, outputs_size);
														
 
															-  // For paged attention v2 reduce kernel.
														
 
															-  dim3 reduce_grid(num_heads, num_seqs);
														
 
															-  int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float);
														
 
															-
														
 
															-  dim3 block(NUM_THREADS);
														
 
															-  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
														
 
															-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
														
 
															-  switch (head_size) {
														
 
															-    // NOTE: To reduce the compilation time, we only compile for the
														
 
															-    // head sizes that we use in the model. However, we can easily extend this
														
 
															-    // to support any head size which is a multiple of 16.
														
 
															-    case 64:
														
 
															-      LAUNCH_PAGED_ATTENTION_V2(64);
														
 
															-      break;
														
 
															-    case 80:
														
 
															-      LAUNCH_PAGED_ATTENTION_V2(80);
														
 
															-      break;
														
 
															-    case 96:
														
 
															-      LAUNCH_PAGED_ATTENTION_V2(96);
														
 
															-      break;
														
 
															-    case 112:
														
 
															-      LAUNCH_PAGED_ATTENTION_V2(112);
														
 
															-      break;
														
 
															-    case 128:
														
 
															-      LAUNCH_PAGED_ATTENTION_V2(128);
														
 
															-      break;
														
 
															-    case 256:
														
 
															-      LAUNCH_PAGED_ATTENTION_V2(256);
														
 
															-      break;
														
 
															-    default:
														
 
															-      TORCH_CHECK(false, "Unsupported head size: ", head_size);
														
 
															-      break;
														
 
															-  }
														
 
															-}
														
 
															-
														
 
															-#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_CACHE_DTYPE)                 \
														
 
															-  paged_attention_v2_launcher<T, CACHE_T, BLOCK_SIZE, KV_CACHE_DTYPE>(           \
														
 
															-    out,                                                                         \
														
 
															-    exp_sums,                                                                    \
														
 
															-    max_logits,                                                                  \
														
 
															-    tmp_out,                                                                     \
														
 
															-    query,                                                                       \
														
 
															-    key_cache,                                                                   \
														
 
															-    value_cache,                                                                 \
														
 
															-    num_kv_heads,                                                                \
														
 
															-    scale,                                                                       \
														
 
															-    block_tables,                                                                \
														
 
															-    context_lens,                                                                \
														
 
															-    max_context_len,                                                             \
														
 
															-    alibi_slopes,                                                                \
														
 
															-    k_scale,                                                                     \
														
 
															-    k_zp,                                                                        \
														
 
															-    v_scale,                                                                     \
														
 
															-    v_zp);
														
 
															-
														
 
															-// NOTE: To reduce the compilation time, we omitted block sizes
														
 
															-// 1, 2, 4, 64, 128, 256.
														
 
															-#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_CACHE_DTYPE)             \
														
 
															-  switch (block_size) {                                                     \
														
 
															-    case 8:                                                                 \
														
 
															-      CALL_V2_LAUNCHER(T, CACHE_T, 8, KV_CACHE_DTYPE);                      \
														
 
															-      break;                                                                \
														
 
															-    case 16:                                                                \
														
 
															-      CALL_V2_LAUNCHER(T, CACHE_T, 16, KV_CACHE_DTYPE);                     \
														
 
															-      break;                                                                \
														
 
															-    case 32:                                                                \
														
 
															-      CALL_V2_LAUNCHER(T, CACHE_T, 32, KV_CACHE_DTYPE);                     \
														
 
															-      break;                                                                \
														
 
															-    default:                                                                \
														
 
															-      TORCH_CHECK(false, "Unsupported block size: ", block_size);           \
														
 
															-      break;                                                                \
														
 
															-  }
														
 
															-
														
 
															-void paged_attention_v2(
														
 
															-  torch::Tensor& out,             // [num_seqs, num_heads, head_size]
														
 
															-  torch::Tensor& exp_sums,        // [num_seqs, num_heads, max_num_partitions]
														
 
															-  torch::Tensor& max_logits,      // [num_seqs, num_heads, max_num_partitions]
														
 
															-  torch::Tensor& tmp_out,         // [num_seqs, num_heads, max_num_partitions, head_size]
														
 
															-  torch::Tensor& query,           // [num_seqs, num_heads, head_size]
														
 
															-  torch::Tensor& key_cache,       // [num_blocks, num_heads, head_size/x, block_size, x]
														
 
															-  torch::Tensor& value_cache,     // [num_blocks, num_heads, head_size, block_size]
														
 
															-  int num_kv_heads,               // [num_heads]
														
 
															-  float scale,
														
 
															-  torch::Tensor& block_tables,    // [num_seqs, max_num_blocks_per_seq]
														
 
															-  torch::Tensor& context_lens,    // [num_seqs]
														
 
															-  int block_size,
														
 
															-  int max_context_len,
														
 
															-  const c10::optional<torch::Tensor>& alibi_slopes,
														
 
															-  const std::string& kv_cache_dtype,
														
 
															-  const float k_scale = 1.0f,
														
 
															-  const float k_zp = 0.0f,
														
 
															-  const float v_scale = 1.0f,
														
 
															-  const float v_zp = 0.0f) {
														
 
															-  if (kv_cache_dtype == "auto") {
														
 
															-    if (query.dtype() == at::ScalarType::Float) {
														
 
															-      CALL_V2_LAUNCHER_BLOCK_SIZE(float, float, AUTO);
														
 
															-    } else if (query.dtype() == at::ScalarType::Half) {
														
 
															-      CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t, uint16_t, AUTO);
														
 
															-    } else if (query.dtype() == at::ScalarType::BFloat16) {
														
 
															-      CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, __nv_bfloat16, AUTO);
														
 
															-    } else {
														
 
															-      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
														
 
															-    }
														
 
															-#ifdef ENABLE_FP8_E5M2
														
 
															-  } else if (kv_cache_dtype == "fp8_e5m2") {
														
 
															-    if (query.dtype() == at::ScalarType::Float) {
														
 
															-      CALL_V2_LAUNCHER_BLOCK_SIZE(float, uint8_t, FP8_E5M2);
														
 
															-    } else if (query.dtype() == at::ScalarType::Half) {
														
 
															-      CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t, uint8_t, FP8_E5M2);
														
 
															-    } else if (query.dtype() == at::ScalarType::BFloat16) {
														
 
															-      CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, uint8_t, FP8_E5M2);
														
 
															-    } else {
														
 
															-      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
														
 
															-    }
														
 
															-#endif
														
 
															-  } else if (kv_cache_dtype == "int8") {
														
 
															-    if (query.dtype() == at::ScalarType::Float) {
														
 
															-      CALL_V2_LAUNCHER_BLOCK_SIZE(float, int8_t, INT8);
														
 
															-    } else if (query.dtype() == at::ScalarType::Half) {
														
 
															-      CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t, int8_t, INT8);
														
 
															-    } else if (query.dtype() == at::ScalarType::BFloat16) {
														
 
															-      CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, int8_t, INT8);
														
 
															-    } else {
														
 
															-      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
														
 
															-    }
														
 
															-  } else {
														
 
															-    TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
														
 
															-  }
														
 
															-}
														
 
															-
														
 
															-#undef WARP_SIZE
														
 
															-#undef MAX
														
 
															-#undef MIN
														
 
															-#undef DIVIDE_ROUND_UP
														
 
															+ #ifdef USE_ROCM
														
 
															+ #include <hip/hip_runtime.h>
														
 
															+ #endif
														
 
															+ 
														
 
															+ #include <torch/extension.h>
														
 
															+ #include <ATen/cuda/CUDAContext.h>
														
 
															+ #include <c10/cuda/CUDAGuard.h>
														
 
															+ 
														
 
															+ #include "attention_dtypes.h"
														
 
															+ #include "attention_utils.cuh"
														
 
															+ #ifdef ENABLE_FP8_E5M2
														
 
															+ #include "../quantization/fp8_e5m2_kvcache/quant_utils.cuh"
														
 
															+ #endif
														
 
															+ 
														
 
															+ #include <algorithm>
														
 
															+ 
														
 
															+ #ifndef USE_ROCM
														
 
															+ #define WARP_SIZE 32
														
 
															+ #else
														
 
															+ #define WARP_SIZE warpSize
														
 
															+ #endif
														
 
															+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
														
 
															+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
														
 
															+ #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
														
 
															+ 
														
 
															+ namespace aphrodite {
														
 
															+ 
														
 
															+ // Utility function for attention softmax.
														
 
															+ template<int NUM_WARPS>
														
 
															+ inline __device__ float block_sum(float* red_smem, float sum) {
														
 
															+   // Decompose the thread index into warp / lane.
														
 
															+   int warp = threadIdx.x / WARP_SIZE;
														
 
															+   int lane = threadIdx.x % WARP_SIZE;
														
 
															+ 
														
 
															+   // Compute the sum per warp.
														
 
															+ #pragma unroll
														
 
															+   for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
														
 
															+     sum += APHRODITE_SHFL_XOR_SYNC(sum, mask);
														
 
															+   }
														
 
															+ 
														
 
															+   // Warp leaders store the data to shared memory.
														
 
															+   if (lane == 0) {
														
 
															+     red_smem[warp] = sum;
														
 
															+   }
														
 
															+ 
														
 
															+   // Make sure the data is in shared memory.
														
 
															+   __syncthreads();
														
 
															+ 
														
 
															+   // The warps compute the final sums.
														
 
															+   if (lane < NUM_WARPS) {
														
 
															+     sum = red_smem[lane];
														
 
															+   }
														
 
															+ 
														
 
															+   // Parallel reduction inside the warp.
														
 
															+ #pragma unroll
														
 
															+   for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
														
 
															+     sum += APHRODITE_SHFL_XOR_SYNC(sum, mask);
														
 
															+   }
														
 
															+ 
														
 
															+   // Broadcast to other threads.
														
 
															+   return APHRODITE_SHFL_SYNC(sum, 0);
														
 
															+ }
														
 
															+ 
														
 
															+ // TODO: Merge the last two dimensions of the grid.
														
 
															+ // Grid: (num_heads, num_seqs, max_num_partitions).
														
 
															+ template<
														
 
															+   typename scalar_t,
														
 
															+   typename cache_t,
														
 
															+   int HEAD_SIZE,
														
 
															+   int BLOCK_SIZE,
														
 
															+   int NUM_THREADS,
														
 
															+   bool IS_FP8_E5M2_KV_CACHE,
														
 
															+   int PARTITION_SIZE = 0> // Zero means no partitioning.
														
 
															+ __device__ void paged_attention_kernel(
														
 
															+   float* __restrict__ exp_sums,           // [num_seqs, num_heads, max_num_partitions]
														
 
															+   float* __restrict__ max_logits,         // [num_seqs, num_heads, max_num_partitions]
														
 
															+   scalar_t* __restrict__ out,             // [num_seqs, num_heads, max_num_partitions, head_size]
														
 
															+   const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
														
 
															+   const cache_t* __restrict__ k_cache,    // [num_blocks, num_kv_heads, head_size/x, block_size, x]
														
 
															+   const cache_t* __restrict__ v_cache,    // [num_blocks, num_kv_heads, head_size, block_size]
														
 
															+   const int num_kv_heads,                 // [num_heads]
														
 
															+   const float scale,
														
 
															+   const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
														
 
															+   const int* __restrict__ context_lens,   // [num_seqs]
														
 
															+   const int max_num_blocks_per_seq,
														
 
															+   const float* __restrict__ alibi_slopes, // [num_heads]
														
 
															+   const int q_stride,
														
 
															+   const int kv_block_stride,
														
 
															+   const int kv_head_stride) {
														
 
															+   const int seq_idx = blockIdx.y;
														
 
															+   const int partition_idx = blockIdx.z;
														
 
															+   const int max_num_partitions = gridDim.z;
														
 
															+   constexpr bool USE_PARTITIONING = PARTITION_SIZE > 0;
														
 
															+   const int context_len = context_lens[seq_idx];
														
 
															+   if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= context_len) {
														
 
															+     // No work to do. Terminate the thread block.
														
 
															+     return;
														
 
															+   }
														
 
															+ 
														
 
															+   const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
														
 
															+   const int num_blocks_per_partition = USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_context_blocks;
														
 
															+ 
														
 
															+   // [start_block_idx, end_block_idx) is the range of blocks to process.
														
 
															+   const int start_block_idx = USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
														
 
															+   const int end_block_idx = MIN(start_block_idx + num_blocks_per_partition, num_context_blocks);
														
 
															+   const int num_blocks = end_block_idx - start_block_idx;
														
 
															+ 
														
 
															+   // [start_token_idx, end_token_idx) is the range of tokens to process.
														
 
															+   const int start_token_idx = start_block_idx * BLOCK_SIZE;
														
 
															+   const int end_token_idx = MIN(start_token_idx + num_blocks * BLOCK_SIZE, context_len);
														
 
															+   const int num_tokens = end_token_idx - start_token_idx;
														
 
															+ 
														
 
															+   constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
														
 
															+   constexpr int NUM_THREAD_GROUPS = NUM_THREADS / THREAD_GROUP_SIZE; // Note: This assumes THREAD_GROUP_SIZE divides NUM_THREADS
														
 
															+   assert(NUM_THREADS % THREAD_GROUP_SIZE == 0);
														
 
															+   constexpr int NUM_TOKENS_PER_THREAD_GROUP = DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE);
														
 
															+   constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
														
 
															+   const int thread_idx = threadIdx.x;
														
 
															+   const int warp_idx = thread_idx / WARP_SIZE;
														
 
															+   const int lane = thread_idx % WARP_SIZE;
														
 
															+ 
														
 
															+   const int head_idx = blockIdx.x;
														
 
															+   const int num_heads = gridDim.x;
														
 
															+   const int num_queries_per_kv = num_heads / num_kv_heads;
														
 
															+   const int kv_head_idx = head_idx / num_queries_per_kv;
														
 
															+   const float alibi_slope = alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx];
														
 
															+ 
														
 
															+   // A vector type to store a part of a key or a query.
														
 
															+   // The vector size is configured in such a way that the threads in a thread group
														
 
															+   // fetch or compute 16 bytes at a time.
														
 
															+   // For example, if the size of a thread group is 4 and the data type is half,
														
 
															+   // then the vector size is 16 / (4 * sizeof(half)) == 2.
														
 
															+   constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(scalar_t)), 1);
														
 
															+   using K_vec = typename Vec<scalar_t, VEC_SIZE>::Type;
														
 
															+   using Q_vec = typename Vec<scalar_t, VEC_SIZE>::Type;
														
 
															+ #ifdef ENABLE_FP8_E5M2
														
 
															+   using Quant_vec = typename Vec<cache_t, VEC_SIZE>::Type;
														
 
															+ #endif
														
 
															+ 
														
 
															+   constexpr int NUM_ELEMS_PER_THREAD = HEAD_SIZE / THREAD_GROUP_SIZE;
														
 
															+   constexpr int NUM_VECS_PER_THREAD = NUM_ELEMS_PER_THREAD / VEC_SIZE;
														
 
															+ 
														
 
															+   const int thread_group_idx = thread_idx / THREAD_GROUP_SIZE;
														
 
															+   const int thread_group_offset = thread_idx % THREAD_GROUP_SIZE;
														
 
															+ 
														
 
															+   // Load the query to registers.
														
 
															+   // Each thread in a thread group has a different part of the query.
														
 
															+   // For example, if the the thread group size is 4, then the first thread in the group
														
 
															+   // has 0, 4, 8, ... th vectors of the query, and the second thread has 1, 5, 9, ...
														
 
															+   // th vectors of the query, and so on.
														
 
															+   // NOTE: Because q is split from a qkv tensor, it may not be contiguous.
														
 
															+   const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
														
 
															+   __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
														
 
															+ #pragma unroll
														
 
															+   for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD; i += NUM_THREAD_GROUPS) {
														
 
															+     const int vec_idx = thread_group_offset + i * THREAD_GROUP_SIZE;
														
 
															+     q_vecs[thread_group_offset][i] = *reinterpret_cast<const Q_vec*>(q_ptr + vec_idx * VEC_SIZE);
														
 
															+   }
														
 
															+   __syncthreads(); // TODO: possible speedup if this is replaced with a memory wall right before we use q_vecs
														
 
															+ 
														
 
															+   // Memory planning.
														
 
															+   extern __shared__ char shared_mem[];
														
 
															+   // NOTE: We use FP32 for the softmax logits for better accuracy.
														
 
															+   float* logits = reinterpret_cast<float*>(shared_mem);
														
 
															+   // Workspace for reduction.
														
 
															+   __shared__ float red_smem[2 * NUM_WARPS];
														
 
															+ 
														
 
															+   // x == THREAD_GROUP_SIZE * VEC_SIZE
														
 
															+   // Each thread group fetches x elements from the key at a time.
														
 
															+   constexpr int x = 16 / sizeof(cache_t);
														
 
															+   float qk_max = -FLT_MAX;
														
 
															+ 
														
 
															+   // Iterate over the key blocks.
														
 
															+   // Each warp fetches a block of keys for each iteration.
														
 
															+   // Each thread group in a warp fetches a key from the block, and computes
														
 
															+   // dot product with the query.
														
 
															+   const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
														
 
															+   for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) {
														
 
															+     // NOTE: The block number is stored in int32. However, we cast it to int64
														
 
															+     // because int32 can lead to overflow when this variable is multiplied by large numbers
														
 
															+     // (e.g., kv_block_stride).
														
 
															+     const int64_t physical_block_number = static_cast<int64_t>(block_table[block_idx]);
														
 
															+ 
														
 
															+     // Load a key to registers.
														
 
															+     // Each thread in a thread group has a different part of the key.
														
 
															+     // For example, if the the thread group size is 4, then the first thread in the group
														
 
															+     // has 0, 4, 8, ... th vectors of the key, and the second thread has 1, 5, 9, ... th
														
 
															+     // vectors of the key, and so on.
														
 
															+     for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
														
 
															+       const int physical_block_offset = (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
														
 
															+       const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
														
 
															+       K_vec k_vecs[NUM_VECS_PER_THREAD];
														
 
															+ 
														
 
															+ #pragma unroll
														
 
															+       for (int j = 0; j < NUM_VECS_PER_THREAD; j++) {
														
 
															+         const cache_t* k_ptr = k_cache + physical_block_number * kv_block_stride
														
 
															+                                        + kv_head_idx * kv_head_stride
														
 
															+                                        + physical_block_offset * x;
														
 
															+         const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE;
														
 
															+         const int offset1 = (vec_idx * VEC_SIZE) / x;
														
 
															+         const int offset2 = (vec_idx * VEC_SIZE) % x;
														
 
															+         if constexpr (IS_FP8_E5M2_KV_CACHE) {
														
 
															+ #ifdef ENABLE_FP8_E5M2
														
 
															+           Quant_vec k_vec_quant = *reinterpret_cast<const Quant_vec*>(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
														
 
															+           // Vector conversion from Quant_vec to K_vec.
														
 
															+           k_vecs[j] = fp8_e5m2_unscaled::vec_conversion<K_vec, Quant_vec>(k_vec_quant);
														
 
															+ #else
														
 
															+           assert(false);
														
 
															+ #endif
														
 
															+         } else {
														
 
															+           k_vecs[j] = *reinterpret_cast<const K_vec*>(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
														
 
															+         }
														
 
															+       }
														
 
															+ 
														
 
															+       // Compute dot product.
														
 
															+       // This includes a reduction across the threads in the same thread group.
														
 
															+       float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
														
 
															+       // Add the ALiBi bias if slopes are given.
														
 
															+       qk += (alibi_slope != 0) ? alibi_slope * (token_idx - context_len + 1) : 0;
														
 
															+ 
														
 
															+       if (thread_group_offset == 0) {
														
 
															+         // Store the partial reductions to shared memory.
														
 
															+         // NOTE: It is required to zero out the masked logits.
														
 
															+         const bool mask = token_idx >= context_len;
														
 
															+         logits[token_idx - start_token_idx] = mask ? 0.f : qk;
														
 
															+         // Update the max value.
														
 
															+         qk_max = mask ? qk_max : fmaxf(qk_max, qk);
														
 
															+       }
														
 
															+     }
														
 
															+   }
														
 
															+ 
														
 
															+   // Perform reduction across the threads in the same warp to get the
														
 
															+   // max qk value for each "warp" (not across the thread block yet).
														
 
															+   // The 0-th thread of each thread group already has its max qk value.
														
 
															+ #pragma unroll
														
 
															+   for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
														
 
															+     qk_max = fmaxf(qk_max, APHRODITE_SHFL_XOR_SYNC(qk_max, mask));
														
 
															+   }
														
 
															+   if (lane == 0) {
														
 
															+     red_smem[warp_idx] = qk_max;
														
 
															+   }
														
 
															+   __syncthreads();
														
 
															+ 
														
 
															+   // TODO: Refactor this part.
														
 
															+   // Get the max qk value for the sequence.
														
 
															+   qk_max = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
														
 
															+ #pragma unroll
														
 
															+   for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
														
 
															+     qk_max = fmaxf(qk_max, APHRODITE_SHFL_XOR_SYNC(qk_max, mask));
														
 
															+   }
														
 
															+   // Broadcast the max qk value to all threads.
														
 
															+   qk_max = APHRODITE_SHFL_SYNC(qk_max, 0);
														
 
															+ 
														
 
															+   // Get the sum of the exp values.
														
 
															+   float exp_sum = 0.f;
														
 
															+   for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
														
 
															+     float val = __expf(logits[i] - qk_max);
														
 
															+     logits[i] = val;
														
 
															+     exp_sum += val;
														
 
															+   }
														
 
															+   exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum);
														
 
															+ 
														
 
															+   // Compute softmax.
														
 
															+   const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
														
 
															+   for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
														
 
															+     logits[i] *= inv_sum;
														
 
															+   }
														
 
															+   __syncthreads();
														
 
															+ 
														
 
															+   // If partitioning is enabled, store the max logit and exp_sum.
														
 
															+   if (USE_PARTITIONING && thread_idx == 0) {
														
 
															+     float* max_logits_ptr = max_logits + seq_idx * num_heads * max_num_partitions
														
 
															+                                        + head_idx * max_num_partitions
														
 
															+                                        + partition_idx;
														
 
															+     *max_logits_ptr = qk_max;
														
 
															+     float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions
														
 
															+                                    + head_idx * max_num_partitions
														
 
															+                                    + partition_idx;
														
 
															+     *exp_sums_ptr = exp_sum;
														
 
															+   }
														
 
															+ 
														
 
															+   // Each thread will fetch 16 bytes from the value cache at a time.
														
 
															+   constexpr int V_VEC_SIZE = MIN(16 / sizeof(scalar_t), BLOCK_SIZE);
														
 
															+   using V_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
														
 
															+   using L_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
														
 
															+ #ifdef ENABLE_FP8_E5M2
														
 
															+   using V_quant_vec = typename Vec<cache_t, V_VEC_SIZE>::Type;
														
 
															+ #endif
														
 
															+   using Float_L_vec = typename FloatVec<L_vec>::Type;
														
 
															+ 
														
 
															+   constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE;
														
 
															+   constexpr int NUM_ROWS_PER_ITER = WARP_SIZE / NUM_V_VECS_PER_ROW;
														
 
															+   constexpr int NUM_ROWS_PER_THREAD = DIVIDE_ROUND_UP(HEAD_SIZE, NUM_ROWS_PER_ITER);
														
 
															+ 
														
 
															+   // NOTE: We use FP32 for the accumulator for better accuracy.
														
 
															+   float accs[NUM_ROWS_PER_THREAD];
														
 
															+ #pragma unroll
														
 
															+   for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
														
 
															+     accs[i] = 0.f;
														
 
															+   }
														
 
															+ 
														
 
															+   scalar_t zero_value;
														
 
															+   zero(zero_value);
														
 
															+   for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) {
														
 
															+     // NOTE: The block number is stored in int32. However, we cast it to int64
														
 
															+     // because int32 can lead to overflow when this variable is multiplied by large numbers
														
 
															+     // (e.g., kv_block_stride).
														
 
															+     const int64_t physical_block_number = static_cast<int64_t>(block_table[block_idx]);
														
 
															+     const int physical_block_offset = (lane % NUM_V_VECS_PER_ROW) * V_VEC_SIZE;
														
 
															+     const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
														
 
															+     L_vec logits_vec;
														
 
															+     from_float(logits_vec, *reinterpret_cast<Float_L_vec*>(logits + token_idx - start_token_idx));
														
 
															+ 
														
 
															+     const cache_t* v_ptr = v_cache + physical_block_number * kv_block_stride
														
 
															+                                    + kv_head_idx * kv_head_stride;
														
 
															+ #pragma unroll
														
 
															+     for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
														
 
															+       const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
														
 
															+       if (row_idx < HEAD_SIZE) {
														
 
															+         const int offset = row_idx * BLOCK_SIZE + physical_block_offset;
														
 
															+         V_vec v_vec;
														
 
															+         if constexpr (IS_FP8_E5M2_KV_CACHE) {
														
 
															+ #ifdef ENABLE_FP8_E5M2
														
 
															+           V_quant_vec v_quant_vec = *reinterpret_cast<const V_quant_vec*>(v_ptr + offset);
														
 
															+           // Vector conversion from V_quant_vec to V_vec.
														
 
															+           v_vec = fp8_e5m2_unscaled::vec_conversion<V_vec, V_quant_vec>(v_quant_vec);
														
 
															+ #else
														
 
															+           assert(false);
														
 
															+ #endif
														
 
															+         } else {
														
 
															+           v_vec = *reinterpret_cast<const V_vec*>(v_ptr + offset);
														
 
															+         }
														
 
															+         if (block_idx == num_context_blocks - 1) {
														
 
															+           // NOTE: When v_vec contains the tokens that are out of the context,
														
 
															+           // we should explicitly zero out the values since they may contain NaNs.
														
 
															+           scalar_t* v_vec_ptr = reinterpret_cast<scalar_t*>(&v_vec);
														
 
															+ #pragma unroll
														
 
															+           for (int j = 0; j < V_VEC_SIZE; j++) {
														
 
															+             v_vec_ptr[j] = token_idx + j < context_len ? v_vec_ptr[j] : zero_value;
														
 
															+           }
														
 
															+         }
														
 
															+         accs[i] += dot(logits_vec, v_vec);
														
 
															+       }
														
 
															+     }
														
 
															+   }
														
 
															+ 
														
 
															+   // Perform reduction within each warp.
														
 
															+ #pragma unroll
														
 
															+   for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
														
 
															+     float acc = accs[i];
														
 
															+ #pragma unroll
														
 
															+     for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
														
 
															+       acc += APHRODITE_SHFL_XOR_SYNC(acc, mask);
														
 
															+     }
														
 
															+     accs[i] = acc;
														
 
															+   }
														
 
															+ 
														
 
															+   // NOTE: A barrier is required because the shared memory space for logits
														
 
															+   // is reused for the output.
														
 
															+   __syncthreads();
														
 
															+ 
														
 
															+   // Perform reduction across warps.
														
 
															+   float* out_smem = reinterpret_cast<float*>(shared_mem);
														
 
															+ #pragma unroll
														
 
															+   for (int i = NUM_WARPS; i > 1; i /= 2) {
														
 
															+     int mid = i / 2;
														
 
															+     // Upper warps write to shared memory.
														
 
															+     if (warp_idx >= mid && warp_idx < i) {
														
 
															+       float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
														
 
															+ #pragma unroll
														
 
															+       for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
														
 
															+         const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
														
 
															+         if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
														
 
															+           dst[row_idx] = accs[i];
														
 
															+         }
														
 
															+       }
														
 
															+     }
														
 
															+     __syncthreads();
														
 
															+ 
														
 
															+     // Lower warps update the output.
														
 
															+     if (warp_idx < mid) {
														
 
															+       const float* src = &out_smem[warp_idx * HEAD_SIZE];
														
 
															+ #pragma unroll
														
 
															+       for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
														
 
															+         const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
														
 
															+         if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
														
 
															+           accs[i] += src[row_idx];
														
 
															+         }
														
 
															+       }
														
 
															+     }
														
 
															+     __syncthreads();
														
 
															+   }
														
 
															+ 
														
 
															+   // Write the final output.
														
 
															+   if (warp_idx == 0) {
														
 
															+     scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
														
 
															+                             + head_idx * max_num_partitions * HEAD_SIZE
														
 
															+                             + partition_idx * HEAD_SIZE;
														
 
															+ #pragma unroll
														
 
															+     for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
														
 
															+       const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
														
 
															+       if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
														
 
															+         from_float(*(out_ptr + row_idx), accs[i]);
														
 
															+       }
														
 
															+     }
														
 
															+   }
														
 
															+ }
														
 
															+ 
														
 
															+ // Grid: (num_heads, num_seqs, 1).
														
 
															+ template<
														
 
															+   typename scalar_t,
														
 
															+   typename cache_t,
														
 
															+   int HEAD_SIZE,
														
 
															+   int BLOCK_SIZE,
														
 
															+   int NUM_THREADS,
														
 
															+   bool IS_FP8_E5M2_KV_CACHE>
														
 
															+ __global__ void paged_attention_v1_kernel(
														
 
															+   scalar_t* __restrict__ out,             // [num_seqs, num_heads, head_size]
														
 
															+   const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
														
 
															+   const cache_t* __restrict__ k_cache,    // [num_blocks, num_kv_heads, head_size/x, block_size, x]
														
 
															+   const cache_t* __restrict__ v_cache,    // [num_blocks, num_kv_heads, head_size, block_size]
														
 
															+   const int num_kv_heads,                 // [num_heads]
														
 
															+   const float scale,
														
 
															+   const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
														
 
															+   const int* __restrict__ context_lens,   // [num_seqs]
														
 
															+   const int max_num_blocks_per_seq,
														
 
															+   const float* __restrict__ alibi_slopes, // [num_heads]
														
 
															+   const int q_stride,
														
 
															+   const int kv_block_stride,
														
 
															+   const int kv_head_stride) {
														
 
															+   paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, IS_FP8_E5M2_KV_CACHE>(
														
 
															+     /* exp_sums */ nullptr, /* max_logits */ nullptr,
														
 
															+     out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, context_lens,
														
 
															+     max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride);
														
 
															+ }
														
 
															+ 
														
 
															+ // Grid: (num_heads, num_seqs, max_num_partitions).
														
 
															+ template<
														
 
															+   typename scalar_t,
														
 
															+   typename cache_t,
														
 
															+   int HEAD_SIZE,
														
 
															+   int BLOCK_SIZE,
														
 
															+   int NUM_THREADS,
														
 
															+   bool IS_FP8_E5M2_KV_CACHE,
														
 
															+   int PARTITION_SIZE>
														
 
															+ __global__ void paged_attention_v2_kernel(
														
 
															+   float* __restrict__ exp_sums,           // [num_seqs, num_heads, max_num_partitions]
														
 
															+   float* __restrict__ max_logits,         // [num_seqs, num_heads, max_num_partitions]
														
 
															+   scalar_t* __restrict__ tmp_out,         // [num_seqs, num_heads, max_num_partitions, head_size]
														
 
															+   const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
														
 
															+   const cache_t* __restrict__ k_cache,    // [num_blocks, num_kv_heads, head_size/x, block_size, x]
														
 
															+   const cache_t* __restrict__ v_cache,    // [num_blocks, num_kv_heads, head_size, block_size]
														
 
															+   const int num_kv_heads,                 // [num_heads]
														
 
															+   const float scale,
														
 
															+   const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
														
 
															+   const int* __restrict__ context_lens,   // [num_seqs]
														
 
															+   const int max_num_blocks_per_seq,
														
 
															+   const float* __restrict__ alibi_slopes, // [num_heads]
														
 
															+   const int q_stride,
														
 
															+   const int kv_block_stride,
														
 
															+   const int kv_head_stride) {
														
 
															+   paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, IS_FP8_E5M2_KV_CACHE, PARTITION_SIZE>(
														
 
															+     exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
														
 
															+     block_tables, context_lens, max_num_blocks_per_seq, alibi_slopes,
														
 
															+     q_stride, kv_block_stride, kv_head_stride);
														
 
															+ }
														
 
															+ 
														
 
															+ // Grid: (num_heads, num_seqs).
														
 
															+ template<
														
 
															+   typename scalar_t,
														
 
															+   int HEAD_SIZE,
														
 
															+   int NUM_THREADS,
														
 
															+   int PARTITION_SIZE>
														
 
															+ __global__ void paged_attention_v2_reduce_kernel(
														
 
															+   scalar_t* __restrict__ out,             // [num_seqs, num_heads, head_size]
														
 
															+   const float* __restrict__ exp_sums,     // [num_seqs, num_heads, max_num_partitions]
														
 
															+   const float* __restrict__ max_logits,   // [num_seqs, num_heads, max_num_partitions]
														
 
															+   const scalar_t* __restrict__ tmp_out,   // [num_seqs, num_heads, max_num_partitions, head_size]
														
 
															+   const int* __restrict__ context_lens,   // [num_seqs]
														
 
															+   const int max_num_partitions) {
														
 
															+   const int num_heads = gridDim.x;
														
 
															+   const int head_idx = blockIdx.x;
														
 
															+   const int seq_idx = blockIdx.y;
														
 
															+   const int context_len = context_lens[seq_idx];
														
 
															+   const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
														
 
															+   if (num_partitions == 1) {
														
 
															+     // No need to reduce. Only copy tmp_out to out.
														
 
															+     scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
														
 
															+     const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
														
 
															+                                           + head_idx * max_num_partitions * HEAD_SIZE;
														
 
															+     for (int i = threadIdx.x; i < HEAD_SIZE; i += blockDim.x) {
														
 
															+       out_ptr[i] = tmp_out_ptr[i];
														
 
															+     }
														
 
															+     // Terminate the thread block.
														
 
															+     return;
														
 
															+   }
														
 
															+ 
														
 
															+   constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
														
 
															+   const int warp_idx = threadIdx.x / WARP_SIZE;
														
 
															+   const int lane = threadIdx.x % WARP_SIZE;
														
 
															+ 
														
 
															+   // Size: 2 * num_partitions.
														
 
															+   extern __shared__ char shared_mem[];
														
 
															+   // Workspace for reduction.
														
 
															+   __shared__ float red_smem[2 * NUM_WARPS];
														
 
															+ 
														
 
															+   // Load max logits to shared memory.
														
 
															+   float* shared_max_logits = reinterpret_cast<float*>(shared_mem);
														
 
															+   const float* max_logits_ptr = max_logits + seq_idx * num_heads * max_num_partitions
														
 
															+                                            + head_idx * max_num_partitions;
														
 
															+   float max_logit = -FLT_MAX;
														
 
															+   for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
														
 
															+     const float l = max_logits_ptr[i];
														
 
															+     shared_max_logits[i] = l;
														
 
															+     max_logit = fmaxf(max_logit, l);
														
 
															+   }
														
 
															+   __syncthreads();
														
 
															+ 
														
 
															+   // Get the global max logit.
														
 
															+   // Reduce within the warp.
														
 
															+ #pragma unroll
														
 
															+   for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
														
 
															+     max_logit = fmaxf(max_logit, APHRODITE_SHFL_XOR_SYNC(max_logit, mask));
														
 
															+   }
														
 
															+   if (lane == 0) {
														
 
															+     red_smem[warp_idx] = max_logit;
														
 
															+   }
														
 
															+   __syncthreads();
														
 
															+   // Reduce across warps.
														
 
															+   max_logit = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
														
 
															+ #pragma unroll
														
 
															+   for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
														
 
															+     max_logit = fmaxf(max_logit, APHRODITE_SHFL_XOR_SYNC(max_logit, mask));
														
 
															+   }
														
 
															+   // Broadcast the max value to all threads.
														
 
															+   max_logit = APHRODITE_SHFL_SYNC(max_logit, 0);
														
 
															+ 
														
 
															+   // Load rescaled exp sums to shared memory.
														
 
															+   float* shared_exp_sums = reinterpret_cast<float*>(shared_mem + sizeof(float) * num_partitions);
														
 
															+   const float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions
														
 
															+                                        + head_idx * max_num_partitions;
														
 
															+   float global_exp_sum = 0.0f;
														
 
															+   for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
														
 
															+     float l = shared_max_logits[i];
														
 
															+     float rescaled_exp_sum = exp_sums_ptr[i] * expf(l - max_logit);
														
 
															+     global_exp_sum += rescaled_exp_sum;
														
 
															+     shared_exp_sums[i] = rescaled_exp_sum;
														
 
															+   }
														
 
															+   __syncthreads();
														
 
															+   global_exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], global_exp_sum);
														
 
															+   const float inv_global_exp_sum = __fdividef(1.0f, global_exp_sum + 1e-6f);
														
 
															+ 
														
 
															+   // Aggregate tmp_out to out.
														
 
															+   const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
														
 
															+                                         + head_idx * max_num_partitions * HEAD_SIZE;
														
 
															+   scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
														
 
															+ #pragma unroll
														
 
															+   for (int i = threadIdx.x; i < HEAD_SIZE; i += NUM_THREADS) {
														
 
															+     float acc = 0.0f;
														
 
															+     for (int j = 0; j < num_partitions; ++j) {
														
 
															+       acc += to_float(tmp_out_ptr[j * HEAD_SIZE + i]) * shared_exp_sums[j] * inv_global_exp_sum;
														
 
															+     }
														
 
															+     from_float(out_ptr[i], acc);
														
 
															+   }
														
 
															+ }
														
 
															+ 
														
 
															+ } // namespace aphrodite
														
 
															+ 
														
 
															+ #define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE)                                                  \
														
 
															+   APHRODITE_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(                                       \
														
 
															+     ((void*)aphrodite::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,   \
														
 
															+       IS_FP8_E5M2_KV_CACHE>), shared_mem_size);                                               \
														
 
															+   aphrodite::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,             \
														
 
															+   IS_FP8_E5M2_KV_CACHE><<<grid, block, shared_mem_size, stream>>>(                            \
														
 
															+     out_ptr,                                                                                  \
														
 
															+     query_ptr,                                                                                \
														
 
															+     key_cache_ptr,                                                                            \
														
 
															+     value_cache_ptr,                                                                          \
														
 
															+     num_kv_heads,                                                                             \
														
 
															+     scale,                                                                                    \
														
 
															+     block_tables_ptr,                                                                         \
														
 
															+     context_lens_ptr,                                                                         \
														
 
															+     max_num_blocks_per_seq,                                                                   \
														
 
															+     alibi_slopes_ptr,                                                                         \
														
 
															+     q_stride,                                                                                 \
														
 
															+     kv_block_stride,                                                                          \
														
 
															+     kv_head_stride);
														
 
															+ 
														
 
															+ // TODO: Tune NUM_THREADS.
														
 
															+ template<
														
 
															+   typename T,
														
 
															+   typename CACHE_T,
														
 
															+   int BLOCK_SIZE,
														
 
															+   bool IS_FP8_E5M2_KV_CACHE,
														
 
															+   int NUM_THREADS = 128>
														
 
															+ void paged_attention_v1_launcher(
														
 
															+   torch::Tensor& out,
														
 
															+   torch::Tensor& query,
														
 
															+   torch::Tensor& key_cache,
														
 
															+   torch::Tensor& value_cache,
														
 
															+   int num_kv_heads,
														
 
															+   float scale,
														
 
															+   torch::Tensor& block_tables,
														
 
															+   torch::Tensor& context_lens,
														
 
															+   int max_context_len,
														
 
															+   const c10::optional<torch::Tensor>& alibi_slopes) {
														
 
															+   int num_seqs = query.size(0);
														
 
															+   int num_heads = query.size(1);
														
 
															+   int head_size = query.size(2);
														
 
															+   int max_num_blocks_per_seq = block_tables.size(1);
														
 
															+   int q_stride = query.stride(0);
														
 
															+   int kv_block_stride = key_cache.stride(0);
														
 
															+   int kv_head_stride = key_cache.stride(1);
														
 
															+ 
														
 
															+   int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
														
 
															+   assert(head_size % thread_group_size == 0);
														
 
															+ 
														
 
															+   // NOTE: alibi_slopes is optional.
														
 
															+   const float* alibi_slopes_ptr = alibi_slopes ?
														
 
															+     reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
														
 
															+     : nullptr;
														
 
															+ 
														
 
															+   T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
														
 
															+   T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
														
 
															+   CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
														
 
															+   CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
														
 
															+   int* block_tables_ptr = block_tables.data_ptr<int>();
														
 
															+   int* context_lens_ptr = context_lens.data_ptr<int>();
														
 
															+ 
														
 
															+   constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
														
 
															+   int padded_max_context_len = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE) * BLOCK_SIZE;
														
 
															+   int logits_size = padded_max_context_len * sizeof(float);
														
 
															+   int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
														
 
															+   // Python-side check in aphrodite.task_handler.worker._check_if_can_support_max_seq_len
														
 
															+   // Keep that in sync with the logic here!
														
 
															+   int shared_mem_size = std::max(logits_size, outputs_size);
														
 
															+ 
														
 
															+   dim3 grid(num_heads, num_seqs, 1);
														
 
															+   dim3 block(NUM_THREADS);
														
 
															+   const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
														
 
															+   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
														
 
															+   switch (head_size) {
														
 
															+     // NOTE: To reduce the compilation time, we only compile for the
														
 
															+     // head sizes that we use in the model. However, we can easily extend this
														
 
															+     // to support any head size which is a multiple of 16.
														
 
															+     case 64:
														
 
															+       LAUNCH_PAGED_ATTENTION_V1(64);
														
 
															+       break;
														
 
															+     case 80:
														
 
															+       LAUNCH_PAGED_ATTENTION_V1(80);
														
 
															+       break;
														
 
															+     case 96:
														
 
															+       LAUNCH_PAGED_ATTENTION_V1(96);
														
 
															+       break;
														
 
															+     case 112:
														
 
															+       LAUNCH_PAGED_ATTENTION_V1(112);
														
 
															+       break;
														
 
															+     case 128:
														
 
															+       LAUNCH_PAGED_ATTENTION_V1(128);
														
 
															+       break;
														
 
															+     case 256:
														
 
															+       LAUNCH_PAGED_ATTENTION_V1(256);
														
 
															+       break;
														
 
															+     default:
														
 
															+       TORCH_CHECK(false, "Unsupported head size: ", head_size);
														
 
															+       break;
														
 
															+   }
														
 
															+ }
														
 
															+ 
														
 
															+ #define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE)       \
														
 
															+   paged_attention_v1_launcher<T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE>( \
														
 
															+     out,                                                                     \
														
 
															+     query,                                                                   \
														
 
															+     key_cache,                                                               \
														
 
															+     value_cache,                                                             \
														
 
															+     num_kv_heads,                                                            \
														
 
															+     scale,                                                                   \
														
 
															+     block_tables,                                                            \
														
 
															+     context_lens,                                                            \
														
 
															+     max_context_len,                                                         \
														
 
															+     alibi_slopes);
														
 
															+ 
														
 
															+ // NOTE: To reduce the compilation time, we omitted block sizes
														
 
															+ // 1, 2, 4, 64, 128, 256.
														
 
															+ #define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_E5M2_KV_CACHE) \
														
 
															+   switch (block_size) {                                               \
														
 
															+     case 8:                                                           \
														
 
															+       CALL_V1_LAUNCHER(T, CACHE_T, 8, IS_FP8_E5M2_KV_CACHE);          \
														
 
															+       break;                                                          \
														
 
															+     case 16:                                                          \
														
 
															+       CALL_V1_LAUNCHER(T, CACHE_T, 16, IS_FP8_E5M2_KV_CACHE);         \
														
 
															+       break;                                                          \
														
 
															+     case 32:                                                          \
														
 
															+       CALL_V1_LAUNCHER(T, CACHE_T, 32, IS_FP8_E5M2_KV_CACHE);         \
														
 
															+       break;                                                          \
														
 
															+     default:                                                          \
														
 
															+       TORCH_CHECK(false, "Unsupported block size: ", block_size);     \
														
 
															+       break;                                                          \
														
 
															+   }
														
 
															+ 
														
 
															+ void paged_attention_v1(
														
 
															+   torch::Tensor& out,             // [num_seqs, num_heads, head_size]
														
 
															+   torch::Tensor& query,           // [num_seqs, num_heads, head_size]
														
 
															+   torch::Tensor& key_cache,       // [num_blocks, num_heads, head_size/x, block_size, x]
														
 
															+   torch::Tensor& value_cache,     // [num_blocks, num_heads, head_size, block_size]
														
 
															+   int num_kv_heads,               // [num_heads]
														
 
															+   float scale,
														
 
															+   torch::Tensor& block_tables,    // [num_seqs, max_num_blocks_per_seq]
														
 
															+   torch::Tensor& context_lens,    // [num_seqs]
														
 
															+   int block_size,
														
 
															+   int max_context_len,
														
 
															+   const c10::optional<torch::Tensor>& alibi_slopes,
														
 
															+   const std::string& kv_cache_dtype) {
														
 
															+   if (kv_cache_dtype == "auto") {
														
 
															+     if (query.dtype() == at::ScalarType::Float) {
														
 
															+       CALL_V1_LAUNCHER_BLOCK_SIZE(float, float, false);
														
 
															+     } else if (query.dtype() == at::ScalarType::Half) {
														
 
															+       CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t, uint16_t, false);
														
 
															+     } else if (query.dtype() == at::ScalarType::BFloat16) {
														
 
															+       CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, __nv_bfloat16, false);
														
 
															+     } else {
														
 
															+       TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
														
 
															+     }
														
 
															+   } else if (kv_cache_dtype == "fp8_e5m2") {
														
 
															+     if (query.dtype() == at::ScalarType::Float) {
														
 
															+       CALL_V1_LAUNCHER_BLOCK_SIZE(float, uint8_t, true);
														
 
															+     } else if (query.dtype() == at::ScalarType::Half) {
														
 
															+       CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t, uint8_t, true);
														
 
															+     } else if (query.dtype() == at::ScalarType::BFloat16) {
														
 
															+       CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, uint8_t, true);
														
 
															+     } else {
														
 
															+       TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
														
 
															+     }
														
 
															+   } else {
														
 
															+     TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
														
 
															+   }
														
 
															+ }
														
 
															+ 
														
 
															+ #define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE)                                                  \
														
 
															+   aphrodite::paged_attention_v2_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,             \
														
 
															+   IS_FP8_E5M2_KV_CACHE, PARTITION_SIZE>                                                       \
														
 
															+   <<<grid, block, shared_mem_size, stream>>>(                                                 \
														
 
															+     exp_sums_ptr,                                                                             \
														
 
															+     max_logits_ptr,                                                                           \
														
 
															+     tmp_out_ptr,                                                                              \
														
 
															+     query_ptr,                                                                                \
														
 
															+     key_cache_ptr,                                                                            \
														
 
															+     value_cache_ptr,                                                                          \
														
 
															+     num_kv_heads,                                                                             \
														
 
															+     scale,                                                                                    \
														
 
															+     block_tables_ptr,                                                                         \
														
 
															+     context_lens_ptr,                                                                         \
														
 
															+     max_num_blocks_per_seq,                                                                   \
														
 
															+     alibi_slopes_ptr,                                                                         \
														
 
															+     q_stride,                                                                                 \
														
 
															+     kv_block_stride,                                                                          \
														
 
															+     kv_head_stride);                                                                          \
														
 
															+   aphrodite::paged_attention_v2_reduce_kernel<T, HEAD_SIZE, NUM_THREADS, PARTITION_SIZE>           \
														
 
															+   <<<reduce_grid, block, reduce_shared_mem_size, stream>>>(                                   \
														
 
															+     out_ptr,                                                                                  \
														
 
															+     exp_sums_ptr,                                                                             \
														
 
															+     max_logits_ptr,                                                                           \
														
 
															+     tmp_out_ptr,                                                                              \
														
 
															+     context_lens_ptr,                                                                         \
														
 
															+     max_num_partitions);
														
 
															+ 
														
 
															+ template<
														
 
															+   typename T,
														
 
															+   typename CACHE_T,
														
 
															+   int BLOCK_SIZE,
														
 
															+   bool IS_FP8_E5M2_KV_CACHE,
														
 
															+   int NUM_THREADS = 128,
														
 
															+   int PARTITION_SIZE = 512>
														
 
															+ void paged_attention_v2_launcher(
														
 
															+   torch::Tensor& out,
														
 
															+   torch::Tensor& exp_sums,
														
 
															+   torch::Tensor& max_logits,
														
 
															+   torch::Tensor& tmp_out,
														
 
															+   torch::Tensor& query,
														
 
															+   torch::Tensor& key_cache,
														
 
															+   torch::Tensor& value_cache,
														
 
															+   int num_kv_heads,
														
 
															+   float scale,
														
 
															+   torch::Tensor& block_tables,
														
 
															+   torch::Tensor& context_lens,
														
 
															+   int max_context_len,
														
 
															+   const c10::optional<torch::Tensor>& alibi_slopes) {
														
 
															+   int num_seqs = query.size(0);
														
 
															+   int num_heads = query.size(1);
														
 
															+   int head_size = query.size(2);
														
 
															+   int max_num_blocks_per_seq = block_tables.size(1);
														
 
															+   int q_stride = query.stride(0);
														
 
															+   int kv_block_stride = key_cache.stride(0);
														
 
															+   int kv_head_stride = key_cache.stride(1);
														
 
															+ 
														
 
															+   int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
														
 
															+   assert(head_size % thread_group_size == 0);
														
 
															+ 
														
 
															+   // NOTE: alibi_slopes is optional.
														
 
															+   const float* alibi_slopes_ptr = alibi_slopes ?
														
 
															+     reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
														
 
															+     : nullptr;
														
 
															+ 
														
 
															+   T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
														
 
															+   float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
														
 
															+   float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
														
 
															+   T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
														
 
															+   T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
														
 
															+   CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
														
 
															+   CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
														
 
															+   int* block_tables_ptr = block_tables.data_ptr<int>();
														
 
															+   int* context_lens_ptr = context_lens.data_ptr<int>();
														
 
															+ 
														
 
															+   constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
														
 
															+   int max_num_partitions = DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
														
 
															+   int logits_size = PARTITION_SIZE * sizeof(float);
														
 
															+   int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
														
 
															+ 
														
 
															+   // For paged attention v2 kernel.
														
 
															+   dim3 grid(num_heads, num_seqs, max_num_partitions);
														
 
															+   int shared_mem_size = std::max(logits_size, outputs_size);
														
 
															+   // For paged attention v2 reduce kernel.
														
 
															+   dim3 reduce_grid(num_heads, num_seqs);
														
 
															+   int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float);
														
 
															+ 
														
 
															+   dim3 block(NUM_THREADS);
														
 
															+   const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
														
 
															+   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
														
 
															+   switch (head_size) {
														
 
															+     // NOTE: To reduce the compilation time, we only compile for the
														
 
															+     // head sizes that we use in the model. However, we can easily extend this
														
 
															+     // to support any head size which is a multiple of 16.
														
 
															+     case 64:
														
 
															+       LAUNCH_PAGED_ATTENTION_V2(64);
														
 
															+       break;
														
 
															+     case 80:
														
 
															+       LAUNCH_PAGED_ATTENTION_V2(80);
														
 
															+       break;
														
 
															+     case 96:
														
 
															+       LAUNCH_PAGED_ATTENTION_V2(96);
														
 
															+       break;
														
 
															+     case 112:
														
 
															+       LAUNCH_PAGED_ATTENTION_V2(112);
														
 
															+       break;
														
 
															+     case 128:
														
 
															+       LAUNCH_PAGED_ATTENTION_V2(128);
														
 
															+       break;
														
 
															+     case 256:
														
 
															+       LAUNCH_PAGED_ATTENTION_V2(256);
														
 
															+       break;
														
 
															+     default:
														
 
															+       TORCH_CHECK(false, "Unsupported head size: ", head_size);
														
 
															+       break;
														
 
															+   }
														
 
															+ }
														
 
															+ 
														
 
															+ #define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE)           \
														
 
															+   paged_attention_v2_launcher<T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE>(     \
														
 
															+     out,                                                                         \
														
 
															+     exp_sums,                                                                    \
														
 
															+     max_logits,                                                                  \
														
 
															+     tmp_out,                                                                     \
														
 
															+     query,                                                                       \
														
 
															+     key_cache,                                                                   \
														
 
															+     value_cache,                                                                 \
														
 
															+     num_kv_heads,                                                                \
														
 
															+     scale,                                                                       \
														
 
															+     block_tables,                                                                \
														
 
															+     context_lens,                                                                \
														
 
															+     max_context_len,                                                             \
														
 
															+     alibi_slopes);
														
 
															+ 
														
 
															+ // NOTE: To reduce the compilation time, we omitted block sizes
														
 
															+ // 1, 2, 4, 64, 128, 256.
														
 
															+ #define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_E5M2_KV_CACHE)       \
														
 
															+   switch (block_size) {                                                     \
														
 
															+     case 8:                                                                 \
														
 
															+       CALL_V2_LAUNCHER(T, CACHE_T, 8, IS_FP8_E5M2_KV_CACHE);                \
														
 
															+       break;                                                                \
														
 
															+     case 16:                                                                \
														
 
															+       CALL_V2_LAUNCHER(T, CACHE_T, 16, IS_FP8_E5M2_KV_CACHE);               \
														
 
															+       break;                                                                \
														
 
															+     case 32:                                                                \
														
 
															+       CALL_V2_LAUNCHER(T, CACHE_T, 32, IS_FP8_E5M2_KV_CACHE);               \
														
 
															+       break;                                                                \
														
 
															+     default:                                                                \
														
 
															+       TORCH_CHECK(false, "Unsupported block size: ", block_size);           \
														
 
															+       break;                                                                \
														
 
															+   }
														
 
															+ 
														
 
															+ void paged_attention_v2(
														
 
															+   torch::Tensor& out,             // [num_seqs, num_heads, head_size]
														
 
															+   torch::Tensor& exp_sums,        // [num_seqs, num_heads, max_num_partitions]
														
 
															+   torch::Tensor& max_logits,      // [num_seqs, num_heads, max_num_partitions]
														
 
															+   torch::Tensor& tmp_out,         // [num_seqs, num_heads, max_num_partitions, head_size]
														
 
															+   torch::Tensor& query,           // [num_seqs, num_heads, head_size]
														
 
															+   torch::Tensor& key_cache,       // [num_blocks, num_heads, head_size/x, block_size, x]
														
 
															+   torch::Tensor& value_cache,     // [num_blocks, num_heads, head_size, block_size]
														
 
															+   int num_kv_heads,               // [num_heads]
														
 
															+   float scale,
														
 
															+   torch::Tensor& block_tables,    // [num_seqs, max_num_blocks_per_seq]
														
 
															+   torch::Tensor& context_lens,    // [num_seqs]
														
 
															+   int block_size,
														
 
															+   int max_context_len,
														
 
															+   const c10::optional<torch::Tensor>& alibi_slopes,
														
 
															+   const std::string& kv_cache_dtype) {
														
 
															+   if (kv_cache_dtype == "auto") {
														
 
															+     if (query.dtype() == at::ScalarType::Float) {
														
 
															+       CALL_V2_LAUNCHER_BLOCK_SIZE(float, float, false);
														
 
															+     } else if (query.dtype() == at::ScalarType::Half) {
														
 
															+       CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t, uint16_t, false);
														
 
															+     } else if (query.dtype() == at::ScalarType::BFloat16) {
														
 
															+       CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, __nv_bfloat16, false);
														
 
															+     } else {
														
 
															+       TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
														
 
															+     }
														
 
															+   } else if (kv_cache_dtype == "fp8_e5m2") {
														
 
															+     if (query.dtype() == at::ScalarType::Float) {
														
 
															+       CALL_V2_LAUNCHER_BLOCK_SIZE(float, uint8_t, true);
														
 
															+     } else if (query.dtype() == at::ScalarType::Half) {
														
 
															+       CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t, uint8_t, true);
														
 
															+     } else if (query.dtype() == at::ScalarType::BFloat16) {
														
 
															+       CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, uint8_t, true);
														
 
															+     } else {
														
 
															+       TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
														
 
															+     }
														
 
															+   } else {
														
 
															+     TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
														
 
															+   }
														
 
															+ }
														
 
															+ 
														
 
															+ #undef WARP_SIZE
														
 
															+ #undef MAX
														
 
															+ #undef MIN
														
 
															+ #undef DIVIDE_ROUND_UP
														
 
															+ 
														
--- a/kernels/attention/dtype_float32.cuh
+++ b/kernels/attention/dtype_float32.cuh
@@ -16,265 +16,258 @@
 
															  * See the License for the specific language governing permissions and
														
 
															  * limitations under the License.
														
 
															  */
														
 
															-#pragma once
														
 
															-
														
 
															-#include "attention_generic.cuh"
														
 
															-
														
 
															-#include <stdint.h>
														
 
															-
														
 
															-namespace aphrodite {
														
 
															-
														
 
															-// Define custom FP32 vector data types.
														
 
															-struct Float4_ {
														
 
															-  float2 x;
														
 
															-  float2 y;
														
 
															-};
														
 
															-
														
 
															-struct Float8_ {
														
 
															-  float2 x;
														
 
															-  float2 y;
														
 
															-  float2 z;
														
 
															-  float2 w;
														
 
															-};
														
 
															-
														
 
															-// FP32 vector types for Q, K, V.
														
 
															-template<>
														
 
															-struct Vec<float, 1> {
														
 
															-  using Type = float;
														
 
															-};
														
 
															-template<>
														
 
															-struct Vec<float, 2> {
														
 
															-  using Type = float2;
														
 
															-};
														
 
															-template<>
														
 
															-struct Vec<float, 4> {
														
 
															-  using Type = float4;
														
 
															-};
														
 
															-
														
 
															-// FP32 accumulator vector types corresponding to Vec.
														
 
															-template<>
														
 
															-struct FloatVec<float> {
														
 
															-  using Type = float;
														
 
															-};
														
 
															-template<>
														
 
															-struct FloatVec<float2> {
														
 
															-  using Type = float2;
														
 
															-};
														
 
															-template<>
														
 
															-struct FloatVec<float4> {
														
 
															-  using Type = float4;
														
 
															-};
														
 
															-
														
 
															-// Vector addition.
														
 
															-inline __device__ float add(float a, float b) {
														
 
															-  return a + b;
														
 
															-}
														
 
															-
														
 
															-inline __device__ float2 add(float2 a, float2 b) {
														
 
															-  float2 c;
														
 
															-  c.x = add(a.x, b.x);
														
 
															-  c.y = add(a.y, b.y);
														
 
															-  return c;
														
 
															-}
														
 
															-
														
 
															-inline __device__ float4 add(float4 a, float4 b) {
														
 
															-  float4 c;
														
 
															-  c.x = add(a.x, b.x);
														
 
															-  c.y = add(a.y, b.y);
														
 
															-  c.z = add(a.z, b.z);
														
 
															-  c.w = add(a.w, b.w);
														
 
															-  return c;
														
 
															-}
														
 
															-
														
 
															-inline __device__ Float4_ add(Float4_ a, Float4_ b) {
														
 
															-  Float4_ c;
														
 
															-  c.x = add(a.x, b.x);
														
 
															-  c.y = add(a.y, b.y);
														
 
															-  return c;
														
 
															-}
														
 
															-
														
 
															-// Vector multiplication.
														
 
															-template<>
														
 
															-inline __device__ float mul<float, float>(float a, float b) {
														
 
															-  return a * b;
														
 
															-}
														
 
															-
														
 
															-template<>
														
 
															-inline __device__ float2 mul(float2 a, float2 b) {
														
 
															-  float2 c;
														
 
															-  c.x = a.x * b.x;
														
 
															-  c.y = a.y * b.y;
														
 
															-  return c;
														
 
															-}
														
 
															-
														
 
															-template<>
														
 
															-inline __device__ float2 mul(float a, float2 b) {
														
 
															-  float2 c;
														
 
															-  c.x = a * b.x;
														
 
															-  c.y = a * b.y;
														
 
															-  return c;
														
 
															-}
														
 
															-
														
 
															-template<>
														
 
															-inline __device__ float4 mul(float4 a, float4 b) {
														
 
															-  float4 c;
														
 
															-  c.x = a.x * b.x;
														
 
															-  c.y = a.y * b.y;
														
 
															-  c.z = a.z * b.z;
														
 
															-  c.w = a.w * b.w;
														
 
															-  return c;
														
 
															-}
														
 
															-
														
 
															-template<>
														
 
															-inline __device__ float4 mul(float a, float4 b) {
														
 
															-  float4 c;
														
 
															-  c.x = a * b.x;
														
 
															-  c.y = a * b.y;
														
 
															-  c.z = a * b.z;
														
 
															-  c.w = a * b.w;
														
 
															-  return c;
														
 
															-}
														
 
															-
														
 
															-// Vector fused multiply-add.
														
 
															-inline __device__ float fma(float a, float b, float c) {
														
 
															-  return a * b + c;
														
 
															-}
														
 
															-
														
 
															-inline __device__ float2 fma(float2 a, float2 b, float2 c) {
														
 
															-  float2 d;
														
 
															-  d.x = fma(a.x, b.x, c.x);
														
 
															-  d.y = fma(a.y, b.y, c.y);
														
 
															-  return d;
														
 
															-}
														
 
															-
														
 
															-inline __device__ float2 fma(float a, float2 b, float2 c) {
														
 
															-  float2 d;
														
 
															-  d.x = fma(a, b.x, c.x);
														
 
															-  d.y = fma(a, b.y, c.y);
														
 
															-  return d;
														
 
															-}
														
 
															-
														
 
															-inline __device__ float4 fma(float4 a, float4 b, float4 c) {
														
 
															-  float4 d;
														
 
															-  d.x = fma(a.x, b.x, c.x);
														
 
															-  d.y = fma(a.y, b.y, c.y);
														
 
															-  d.z = fma(a.z, b.z, c.z);
														
 
															-  d.w = fma(a.w, b.w, c.w);
														
 
															-  return d;
														
 
															-}
														
 
															-
														
 
															-inline __device__ float4 fma(float a, float4 b, float4 c) {
														
 
															-  float4 d;
														
 
															-  d.x = fma(a, b.x, c.x);
														
 
															-  d.y = fma(a, b.y, c.y);
														
 
															-  d.z = fma(a, b.z, c.z);
														
 
															-  d.w = fma(a, b.w, c.w);
														
 
															-  return d;
														
 
															-}
														
 
															-
														
 
															-inline __device__ Float4_ fma(float a, Float4_ b, Float4_ c) {
														
 
															-  Float4_ d;
														
 
															-  d.x = fma(a, b.x, c.x);
														
 
															-  d.y = fma(a, b.y, c.y);
														
 
															-  return d;
														
 
															-}
														
 
															-
														
 
															-inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c) {
														
 
															-  Float8_ d;
														
 
															-  d.x = fma(a, b.x, c.x);
														
 
															-  d.y = fma(a, b.y, c.y);
														
 
															-  d.z = fma(a, b.z, c.z);
														
 
															-  d.w = fma(a, b.w, c.w);
														
 
															-  return d;
														
 
															-}
														
 
															-
														
 
															-// Vector sum.
														
 
															-template<>
														
 
															-inline __device__ float sum(float v) {
														
 
															-  return v;
														
 
															-}
														
 
															-
														
 
															-template<>
														
 
															-inline __device__ float sum(float2 v) {
														
 
															-  return v.x + v.y;
														
 
															-}
														
 
															-
														
 
															-template<>
														
 
															-inline __device__ float sum(float4 v) {
														
 
															-  return v.x + v.y + v.z + v.w;
														
 
															-}
														
 
															-
														
 
															-template<>
														
 
															-inline __device__ float sum(Float4_ v) {
														
 
															-  return v.x.x + v.x.y + v.y.x + v.y.y;
														
 
															-}
														
 
															-
														
 
															-template<>
														
 
															-inline __device__ float sum(Float8_ v) {
														
 
															-  return v.x.x + v.x.y + v.y.x + v.y.y + v.z.x + v.z.y + v.w.x + v.w.y;
														
 
															-}
														
 
															-
														
 
															-// Vector dot product.
														
 
															-inline __device__ float dot(float a, float b) {
														
 
															-  return a * b;
														
 
															-}
														
 
															-
														
 
															-inline __device__ float dot(float2 a, float2 b) {
														
 
															-  float2 c = mul<float2, float2, float2>(a, b);
														
 
															-  return c.x + c.y;
														
 
															-}
														
 
															-
														
 
															-inline __device__ float dot(Float4_ a, Float4_ b) {
														
 
															-  float2 acc = mul<float2, float2, float2>(a.x, b.x);
														
 
															-  acc = fma(a.y, b.y, acc);
														
 
															-  return acc.x + acc.y;
														
 
															-}
														
 
															-
														
 
															-inline __device__ float dot(Float8_ a, Float8_ b) {
														
 
															-  float2 acc = mul<float2, float2, float2>(a.x, b.x);
														
 
															-  acc = fma(a.y, b.y, acc);
														
 
															-  acc = fma(a.z, b.z, acc);
														
 
															-  acc = fma(a.w, b.w, acc);
														
 
															-  return acc.x + acc.y;
														
 
															-}
														
 
															-
														
 
															-// From float to float.
														
 
															-inline __device__ void from_float(float& dst, float src) {
														
 
															-  dst = src;
														
 
															-}
														
 
															-
														
 
															-inline __device__ void from_float(float2& dst, float2 src) {
														
 
															-  dst = src;
														
 
															-}
														
 
															-
														
 
															-inline __device__ void from_float(float4& dst, float4 src) {
														
 
															-  dst = src;
														
 
															-}
														
 
															-
														
 
															-// From float to float.
														
 
															-inline __device__ float to_float(float u) {
														
 
															-  return u;
														
 
															-}
														
 
															-
														
 
															-inline __device__ float2 to_float(float2 u) {
														
 
															-  return u;
														
 
															-}
														
 
															-
														
 
															-inline __device__ float4 to_float(float4 u) {
														
 
															-  return u;
														
 
															-}
														
 
															-
														
 
															-inline __device__ Float4_ to_float(Float4_ u) {
														
 
															-  return u;
														
 
															-}
														
 
															-
														
 
															-inline __device__ Float8_ to_float(Float8_ u) {
														
 
															-  return u;
														
 
															-}
														
 
															-
														
 
															-// Zero-out a variable.
														
 
															-inline __device__ void zero(float& dst) {
														
 
															-  dst = 0.f;
														
 
															-}
														
 
															-
														
 
															-} // namespace aphrodite
														
 
															+ #pragma once
														
 
															+
														
 
															+ #include "attention_generic.cuh"
														
 
															+ 
														
 
															+ #include <stdint.h>
														
 
															+ 
														
 
															+ namespace aphrodite {
														
 
															+ 
														
 
															+ // Define custom FP32 vector data types.
														
 
															+ struct Float4_ {
														
 
															+   float2 x;
														
 
															+   float2 y;
														
 
															+ };
														
 
															+ 
														
 
															+ struct Float8_ {
														
 
															+   float2 x;
														
 
															+   float2 y;
														
 
															+   float2 z;
														
 
															+   float2 w;
														
 
															+ };
														
 
															+ 
														
 
															+ // FP32 vector types for Q, K, V.
														
 
															+ template<>
														
 
															+ struct Vec<float, 1> {
														
 
															+   using Type = float;
														
 
															+ };
														
 
															+ template<>
														
 
															+ struct Vec<float, 2> {
														
 
															+   using Type = float2;
														
 
															+ };
														
 
															+ template<>
														
 
															+ struct Vec<float, 4> {
														
 
															+   using Type = float4;
														
 
															+ };
														
 
															+ 
														
 
															+ // FP32 accumulator vector types corresponding to Vec.
														
 
															+ template<>
														
 
															+ struct FloatVec<float> {
														
 
															+   using Type = float;
														
 
															+ };
														
 
															+ template<>
														
 
															+ struct FloatVec<float2> {
														
 
															+   using Type = float2;
														
 
															+ };
														
 
															+ template<>
														
 
															+ struct FloatVec<float4> {
														
 
															+   using Type = float4;
														
 
															+ };
														
 
															+ 
														
 
															+ // Vector addition.
														
 
															+ inline __device__ float add(float a, float b) {
														
 
															+   return a + b;
														
 
															+ }
														
 
															+ 
														
 
															+ inline __device__ float2 add(float2 a, float2 b) {
														
 
															+   float2 c;
														
 
															+   c.x = add(a.x, b.x);
														
 
															+   c.y = add(a.y, b.y);
														
 
															+   return c;
														
 
															+ }
														
 
															+ 
														
 
															+ inline __device__ float4 add(float4 a, float4 b) {
														
 
															+   float4 c;
														
 
															+   c.x = add(a.x, b.x);
														
 
															+   c.y = add(a.y, b.y);
														
 
															+   c.z = add(a.z, b.z);
														
 
															+   c.w = add(a.w, b.w);
														
 
															+   return c;
														
 
															+ }
														
 
															+ 
														
 
															+ // Vector multiplication.
														
 
															+ template<>
														
 
															+ inline __device__ float mul<float, float>(float a, float b) {
														
 
															+   return a * b;
														
 
															+ }
														
 
															+ 
														
 
															+ template<>
														
 
															+ inline __device__ float2 mul(float2 a, float2 b) {
														
 
															+   float2 c;
														
 
															+   c.x = a.x * b.x;
														
 
															+   c.y = a.y * b.y;
														
 
															+   return c;
														
 
															+ }
														
 
															+ 
														
 
															+ template<>
														
 
															+ inline __device__ float2 mul(float a, float2 b) {
														
 
															+   float2 c;
														
 
															+   c.x = a * b.x;
														
 
															+   c.y = a * b.y;
														
 
															+   return c;
														
 
															+ }
														
 
															+ 
														
 
															+ template<>
														
 
															+ inline __device__ float4 mul(float4 a, float4 b) {
														
 
															+   float4 c;
														
 
															+   c.x = a.x * b.x;
														
 
															+   c.y = a.y * b.y;
														
 
															+   c.z = a.z * b.z;
														
 
															+   c.w = a.w * b.w;
														
 
															+   return c;
														
 
															+ }
														
 
															+ 
														
 
															+ template<>
														
 
															+ inline __device__ float4 mul(float a, float4 b) {
														
 
															+   float4 c;
														
 
															+   c.x = a * b.x;
														
 
															+   c.y = a * b.y;
														
 
															+   c.z = a * b.z;
														
 
															+   c.w = a * b.w;
														
 
															+   return c;
														
 
															+ }
														
 
															+ 
														
 
															+ // Vector fused multiply-add.
														
 
															+ inline __device__ float fma(float a, float b, float c) {
														
 
															+   return a * b + c;
														
 
															+ }
														
 
															+ 
														
 
															+ inline __device__ float2 fma(float2 a, float2 b, float2 c) {
														
 
															+   float2 d;
														
 
															+   d.x = fma(a.x, b.x, c.x);
														
 
															+   d.y = fma(a.y, b.y, c.y);
														
 
															+   return d;
														
 
															+ }
														
 
															+ 
														
 
															+ inline __device__ float2 fma(float a, float2 b, float2 c) {
														
 
															+   float2 d;
														
 
															+   d.x = fma(a, b.x, c.x);
														
 
															+   d.y = fma(a, b.y, c.y);
														
 
															+   return d;
														
 
															+ }
														
 
															+ 
														
 
															+ inline __device__ float4 fma(float4 a, float4 b, float4 c) {
														
 
															+   float4 d;
														
 
															+   d.x = fma(a.x, b.x, c.x);
														
 
															+   d.y = fma(a.y, b.y, c.y);
														
 
															+   d.z = fma(a.z, b.z, c.z);
														
 
															+   d.w = fma(a.w, b.w, c.w);
														
 
															+   return d;
														
 
															+ }
														
 
															+ 
														
 
															+ inline __device__ float4 fma(float a, float4 b, float4 c) {
														
 
															+   float4 d;
														
 
															+   d.x = fma(a, b.x, c.x);
														
 
															+   d.y = fma(a, b.y, c.y);
														
 
															+   d.z = fma(a, b.z, c.z);
														
 
															+   d.w = fma(a, b.w, c.w);
														
 
															+   return d;
														
 
															+ }
														
 
															+ 
														
 
															+ inline __device__ Float4_ fma(float a, Float4_ b, Float4_ c) {
														
 
															+   Float4_ d;
														
 
															+   d.x = fma(a, b.x, c.x);
														
 
															+   d.y = fma(a, b.y, c.y);
														
 
															+   return d;
														
 
															+ }
														
 
															+ 
														
 
															+ inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c) {
														
 
															+   Float8_ d;
														
 
															+   d.x = fma(a, b.x, c.x);
														
 
															+   d.y = fma(a, b.y, c.y);
														
 
															+   d.z = fma(a, b.z, c.z);
														
 
															+   d.w = fma(a, b.w, c.w);
														
 
															+   return d;
														
 
															+ }
														
 
															+ 
														
 
															+ // Vector sum.
														
 
															+ template<>
														
 
															+ inline __device__ float sum(float v) {
														
 
															+   return v;
														
 
															+ }
														
 
															+ 
														
 
															+ template<>
														
 
															+ inline __device__ float sum(float2 v) {
														
 
															+   return v.x + v.y;
														
 
															+ }
														
 
															+ 
														
 
															+ template<>
														
 
															+ inline __device__ float sum(float4 v) {
														
 
															+   return v.x + v.y + v.z + v.w;
														
 
															+ }
														
 
															+ 
														
 
															+ template<>
														
 
															+ inline __device__ float sum(Float4_ v) {
														
 
															+   return v.x.x + v.x.y + v.y.x + v.y.y;
														
 
															+ }
														
 
															+ 
														
 
															+ template<>
														
 
															+ inline __device__ float sum(Float8_ v) {
														
 
															+   return v.x.x + v.x.y + v.y.x + v.y.y + v.z.x + v.z.y + v.w.x + v.w.y;
														
 
															+ }
														
 
															+ 
														
 
															+ // Vector dot product.
														
 
															+ inline __device__ float dot(float a, float b) {
														
 
															+   return a * b;
														
 
															+ }
														
 
															+ 
														
 
															+ inline __device__ float dot(float2 a, float2 b) {
														
 
															+   float2 c = mul<float2, float2, float2>(a, b);
														
 
															+   return c.x + c.y;
														
 
															+ }
														
 
															+ 
														
 
															+ inline __device__ float dot(Float4_ a, Float4_ b) {
														
 
															+   float2 acc = mul<float2, float2, float2>(a.x, b.x);
														
 
															+   acc = fma(a.y, b.y, acc);
														
 
															+   return acc.x + acc.y;
														
 
															+ }
														
 
															+ 
														
 
															+ inline __device__ float dot(Float8_ a, Float8_ b) {
														
 
															+   float2 acc = mul<float2, float2, float2>(a.x, b.x);
														
 
															+   acc = fma(a.y, b.y, acc);
														
 
															+   acc = fma(a.z, b.z, acc);
														
 
															+   acc = fma(a.w, b.w, acc);
														
 
															+   return acc.x + acc.y;
														
 
															+ }
														
 
															+ 
														
 
															+ // From float to float.
														
 
															+ inline __device__ void from_float(float& dst, float src) {
														
 
															+   dst = src;
														
 
															+ }
														
 
															+ 
														
 
															+ inline __device__ void from_float(float2& dst, float2 src) {
														
 
															+   dst = src;
														
 
															+ }
														
 
															+ 
														
 
															+ inline __device__ void from_float(float4& dst, float4 src) {
														
 
															+   dst = src;
														
 
															+ }
														
 
															+ 
														
 
															+ // From float to float.
														
 
															+ inline __device__ float to_float(float u) {
														
 
															+   return u;
														
 
															+ }
														
 
															+ 
														
 
															+ inline __device__ float2 to_float(float2 u) {
														
 
															+   return u;
														
 
															+ }
														
 
															+ 
														
 
															+ inline __device__ float4 to_float(float4 u) {
														
 
															+   return u;
														
 
															+ }
														
 
															+ 
														
 
															+ inline __device__ Float4_ to_float(Float4_ u) {
														
 
															+   return u;
														
 
															+ }
														
 
															+ 
														
 
															+ inline __device__ Float8_ to_float(Float8_ u) {
														
 
															+   return u;
														
 
															+ }
														
 
															+ 
														
 
															+ // Zero-out a variable.
														
 
															+ inline __device__ void zero(float& dst) {
														
 
															+   dst = 0.f;
														
 
															+ }
														
 
															+ 
														
 
															+ } // namespace aphrodite
														
--- a/kernels/backup/README
+++ b/kernels/backup/README
@@ -0,0 +1 @@
 
															+Backup of attention and cache kernels from INT8 KV Cache. Will be restored soon.
														
--- a/kernels/backup/attention_dtypes.h
+++ b/kernels/backup/attention_dtypes.h
@@ -0,0 +1,8 @@
 
															+#pragma once
														
 
															+
														
 
															+#include "attention_generic.cuh"
														
 
															+#include "dtype_float16.cuh"
														
 
															+#include "dtype_float32.cuh"
														
 
															+#include "dtype_bfloat16.cuh"
														
 
															+#include "dtype_fp8_e5m2.cuh"
														
 
															+#include "dtype_int8.cuh"
														
--- a/kernels/backup/attention_kernels.cu
+++ b/kernels/backup/attention_kernels.cu
@@ -0,0 +1,1032 @@
 
															+/*
														
 
															+ * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
														
 
															+ * Copyright (c) 2023, The PygmalionAI team.
														
 
															+ * Copyright (c) 2023, The vLLM team.
														
 
															+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
														
 
															+ *
														
 
															+ * Licensed under the Apache License, Version 2.0 (the "License");
														
 
															+ * you may not use this file except in compliance with the License.
														
 
															+ * You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+#ifdef USE_ROCM
														
 
															+#include <hip/hip_runtime.h>
														
 
															+#endif
														
 
															+
														
 
															+#include <torch/extension.h>
														
 
															+#include <ATen/cuda/CUDAContext.h>
														
 
															+#include <c10/cuda/CUDAGuard.h>
														
 
															+
														
 
															+#include "attention_dtypes.h"
														
 
															+#include "attention_utils.cuh"
														
 
															+#include "../quantization/int8_kvcache/quant_utils.cuh"
														
 
															+#ifdef ENABLE_FP8_E5M2
														
 
															+#include "../quantization/fp8_e5m2_kvcache/quant_utils.cuh"
														
 
															+#endif
														
 
															+
														
 
															+#include <algorithm>
														
 
															+
														
 
															+#ifndef USE_ROCM
														
 
															+#define WARP_SIZE 32
														
 
															+#else
														
 
															+#define WARP_SIZE warpSize
														
 
															+#endif
														
 
															+#define MAX(a, b) ((a) > (b) ? (a) : (b))
														
 
															+#define MIN(a, b) ((a) < (b) ? (a) : (b))
														
 
															+#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
														
 
															+
														
 
															+enum kv_cache_dtype {
														
 
															+  AUTO,
														
 
															+#ifdef ENABLE_FP8_E5M2
														
 
															+  FP8_E5M2,
														
 
															+#endif
														
 
															+  INT8};
														
 
															+
														
 
															+namespace aphrodite {
														
 
															+
														
 
															+// Utility function for attention softmax.
														
 
															+template<int NUM_WARPS>
														
 
															+inline __device__ float block_sum(float* red_smem, float sum) {
														
 
															+  // Decompose the thread index into warp / lane.
														
 
															+  int warp = threadIdx.x / WARP_SIZE;
														
 
															+  int lane = threadIdx.x % WARP_SIZE;
														
 
															+
														
 
															+  // Compute the sum per warp.
														
 
															+#pragma unroll
														
 
															+  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
														
 
															+    sum += APHRODITE_SHFL_XOR_SYNC(sum, mask);
														
 
															+  }
														
 
															+
														
 
															+  // Warp leaders store the data to shared memory.
														
 
															+  if (lane == 0) {
														
 
															+    red_smem[warp] = sum;
														
 
															+  }
														
 
															+
														
 
															+  // Make sure the data is in shared memory.
														
 
															+  __syncthreads();
														
 
															+
														
 
															+  // The warps compute the final sums.
														
 
															+  if (lane < NUM_WARPS) {
														
 
															+    sum = red_smem[lane];
														
 
															+  }
														
 
															+
														
 
															+  // Parallel reduction inside the warp.
														
 
															+#pragma unroll
														
 
															+  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
														
 
															+    sum += APHRODITE_SHFL_XOR_SYNC(sum, mask);
														
 
															+  }
														
 
															+
														
 
															+  // Broadcast to other threads.
														
 
															+  return APHRODITE_SHFL_SYNC(sum, 0);
														
 
															+}
														
 
															+
														
 
															+// TODO: Merge the last two dimensions of the grid.
														
 
															+// Grid: (num_heads, num_seqs, max_num_partitions).
														
 
															+template<
														
 
															+  typename scalar_t,
														
 
															+  typename cache_t,
														
 
															+  int HEAD_SIZE,
														
 
															+  int BLOCK_SIZE,
														
 
															+  int NUM_THREADS,
														
 
															+  kv_cache_dtype KV_CACHE_DTYPE,
														
 
															+  int PARTITION_SIZE = 0> // Zero means no partitioning.
														
 
															+__device__ void paged_attention_kernel(
														
 
															+  float* __restrict__ exp_sums,           // [num_seqs, num_heads, max_num_partitions]
														
 
															+  float* __restrict__ max_logits,         // [num_seqs, num_heads, max_num_partitions]
														
 
															+  scalar_t* __restrict__ out,             // [num_seqs, num_heads, max_num_partitions, head_size]
														
 
															+  const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
														
 
															+  const cache_t* __restrict__ k_cache,    // [num_blocks, num_kv_heads, head_size/x, block_size, x]
														
 
															+  const cache_t* __restrict__ v_cache,    // [num_blocks, num_kv_heads, head_size, block_size]
														
 
															+  const int num_kv_heads,                 // [num_heads]
														
 
															+  const float scale,
														
 
															+  const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
														
 
															+  const int* __restrict__ context_lens,   // [num_seqs]
														
 
															+  const int max_num_blocks_per_seq,
														
 
															+  const float* __restrict__ alibi_slopes, // [num_heads]
														
 
															+  const int q_stride,
														
 
															+  const int kv_block_stride,
														
 
															+  const int kv_head_stride,
														
 
															+  const float k_scale = 1.0f,
														
 
															+  const float k_zp = 0.0f,
														
 
															+  const float v_scale = 1.0f,
														
 
															+  const float v_zp = 0.0f) {
														
 
															+  const int seq_idx = blockIdx.y;
														
 
															+  const int partition_idx = blockIdx.z;
														
 
															+  const int max_num_partitions = gridDim.z;
														
 
															+  constexpr bool USE_PARTITIONING = PARTITION_SIZE > 0;
														
 
															+  const int context_len = context_lens[seq_idx];
														
 
															+  if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= context_len) {
														
 
															+    // No work to do. Terminate the thread block.
														
 
															+    return;
														
 
															+  }
														
 
															+
														
 
															+  const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
														
 
															+  const int num_blocks_per_partition = USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_context_blocks;
														
 
															+
														
 
															+  // [start_block_idx, end_block_idx) is the range of blocks to process.
														
 
															+  const int start_block_idx = USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
														
 
															+  const int end_block_idx = MIN(start_block_idx + num_blocks_per_partition, num_context_blocks);
														
 
															+  const int num_blocks = end_block_idx - start_block_idx;
														
 
															+
														
 
															+  // [start_token_idx, end_token_idx) is the range of tokens to process.
														
 
															+  const int start_token_idx = start_block_idx * BLOCK_SIZE;
														
 
															+  const int end_token_idx = MIN(start_token_idx + num_blocks * BLOCK_SIZE, context_len);
														
 
															+  const int num_tokens = end_token_idx - start_token_idx;
														
 
															+
														
 
															+  constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
														
 
															+  constexpr int NUM_THREAD_GROUPS = NUM_THREADS / THREAD_GROUP_SIZE; // Note: This assumes THREAD_GROUP_SIZE divides NUM_THREADS
														
 
															+  assert(NUM_THREADS % THREAD_GROUP_SIZE == 0);
														
 
															+  constexpr int NUM_TOKENS_PER_THREAD_GROUP = DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE);
														
 
															+  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
														
 
															+  const int thread_idx = threadIdx.x;
														
 
															+  const int warp_idx = thread_idx / WARP_SIZE;
														
 
															+  const int lane = thread_idx % WARP_SIZE;
														
 
															+
														
 
															+  const int head_idx = blockIdx.x;
														
 
															+  const int num_heads = gridDim.x;
														
 
															+  const int num_queries_per_kv = num_heads / num_kv_heads;
														
 
															+  const int kv_head_idx = head_idx / num_queries_per_kv;
														
 
															+  const float alibi_slope = alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx];
														
 
															+
														
 
															+  // A vector type to store a part of a key or a query.
														
 
															+  // The vector size is configured in such a way that the threads in a thread group
														
 
															+  // fetch or compute 16 bytes at a time.
														
 
															+  // For example, if the size of a thread group is 4 and the data type is half,
														
 
															+  // then the vector size is 16 / (4 * sizeof(half)) == 2.
														
 
															+  constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(scalar_t)), 1);
														
 
															+  using K_vec = typename Vec<scalar_t, VEC_SIZE>::Type;
														
 
															+  using Q_vec = typename Vec<scalar_t, VEC_SIZE>::Type;
														
 
															+  using Quant_vec = typename Vec<cache_t, VEC_SIZE>::Type;
														
 
															+
														
 
															+  constexpr int NUM_ELEMS_PER_THREAD = HEAD_SIZE / THREAD_GROUP_SIZE;
														
 
															+  constexpr int NUM_VECS_PER_THREAD = NUM_ELEMS_PER_THREAD / VEC_SIZE;
														
 
															+
														
 
															+  const int thread_group_idx = thread_idx / THREAD_GROUP_SIZE;
														
 
															+  const int thread_group_offset = thread_idx % THREAD_GROUP_SIZE;
														
 
															+
														
 
															+  // Load the query to registers.
														
 
															+  // Each thread in a thread group has a different part of the query.
														
 
															+  // For example, if the the thread group size is 4, then the first thread in the group
														
 
															+  // has 0, 4, 8, ... th vectors of the query, and the second thread has 1, 5, 9, ...
														
 
															+  // th vectors of the query, and so on.
														
 
															+  // NOTE: Because q is split from a qkv tensor, it may not be contiguous.
														
 
															+  const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
														
 
															+  __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
														
 
															+#pragma unroll
														
 
															+  for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD; i += NUM_THREAD_GROUPS) {
														
 
															+    const int vec_idx = thread_group_offset + i * THREAD_GROUP_SIZE;
														
 
															+    q_vecs[thread_group_offset][i] = *reinterpret_cast<const Q_vec*>(q_ptr + vec_idx * VEC_SIZE);
														
 
															+  }
														
 
															+  __syncthreads(); // TODO: possible speedup if this is replaced with a memory wall right before we use q_vecs
														
 
															+
														
 
															+  // Memory planning.
														
 
															+  extern __shared__ char shared_mem[];
														
 
															+  // NOTE: We use FP32 for the softmax logits for better accuracy.
														
 
															+  float* logits = reinterpret_cast<float*>(shared_mem);
														
 
															+  // Workspace for reduction.
														
 
															+  __shared__ float red_smem[2 * NUM_WARPS];
														
 
															+
														
 
															+  // x == THREAD_GROUP_SIZE * VEC_SIZE
														
 
															+  // Each thread group fetches x elements from the key at a time.
														
 
															+  constexpr int x = 16 / sizeof(cache_t);
														
 
															+  float qk_max = -FLT_MAX;
														
 
															+
														
 
															+  // Iterate over the key blocks.
														
 
															+  // Each warp fetches a block of keys for each iteration.
														
 
															+  // Each thread group in a warp fetches a key from the block, and computes
														
 
															+  // dot product with the query.
														
 
															+  const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
														
 
															+  for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) {
														
 
															+    // NOTE: The block number is stored in int32. However, we cast it to int64
														
 
															+    // because int32 can lead to overflow when this variable is multiplied by large numbers
														
 
															+    // (e.g., kv_block_stride).
														
 
															+    const int64_t physical_block_number = static_cast<int64_t>(block_table[block_idx]);
														
 
															+
														
 
															+    // Load a key to registers.
														
 
															+    // Each thread in a thread group has a different part of the key.
														
 
															+    // For example, if the the thread group size is 4, then the first thread in the group
														
 
															+    // has 0, 4, 8, ... th vectors of the key, and the second thread has 1, 5, 9, ... th
														
 
															+    // vectors of the key, and so on.
														
 
															+    for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
														
 
															+      const int physical_block_offset = (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
														
 
															+      const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
														
 
															+      K_vec k_vecs[NUM_VECS_PER_THREAD];
														
 
															+
														
 
															+#pragma unroll
														
 
															+      for (int j = 0; j < NUM_VECS_PER_THREAD; j++) {
														
 
															+        const cache_t* k_ptr = k_cache + physical_block_number * kv_block_stride
														
 
															+                                       + kv_head_idx * kv_head_stride
														
 
															+                                       + physical_block_offset * x;
														
 
															+        const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE;
														
 
															+        const int offset1 = (vec_idx * VEC_SIZE) / x;
														
 
															+        const int offset2 = (vec_idx * VEC_SIZE) % x;
														
 
															+        if constexpr (KV_CACHE_DTYPE == INT8) {
														
 
															+          Quant_vec k_vec_quant = *reinterpret_cast<const Quant_vec*>(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
														
 
															+          using Dequant_vec = typename FloatVec<Quant_vec>::Type;
														
 
															+          Dequant_vec k_vec_dequant = int8::dequant(k_vec_quant, k_scale, k_zp);
														
 
															+          k_vecs[j] = int8::vec_conversion<K_vec, Dequant_vec>(k_vec_dequant);
														
 
															+#ifdef ENABLE_FP8_E5M2
														
 
															+        } else if constexpr (KV_CACHE_DTYPE == FP8_E5M2) {
														
 
															+          Quant_vec k_vec_quant = *reinterpret_cast<const Quant_vec*>(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
														
 
															+          // Vector conversion from Quant_vec to K_vec.
														
 
															+          k_vecs[j] = fp8_e5m2_unscaled::vec_conversion<K_vec, Quant_vec>(k_vec_quant);
														
 
															+#endif
														
 
															+        } else {
														
 
															+          k_vecs[j] = *reinterpret_cast<const K_vec*>(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
														
 
															+        }
														
 
															+      }
														
 
															+
														
 
															+      // Compute dot product.
														
 
															+      // This includes a reduction across the threads in the same thread group.
														
 
															+      float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
														
 
															+      // Add the ALiBi bias if slopes are given.
														
 
															+      qk += (alibi_slope != 0) ? alibi_slope * (token_idx - context_len + 1) : 0;
														
 
															+
														
 
															+      if (thread_group_offset == 0) {
														
 
															+        // Store the partial reductions to shared memory.
														
 
															+        // NOTE: It is required to zero out the masked logits.
														
 
															+        const bool mask = token_idx >= context_len;
														
 
															+        logits[token_idx - start_token_idx] = mask ? 0.f : qk;
														
 
															+        // Update the max value.
														
 
															+        qk_max = mask ? qk_max : fmaxf(qk_max, qk);
														
 
															+      }
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  // Perform reduction across the threads in the same warp to get the
														
 
															+  // max qk value for each "warp" (not across the thread block yet).
														
 
															+  // The 0-th thread of each thread group already has its max qk value.
														
 
															+#pragma unroll
														
 
															+  for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
														
 
															+    qk_max = fmaxf(qk_max, APHRODITE_SHFL_XOR_SYNC(qk_max, mask));
														
 
															+  }
														
 
															+  if (lane == 0) {
														
 
															+    red_smem[warp_idx] = qk_max;
														
 
															+  }
														
 
															+  __syncthreads();
														
 
															+
														
 
															+  // TODO: Refactor this part.
														
 
															+  // Get the max qk value for the sequence.
														
 
															+  qk_max = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
														
 
															+#pragma unroll
														
 
															+  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
														
 
															+    qk_max = fmaxf(qk_max, APHRODITE_SHFL_XOR_SYNC(qk_max, mask));
														
 
															+  }
														
 
															+  // Broadcast the max qk value to all threads.
														
 
															+  qk_max = APHRODITE_SHFL_SYNC(qk_max, 0);
														
 
															+
														
 
															+  // Get the sum of the exp values.
														
 
															+  float exp_sum = 0.f;
														
 
															+  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
														
 
															+    float val = __expf(logits[i] - qk_max);
														
 
															+    logits[i] = val;
														
 
															+    exp_sum += val;
														
 
															+  }
														
 
															+  exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum);
														
 
															+
														
 
															+  // Compute softmax.
														
 
															+  const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
														
 
															+  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
														
 
															+    logits[i] *= inv_sum;
														
 
															+  }
														
 
															+  __syncthreads();
														
 
															+
														
 
															+  // If partitioning is enabled, store the max logit and exp_sum.
														
 
															+  if (USE_PARTITIONING && thread_idx == 0) {
														
 
															+    float* max_logits_ptr = max_logits + seq_idx * num_heads * max_num_partitions
														
 
															+                                       + head_idx * max_num_partitions
														
 
															+                                       + partition_idx;
														
 
															+    *max_logits_ptr = qk_max;
														
 
															+    float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions
														
 
															+                                   + head_idx * max_num_partitions
														
 
															+                                   + partition_idx;
														
 
															+    *exp_sums_ptr = exp_sum;
														
 
															+  }
														
 
															+
														
 
															+  // Each thread will fetch 16 bytes from the value cache at a time.
														
 
															+  constexpr int V_VEC_SIZE = MIN(16 / sizeof(scalar_t), BLOCK_SIZE);
														
 
															+  using V_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
														
 
															+  using L_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
														
 
															+  using V_quant_vec = typename Vec<cache_t, V_VEC_SIZE>::Type;
														
 
															+  using Float_L_vec = typename FloatVec<L_vec>::Type;
														
 
															+
														
 
															+  constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE;
														
 
															+  constexpr int NUM_ROWS_PER_ITER = WARP_SIZE / NUM_V_VECS_PER_ROW;
														
 
															+  constexpr int NUM_ROWS_PER_THREAD = DIVIDE_ROUND_UP(HEAD_SIZE, NUM_ROWS_PER_ITER);
														
 
															+
														
 
															+  // NOTE: We use FP32 for the accumulator for better accuracy.
														
 
															+  float accs[NUM_ROWS_PER_THREAD];
														
 
															+#pragma unroll
														
 
															+  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
														
 
															+    accs[i] = 0.f;
														
 
															+  }
														
 
															+
														
 
															+  scalar_t zero_value;
														
 
															+  zero(zero_value);
														
 
															+  for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) {
														
 
															+    // NOTE: The block number is stored in int32. However, we cast it to int64
														
 
															+    // because int32 can lead to overflow when this variable is multiplied by large numbers
														
 
															+    // (e.g., kv_block_stride).
														
 
															+    const int64_t physical_block_number = static_cast<int64_t>(block_table[block_idx]);
														
 
															+    const int physical_block_offset = (lane % NUM_V_VECS_PER_ROW) * V_VEC_SIZE;
														
 
															+    const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
														
 
															+    L_vec logits_vec;
														
 
															+    from_float(logits_vec, *reinterpret_cast<Float_L_vec*>(logits + token_idx - start_token_idx));
														
 
															+
														
 
															+    const cache_t* v_ptr = v_cache + physical_block_number * kv_block_stride
														
 
															+                                   + kv_head_idx * kv_head_stride;
														
 
															+#pragma unroll
														
 
															+    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
														
 
															+      const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
														
 
															+      if (row_idx < HEAD_SIZE) {
														
 
															+        const int offset = row_idx * BLOCK_SIZE + physical_block_offset;
														
 
															+        V_vec v_vec;
														
 
															+        if constexpr (KV_CACHE_DTYPE == INT8) {
														
 
															+          // dequant and conversion
														
 
															+          V_quant_vec v_vec_quant = *reinterpret_cast<const V_quant_vec*>(v_ptr + offset);
														
 
															+          using V_dequant_vec = typename FloatVec<V_quant_vec>::Type;
														
 
															+          V_dequant_vec v_vec_dequant = int8::dequant(v_vec_quant, v_scale, v_zp);
														
 
															+          v_vec = int8::vec_conversion<V_vec, V_dequant_vec>(v_vec_dequant);
														
 
															+#ifdef ENABLE_FP8_E5M2
														
 
															+        } else if constexpr (KV_CACHE_DTYPE == FP8_E5M2) {
														
 
															+          V_quant_vec v_quant_vec = *reinterpret_cast<const V_quant_vec*>(v_ptr + offset);
														
 
															+          // Vector conversion from V_quant_vec to V_vec.
														
 
															+          v_vec = fp8_e5m2_unscaled::vec_conversion<V_vec, V_quant_vec>(v_quant_vec);
														
 
															+#endif
														
 
															+        } else {
														
 
															+          v_vec = *reinterpret_cast<const V_vec*>(v_ptr + offset);
														
 
															+        }
														
 
															+        if (block_idx == num_context_blocks - 1) {
														
 
															+          // NOTE: When v_vec contains the tokens that are out of the context,
														
 
															+          // we should explicitly zero out the values since they may contain NaNs.
														
 
															+          scalar_t* v_vec_ptr = reinterpret_cast<scalar_t*>(&v_vec);
														
 
															+#pragma unroll
														
 
															+          for (int j = 0; j < V_VEC_SIZE; j++) {
														
 
															+            v_vec_ptr[j] = token_idx + j < context_len ? v_vec_ptr[j] : zero_value;
														
 
															+          }
														
 
															+        }
														
 
															+        accs[i] += dot(logits_vec, v_vec);
														
 
															+      }
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  // Perform reduction within each warp.
														
 
															+#pragma unroll
														
 
															+  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
														
 
															+    float acc = accs[i];
														
 
															+#pragma unroll
														
 
															+    for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
														
 
															+      acc += APHRODITE_SHFL_XOR_SYNC(acc, mask);
														
 
															+    }
														
 
															+    accs[i] = acc;
														
 
															+  }
														
 
															+
														
 
															+  // NOTE: A barrier is required because the shared memory space for logits
														
 
															+  // is reused for the output.
														
 
															+  __syncthreads();
														
 
															+
														
 
															+  // Perform reduction across warps.
														
 
															+  float* out_smem = reinterpret_cast<float*>(shared_mem);
														
 
															+#pragma unroll
														
 
															+  for (int i = NUM_WARPS; i > 1; i /= 2) {
														
 
															+    int mid = i / 2;
														
 
															+    // Upper warps write to shared memory.
														
 
															+    if (warp_idx >= mid && warp_idx < i) {
														
 
															+      float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
														
 
															+#pragma unroll
														
 
															+      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
														
 
															+        const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
														
 
															+        if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
														
 
															+          dst[row_idx] = accs[i];
														
 
															+        }
														
 
															+      }
														
 
															+    }
														
 
															+    __syncthreads();
														
 
															+
														
 
															+    // Lower warps update the output.
														
 
															+    if (warp_idx < mid) {
														
 
															+      const float* src = &out_smem[warp_idx * HEAD_SIZE];
														
 
															+#pragma unroll
														
 
															+      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
														
 
															+        const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
														
 
															+        if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
														
 
															+          accs[i] += src[row_idx];
														
 
															+        }
														
 
															+      }
														
 
															+    }
														
 
															+    __syncthreads();
														
 
															+  }
														
 
															+
														
 
															+  // Write the final output.
														
 
															+  if (warp_idx == 0) {
														
 
															+    scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
														
 
															+                            + head_idx * max_num_partitions * HEAD_SIZE
														
 
															+                            + partition_idx * HEAD_SIZE;
														
 
															+#pragma unroll
														
 
															+    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
														
 
															+      const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
														
 
															+      if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
														
 
															+        from_float(*(out_ptr + row_idx), accs[i]);
														
 
															+      }
														
 
															+    }
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+// Grid: (num_heads, num_seqs, 1).
														
 
															+template<
														
 
															+  typename scalar_t,
														
 
															+  typename cache_t,
														
 
															+  int HEAD_SIZE,
														
 
															+  int BLOCK_SIZE,
														
 
															+  int NUM_THREADS,
														
 
															+  kv_cache_dtype KV_CACHE_DTYPE>
														
 
															+__global__ void paged_attention_v1_kernel(
														
 
															+  scalar_t* __restrict__ out,             // [num_seqs, num_heads, head_size]
														
 
															+  const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
														
 
															+  const cache_t* __restrict__ k_cache,    // [num_blocks, num_kv_heads, head_size/x, block_size, x]
														
 
															+  const cache_t* __restrict__ v_cache,    // [num_blocks, num_kv_heads, head_size, block_size]
														
 
															+  const int num_kv_heads,                 // [num_heads]
														
 
															+  const float scale,
														
 
															+  const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
														
 
															+  const int* __restrict__ context_lens,   // [num_seqs]
														
 
															+  const int max_num_blocks_per_seq,
														
 
															+  const float* __restrict__ alibi_slopes, // [num_heads]
														
 
															+  const int q_stride,
														
 
															+  const int kv_block_stride,
														
 
															+  const int kv_head_stride,
														
 
															+  const float k_scale,
														
 
															+  const float k_zp,
														
 
															+  const float v_scale,
														
 
															+  const float v_zp) {
														
 
															+  paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, KV_CACHE_DTYPE>(
														
 
															+    /* exp_sums */ nullptr, /* max_logits */ nullptr,
														
 
															+    out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, context_lens,
														
 
															+    max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride, k_scale, k_zp, v_scale, v_zp);
														
 
															+}
														
 
															+
														
 
															+// Grid: (num_heads, num_seqs, max_num_partitions).
														
 
															+template<
														
 
															+  typename scalar_t,
														
 
															+  typename cache_t,
														
 
															+  int HEAD_SIZE,
														
 
															+  int BLOCK_SIZE,
														
 
															+  int NUM_THREADS,
														
 
															+  kv_cache_dtype KV_CACHE_DTYPE,
														
 
															+  int PARTITION_SIZE>
														
 
															+__global__ void paged_attention_v2_kernel(
														
 
															+  float* __restrict__ exp_sums,           // [num_seqs, num_heads, max_num_partitions]
														
 
															+  float* __restrict__ max_logits,         // [num_seqs, num_heads, max_num_partitions]
														
 
															+  scalar_t* __restrict__ tmp_out,         // [num_seqs, num_heads, max_num_partitions, head_size]
														
 
															+  const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
														
 
															+  const cache_t* __restrict__ k_cache,    // [num_blocks, num_kv_heads, head_size/x, block_size, x]
														
 
															+  const cache_t* __restrict__ v_cache,    // [num_blocks, num_kv_heads, head_size, block_size]
														
 
															+  const int num_kv_heads,                 // [num_heads]
														
 
															+  const float scale,
														
 
															+  const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
														
 
															+  const int* __restrict__ context_lens,   // [num_seqs]
														
 
															+  const int max_num_blocks_per_seq,
														
 
															+  const float* __restrict__ alibi_slopes, // [num_heads]
														
 
															+  const int q_stride,
														
 
															+  const int kv_block_stride,
														
 
															+  const int kv_head_stride,
														
 
															+  const float k_scale,
														
 
															+  const float k_zp,
														
 
															+  const float v_scale,
														
 
															+  const float v_zp) {
														
 
															+  paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, KV_CACHE_DTYPE, PARTITION_SIZE>(
														
 
															+    exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
														
 
															+    block_tables, context_lens, max_num_blocks_per_seq, alibi_slopes,
														
 
															+    q_stride, kv_block_stride, kv_head_stride, k_scale, k_zp, v_scale, v_zp);
														
 
															+}
														
 
															+
														
 
															+// Grid: (num_heads, num_seqs).
														
 
															+template<
														
 
															+  typename scalar_t,
														
 
															+  int HEAD_SIZE,
														
 
															+  int NUM_THREADS,
														
 
															+  int PARTITION_SIZE>
														
 
															+__global__ void paged_attention_v2_reduce_kernel(
														
 
															+  scalar_t* __restrict__ out,             // [num_seqs, num_heads, head_size]
														
 
															+  const float* __restrict__ exp_sums,     // [num_seqs, num_heads, max_num_partitions]
														
 
															+  const float* __restrict__ max_logits,   // [num_seqs, num_heads, max_num_partitions]
														
 
															+  const scalar_t* __restrict__ tmp_out,   // [num_seqs, num_heads, max_num_partitions, head_size]
														
 
															+  const int* __restrict__ context_lens,   // [num_seqs]
														
 
															+  const int max_num_partitions) {
														
 
															+  const int num_heads = gridDim.x;
														
 
															+  const int head_idx = blockIdx.x;
														
 
															+  const int seq_idx = blockIdx.y;
														
 
															+  const int context_len = context_lens[seq_idx];
														
 
															+  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
														
 
															+  if (num_partitions == 1) {
														
 
															+    // No need to reduce. Only copy tmp_out to out.
														
 
															+    scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
														
 
															+    const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
														
 
															+                                          + head_idx * max_num_partitions * HEAD_SIZE;
														
 
															+    for (int i = threadIdx.x; i < HEAD_SIZE; i += blockDim.x) {
														
 
															+      out_ptr[i] = tmp_out_ptr[i];
														
 
															+    }
														
 
															+    // Terminate the thread block.
														
 
															+    return;
														
 
															+  }
														
 
															+
														
 
															+  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
														
 
															+  const int warp_idx = threadIdx.x / WARP_SIZE;
														
 
															+  const int lane = threadIdx.x % WARP_SIZE;
														
 
															+
														
 
															+  // Size: 2 * num_partitions.
														
 
															+  extern __shared__ char shared_mem[];
														
 
															+  // Workspace for reduction.
														
 
															+  __shared__ float red_smem[2 * NUM_WARPS];
														
 
															+
														
 
															+  // Load max logits to shared memory.
														
 
															+  float* shared_max_logits = reinterpret_cast<float*>(shared_mem);
														
 
															+  const float* max_logits_ptr = max_logits + seq_idx * num_heads * max_num_partitions
														
 
															+                                           + head_idx * max_num_partitions;
														
 
															+  float max_logit = -FLT_MAX;
														
 
															+  for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
														
 
															+    const float l = max_logits_ptr[i];
														
 
															+    shared_max_logits[i] = l;
														
 
															+    max_logit = fmaxf(max_logit, l);
														
 
															+  }
														
 
															+  __syncthreads();
														
 
															+
														
 
															+  // Get the global max logit.
														
 
															+  // Reduce within the warp.
														
 
															+#pragma unroll
														
 
															+  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
														
 
															+    max_logit = fmaxf(max_logit, APHRODITE_SHFL_XOR_SYNC(max_logit, mask));
														
 
															+  }
														
 
															+  if (lane == 0) {
														
 
															+    red_smem[warp_idx] = max_logit;
														
 
															+  }
														
 
															+  __syncthreads();
														
 
															+  // Reduce across warps.
														
 
															+  max_logit = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
														
 
															+#pragma unroll
														
 
															+  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
														
 
															+    max_logit = fmaxf(max_logit, APHRODITE_SHFL_XOR_SYNC(max_logit, mask));
														
 
															+  }
														
 
															+  // Broadcast the max value to all threads.
														
 
															+  max_logit = APHRODITE_SHFL_SYNC(max_logit, 0);
														
 
															+
														
 
															+  // Load rescaled exp sums to shared memory.
														
 
															+  float* shared_exp_sums = reinterpret_cast<float*>(shared_mem + sizeof(float) * num_partitions);
														
 
															+  const float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions
														
 
															+                                       + head_idx * max_num_partitions;
														
 
															+  float global_exp_sum = 0.0f;
														
 
															+  for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
														
 
															+    float l = shared_max_logits[i];
														
 
															+    float rescaled_exp_sum = exp_sums_ptr[i] * expf(l - max_logit);
														
 
															+    global_exp_sum += rescaled_exp_sum;
														
 
															+    shared_exp_sums[i] = rescaled_exp_sum;
														
 
															+  }
														
 
															+  __syncthreads();
														
 
															+  global_exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], global_exp_sum);
														
 
															+  const float inv_global_exp_sum = __fdividef(1.0f, global_exp_sum + 1e-6f);
														
 
															+
														
 
															+  // Aggregate tmp_out to out.
														
 
															+  const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
														
 
															+                                        + head_idx * max_num_partitions * HEAD_SIZE;
														
 
															+  scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
														
 
															+#pragma unroll
														
 
															+  for (int i = threadIdx.x; i < HEAD_SIZE; i += NUM_THREADS) {
														
 
															+    float acc = 0.0f;
														
 
															+    for (int j = 0; j < num_partitions; ++j) {
														
 
															+      acc += to_float(tmp_out_ptr[j * HEAD_SIZE + i]) * shared_exp_sums[j] * inv_global_exp_sum;
														
 
															+    }
														
 
															+    from_float(out_ptr[i], acc);
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+} // namespace aphrodite
														
 
															+
														
 
															+#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE)                                                        \
														
 
															+  APHRODITE_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(                                        \
														
 
															+    ((void*)aphrodite::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,    \
														
 
															+      KV_CACHE_DTYPE>), shared_mem_size);                                                           \
														
 
															+  aphrodite::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,              \
														
 
															+  KV_CACHE_DTYPE><<<grid, block, shared_mem_size, stream>>>(                                        \
														
 
															+    out_ptr,                                                                                        \
														
 
															+    query_ptr,                                                                                      \
														
 
															+    key_cache_ptr,                                                                                  \
														
 
															+    value_cache_ptr,                                                                                \
														
 
															+    num_kv_heads,                                                                                   \
														
 
															+    scale,                                                                                          \
														
 
															+    block_tables_ptr,                                                                               \
														
 
															+    context_lens_ptr,                                                                               \
														
 
															+    max_num_blocks_per_seq,                                                                         \
														
 
															+    alibi_slopes_ptr,                                                                               \
														
 
															+    q_stride,                                                                                       \
														
 
															+    kv_block_stride,                                                                                \
														
 
															+    kv_head_stride,                                                                                 \
														
 
															+    k_scale,                                                                                        \
														
 
															+    k_zp,                                                                                           \
														
 
															+    v_scale,                                                                                        \
														
 
															+    v_zp);
														
 
															+
														
 
															+// TODO: Tune NUM_THREADS.
														
 
															+template<
														
 
															+  typename T,
														
 
															+  typename CACHE_T,
														
 
															+  int BLOCK_SIZE,
														
 
															+  kv_cache_dtype KV_CACHE_DTYPE,
														
 
															+  int NUM_THREADS = 128>
														
 
															+void paged_attention_v1_launcher(
														
 
															+  torch::Tensor& out,
														
 
															+  torch::Tensor& query,
														
 
															+  torch::Tensor& key_cache,
														
 
															+  torch::Tensor& value_cache,
														
 
															+  int num_kv_heads,
														
 
															+  float scale,
														
 
															+  torch::Tensor& block_tables,
														
 
															+  torch::Tensor& context_lens,
														
 
															+  int max_context_len,
														
 
															+  const c10::optional<torch::Tensor>& alibi_slopes,
														
 
															+  const float k_scale,
														
 
															+  const float k_zp,
														
 
															+  const float v_scale,
														
 
															+  const float v_zp) {
														
 
															+  int num_seqs = query.size(0);
														
 
															+  int num_heads = query.size(1);
														
 
															+  int head_size = query.size(2);
														
 
															+  int max_num_blocks_per_seq = block_tables.size(1);
														
 
															+  int q_stride = query.stride(0);
														
 
															+  int kv_block_stride = key_cache.stride(0);
														
 
															+  int kv_head_stride = key_cache.stride(1);
														
 
															+
														
 
															+  int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
														
 
															+  assert(head_size % thread_group_size == 0);
														
 
															+
														
 
															+  // NOTE: alibi_slopes is optional.
														
 
															+  const float* alibi_slopes_ptr = alibi_slopes ?
														
 
															+    reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
														
 
															+    : nullptr;
														
 
															+
														
 
															+  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
														
 
															+  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
														
 
															+  CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
														
 
															+  CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
														
 
															+  int* block_tables_ptr = block_tables.data_ptr<int>();
														
 
															+  int* context_lens_ptr = context_lens.data_ptr<int>();
														
 
															+
														
 
															+  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
														
 
															+  int padded_max_context_len = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE) * BLOCK_SIZE;
														
 
															+  int logits_size = padded_max_context_len * sizeof(float);
														
 
															+  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
														
 
															+  // Python-side check in aphrodite.task_handler.worker._check_if_can_support_max_seq_len
														
 
															+  // Keep that in sync with the logic here!
														
 
															+  int shared_mem_size = std::max(logits_size, outputs_size);
														
 
															+
														
 
															+  dim3 grid(num_heads, num_seqs, 1);
														
 
															+  dim3 block(NUM_THREADS);
														
 
															+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
														
 
															+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
														
 
															+  switch (head_size) {
														
 
															+    // NOTE: To reduce the compilation time, we only compile for the
														
 
															+    // head sizes that we use in the model. However, we can easily extend this
														
 
															+    // to support any head size which is a multiple of 16.
														
 
															+    case 64:
														
 
															+      LAUNCH_PAGED_ATTENTION_V1(64);
														
 
															+      break;
														
 
															+    case 80:
														
 
															+      LAUNCH_PAGED_ATTENTION_V1(80);
														
 
															+      break;
														
 
															+    case 96:
														
 
															+      LAUNCH_PAGED_ATTENTION_V1(96);
														
 
															+      break;
														
 
															+    case 112:
														
 
															+      LAUNCH_PAGED_ATTENTION_V1(112);
														
 
															+      break;
														
 
															+    case 128:
														
 
															+      LAUNCH_PAGED_ATTENTION_V1(128);
														
 
															+      break;
														
 
															+    case 256:
														
 
															+      LAUNCH_PAGED_ATTENTION_V1(256);
														
 
															+      break;
														
 
															+    default:
														
 
															+      TORCH_CHECK(false, "Unsupported head size: ", head_size);
														
 
															+      break;
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_CACHE_DTYPE)             \
														
 
															+  paged_attention_v1_launcher<T, CACHE_T, BLOCK_SIZE, KV_CACHE_DTYPE>(       \
														
 
															+    out,                                                                     \
														
 
															+    query,                                                                   \
														
 
															+    key_cache,                                                               \
														
 
															+    value_cache,                                                             \
														
 
															+    num_kv_heads,                                                            \
														
 
															+    scale,                                                                   \
														
 
															+    block_tables,                                                            \
														
 
															+    context_lens,                                                            \
														
 
															+    max_context_len,                                                         \
														
 
															+    alibi_slopes,                                                            \
														
 
															+    k_scale,                                                                 \
														
 
															+    k_zp,                                                                    \
														
 
															+    v_scale,                                                                 \
														
 
															+    v_zp);
														
 
															+
														
 
															+// NOTE: To reduce the compilation time, we omitted block sizes
														
 
															+// 1, 2, 4, 64, 128, 256.
														
 
															+#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_CACHE_DTYPE)       \
														
 
															+  switch (block_size) {                                               \
														
 
															+    case 8:                                                           \
														
 
															+      CALL_V1_LAUNCHER(T, CACHE_T, 8, KV_CACHE_DTYPE);                \
														
 
															+      break;                                                          \
														
 
															+    case 16:                                                          \
														
 
															+      CALL_V1_LAUNCHER(T, CACHE_T, 16, KV_CACHE_DTYPE);               \
														
 
															+      break;                                                          \
														
 
															+    case 32:                                                          \
														
 
															+      CALL_V1_LAUNCHER(T, CACHE_T, 32, KV_CACHE_DTYPE);               \
														
 
															+      break;                                                          \
														
 
															+    default:                                                          \
														
 
															+      TORCH_CHECK(false, "Unsupported block size: ", block_size);     \
														
 
															+      break;                                                          \
														
 
															+  }
														
 
															+
														
 
															+void paged_attention_v1(
														
 
															+  torch::Tensor& out,             // [num_seqs, num_heads, head_size]
														
 
															+  torch::Tensor& query,           // [num_seqs, num_heads, head_size]
														
 
															+  torch::Tensor& key_cache,       // [num_blocks, num_heads, head_size/x, block_size, x]
														
 
															+  torch::Tensor& value_cache,     // [num_blocks, num_heads, head_size, block_size]
														
 
															+  int num_kv_heads,               // [num_heads]
														
 
															+  float scale,
														
 
															+  torch::Tensor& block_tables,    // [num_seqs, max_num_blocks_per_seq]
														
 
															+  torch::Tensor& context_lens,    // [num_seqs]
														
 
															+  int block_size,
														
 
															+  int max_context_len,
														
 
															+  const c10::optional<torch::Tensor>& alibi_slopes,
														
 
															+  const std::string& kv_cache_dtype,
														
 
															+  const float k_scale = 1.0f,
														
 
															+  const float k_zp = 0.0f,
														
 
															+  const float v_scale = 1.0f,
														
 
															+  const float v_zp = 0.0f) {
														
 
															+  if (kv_cache_dtype == "auto") {
														
 
															+    if (query.dtype() == at::ScalarType::Float) {
														
 
															+      CALL_V1_LAUNCHER_BLOCK_SIZE(float, float, AUTO);
														
 
															+    } else if (query.dtype() == at::ScalarType::Half) {
														
 
															+      CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t, uint16_t, AUTO);
														
 
															+    } else if (query.dtype() == at::ScalarType::BFloat16) {
														
 
															+      CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, __nv_bfloat16, AUTO);
														
 
															+    } else {
														
 
															+      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
														
 
															+    }
														
 
															+#ifdef ENABLE_FP8_E5M2
														
 
															+  } else if (kv_cache_dtype == "fp8_e5m2") {
														
 
															+    if (query.dtype() == at::ScalarType::Float) {
														
 
															+      CALL_V1_LAUNCHER_BLOCK_SIZE(float, uint8_t, FP8_E5M2);
														
 
															+    } else if (query.dtype() == at::ScalarType::Half) {
														
 
															+      CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t, uint8_t, FP8_E5M2);
														
 
															+    } else if (query.dtype() == at::ScalarType::BFloat16) {
														
 
															+      CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, uint8_t, FP8_E5M2);
														
 
															+    } else {
														
 
															+      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
														
 
															+    }
														
 
															+#endif
														
 
															+  } else if (kv_cache_dtype == "int8") {
														
 
															+    if (query.dtype() == at::ScalarType::Float) {
														
 
															+      CALL_V1_LAUNCHER_BLOCK_SIZE(float, int8_t, INT8);
														
 
															+    } else if (query.dtype() == at::ScalarType::Half) {
														
 
															+      CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t, int8_t, INT8);
														
 
															+    } else if (query.dtype() == at::ScalarType::BFloat16) {
														
 
															+      CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, int8_t, INT8);
														
 
															+    } else {
														
 
															+      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
														
 
															+    }
														
 
															+  } else {
														
 
															+    TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE)                                                  \
														
 
															+  aphrodite::paged_attention_v2_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,        \
														
 
															+  KV_CACHE_DTYPE, PARTITION_SIZE>                                                             \
														
 
															+  <<<grid, block, shared_mem_size, stream>>>(                                                 \
														
 
															+    exp_sums_ptr,                                                                             \
														
 
															+    max_logits_ptr,                                                                           \
														
 
															+    tmp_out_ptr,                                                                              \
														
 
															+    query_ptr,                                                                                \
														
 
															+    key_cache_ptr,                                                                            \
														
 
															+    value_cache_ptr,                                                                          \
														
 
															+    num_kv_heads,                                                                             \
														
 
															+    scale,                                                                                    \
														
 
															+    block_tables_ptr,                                                                         \
														
 
															+    context_lens_ptr,                                                                         \
														
 
															+    max_num_blocks_per_seq,                                                                   \
														
 
															+    alibi_slopes_ptr,                                                                         \
														
 
															+    q_stride,                                                                                 \
														
 
															+    kv_block_stride,                                                                          \
														
 
															+    kv_head_stride,                                                                           \
														
 
															+    k_scale,                                                                                  \
														
 
															+    k_zp,                                                                                     \
														
 
															+    v_scale,                                                                                  \
														
 
															+    v_zp);                                                                                    \
														
 
															+  aphrodite::paged_attention_v2_reduce_kernel<T, HEAD_SIZE, NUM_THREADS, PARTITION_SIZE>           \
														
 
															+  <<<reduce_grid, block, reduce_shared_mem_size, stream>>>(                                   \
														
 
															+    out_ptr,                                                                                  \
														
 
															+    exp_sums_ptr,                                                                             \
														
 
															+    max_logits_ptr,                                                                           \
														
 
															+    tmp_out_ptr,                                                                              \
														
 
															+    context_lens_ptr,                                                                         \
														
 
															+    max_num_partitions);
														
 
															+
														
 
															+template<
														
 
															+  typename T,
														
 
															+  typename CACHE_T,
														
 
															+  int BLOCK_SIZE,
														
 
															+  kv_cache_dtype KV_CACHE_DTYPE,
														
 
															+  int NUM_THREADS = 128,
														
 
															+  int PARTITION_SIZE = 512>
														
 
															+void paged_attention_v2_launcher(
														
 
															+  torch::Tensor& out,
														
 
															+  torch::Tensor& exp_sums,
														
 
															+  torch::Tensor& max_logits,
														
 
															+  torch::Tensor& tmp_out,
														
 
															+  torch::Tensor& query,
														
 
															+  torch::Tensor& key_cache,
														
 
															+  torch::Tensor& value_cache,
														
 
															+  int num_kv_heads,
														
 
															+  float scale,
														
 
															+  torch::Tensor& block_tables,
														
 
															+  torch::Tensor& context_lens,
														
 
															+  int max_context_len,
														
 
															+  const c10::optional<torch::Tensor>& alibi_slopes,
														
 
															+  const float k_scale,
														
 
															+  const float k_zp,
														
 
															+  const float v_scale,
														
 
															+  const float v_zp) {
														
 
															+  int num_seqs = query.size(0);
														
 
															+  int num_heads = query.size(1);
														
 
															+  int head_size = query.size(2);
														
 
															+  int max_num_blocks_per_seq = block_tables.size(1);
														
 
															+  int q_stride = query.stride(0);
														
 
															+  int kv_block_stride = key_cache.stride(0);
														
 
															+  int kv_head_stride = key_cache.stride(1);
														
 
															+
														
 
															+  int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
														
 
															+  assert(head_size % thread_group_size == 0);
														
 
															+
														
 
															+  // NOTE: alibi_slopes is optional.
														
 
															+  const float* alibi_slopes_ptr = alibi_slopes ?
														
 
															+    reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
														
 
															+    : nullptr;
														
 
															+
														
 
															+  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
														
 
															+  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
														
 
															+  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
														
 
															+  T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
														
 
															+  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
														
 
															+  CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
														
 
															+  CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
														
 
															+  int* block_tables_ptr = block_tables.data_ptr<int>();
														
 
															+  int* context_lens_ptr = context_lens.data_ptr<int>();
														
 
															+
														
 
															+  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
														
 
															+  int max_num_partitions = DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
														
 
															+  int logits_size = PARTITION_SIZE * sizeof(float);
														
 
															+  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
														
 
															+
														
 
															+  // For paged attention v2 kernel.
														
 
															+  dim3 grid(num_heads, num_seqs, max_num_partitions);
														
 
															+  int shared_mem_size = std::max(logits_size, outputs_size);
														
 
															+  // For paged attention v2 reduce kernel.
														
 
															+  dim3 reduce_grid(num_heads, num_seqs);
														
 
															+  int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float);
														
 
															+
														
 
															+  dim3 block(NUM_THREADS);
														
 
															+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
														
 
															+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
														
 
															+  switch (head_size) {
														
 
															+    // NOTE: To reduce the compilation time, we only compile for the
														
 
															+    // head sizes that we use in the model. However, we can easily extend this
														
 
															+    // to support any head size which is a multiple of 16.
														
 
															+    case 64:
														
 
															+      LAUNCH_PAGED_ATTENTION_V2(64);
														
 
															+      break;
														
 
															+    case 80:
														
 
															+      LAUNCH_PAGED_ATTENTION_V2(80);
														
 
															+      break;
														
 
															+    case 96:
														
 
															+      LAUNCH_PAGED_ATTENTION_V2(96);
														
 
															+      break;
														
 
															+    case 112:
														
 
															+      LAUNCH_PAGED_ATTENTION_V2(112);
														
 
															+      break;
														
 
															+    case 128:
														
 
															+      LAUNCH_PAGED_ATTENTION_V2(128);
														
 
															+      break;
														
 
															+    case 256:
														
 
															+      LAUNCH_PAGED_ATTENTION_V2(256);
														
 
															+      break;
														
 
															+    default:
														
 
															+      TORCH_CHECK(false, "Unsupported head size: ", head_size);
														
 
															+      break;
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_CACHE_DTYPE)                 \
														
 
															+  paged_attention_v2_launcher<T, CACHE_T, BLOCK_SIZE, KV_CACHE_DTYPE>(           \
														
 
															+    out,                                                                         \
														
 
															+    exp_sums,                                                                    \
														
 
															+    max_logits,                                                                  \
														
 
															+    tmp_out,                                                                     \
														
 
															+    query,                                                                       \
														
 
															+    key_cache,                                                                   \
														
 
															+    value_cache,                                                                 \
														
 
															+    num_kv_heads,                                                                \
														
 
															+    scale,                                                                       \
														
 
															+    block_tables,                                                                \
														
 
															+    context_lens,                                                                \
														
 
															+    max_context_len,                                                             \
														
 
															+    alibi_slopes,                                                                \
														
 
															+    k_scale,                                                                     \
														
 
															+    k_zp,                                                                        \
														
 
															+    v_scale,                                                                     \
														
 
															+    v_zp);
														
 
															+
														
 
															+// NOTE: To reduce the compilation time, we omitted block sizes
														
 
															+// 1, 2, 4, 64, 128, 256.
														
 
															+#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_CACHE_DTYPE)             \
														
 
															+  switch (block_size) {                                                     \
														
 
															+    case 8:                                                                 \
														
 
															+      CALL_V2_LAUNCHER(T, CACHE_T, 8, KV_CACHE_DTYPE);                      \
														
 
															+      break;                                                                \
														
 
															+    case 16:                                                                \
														
 
															+      CALL_V2_LAUNCHER(T, CACHE_T, 16, KV_CACHE_DTYPE);                     \
														
 
															+      break;                                                                \
														
 
															+    case 32:                                                                \
														
 
															+      CALL_V2_LAUNCHER(T, CACHE_T, 32, KV_CACHE_DTYPE);                     \
														
 
															+      break;                                                                \
														
 
															+    default:                                                                \
														
 
															+      TORCH_CHECK(false, "Unsupported block size: ", block_size);           \
														
 
															+      break;                                                                \
														
 
															+  }
														
 
															+
														
 
															+void paged_attention_v2(
														
 
															+  torch::Tensor& out,             // [num_seqs, num_heads, head_size]
														
 
															+  torch::Tensor& exp_sums,        // [num_seqs, num_heads, max_num_partitions]
														
 
															+  torch::Tensor& max_logits,      // [num_seqs, num_heads, max_num_partitions]
														
 
															+  torch::Tensor& tmp_out,         // [num_seqs, num_heads, max_num_partitions, head_size]
														
 
															+  torch::Tensor& query,           // [num_seqs, num_heads, head_size]
														
 
															+  torch::Tensor& key_cache,       // [num_blocks, num_heads, head_size/x, block_size, x]
														
 
															+  torch::Tensor& value_cache,     // [num_blocks, num_heads, head_size, block_size]
														
 
															+  int num_kv_heads,               // [num_heads]
														
 
															+  float scale,
														
 
															+  torch::Tensor& block_tables,    // [num_seqs, max_num_blocks_per_seq]
														
 
															+  torch::Tensor& context_lens,    // [num_seqs]
														
 
															+  int block_size,
														
 
															+  int max_context_len,
														
 
															+  const c10::optional<torch::Tensor>& alibi_slopes,
														
 
															+  const std::string& kv_cache_dtype,
														
 
															+  const float k_scale = 1.0f,
														
 
															+  const float k_zp = 0.0f,
														
 
															+  const float v_scale = 1.0f,
														
 
															+  const float v_zp = 0.0f) {
														
 
															+  if (kv_cache_dtype == "auto") {
														
 
															+    if (query.dtype() == at::ScalarType::Float) {
														
 
															+      CALL_V2_LAUNCHER_BLOCK_SIZE(float, float, AUTO);
														
 
															+    } else if (query.dtype() == at::ScalarType::Half) {
														
 
															+      CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t, uint16_t, AUTO);
														
 
															+    } else if (query.dtype() == at::ScalarType::BFloat16) {
														
 
															+      CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, __nv_bfloat16, AUTO);
														
 
															+    } else {
														
 
															+      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
														
 
															+    }
														
 
															+#ifdef ENABLE_FP8_E5M2
														
 
															+  } else if (kv_cache_dtype == "fp8_e5m2") {
														
 
															+    if (query.dtype() == at::ScalarType::Float) {
														
 
															+      CALL_V2_LAUNCHER_BLOCK_SIZE(float, uint8_t, FP8_E5M2);
														
 
															+    } else if (query.dtype() == at::ScalarType::Half) {
														
 
															+      CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t, uint8_t, FP8_E5M2);
														
 
															+    } else if (query.dtype() == at::ScalarType::BFloat16) {
														
 
															+      CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, uint8_t, FP8_E5M2);
														
 
															+    } else {
														
 
															+      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
														
 
															+    }
														
 
															+#endif
														
 
															+  } else if (kv_cache_dtype == "int8") {
														
 
															+    if (query.dtype() == at::ScalarType::Float) {
														
 
															+      CALL_V2_LAUNCHER_BLOCK_SIZE(float, int8_t, INT8);
														
 
															+    } else if (query.dtype() == at::ScalarType::Half) {
														
 
															+      CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t, int8_t, INT8);
														
 
															+    } else if (query.dtype() == at::ScalarType::BFloat16) {
														
 
															+      CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, int8_t, INT8);
														
 
															+    } else {
														
 
															+      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
														
 
															+    }
														
 
															+  } else {
														
 
															+    TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+#undef WARP_SIZE
														
 
															+#undef MAX
														
 
															+#undef MIN
														
 
															+#undef DIVIDE_ROUND_UP
														
--- a/kernels/backup/cache.h
+++ b/kernels/backup/cache.h
@@ -0,0 +1,39 @@
 
															+#pragma once
														
 
															+
														
 
															+#include <torch/extension.h>
														
 
															+
														
 
															+#include <map>
														
 
															+#include <vector>
														
 
															+
														
 
															+void swap_blocks(
														
 
															+  torch::Tensor& src,
														
 
															+  torch::Tensor& dst,
														
 
															+  const std::map<int64_t, int64_t>& block_mapping);
														
 
															+
														
 
															+void copy_blocks(
														
 
															+  std::vector<torch::Tensor>& key_caches,
														
 
															+  std::vector<torch::Tensor>& value_caches,
														
 
															+  const std::map<int64_t, std::vector<int64_t>>& block_mapping);
														
 
															+
														
 
															+void reshape_and_cache(
														
 
															+  torch::Tensor& key,   
														
 
															+  torch::Tensor& value, 
														
 
															+  torch::Tensor& key_cache, 
														
 
															+  torch::Tensor& value_cache, 
														
 
															+  torch::Tensor& slot_mapping, 
														
 
															+  const std::string& kv_cache_dtype,
														
 
															+  const float k_scale = 1.0f,
														
 
															+  const float k_zp = 0.0f,
														
 
															+  const float v_scale = 1.0f,
														
 
															+  const float v_zp = 0.0f);
														
 
															+
														
 
															+void gather_cached_kv(
														
 
															+  torch::Tensor& key,
														
 
															+  torch::Tensor& value,
														
 
															+  torch::Tensor& key_cache,
														
 
															+  torch::Tensor& value_cache,
														
 
															+  torch::Tensor& slot_mapping);
														
 
															+
														
 
															+void convert_fp8_e5m2(
														
 
															+  torch::Tensor& src_cache,
														
 
															+  torch::Tensor& dst_cache);
														
--- a/kernels/backup/cache_kernels.cu
+++ b/kernels/backup/cache_kernels.cu
@@ -0,0 +1,512 @@
 
															+#include <torch/extension.h>
														
 
															+#include <ATen/cuda/CUDAContext.h>
														
 
															+#include <c10/cuda/CUDAGuard.h>
														
 
															+
														
 
															+#include "cuda_compat.h"
														
 
															+#include "dispatch_utils.h"
														
 
															+#include "quantization/int8_kvcache/quant_utils.cuh"
														
 
															+#ifdef ENABLE_FP8_E5M2
														
 
															+#include "quantization/fp8_e5m2_kvcache/quant_utils.cuh"
														
 
															+#endif
														
 
															+
														
 
															+#include <algorithm>
														
 
															+#include <cassert>
														
 
															+#include <map>
														
 
															+#include <vector>
														
 
															+
														
 
															+enum kv_cache_dtype {
														
 
															+  AUTO,
														
 
															+#ifdef ENABLE_FP8_E5M2
														
 
															+  FP8_E5M2,
														
 
															+#endif
														
 
															+  INT8};
														
 
															+
														
 
															+#ifdef USE_ROCM
														
 
															+  #include <hip/hip_bf16.h>
														
 
															+  typedef __hip_bfloat16 __nv_bfloat16;
														
 
															+#endif
														
 
															+
														
 
															+void swap_blocks(
														
 
															+  torch::Tensor& src,
														
 
															+  torch::Tensor& dst,
														
 
															+  const std::map<int64_t, int64_t>& block_mapping) {
														
 
															+  torch::Device src_device = src.device();
														
 
															+  torch::Device dst_device = dst.device();
														
 
															+  cudaMemcpyKind memcpy_type;
														
 
															+  if (src_device.is_cuda() && dst_device.is_cuda()) {
														
 
															+    TORCH_CHECK(
														
 
															+      src_device.index() == dst_device.index(),
														
 
															+      "src and dst must be on the same GPU");
														
 
															+    memcpy_type = cudaMemcpyDeviceToDevice;
														
 
															+  } else if (src_device.is_cuda() && dst_device.is_cpu()) {
														
 
															+    memcpy_type = cudaMemcpyDeviceToHost;
														
 
															+  } else if (src_device.is_cpu() && dst_device.is_cuda()) {
														
 
															+    memcpy_type = cudaMemcpyHostToDevice;
														
 
															+  } else {
														
 
															+    TORCH_CHECK(false, "Invalid device combination");
														
 
															+  }
														
 
															+
														
 
															+  char *src_ptr = static_cast<char*>(src.data_ptr());
														
 
															+  char *dst_ptr = static_cast<char*>(dst.data_ptr());
														
 
															+
														
 
															+  const int64_t block_size_in_bytes = src.element_size() * src[0].numel();
														
 
															+  const at::cuda::OptionalCUDAGuard device_guard(src_device.is_cuda() ? src_device : dst_device);
														
 
															+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
														
 
															+  // NOTE: This can be slow if the number of blocks is large.
														
 
															+  for (const auto& pair : block_mapping) {
														
 
															+    int64_t src_block_number = pair.first;
														
 
															+    int64_t dst_block_number = pair.second;
														
 
															+    int64_t src_offset = src_block_number * block_size_in_bytes;
														
 
															+    int64_t dst_offset = dst_block_number * block_size_in_bytes;
														
 
															+    cudaMemcpyAsync(
														
 
															+      dst_ptr + dst_offset,
														
 
															+      src_ptr + src_offset,
														
 
															+      block_size_in_bytes,
														
 
															+      memcpy_type,
														
 
															+      stream);
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+namespace aphrodite {
														
 
															+
														
 
															+// Grid: (num_layers, num_pairs)
														
 
															+template<typename scalar_t>
														
 
															+__global__ void copy_blocks_kernel(
														
 
															+  int64_t* key_cache_ptrs,
														
 
															+  int64_t* value_cache_ptrs,
														
 
															+  const int64_t* __restrict__ block_mapping,
														
 
															+  const int numel_per_block) {
														
 
															+  const int layer_idx = blockIdx.x;
														
 
															+  const int pair_idx = blockIdx.y;
														
 
															+
														
 
															+  scalar_t* key_cache = reinterpret_cast<scalar_t*>(key_cache_ptrs[layer_idx]);
														
 
															+  scalar_t* value_cache = reinterpret_cast<scalar_t*>(value_cache_ptrs[layer_idx]);
														
 
															+  int64_t src_block_number = block_mapping[2 * pair_idx];
														
 
															+  int64_t dst_block_number = block_mapping[2 * pair_idx + 1];
														
 
															+
														
 
															+  const int64_t src_block_offset = src_block_number * numel_per_block;
														
 
															+  const int64_t dst_block_offset = dst_block_number * numel_per_block;
														
 
															+  for (int i = threadIdx.x; i < numel_per_block; i += blockDim.x) {
														
 
															+    int64_t src_offset = src_block_offset + i;
														
 
															+    int64_t dst_offset = dst_block_offset + i;
														
 
															+    key_cache[dst_offset] = key_cache[src_offset];
														
 
															+  }
														
 
															+  for (int i = threadIdx.x; i < numel_per_block; i += blockDim.x) {
														
 
															+    int64_t src_offset = src_block_offset + i;
														
 
															+    int64_t dst_offset = dst_block_offset + i;
														
 
															+    value_cache[dst_offset] = value_cache[src_offset];
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+} // namespace aphrodite
														
 
															+
														
 
															+void copy_blocks(
														
 
															+  std::vector<torch::Tensor>& key_caches,
														
 
															+  std::vector<torch::Tensor>& value_caches,
														
 
															+  const std::map<int64_t, std::vector<int64_t>>& block_mapping) {
														
 
															+  int num_layers = key_caches.size();
														
 
															+  TORCH_CHECK(num_layers == value_caches.size());
														
 
															+  if (num_layers == 0) {
														
 
															+    return;
														
 
															+  }
														
 
															+  torch::Device cache_device = key_caches[0].device();
														
 
															+  TORCH_CHECK(cache_device.is_cuda());
														
 
															+
														
 
															+  // Create data structures for the kernel.
														
 
															+  // Create an array of pointers to the key and value caches.
														
 
															+  int64_t key_cache_ptrs[num_layers];
														
 
															+  int64_t value_cache_ptrs[num_layers];
														
 
															+  for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
														
 
															+    key_cache_ptrs[layer_idx] = reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
														
 
															+    value_cache_ptrs[layer_idx] = reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
														
 
															+  }
														
 
															+  // Create block mapping array.
														
 
															+  std::vector<int64_t> block_mapping_vec;
														
 
															+  for (const auto& pair : block_mapping) {
														
 
															+    int64_t src_block_number = pair.first;
														
 
															+    for (int64_t dst_block_number : pair.second) {
														
 
															+      block_mapping_vec.push_back(src_block_number);
														
 
															+      block_mapping_vec.push_back(dst_block_number);
														
 
															+    }
														
 
															+  }
														
 
															+  int64_t* block_mapping_array = block_mapping_vec.data();
														
 
															+  int num_pairs = block_mapping_vec.size() / 2;
														
 
															+
														
 
															+  // Move the data structures to the GPU.
														
 
															+  // NOTE: This synchronizes the CPU and GPU.
														
 
															+  torch::Tensor key_cache_ptrs_tensor = torch::from_blob(
														
 
															+    key_cache_ptrs, {num_layers}, torch::kInt64).to(cache_device);
														
 
															+  torch::Tensor value_cache_ptrs_tensor = torch::from_blob(
														
 
															+    value_cache_ptrs, {num_layers}, torch::kInt64).to(cache_device);
														
 
															+  torch::Tensor block_mapping_tensor = torch::from_blob(
														
 
															+    block_mapping_array, {2 * num_pairs}, torch::kInt64).to(cache_device);
														
 
															+
														
 
															+  // Launch the kernel.
														
 
															+  const int numel_per_block = key_caches[0][0].numel();
														
 
															+  dim3 grid(num_layers, num_pairs);
														
 
															+  dim3 block(std::min(1024, numel_per_block));
														
 
															+  const at::cuda::OptionalCUDAGuard device_guard(cache_device);
														
 
															+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
														
 
															+  APHRODITE_DISPATCH_FLOATING_AND_BYTE_TYPES(
														
 
															+    key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
														
 
															+      aphrodite::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
														
 
															+        key_cache_ptrs_tensor.data_ptr<int64_t>(),
														
 
															+        value_cache_ptrs_tensor.data_ptr<int64_t>(),
														
 
															+        block_mapping_tensor.data_ptr<int64_t>(),
														
 
															+        numel_per_block);
														
 
															+    }));
														
 
															+}
														
 
															+
														
 
															+namespace aphrodite {
														
 
															+
														
 
															+template<typename scalar_t, typename cache_t, kv_cache_dtype KV_CACHE_DTYPE>
														
 
															+__global__ void reshape_and_cache_kernel(
														
 
															+  const scalar_t* __restrict__ key,           // [num_tokens, num_heads, head_size]
														
 
															+  const scalar_t* __restrict__ value,         // [num_tokens, num_heads, head_size]
														
 
															+  cache_t* __restrict__ key_cache,            // [num_blocks, num_heads, head_size/x, block_size, x]
														
 
															+  cache_t* __restrict__ value_cache,          // [num_blocks, num_heads, head_size, block_size]
														
 
															+  const int64_t* __restrict__ slot_mapping,   // [num_tokens]
														
 
															+  const int key_stride,
														
 
															+  const int value_stride,
														
 
															+  const int num_heads,
														
 
															+  const int head_size,
														
 
															+  const int block_size,
														
 
															+  const int x,
														
 
															+  const float k_scale,
														
 
															+  const float k_zp,
														
 
															+  const float v_scale,
														
 
															+  const float v_zp) {
														
 
															+  const int64_t token_idx = blockIdx.x;
														
 
															+  const int64_t slot_idx = slot_mapping[token_idx];
														
 
															+  if (slot_idx < 0) {
														
 
															+    // Padding token that should be ignored.
														
 
															+    return;
														
 
															+  }
														
 
															+
														
 
															+  const int64_t block_idx = slot_idx / block_size;
														
 
															+  const int64_t block_offset = slot_idx % block_size;
														
 
															+
														
 
															+  const int n = num_heads * head_size;
														
 
															+  for (int i = threadIdx.x; i < n; i += blockDim.x) {
														
 
															+    const int64_t src_key_idx = token_idx * key_stride + i;
														
 
															+    const int64_t src_value_idx = token_idx * value_stride + i;
														
 
															+
														
 
															+    const int head_idx = i / head_size;
														
 
															+    const int head_offset = i % head_size;
														
 
															+    const int x_idx = head_offset / x;
														
 
															+    const int x_offset = head_offset % x;
														
 
															+
														
 
															+    const int64_t tgt_key_idx = block_idx * num_heads * (head_size / x) * block_size * x
														
 
															+                                + head_idx * (head_size / x) * block_size * x
														
 
															+                                + x_idx * block_size * x
														
 
															+                                + block_offset * x
														
 
															+                                + x_offset;
														
 
															+    const int64_t tgt_value_idx = block_idx * num_heads * head_size * block_size
														
 
															+                                  + head_idx * head_size * block_size
														
 
															+                                  + head_offset * block_size
														
 
															+                                  + block_offset;
														
 
															+    scalar_t tgt_key = key[src_key_idx];
														
 
															+    scalar_t tgt_value = value[src_value_idx];
														
 
															+    if constexpr (KV_CACHE_DTYPE == INT8) {
														
 
															+      key_cache[tgt_key_idx] = int8::quant(tgt_key, k_scale, k_zp);
														
 
															+      value_cache[tgt_value_idx] = int8::quant(tgt_value, v_scale, v_zp);
														
 
															+#ifdef ENABLE_FP8_E5M2
														
 
															+    } else if constexpr (KV_CACHE_DTYPE == FP8_E5M2) {
														
 
															+      key_cache[tgt_key_idx] = fp8_e5m2_unscaled::vec_conversion<uint8_t, scalar_t>(tgt_key);
														
 
															+      value_cache[tgt_value_idx] = fp8_e5m2_unscaled::vec_conversion<uint8_t, scalar_t>(tgt_value);
														
 
															+#endif
														
 
															+    } else {
														
 
															+      key_cache[tgt_key_idx] = tgt_key;
														
 
															+      value_cache[tgt_value_idx] = tgt_value;
														
 
															+    }
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+} // namespace aphrodite
														
 
															+
														
 
															+#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, KV_CACHE_DTYPE)                                      \
														
 
															+  aphrodite::reshape_and_cache_kernel<KV_T, CACHE_T, KV_CACHE_DTYPE><<<grid, block, 0, stream>>>(  \
														
 
															+    reinterpret_cast<KV_T*>(key.data_ptr()),                                                       \
														
 
															+    reinterpret_cast<KV_T*>(value.data_ptr()),                                                     \
														
 
															+    reinterpret_cast<CACHE_T*>(key_cache.data_ptr()),                                              \
														
 
															+    reinterpret_cast<CACHE_T*>(value_cache.data_ptr()),                                            \
														
 
															+    slot_mapping.data_ptr<int64_t>(),                                                              \
														
 
															+    key_stride,                                                                                    \
														
 
															+    value_stride,                                                                                  \
														
 
															+    num_heads,                                                                                     \
														
 
															+    head_size,                                                                                     \
														
 
															+    block_size,                                                                                    \
														
 
															+    x,                                                                                             \
														
 
															+    k_scale,                                                                                       \
														
 
															+    k_zp,                                                                                          \
														
 
															+    v_scale,                                                                                       \
														
 
															+    v_zp);
														
 
															+
														
 
															+void reshape_and_cache(
														
 
															+  torch::Tensor& key,           // [num_tokens, num_heads, head_size]
														
 
															+  torch::Tensor& value,         // [num_tokens, num_heads, head_size]
														
 
															+  torch::Tensor& key_cache,     // [num_blocks, num_heads, head_size/x, block_size, x]
														
 
															+  torch::Tensor& value_cache,   // [num_blocks, num_heads, head_size, block_size]
														
 
															+  torch::Tensor& slot_mapping,  // [num_tokens]
														
 
															+  const std::string& kv_cache_dtype,
														
 
															+  const float k_scale = 1.0f,
														
 
															+  const float k_zp = 0.0f,
														
 
															+  const float v_scale = 1.0f,
														
 
															+  const float v_zp = 0.0f)
														
 
															+{
														
 
															+  int num_tokens = key.size(0);
														
 
															+  int num_heads = key.size(1);
														
 
															+  int head_size = key.size(2);
														
 
															+  int block_size = key_cache.size(3);
														
 
															+  int x = key_cache.size(4);
														
 
															+
														
 
															+  int key_stride = key.stride(0);
														
 
															+  int value_stride = value.stride(0);
														
 
															+
														
 
															+  dim3 grid(num_tokens);
														
 
															+  dim3 block(std::min(num_heads * head_size, 512));
														
 
															+  const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
														
 
															+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
														
 
															+  if (kv_cache_dtype == "auto") {
														
 
															+    if (key.dtype() == at::ScalarType::Float) {
														
 
															+      CALL_RESHAPE_AND_CACHE(float, float, AUTO);
														
 
															+    } else if (key.dtype() == at::ScalarType::Half) {
														
 
															+      CALL_RESHAPE_AND_CACHE(uint16_t, uint16_t, AUTO);
														
 
															+    } else if (key.dtype() == at::ScalarType::BFloat16) {
														
 
															+      CALL_RESHAPE_AND_CACHE(__nv_bfloat16, __nv_bfloat16, AUTO);
														
 
															+    }
														
 
															+#ifdef ENABLE_FP8_E5M2
														
 
															+  } else if (kv_cache_dtype == "fp8_e5m2") {
														
 
															+    if (key.dtype() == at::ScalarType::Float) {
														
 
															+      CALL_RESHAPE_AND_CACHE(float, uint8_t, FP8_E5M2);
														
 
															+    } else if (key.dtype() == at::ScalarType::Half) {
														
 
															+      CALL_RESHAPE_AND_CACHE(uint16_t, uint8_t, FP8_E5M2);
														
 
															+    } else if (key.dtype() == at::ScalarType::BFloat16) {
														
 
															+      CALL_RESHAPE_AND_CACHE(__nv_bfloat16, uint8_t, FP8_E5M2);
														
 
															+    }
														
 
															+#endif
														
 
															+  } else if (kv_cache_dtype == "int8") {
														
 
															+    if (key.dtype() == at::ScalarType::Float) {
														
 
															+      CALL_RESHAPE_AND_CACHE(float, int8_t, INT8);
														
 
															+    } else if (key.dtype() == at::ScalarType::Half) {
														
 
															+      CALL_RESHAPE_AND_CACHE(uint16_t, int8_t, INT8);
														
 
															+    } else if (key.dtype() == at::ScalarType::BFloat16) {
														
 
															+      CALL_RESHAPE_AND_CACHE(__nv_bfloat16, int8_t, INT8);
														
 
															+    }
														
 
															+  } else {
														
 
															+    TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+namespace aphrodite {
														
 
															+
														
 
															+// Grid: (num_blocks, block_size).
														
 
															+template<typename scalar_t>
														
 
															+__global__ void gather_cached_kv_kernel(
														
 
															+  scalar_t* __restrict__ key,             // [num_tokens, [stride], num_heads, head_size]
														
 
															+  scalar_t* __restrict__ value,           // [num_tokens, [stride], num_heads, head_size]
														
 
															+  const scalar_t* __restrict__ key_cache,   // [num_blocks, num_heads, head_size/x, block_size, x]
														
 
															+  const scalar_t* __restrict__ value_cache,   // [num_blocks, num_heads, head_size, block_size]
														
 
															+  const int* __restrict__ slot_mapping,   // [num_tokens]
														
 
															+  const int key_stride,
														
 
															+  const int value_stride,
														
 
															+  const int num_heads,
														
 
															+  const int head_size,
														
 
															+  const int block_size,
														
 
															+  const int x) {
														
 
															+    const int token_idx = blockIdx.x;
														
 
															+    const int slot_idx = slot_mapping[token_idx];
														
 
															+    const int block_idx = slot_idx / block_size;
														
 
															+    const int block_offset = slot_idx % block_size;
														
 
															+
														
 
															+    const int num_tokens = num_heads * head_size;
														
 
															+    for (int i = threadIdx.x; i < num_tokens; i += blockDim.x) {
														
 
															+      const int tgt_key_idx = token_idx * key_stride + i;
														
 
															+      const int tgt_value_idx = token_idx * value_stride + i;
														
 
															+
														
 
															+      const int head_idx = i / head_size;
														
 
															+      const int head_offset = i % head_size;
														
 
															+      const int x_idx = head_offset / x;  // the offset of the [head_size/x] dimension
														
 
															+      const int x_offset = head_offset % x;
														
 
															+
														
 
															+      const int src_key_idx = block_idx * num_heads * (head_size / x) * block_size * x
														
 
															+                              + head_idx * (head_size / x) * block_size * x
														
 
															+                              + x_idx * block_size * x
														
 
															+                              + block_offset * x
														
 
															+                              + x_offset;
														
 
															+      const int src_value_idx = block_idx * num_heads * head_size * block_size
														
 
															+                                + head_idx * head_size * block_size
														
 
															+                                + head_offset * block_size
														
 
															+                                + block_offset;
														
 
															+
														
 
															+      key[tgt_key_idx] = APHRODITE_LDG(&key_cache[src_key_idx]);
														
 
															+      value[tgt_value_idx] = APHRODITE_LDG(&value_cache[src_value_idx]);
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+template <typename scalar_t>
														
 
															+__global__ void gather_cached_kv_kernel_optimized(
														
 
															+    scalar_t *__restrict__ key,             // [num_tokens, [stride], num_heads, head_size]
														
 
															+    scalar_t *__restrict__ value,           // [num_tokens, [stride], num_heads, head_size]
														
 
															+    const scalar_t *__restrict__ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
														
 
															+    const scalar_t *__restrict__ value_cache, // [num_blocks, num_heads, head_size, block_size]
														
 
															+    const int *__restrict__ slot_mapping,   // [num_tokens]
														
 
															+    const int key_stride,
														
 
															+    const int value_stride,
														
 
															+    const int num_heads,
														
 
															+    const int head_size,
														
 
															+    const int block_size,
														
 
															+    const int x)
														
 
															+{
														
 
															+    const int token_idx = blockIdx.x;
														
 
															+    const int slot_idx = slot_mapping[token_idx];
														
 
															+    const int block_idx = slot_idx / block_size;
														
 
															+    const int block_offset = slot_idx % block_size;
														
 
															+
														
 
															+    const int dim = num_heads * head_size;
														
 
															+    assert(dim % 4 == 0);  // this is true for known use cases
														
 
															+    const int unroll_factor = 4;
														
 
															+    const int unrolled_dim = dim / unroll_factor;
														
 
															+
														
 
															+    for (int i = threadIdx.x; i < unrolled_dim; i += blockDim.x)
														
 
															+    {
														
 
															+        int tgt_key_indices[unroll_factor];
														
 
															+        int tgt_value_indices[unroll_factor];
														
 
															+        int src_key_indices[unroll_factor];
														
 
															+        int src_value_indices[unroll_factor];
														
 
															+        scalar_t keys_to_store[unroll_factor];
														
 
															+        scalar_t values_to_store[unroll_factor];
														
 
															+
														
 
															+        #pragma unroll
														
 
															+        for (int j = 0; j < unroll_factor; ++j)
														
 
															+        {
														
 
															+            int index = i + j * unrolled_dim;
														
 
															+
														
 
															+            const int tgt_key_idx = token_idx * key_stride + index;
														
 
															+            const int tgt_value_idx = token_idx * value_stride + index;
														
 
															+
														
 
															+            const int head_idx = index / head_size;
														
 
															+            const int head_offset = index % head_size;
														
 
															+            const int x_idx = head_offset / x;
														
 
															+            const int x_offset = head_offset % x;
														
 
															+
														
 
															+            const int src_key_idx = block_idx * num_heads * (head_size / x) * block_size * x
														
 
															+                                    + head_idx * (head_size / x) * block_size * x
														
 
															+                                    + x_idx * block_size * x
														
 
															+                                    + block_offset * x
														
 
															+                                    + x_offset;
														
 
															+            const int src_value_idx = block_idx * num_heads * head_size * block_size
														
 
															+                                      + head_idx * head_size * block_size
														
 
															+                                      + head_offset * block_size
														
 
															+                                      + block_offset;
														
 
															+
														
 
															+            tgt_key_indices[j] = tgt_key_idx;
														
 
															+            tgt_value_indices[j] = tgt_value_idx;
														
 
															+            src_key_indices[j] = src_key_idx;
														
 
															+            src_value_indices[j] = src_value_idx;
														
 
															+
														
 
															+            keys_to_store[j] = APHRODITE_LDG(&key_cache[src_key_idx]);
														
 
															+            values_to_store[j] = APHRODITE_LDG(&value_cache[src_value_idx]);
														
 
															+        }
														
 
															+
														
 
															+        #pragma unroll
														
 
															+        for (int j = 0; j < unroll_factor; ++j)
														
 
															+        {
														
 
															+            key[tgt_key_indices[j]] = keys_to_store[j];
														
 
															+            value[tgt_value_indices[j]] = values_to_store[j];
														
 
															+        }
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+} // namespace aphrodite
														
 
															+
														
 
															+void gather_cached_kv(
														
 
															+  torch::Tensor& key,           // [out] [num_tokens, num_heads, head_size]
														
 
															+  torch::Tensor& value,         // [out] [num_tokens, num_heads, head_size]
														
 
															+  torch::Tensor& key_cache,     // [in]  [num_blocks, num_heads, head_size/x, block_size, x]
														
 
															+  torch::Tensor& value_cache,   // [in]  [num_blocks, num_heads, head_size, block_size]
														
 
															+  torch::Tensor& slot_mapping)  // [in]  [num_tokens]
														
 
															+{
														
 
															+  int num_tokens = key.size(0);
														
 
															+  int num_heads = key.size(1);
														
 
															+  int head_size = key.size(2);
														
 
															+  int block_size = key_cache.size(3);
														
 
															+  int x = key_cache.size(4);
														
 
															+
														
 
															+  int key_stride = key.stride(0);
														
 
															+  int value_stride = value.stride(0);
														
 
															+
														
 
															+  dim3 grid(num_tokens);
														
 
															+  dim3 block(std::min(num_heads * head_size, 512));
														
 
															+  const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
														
 
															+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
														
 
															+  APHRODITE_DISPATCH_FLOATING_AND_BYTE_TYPES(
														
 
															+    key.scalar_type(),
														
 
															+    "gather_cached_kv_kernel_optimized",
														
 
															+    [&] {
														
 
															+      aphrodite::gather_cached_kv_kernel_optimized<scalar_t><<<grid, block, 0, stream>>>(
														
 
															+        key.data_ptr<scalar_t>(),
														
 
															+        value.data_ptr<scalar_t>(),
														
 
															+        key_cache.data_ptr<scalar_t>(),
														
 
															+        value_cache.data_ptr<scalar_t>(),
														
 
															+        slot_mapping.data_ptr<int>(),
														
 
															+        key_stride,
														
 
															+        value_stride,
														
 
															+        num_heads,
														
 
															+        head_size,
														
 
															+        block_size,
														
 
															+        x);
														
 
															+    });
														
 
															+}
														
 
															+
														
 
															+namespace aphrodite {
														
 
															+
														
 
															+template<typename Tout, typename Tin>
														
 
															+__global__ void convert_fp8_e5m2_kernel(
														
 
															+  const Tin* __restrict__ src_cache,
														
 
															+  Tout* __restrict__ dst_cache,
														
 
															+  const int64_t block_stride) {
														
 
															+  const int64_t block_idx = blockIdx.x;
														
 
															+  for (int i = threadIdx.x; i < block_stride; i += blockDim.x) {
														
 
															+    int64_t idx = block_idx * block_stride + i;
														
 
															+#ifdef ENABLE_FP8_E5M2
														
 
															+    dst_cache[idx] = fp8_e5m2_unscaled::vec_conversion<Tout, Tin>(src_cache[idx]);
														
 
															+#else
														
 
															+    assert(false);
														
 
															+#endif
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+} // namespace aphrodite
														
 
															+
														
 
															+#define CALL_CONVERT_FP8_E5M2(Tout, Tin)                                 \
														
 
															+  aphrodite::convert_fp8_e5m2_kernel<Tout, Tin><<<grid, block, 0, stream>>>(  \
														
 
															+    reinterpret_cast<Tin*>(src_cache.data_ptr()),                        \
														
 
															+    reinterpret_cast<Tout*>(dst_cache.data_ptr()),                       \
														
 
															+    block_stride);
														
 
															+
														
 
															+void convert_fp8_e5m2(
														
 
															+  torch::Tensor& src_cache,
														
 
															+  torch::Tensor& dst_cache)
														
 
															+{
														
 
															+  int64_t num_blocks = src_cache.size(0);
														
 
															+  int64_t block_stride = src_cache.stride(0);
														
 
															+
														
 
															+  dim3 grid(num_blocks);
														
 
															+  dim3 block(std::min(block_stride, int64_t(512)));
														
 
															+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
														
 
															+
														
 
															+  if (src_cache.dtype() == at::ScalarType::Float) {
														
 
															+    CALL_CONVERT_FP8_E5M2(uint8_t, float);
														
 
															+  } else if (src_cache.dtype() == at::ScalarType::Half) {
														
 
															+    CALL_CONVERT_FP8_E5M2(uint8_t, uint16_t);
														
 
															+  } else if (src_cache.dtype() == at::ScalarType::BFloat16) {
														
 
															+    CALL_CONVERT_FP8_E5M2(uint8_t, __nv_bfloat16);
														
 
															+  } else if (dst_cache.dtype() == at::ScalarType::Float) {
														
 
															+    CALL_CONVERT_FP8_E5M2(float, uint8_t);
														
 
															+  } else if (dst_cache.dtype() == at::ScalarType::Half) {
														
 
															+    CALL_CONVERT_FP8_E5M2(uint16_t, uint8_t);
														
 
															+  } else if (dst_cache.dtype() == at::ScalarType::BFloat16) {
														
 
															+    CALL_CONVERT_FP8_E5M2(__nv_bfloat16, uint8_t);
														
 
															+  }
														
 
															+}
														
--- a/kernels/backup/dispatch_utils.h
+++ b/kernels/backup/dispatch_utils.h
@@ -0,0 +1,39 @@
 
															+/*
														
 
															+ * Adapted from
														
 
															+ * https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h
														
 
															+ */
														
 
															+#pragma once
														
 
															+
														
 
															+#include <torch/extension.h>
														
 
															+
														
 
															+#define APHRODITE_DISPATCH_CASE_FLOATING_TYPES(...)              \
														
 
															+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)      \
														
 
															+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)       \
														
 
															+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
														
 
															+
														
 
															+#define APHRODITE_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)             \
														
 
															+  AT_DISPATCH_SWITCH(                                             \
														
 
															+    TYPE, NAME, APHRODITE_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
														
 
															+
														
 
															+
														
 
															+#define APHRODITE_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...)     \
														
 
															+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)      \
														
 
															+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)       \
														
 
															+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)   \
														
 
															+  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)       \
														
 
															+  AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
														
 
															+
														
 
															+#define APHRODITE_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...)           \
														
 
															+  AT_DISPATCH_SWITCH(                                                    \
														
 
															+    TYPE, NAME, APHRODITE_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__))
														
 
															+
														
 
															+#define APHRODITE_DISPATCH_CASE_INTEGRAL_TYPES(...)             \
														
 
															+  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)      \
														
 
															+  AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)      \
														
 
															+  AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__)     \
														
 
															+  AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)       \
														
 
															+  AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
														
 
															+
														
 
															+#define APHRODITE_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...)             \
														
 
															+  AT_DISPATCH_SWITCH(                                             \
														
 
															+    TYPE, NAME, APHRODITE_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
														
--- a/kernels/backup/dtype_float32.cuh
+++ b/kernels/backup/dtype_float32.cuh
@@ -0,0 +1,280 @@
 
															+/*
														
 
															+ * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
														
 
															+ * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
														
 
															+ * Copyright (c) 2023, The vLLM team.
														
 
															+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
														
 
															+ *
														
 
															+ * Licensed under the Apache License, Version 2.0 (the "License");
														
 
															+ * you may not use this file except in compliance with the License.
														
 
															+ * You may obtain a copy of the License at
														
 
															+ *
														
 
															+ *     http://www.apache.org/licenses/LICENSE-2.0
														
 
															+ *
														
 
															+ * Unless required by applicable law or agreed to in writing, software
														
 
															+ * distributed under the License is distributed on an "AS IS" BASIS,
														
 
															+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
														
 
															+ * See the License for the specific language governing permissions and
														
 
															+ * limitations under the License.
														
 
															+ */
														
 
															+#pragma once
														
 
															+
														
 
															+#include "attention_generic.cuh"
														
 
															+
														
 
															+#include <stdint.h>
														
 
															+
														
 
															+namespace aphrodite {
														
 
															+
														
 
															+// Define custom FP32 vector data types.
														
 
															+struct Float4_ {
														
 
															+  float2 x;
														
 
															+  float2 y;
														
 
															+};
														
 
															+
														
 
															+struct Float8_ {
														
 
															+  float2 x;
														
 
															+  float2 y;
														
 
															+  float2 z;
														
 
															+  float2 w;
														
 
															+};
														
 
															+
														
 
															+// FP32 vector types for Q, K, V.
														
 
															+template<>
														
 
															+struct Vec<float, 1> {
														
 
															+  using Type = float;
														
 
															+};
														
 
															+template<>
														
 
															+struct Vec<float, 2> {
														
 
															+  using Type = float2;
														
 
															+};
														
 
															+template<>
														
 
															+struct Vec<float, 4> {
														
 
															+  using Type = float4;
														
 
															+};
														
 
															+
														
 
															+// FP32 accumulator vector types corresponding to Vec.
														
 
															+template<>
														
 
															+struct FloatVec<float> {
														
 
															+  using Type = float;
														
 
															+};
														
 
															+template<>
														
 
															+struct FloatVec<float2> {
														
 
															+  using Type = float2;
														
 
															+};
														
 
															+template<>
														
 
															+struct FloatVec<float4> {
														
 
															+  using Type = float4;
														
 
															+};
														
 
															+
														
 
															+// Vector addition.
														
 
															+inline __device__ float add(float a, float b) {
														
 
															+  return a + b;
														
 
															+}
														
 
															+
														
 
															+inline __device__ float2 add(float2 a, float2 b) {
														
 
															+  float2 c;
														
 
															+  c.x = add(a.x, b.x);
														
 
															+  c.y = add(a.y, b.y);
														
 
															+  return c;
														
 
															+}
														
 
															+
														
 
															+inline __device__ float4 add(float4 a, float4 b) {
														
 
															+  float4 c;
														
 
															+  c.x = add(a.x, b.x);
														
 
															+  c.y = add(a.y, b.y);
														
 
															+  c.z = add(a.z, b.z);
														
 
															+  c.w = add(a.w, b.w);
														
 
															+  return c;
														
 
															+}
														
 
															+
														
 
															+inline __device__ Float4_ add(Float4_ a, Float4_ b) {
														
 
															+  Float4_ c;
														
 
															+  c.x = add(a.x, b.x);
														
 
															+  c.y = add(a.y, b.y);
														
 
															+  return c;
														
 
															+}
														
 
															+
														
 
															+// Vector multiplication.
														
 
															+template<>
														
 
															+inline __device__ float mul<float, float>(float a, float b) {
														
 
															+  return a * b;
														
 
															+}
														
 
															+
														
 
															+template<>
														
 
															+inline __device__ float2 mul(float2 a, float2 b) {
														
 
															+  float2 c;
														
 
															+  c.x = a.x * b.x;
														
 
															+  c.y = a.y * b.y;
														
 
															+  return c;
														
 
															+}
														
 
															+
														
 
															+template<>
														
 
															+inline __device__ float2 mul(float a, float2 b) {
														
 
															+  float2 c;
														
 
															+  c.x = a * b.x;
														
 
															+  c.y = a * b.y;
														
 
															+  return c;
														
 
															+}
														
 
															+
														
 
															+template<>
														
 
															+inline __device__ float4 mul(float4 a, float4 b) {
														
 
															+  float4 c;
														
 
															+  c.x = a.x * b.x;
														
 
															+  c.y = a.y * b.y;
														
 
															+  c.z = a.z * b.z;
														
 
															+  c.w = a.w * b.w;
														
 
															+  return c;
														
 
															+}
														
 
															+
														
 
															+template<>
														
 
															+inline __device__ float4 mul(float a, float4 b) {
														
 
															+  float4 c;
														
 
															+  c.x = a * b.x;
														
 
															+  c.y = a * b.y;
														
 
															+  c.z = a * b.z;
														
 
															+  c.w = a * b.w;
														
 
															+  return c;
														
 
															+}
														
 
															+
														
 
															+// Vector fused multiply-add.
														
 
															+inline __device__ float fma(float a, float b, float c) {
														
 
															+  return a * b + c;
														
 
															+}
														
 
															+
														
 
															+inline __device__ float2 fma(float2 a, float2 b, float2 c) {
														
 
															+  float2 d;
														
 
															+  d.x = fma(a.x, b.x, c.x);
														
 
															+  d.y = fma(a.y, b.y, c.y);
														
 
															+  return d;
														
 
															+}
														
 
															+
														
 
															+inline __device__ float2 fma(float a, float2 b, float2 c) {
														
 
															+  float2 d;
														
 
															+  d.x = fma(a, b.x, c.x);
														
 
															+  d.y = fma(a, b.y, c.y);
														
 
															+  return d;
														
 
															+}
														
 
															+
														
 
															+inline __device__ float4 fma(float4 a, float4 b, float4 c) {
														
 
															+  float4 d;
														
 
															+  d.x = fma(a.x, b.x, c.x);
														
 
															+  d.y = fma(a.y, b.y, c.y);
														
 
															+  d.z = fma(a.z, b.z, c.z);
														
 
															+  d.w = fma(a.w, b.w, c.w);
														
 
															+  return d;
														
 
															+}
														
 
															+
														
 
															+inline __device__ float4 fma(float a, float4 b, float4 c) {
														
 
															+  float4 d;
														
 
															+  d.x = fma(a, b.x, c.x);
														
 
															+  d.y = fma(a, b.y, c.y);
														
 
															+  d.z = fma(a, b.z, c.z);
														
 
															+  d.w = fma(a, b.w, c.w);
														
 
															+  return d;
														
 
															+}
														
 
															+
														
 
															+inline __device__ Float4_ fma(float a, Float4_ b, Float4_ c) {
														
 
															+  Float4_ d;
														
 
															+  d.x = fma(a, b.x, c.x);
														
 
															+  d.y = fma(a, b.y, c.y);
														
 
															+  return d;
														
 
															+}
														
 
															+
														
 
															+inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c) {
														
 
															+  Float8_ d;
														
 
															+  d.x = fma(a, b.x, c.x);
														
 
															+  d.y = fma(a, b.y, c.y);
														
 
															+  d.z = fma(a, b.z, c.z);
														
 
															+  d.w = fma(a, b.w, c.w);
														
 
															+  return d;
														
 
															+}
														
 
															+
														
 
															+// Vector sum.
														
 
															+template<>
														
 
															+inline __device__ float sum(float v) {
														
 
															+  return v;
														
 
															+}
														
 
															+
														
 
															+template<>
														
 
															+inline __device__ float sum(float2 v) {
														
 
															+  return v.x + v.y;
														
 
															+}
														
 
															+
														
 
															+template<>
														
 
															+inline __device__ float sum(float4 v) {
														
 
															+  return v.x + v.y + v.z + v.w;
														
 
															+}
														
 
															+
														
 
															+template<>
														
 
															+inline __device__ float sum(Float4_ v) {
														
 
															+  return v.x.x + v.x.y + v.y.x + v.y.y;
														
 
															+}
														
 
															+
														
 
															+template<>
														
 
															+inline __device__ float sum(Float8_ v) {
														
 
															+  return v.x.x + v.x.y + v.y.x + v.y.y + v.z.x + v.z.y + v.w.x + v.w.y;
														
 
															+}
														
 
															+
														
 
															+// Vector dot product.
														
 
															+inline __device__ float dot(float a, float b) {
														
 
															+  return a * b;
														
 
															+}
														
 
															+
														
 
															+inline __device__ float dot(float2 a, float2 b) {
														
 
															+  float2 c = mul<float2, float2, float2>(a, b);
														
 
															+  return c.x + c.y;
														
 
															+}
														
 
															+
														
 
															+inline __device__ float dot(Float4_ a, Float4_ b) {
														
 
															+  float2 acc = mul<float2, float2, float2>(a.x, b.x);
														
 
															+  acc = fma(a.y, b.y, acc);
														
 
															+  return acc.x + acc.y;
														
 
															+}
														
 
															+
														
 
															+inline __device__ float dot(Float8_ a, Float8_ b) {
														
 
															+  float2 acc = mul<float2, float2, float2>(a.x, b.x);
														
 
															+  acc = fma(a.y, b.y, acc);
														
 
															+  acc = fma(a.z, b.z, acc);
														
 
															+  acc = fma(a.w, b.w, acc);
														
 
															+  return acc.x + acc.y;
														
 
															+}
														
 
															+
														
 
															+// From float to float.
														
 
															+inline __device__ void from_float(float& dst, float src) {
														
 
															+  dst = src;
														
 
															+}
														
 
															+
														
 
															+inline __device__ void from_float(float2& dst, float2 src) {
														
 
															+  dst = src;
														
 
															+}
														
 
															+
														
 
															+inline __device__ void from_float(float4& dst, float4 src) {
														
 
															+  dst = src;
														
 
															+}
														
 
															+
														
 
															+// From float to float.
														
 
															+inline __device__ float to_float(float u) {
														
 
															+  return u;
														
 
															+}
														
 
															+
														
 
															+inline __device__ float2 to_float(float2 u) {
														
 
															+  return u;
														
 
															+}
														
 
															+
														
 
															+inline __device__ float4 to_float(float4 u) {
														
 
															+  return u;
														
 
															+}
														
 
															+
														
 
															+inline __device__ Float4_ to_float(Float4_ u) {
														
 
															+  return u;
														
 
															+}
														
 
															+
														
 
															+inline __device__ Float8_ to_float(Float8_ u) {
														
 
															+  return u;
														
 
															+}
														
 
															+
														
 
															+// Zero-out a variable.
														
 
															+inline __device__ void zero(float& dst) {
														
 
															+  dst = 0.f;
														
 
															+}
														
 
															+
														
 
															+} // namespace aphrodite
														
--- a/kernels/attention/dtype_int8.cuh
+++ b/kernels/attention/dtype_int8.cuh
	`@@ -0,0 +1 @@`
			`+Backup of attention and cache kernels from INT8 KV Cache. Will be restored soon.`