7 months ago · 696f2cd59c
--- a/aphrodite/attention/backends/abstract.py
+++ b/aphrodite/attention/backends/abstract.py
@@ -111,6 +111,7 @@ class AttentionImpl(ABC, Generic[T]):
 
				         alibi_slopes: Optional[List[float]] = None,
			
 
				         sliding_window: Optional[int] = None,
			
 
				         kv_cache_dtype: str = "auto",
			
 
				+        blocksparse_params: Optional[Dict[str, Any]] = None,
			
 
				     ) -> None:
			
 
				         raise NotImplementedError
			
 
				 
			
--- a/aphrodite/attention/backends/blocksparse_attn.py
+++ b/aphrodite/attention/backends/blocksparse_attn.py
@@ -0,0 +1,405 @@
 
				+from dataclasses import dataclass, field
			
 
				+from typing import Any, Dict, List, Optional, Tuple, Type
			
 
				+
			
 
				+import torch
			
 
				+
			
 
				+from aphrodite.attention.backends.abstract import (AttentionBackend,
			
 
				+                                                   AttentionImpl,
			
 
				+                                                   AttentionMetadata)
			
 
				+from aphrodite.attention.ops.blocksparse_attention.interface import (
			
 
				+    LocalStridedBlockSparseAttn, get_head_sliding_step)
			
 
				+from aphrodite.attention.ops.paged_attn import PagedAttention
			
 
				+from aphrodite.distributed import (get_tensor_model_parallel_rank,
			
 
				+                                   get_tensor_model_parallel_world_size)
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class BlocksparseParams:
			
 
				+    max_seqlen: int
			
 
				+
			
 
				+    # Num q heads per tensor-parallel rank/partition
			
 
				+    num_heads: int  # per TP partition
			
 
				+    # Num kv heads per tensor-parallel rank/partition
			
 
				+    num_kv_heads: int
			
 
				+
			
 
				+    # block size used for blocksparse attention.
			
 
				+    # This is the block_size used in `local_blocks`, `vert_stride`.
			
 
				+    block_size: int
			
 
				+
			
 
				+    # Number of blocks for local attention, i.e., number of
			
 
				+    # local attended tokens / `sparse_block_size`
			
 
				+    local_blocks: int
			
 
				+
			
 
				+    # Attend to one block per every `vert_stride` blocks.
			
 
				+    # Controlling the sparsity
			
 
				+    vert_stride: int
			
 
				+    """
			
 
				+    If to use the same vertical stride offset for all heads, 
			
 
				+    i.e., attend to the same block of tokens on all heads.
			
 
				+    By default, it is False, i.e., attention on the non-local 
			
 
				+    blocks depends on the `head_idx`, that is on
			
 
				+    blocks satisfying 
			
 
				+    `(block_idx + head_idx * head_sliding_step + 1) % vert_stride == 0`
			
 
				+    where `head_sliding_step=max(1, int(vert_stride / num_total_heads))`,
			
 
				+            `block_idx = position_id // sparse_block_size`.
			
 
				+    See `..ops.blocksparse_attention.utils:get_sparse_attn_mask`
			
 
				+    for more detail.
			
 
				+    """
			
 
				+    homo_head: bool = False
			
 
				+
			
 
				+    # If within a group, the kv offsets that each q attends is the same or no.
			
 
				+    homo_head_group: bool = False
			
 
				+
			
 
				+    # Decided by homo_head and homo_head group
			
 
				+    head_sliding_step: int = field(init=False)
			
 
				+
			
 
				+    # range of q heads to for a TP rank
			
 
				+    active_head_range: Tuple = field(init=False)
			
 
				+
			
 
				+    def __post_init__(self):
			
 
				+        assert self.block_size > 0
			
 
				+        assert self.local_blocks >= 0
			
 
				+        assert self.vert_stride >= 1
			
 
				+        assert self.num_heads % self.num_kv_heads == 0
			
 
				+
			
 
				+        tp_size = get_tensor_model_parallel_world_size()
			
 
				+        tp_rank = get_tensor_model_parallel_rank()
			
 
				+        total_heads = tp_size * self.num_heads
			
 
				+        total_kv_heads = tp_size * self.num_kv_heads
			
 
				+
			
 
				+        if self.homo_head:
			
 
				+            self.head_sliding_step = 0
			
 
				+        elif self.homo_head_group:
			
 
				+            head_sliding_step = get_head_sliding_step(total_kv_heads,
			
 
				+                                                      self.vert_stride)
			
 
				+            # negative indicates sliding along kv heads, i.e., homo q group
			
 
				+            self.head_sliding_step = -head_sliding_step
			
 
				+        else:
			
 
				+            self.head_sliding_step = get_head_sliding_step(
			
 
				+                total_heads, self.vert_stride)
			
 
				+
			
 
				+        self.active_head_range = (
			
 
				+            tp_rank * self.num_heads,
			
 
				+            (tp_rank + 1) * self.num_heads,
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+class BlocksparseFlashAttentionBackend(AttentionBackend):
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def get_impl_cls() -> Type["BlocksparseFlashAttentionImpl"]:
			
 
				+        return BlocksparseFlashAttentionImpl
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def make_metadata(*args, **kwargs) -> "BlocksparseFlashAttentionMetadata":
			
 
				+        return BlocksparseFlashAttentionMetadata(*args, **kwargs)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def get_kv_cache_shape(
			
 
				+        num_blocks: int,
			
 
				+        block_size: int,
			
 
				+        num_kv_heads: int,
			
 
				+        head_size: int,
			
 
				+    ) -> Tuple[int, ...]:
			
 
				+        return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
			
 
				+                                                 num_kv_heads, head_size)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def swap_blocks(
			
 
				+        src_kv_cache: torch.Tensor,
			
 
				+        dst_kv_cache: torch.Tensor,
			
 
				+        src_to_dst: Dict[int, int],
			
 
				+    ) -> None:
			
 
				+        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def copy_blocks(
			
 
				+        kv_caches: List[torch.Tensor],
			
 
				+        src_to_dists: Dict[int, List[int]],
			
 
				+    ) -> None:
			
 
				+        PagedAttention.copy_blocks(kv_caches, src_to_dists)
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class BlocksparseFlashAttentionMetadata(AttentionMetadata):
			
 
				+    """A copy of Metadata for FlashAttentionBackend,
			
 
				+    to avoid having to install flash_attn.
			
 
				+    NOTE: Any python object stored here is not updated when it is
			
 
				+    cuda-graph replayed. If you have values that need to be changed
			
 
				+    dynamically, it should be stored in tensor. The tensor has to be
			
 
				+    updated from `CUDAGraphRunner.forward` API.
			
 
				+    """
			
 
				+    # (batch_size,). The sequence length per sequence. Sequence length means
			
 
				+    # the computed tokens + new tokens None if it is a decoding.
			
 
				+    seq_lens: Optional[List[int]]
			
 
				+    # seq_lens stored as a tensor.
			
 
				+    seq_lens_tensor: Optional[torch.Tensor]
			
 
				+
			
 
				+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
			
 
				+    # |---------- N-1 iteration --------|
			
 
				+    # |---------------- N iteration ---------------------|
			
 
				+    # |- tokenA -|......................|-- newTokens ---|
			
 
				+    # |---------- context_len ----------|
			
 
				+    # |-------------------- seq_len ----------------------|
			
 
				+    #                                   |-- query_len ---|
			
 
				+
			
 
				+    # Maximum query length in the batch. None for decoding.
			
 
				+    max_query_len: Optional[int]
			
 
				+    # Maximum sequence length among prefill batch. 0 if there are decoding
			
 
				+    # requests only.
			
 
				+    max_prefill_seq_len: int
			
 
				+    # Maximum sequence length among decode batch. 0 if there are prefill
			
 
				+    # requests only.
			
 
				+    max_decode_seq_len: int
			
 
				+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
			
 
				+    # the batch, used to index into subquery. E.g., if the subquery length
			
 
				+    # is [4, 6], it is [0, 4, 10].
			
 
				+    query_start_loc: Optional[torch.Tensor]
			
 
				+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
			
 
				+    # the batch, used to index into sequence. E.g., if the sequence length is
			
 
				+    # [4, 6], it is [0, 4, 10].
			
 
				+    seq_start_loc: Optional[torch.Tensor]
			
 
				+    # (batch_size,) A tensor of context lengths (tokens that are computed
			
 
				+    # so far).
			
 
				+    context_lens_tensor: Optional[torch.Tensor]
			
 
				+
			
 
				+    # (batch_size, max_blocks_per_seq).
			
 
				+    # Block addresses per sequence. (Seq id -> list of physical block)
			
 
				+    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
			
 
				+    # in the kv cache. Each block can contain up to block_size tokens.
			
 
				+    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
			
 
				+    # captured.
			
 
				+    block_tables: Optional[torch.Tensor]
			
 
				+
			
 
				+    # Whether or not if cuda graph is enabled.
			
 
				+    # Cuda-graph is currently enabled for decoding only.
			
 
				+    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
			
 
				+    use_cuda_graph: bool
			
 
				+
			
 
				+    _cached_prefill_metadata: Optional[
			
 
				+        "BlocksparseFlashAttentionMetadata"] = None
			
 
				+    _cached_decode_metadata: Optional[
			
 
				+        "BlocksparseFlashAttentionMetadata"] = None
			
 
				+
			
 
				+    @property
			
 
				+    def prefill_metadata(
			
 
				+            self) -> Optional["BlocksparseFlashAttentionMetadata"]:
			
 
				+        if self.num_prefills == 0:
			
 
				+            return None
			
 
				+
			
 
				+        if self._cached_prefill_metadata is not None:
			
 
				+            return self._cached_prefill_metadata
			
 
				+
			
 
				+        assert self.seq_lens is not None
			
 
				+        assert self.seq_lens_tensor is not None
			
 
				+        assert self.query_start_loc is not None
			
 
				+        assert self.context_lens_tensor is not None
			
 
				+        assert self.block_tables is not None
			
 
				+        assert self.seq_start_loc is not None
			
 
				+
			
 
				+        self._cached_prefill_metadata = BlocksparseFlashAttentionMetadata(
			
 
				+            num_prefills=self.num_prefills,
			
 
				+            num_prefill_tokens=self.num_prefill_tokens,
			
 
				+            num_decode_tokens=0,
			
 
				+            slot_mapping=self.slot_mapping[:self.num_prefill_tokens],
			
 
				+            seq_lens=self.seq_lens[:self.num_prefills],
			
 
				+            seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
			
 
				+            max_query_len=self.max_query_len,
			
 
				+            max_prefill_seq_len=self.max_prefill_seq_len,
			
 
				+            max_decode_seq_len=0,
			
 
				+            query_start_loc=self.query_start_loc[:self.num_prefills + 1],
			
 
				+            seq_start_loc=self.seq_start_loc[:self.num_prefills + 1],
			
 
				+            context_lens_tensor=self.context_lens_tensor[:self.num_prefills],
			
 
				+            block_tables=self.block_tables[:self.num_prefills],
			
 
				+            use_cuda_graph=False,
			
 
				+        )
			
 
				+        return self._cached_prefill_metadata
			
 
				+
			
 
				+    @property
			
 
				+    def decode_metadata(self) -> Optional["BlocksparseFlashAttentionMetadata"]:
			
 
				+        if self.num_decode_tokens == 0:
			
 
				+            return None
			
 
				+
			
 
				+        if self._cached_decode_metadata is not None:
			
 
				+            return self._cached_decode_metadata
			
 
				+        assert self.block_tables is not None
			
 
				+        assert self.seq_lens_tensor is not None
			
 
				+
			
 
				+        self._cached_decode_metadata = BlocksparseFlashAttentionMetadata(
			
 
				+            num_prefills=0,
			
 
				+            num_prefill_tokens=0,
			
 
				+            num_decode_tokens=self.num_decode_tokens,
			
 
				+            slot_mapping=self.slot_mapping[self.num_prefill_tokens:],
			
 
				+            seq_lens=None,
			
 
				+            seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
			
 
				+            max_query_len=None,
			
 
				+            max_prefill_seq_len=0,
			
 
				+            max_decode_seq_len=self.max_decode_seq_len,
			
 
				+            query_start_loc=None,
			
 
				+            seq_start_loc=None,
			
 
				+            context_lens_tensor=None,
			
 
				+            block_tables=self.block_tables[self.num_prefills:],
			
 
				+            use_cuda_graph=self.use_cuda_graph,
			
 
				+        )
			
 
				+        return self._cached_decode_metadata
			
 
				+
			
 
				+
			
 
				+class BlocksparseFlashAttentionImpl(AttentionImpl):
			
 
				+    """
			
 
				+    If the input tensors contain prompt tokens, the layout is as follows:
			
 
				+    |<--------------- num_prompt_tokens -------------->|
			
 
				+    |<--prompt_0-->|<--prompt_1-->|...|<--prompt_N-1-->|
			
 
				+    Otherwise, the layout is as follows:
			
 
				+    |<------------------ num_generation_tokens (M) ----------------->|
			
 
				+    |<--generation_0-->|..........|<--generation_M-1-->|<--padding-->|
			
 
				+    Generation tokens can contain padding when cuda-graph is used.
			
 
				+    Currently, prompt tokens don't contain any padding.
			
 
				+    The prompts might have different lengths, while the generation tokens
			
 
				+    always have length 1.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        num_heads: int,
			
 
				+        head_size: int,
			
 
				+        scale: float,
			
 
				+        num_kv_heads: int,
			
 
				+        alibi_slopes: Optional[List[float]],
			
 
				+        sliding_window: Optional[int],
			
 
				+        kv_cache_dtype: str,
			
 
				+        blocksparse_params: Optional[Dict[str, Any]] = None,
			
 
				+    ) -> None:
			
 
				+        assert blocksparse_params is not None
			
 
				+        assert alibi_slopes is None, ValueError(
			
 
				+            "Alibi not support for blocksparse flash attention.")
			
 
				+        assert sliding_window is None, ValueError(
			
 
				+            "sliding_window is invalid for blocksparse attention.")
			
 
				+
			
 
				+        if "num_heads" not in blocksparse_params:
			
 
				+            blocksparse_params["num_heads"] = num_heads
			
 
				+        if "num_kv_heads" not in blocksparse_params:
			
 
				+            blocksparse_params["num_kv_heads"] = num_kv_heads or num_heads
			
 
				+        self.blocksparse_params = BlocksparseParams(**blocksparse_params)
			
 
				+        self.kv_cache_dtype = kv_cache_dtype
			
 
				+
			
 
				+        self.num_heads = num_heads
			
 
				+        self.head_size = head_size
			
 
				+        self.scale = float(scale)
			
 
				+        self.alibi_slopes = alibi_slopes
			
 
				+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
			
 
				+
			
 
				+        assert self.num_heads % self.num_kv_heads == 0
			
 
				+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
			
 
				+
			
 
				+        self.local_blocks = self.blocksparse_params.local_blocks
			
 
				+        self.vert_stride = self.blocksparse_params.vert_stride
			
 
				+        self.sparse_block_size = self.blocksparse_params.block_size
			
 
				+        self.head_sliding_step = self.blocksparse_params.head_sliding_step
			
 
				+
			
 
				+        suppored_head_sizes = PagedAttention.get_supported_head_sizes()
			
 
				+        if head_size not in suppored_head_sizes:
			
 
				+            raise ValueError(
			
 
				+                f"Head size {head_size} is not supported by PagedAttention. "
			
 
				+                f"Supported head sizes are: {suppored_head_sizes}.")
			
 
				+
			
 
				+        self.tp_size = get_tensor_model_parallel_world_size()
			
 
				+        self.tp_rank = get_tensor_model_parallel_rank()
			
 
				+
			
 
				+        total_num_heads = num_heads * self.tp_size
			
 
				+        self.bs_attn = LocalStridedBlockSparseAttn(
			
 
				+            total_num_heads,
			
 
				+            self.blocksparse_params.max_seqlen,
			
 
				+            self.blocksparse_params.local_blocks,
			
 
				+            self.blocksparse_params.vert_stride,
			
 
				+            self.blocksparse_params.block_size,
			
 
				+            homo_head=self.blocksparse_params.homo_head,
			
 
				+            active_head_range=self.blocksparse_params.active_head_range,
			
 
				+        )
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        query: torch.Tensor,
			
 
				+        key: torch.Tensor,
			
 
				+        value: torch.Tensor,
			
 
				+        kv_cache: torch.Tensor,
			
 
				+        attn_metadata: BlocksparseFlashAttentionMetadata,
			
 
				+        kv_scale: float = 1.0,
			
 
				+    ) -> torch.Tensor:
			
 
				+        """Forward pass with FlashAttention and PagedAttention.
			
 
				+        Args:
			
 
				+            query: shape = [num_tokens, num_heads * head_size]
			
 
				+            key: shape = [num_tokens, num_kv_heads * head_size]
			
 
				+            value: shape = [num_tokens, num_kv_heads * head_size]
			
 
				+            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
			
 
				+            attn_metadata: Metadata for attention.
			
 
				+        Returns:
			
 
				+            shape = [num_tokens, num_heads * head_size]
			
 
				+        """
			
 
				+        num_tokens, hidden_size = query.shape
			
 
				+        # Reshape the query, key, and value tensors.
			
 
				+        query = query.view(-1, self.num_heads, self.head_size)
			
 
				+        key = key.view(-1, self.num_kv_heads, self.head_size)
			
 
				+        value = value.view(-1, self.num_kv_heads, self.head_size)
			
 
				+
			
 
				+        if kv_cache is not None:
			
 
				+            key_cache, value_cache = PagedAttention.split_kv_cache(
			
 
				+                kv_cache, self.num_kv_heads, self.head_size)
			
 
				+
			
 
				+            # Reshape the input keys and values and store them in the cache.
			
 
				+            # If kv_cache is not provided, the new key and value tensors are
			
 
				+            # not cached. This happens during the initial memory profiling run.
			
 
				+
			
 
				+            PagedAttention.write_to_paged_cache(
			
 
				+                key,
			
 
				+                value,
			
 
				+                key_cache,
			
 
				+                value_cache,
			
 
				+                attn_metadata.slot_mapping,
			
 
				+                self.kv_cache_dtype,
			
 
				+                kv_scale,
			
 
				+            )
			
 
				+
			
 
				+        if prefill_meta := attn_metadata.prefill_metadata:
			
 
				+
			
 
				+            # Prompt run.
			
 
				+            # normal attention
			
 
				+            # When block_tables are not filled, it means q and k are the
			
 
				+            # prompt, and they have the same length.
			
 
				+
			
 
				+            assert kv_cache is None \
			
 
				+                    or prefill_meta.block_tables is None \
			
 
				+                    or prefill_meta.block_tables.numel() == 0, \
			
 
				+                "Does not support prefix-enabled attention."
			
 
				+
			
 
				+            output = self.bs_attn(
			
 
				+                q=query,
			
 
				+                k=key,
			
 
				+                v=value,
			
 
				+                cu_seqlens_q=prefill_meta.seq_start_loc,
			
 
				+                cu_seqlens_k=prefill_meta.seq_start_loc,
			
 
				+                sm_scale=self.scale,
			
 
				+            )
			
 
				+
			
 
				+        if decode_meta := attn_metadata.decode_metadata:
			
 
				+            # Decoding run.
			
 
				+            output = PagedAttention.forward_decode(
			
 
				+                query,
			
 
				+                key_cache,
			
 
				+                value_cache,
			
 
				+                decode_meta.block_tables,
			
 
				+                decode_meta.seq_lens_tensor,
			
 
				+                self.blocksparse_params.max_seqlen,
			
 
				+                self.kv_cache_dtype,
			
 
				+                self.num_kv_heads,
			
 
				+                self.scale,
			
 
				+                self.alibi_slopes,
			
 
				+                kv_scale,
			
 
				+                tp_rank=self.tp_rank,
			
 
				+                blocksparse_local_blocks=self.local_blocks,
			
 
				+                blocksparse_vert_stride=self.vert_stride,
			
 
				+                blocksparse_block_size=self.sparse_block_size,
			
 
				+                blocksparse_head_sliding_step=self.head_sliding_step,
			
 
				+            )
			
 
				+
			
 
				+        # Reshape the output tensor.
			
 
				+        return output.view(num_tokens, hidden_size)
			
--- a/aphrodite/attention/backends/flash_attn.py
+++ b/aphrodite/attention/backends/flash_attn.py
@@ -1,6 +1,6 @@
 
				 """Attention layer with FlashAttention."""
			
 
				 from dataclasses import dataclass
			
 
				-from typing import List, Optional, Tuple, Type
			
 
				+from typing import Any, Dict, List, Optional, Tuple, Type
			
 
				 
			
 
				 import torch
			
 
				 from vllm_flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
			
@@ -220,7 +220,10 @@ class FlashAttentionImpl(AttentionImpl):
 
				         alibi_slopes: Optional[List[float]],
			
 
				         sliding_window: Optional[int],
			
 
				         kv_cache_dtype: str,
			
 
				+        blocksparse_params: Optional[Dict[str, Any]] = None,
			
 
				     ) -> None:
			
 
				+        assert blocksparse_params is None, ValueError(
			
 
				+            "FlashAttention does not support block-sparse attention.")
			
 
				         self.num_heads = num_heads
			
 
				         self.head_size = head_size
			
 
				         self.scale = float(scale)
			
@@ -240,6 +243,7 @@ class FlashAttentionImpl(AttentionImpl):
 
				             # paged KV cache.
			
 
				             raise ValueError(
			
 
				                 "Sliding window is not supported in FlashAttention.")
			
 
				+
			
 
				         support_head_sizes = FlashAttentionBackend.get_supported_head_sizes()
			
 
				         if head_size not in support_head_sizes:
			
 
				             raise ValueError(
			
--- a/aphrodite/attention/backends/flashinfer.py
+++ b/aphrodite/attention/backends/flashinfer.py
@@ -169,7 +169,10 @@ class FlashInferImpl(AttentionImpl):
 
				         alibi_slopes: Optional[List[float]],
			
 
				         sliding_window: Optional[int],
			
 
				         kv_cache_dtype: str,
			
 
				+        blocksparse_params: Optional[Dict[str, Any]] = None,
			
 
				     ) -> None:
			
 
				+        assert blocksparse_params is None, ValueError(
			
 
				+            "FlashInfer does not support block-sparse attention.")
			
 
				         self.num_heads = num_heads
			
 
				         self.head_size = head_size
			
 
				         self.scale = float(scale)
			
--- a/aphrodite/attention/backends/rocm_flash_attn.py
+++ b/aphrodite/attention/backends/rocm_flash_attn.py
@@ -1,7 +1,7 @@
 
				 """Attention layer ROCm GPUs."""
			
 
				-from dataclasses import dataclass
			
 
				 import os
			
 
				-from typing import List, Optional, Tuple, Type
			
 
				+from dataclasses import dataclass
			
 
				+from typing import Any, Dict, List, Optional, Tuple, Type
			
 
				 
			
 
				 import torch
			
 
				 from loguru import logger
			
@@ -200,7 +200,10 @@ class ROCmFlashAttentionImpl(AttentionImpl):
 
				         alibi_slopes: Optional[List[float]],
			
 
				         sliding_window: Optional[int],
			
 
				         kv_cache_dtype: str,
			
 
				+        blocksparse_params: Optional[Dict[str, Any]] = None,
			
 
				     ) -> None:
			
 
				+        assert blocksparse_params is None, ValueError(
			
 
				+            "ROCm FlashAttention does not support block-sparse attention.")
			
 
				         self.num_heads = num_heads
			
 
				         self.head_size = head_size
			
 
				         self.scale = float(scale)
			
--- a/aphrodite/attention/backends/torch_sdpa.py
+++ b/aphrodite/attention/backends/torch_sdpa.py
@@ -1,7 +1,7 @@
 
				 """ Attention layer with torch scaled_dot_product_attention
			
 
				     and PagedAttention."""
			
 
				 from dataclasses import dataclass
			
 
				-from typing import List, Optional, Tuple, Type
			
 
				+from typing import Any, Dict, List, Optional, Tuple, Type
			
 
				 
			
 
				 import torch
			
 
				 from torch.nn.functional import scaled_dot_product_attention
			
@@ -101,7 +101,10 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
 
				         alibi_slopes: Optional[List[float]],
			
 
				         sliding_window: Optional[int],
			
 
				         kv_cache_dtype: str,
			
 
				+    blocksparse_params: Optional[Dict[str, Any]] = None,
			
 
				     ) -> None:
			
 
				+        assert blocksparse_params is None, ValueError(
			
 
				+            "Torch SDPA does not support block-sparse attention.")
			
 
				         self.num_heads = num_heads
			
 
				         self.head_size = head_size
			
 
				         self.scale = float(scale)
			
--- a/aphrodite/attention/backends/xformers.py
+++ b/aphrodite/attention/backends/xformers.py
@@ -1,6 +1,6 @@
 
				 """Attention layer with xFormers and PagedAttention."""
			
 
				 from dataclasses import dataclass
			
 
				-from typing import Dict, List, Optional, Tuple, Type
			
 
				+from typing import Any, Dict, List, Optional, Tuple, Type
			
 
				 
			
 
				 import torch
			
 
				 from xformers import ops as xops
			
@@ -210,7 +210,10 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
 
				         alibi_slopes: Optional[List[float]],
			
 
				         sliding_window: Optional[int],
			
 
				         kv_cache_dtype: str,
			
 
				+        blocksparse_params: Optional[Dict[str, Any]] = None,
			
 
				     ) -> None:
			
 
				+        assert blocksparse_params is None, ValueError(
			
 
				+            "XFormers does not support block-sparse attention.")
			
 
				         self.num_heads = num_heads
			
 
				         self.head_size = head_size
			
 
				         self.scale = float(scale)
			
--- a/aphrodite/attention/layer.py
+++ b/aphrodite/attention/layer.py
@@ -1,5 +1,5 @@
 
				 """Attention layer."""
			
 
				-from typing import List, Optional
			
 
				+from typing import Any, Dict, List, Optional
			
 
				 
			
 
				 import torch
			
 
				 import torch.nn as nn
			
@@ -12,9 +12,11 @@ from aphrodite.quantization.base_config import QuantizationConfig
 
				 
			
 
				 class Attention(nn.Module):
			
 
				     """Attention layer.
			
 
				+
			
 
				     This class takes query, key, and value tensors as input. The input tensors
			
 
				     can either contain prompt tokens or generation tokens.
			
 
				     The class does the following:
			
 
				+
			
 
				     1. Store the input key and value tensors in the KV cache.
			
 
				     2. Perform (multi-head/multi-query/grouped-query) attention.
			
 
				     3. Return the output tensor.
			
@@ -30,6 +32,7 @@ class Attention(nn.Module):
 
				         sliding_window: Optional[int] = None,
			
 
				         cache_config: Optional[CacheConfig] = None,
			
 
				         quant_config: Optional[QuantizationConfig] = None,
			
 
				+        blocksparse_params: Optional[Dict[str, Any]] = None,
			
 
				     ) -> None:
			
 
				         super().__init__()
			
 
				         if cache_config is not None:
			
@@ -60,15 +63,18 @@ class Attention(nn.Module):
 
				             # to self._kv_scale in a native float32 value after weight loading.
			
 
				             self.quant_method = quant_method
			
 
				             self.quant_method.create_weights(self)
			
 
				+
			
 
				         # During model initialization, the default dtype is set as the model
			
 
				         # weight and activation dtype.
			
 
				         dtype = torch.get_default_dtype()
			
 
				         attn_backend = get_attn_backend(num_heads, head_size, num_kv_heads,
			
 
				                                         sliding_window, dtype, kv_cache_dtype,
			
 
				-                                        block_size)
			
 
				+                                        block_size, blocksparse_params
			
 
				+                                        is not None)
			
 
				         impl_cls = attn_backend.get_impl_cls()
			
 
				         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
			
 
				-                             alibi_slopes, sliding_window, kv_cache_dtype)
			
 
				+                             alibi_slopes, sliding_window, kv_cache_dtype,
			
 
				+                             blocksparse_params)
			
 
				 
			
 
				     def forward(
			
 
				         self,
			
@@ -78,11 +84,13 @@ class Attention(nn.Module):
 
				         kv_cache: Optional[torch.Tensor],
			
 
				         attn_metadata: AttentionMetadata,
			
 
				     ) -> torch.Tensor:
			
 
				-        return self.impl.forward(query, key, value, kv_cache, attn_metadata)
			
 
				+        return self.impl.forward(query, key, value, kv_cache, attn_metadata,
			
 
				+                                 self._kv_scale)
			
 
				 
			
 
				     def extra_repr(self) -> str:
			
 
				         s = f"head_size={self.impl.head_size}"  # type: ignore
			
 
				         s += f", num_heads={self.impl.num_heads}"  # type: ignore
			
 
				         s += f", num_kv_heads={self.impl.num_kv_heads}"  # type: ignore
			
 
				         s += f", scale={self.impl.scale}"  # type: ignore
			
 
				+        s += f", backend={self.impl.__class__.__name__}"
			
 
				         return s
			
--- a/aphrodite/attention/ops/blocksparse_attention/__init__.py
+++ b/aphrodite/attention/ops/blocksparse_attention/__init__.py
--- a/aphrodite/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
+++ b/aphrodite/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
@@ -0,0 +1,422 @@
 
				+import torch
			
 
				+import triton
			
 
				+import triton.language as tl
			
 
				+
			
 
				+
			
 
				+def blocksparse_flash_attn_varlen_fwd(
			
 
				+        q,
			
 
				+        k,
			
 
				+        v,  # (#tokens, n_heads, head_size)
			
 
				+        cu_seqlens_k,
			
 
				+        cu_seqlens_q,
			
 
				+        sm_scale,
			
 
				+        sparse_layout,
			
 
				+        *,
			
 
				+        block_size=64,
			
 
				+        q_block_size=None,
			
 
				+        max_seqlen=None):
			
 
				+    # split q to blocks
			
 
				+
			
 
				+    assert isinstance(sparse_layout, (list, tuple))
			
 
				+
			
 
				+    _, n_heads, head_size = q.shape
			
 
				+    batch_size = cu_seqlens_k.size(0) - 1
			
 
				+    q_block_size = q_block_size or block_size
			
 
				+
			
 
				+    assert q.dim() == k.dim() == v.dim() == 3
			
 
				+    assert q.size(1) % k.size(1) == 0
			
 
				+    assert q.size(2) == k.size(2)
			
 
				+    # TODO: allow k, v to have different head_size
			
 
				+    assert k.shape == v.shape
			
 
				+    assert cu_seqlens_k.dim() == 1
			
 
				+
			
 
				+    q_k_ratio = q.size(1) // k.size(1)
			
 
				+
			
 
				+    if cu_seqlens_q is None:
			
 
				+        if q.size(0) == batch_size:  # decoding only
			
 
				+            cu_seqlens_q = torch.arange(
			
 
				+                0,
			
 
				+                batch_size + 1,
			
 
				+                dtype=cu_seqlens_k.dtype,
			
 
				+                device=cu_seqlens_k.device,
			
 
				+            )
			
 
				+        elif q.size(0) == k.size(0):
			
 
				+            cu_seqlens_q = cu_seqlens_k
			
 
				+        else:
			
 
				+            raise ValueError("cu_seqlens_q must be specified\
			
 
				+                    if it mix of prefilling and decoding.")
			
 
				+    else:
			
 
				+        assert cu_seqlens_k.size(0) == cu_seqlens_q.size(0)
			
 
				+
			
 
				+    # switch to use cpu to avoid too many kernel launches when iterated over
			
 
				+    q_lens = (cu_seqlens_q[1:] - cu_seqlens_q[:-1]).cpu()
			
 
				+    k_lens = (cu_seqlens_k[1:] - cu_seqlens_k[:-1]).cpu()
			
 
				+
			
 
				+    assert torch.logical_or(q_lens == 1, k_lens == q_lens).all(), (
			
 
				+        "length of q should either be 1 (decoding) or same as k (prefilling).")
			
 
				+
			
 
				+    if max_seqlen:
			
 
				+        assert k_lens.max() <= max_seqlen
			
 
				+
			
 
				+    n_blocks = (q_lens + q_block_size - 1) // q_block_size
			
 
				+
			
 
				+    q_batch_ids = torch.tensor(
			
 
				+        [i for i, n in enumerate(n_blocks) for _ in range(n)],
			
 
				+        dtype=cu_seqlens_q.dtype,
			
 
				+        device=cu_seqlens_q.device,
			
 
				+    )
			
 
				+    q_start_sids = torch.tensor(
			
 
				+        [i * q_block_size for n in n_blocks for i in range(n)],
			
 
				+        dtype=cu_seqlens_q.dtype,
			
 
				+        device=cu_seqlens_q.device,
			
 
				+    )
			
 
				+
			
 
				+    out = q.new_empty(q.shape)
			
 
				+    cu_seqlens_q = cu_seqlens_q.contiguous()
			
 
				+    cu_seqlens_k = cu_seqlens_k.contiguous()
			
 
				+
			
 
				+    layout_crow_indices, layout_col_indices = sparse_layout
			
 
				+    block_d = triton.next_power_of_2(head_size)
			
 
				+
			
 
				+    decoding_only = (q_lens == 1).all().item()
			
 
				+    grid = (len(q_start_sids), n_heads, 1)
			
 
				+
			
 
				+    _fwd_kernel_batch_inference[grid](
			
 
				+        q,
			
 
				+        k,
			
 
				+        v,
			
 
				+        out,
			
 
				+        sm_scale,
			
 
				+        cu_seqlens_q[:-1],
			
 
				+        cu_seqlens_q[1:],
			
 
				+        cu_seqlens_k[:-1],
			
 
				+        cu_seqlens_k[1:],
			
 
				+        q_batch_ids,
			
 
				+        q_start_sids,
			
 
				+        0,
			
 
				+        *q.stride(),
			
 
				+        0,
			
 
				+        *k.stride(),
			
 
				+        0,
			
 
				+        *v.stride(),
			
 
				+        0,
			
 
				+        *out.stride(),
			
 
				+        layout_crow_indices,
			
 
				+        layout_col_indices,
			
 
				+        *layout_crow_indices.stride(),
			
 
				+        *layout_col_indices.stride(),
			
 
				+        q_k_ratio,
			
 
				+        HAS_BATCH_DIM=False,
			
 
				+        D_HEAD=head_size,
			
 
				+        BLOCK_M=q_block_size,
			
 
				+        BLOCK_N=block_size,
			
 
				+        BLOCK_D=block_d,
			
 
				+        BLOCK_M_LOADING=(16 if decoding_only else
			
 
				+                         q_block_size),  # smaller for decoding
			
 
				+        EVEN_D=block_d == head_size,
			
 
				+        num_warps=1 if decoding_only else 4,
			
 
				+        num_stages=3)
			
 
				+
			
 
				+    return out
			
 
				+
			
 
				+
			
 
				+@triton.jit
			
 
				+def _fwd_kernel_inner(
			
 
				+    acc,
			
 
				+    l_i,
			
 
				+    m_i,
			
 
				+    q,
			
 
				+    Q,
			
 
				+    k_block_col_idx,
			
 
				+    layout_col_ptr,
			
 
				+    layout_col_stride_h,
			
 
				+    layout_col_stride_m,
			
 
				+    k_ptrs,
			
 
				+    v_ptrs,
			
 
				+    off_h,
			
 
				+    offs_m,
			
 
				+    offs_n,
			
 
				+    offs_d,
			
 
				+    stride_kt,
			
 
				+    stride_vt,
			
 
				+    sm_scale,
			
 
				+    k_seqlen,
			
 
				+    past_len,
			
 
				+    LAST_K_BLOCK: tl.constexpr,
			
 
				+    BLOCK_M_LOADING: tl.constexpr,
			
 
				+    BLOCK_N: tl.constexpr,
			
 
				+    D_HEAD: tl.constexpr,
			
 
				+    EVEN_D: tl.constexpr,
			
 
				+    M_LT_N: tl.constexpr,
			
 
				+):
			
 
				+    k_block_id = tl.load(layout_col_ptr + off_h * layout_col_stride_h +
			
 
				+                         k_block_col_idx * layout_col_stride_m).to(tl.int32)
			
 
				+    start_n = k_block_id * BLOCK_N
			
 
				+    if LAST_K_BLOCK:
			
 
				+        if EVEN_D:
			
 
				+            k = tl.load(
			
 
				+                k_ptrs + start_n * stride_kt,
			
 
				+                mask=offs_n[None, :] + start_n < k_seqlen,
			
 
				+            )
			
 
				+        else:
			
 
				+            k = tl.load(
			
 
				+                k_ptrs + start_n * stride_kt,
			
 
				+                mask=(offs_n[None, :] + start_n < k_seqlen) &
			
 
				+                (offs_d[:, None] < D_HEAD),
			
 
				+            )
			
 
				+    else:
			
 
				+        if EVEN_D:
			
 
				+            k = tl.load(k_ptrs + start_n * stride_kt)
			
 
				+        else:
			
 
				+            k = tl.load(k_ptrs + start_n * stride_kt,
			
 
				+                        mask=offs_d[:, None] < D_HEAD)
			
 
				+
			
 
				+    qk = tl.zeros([BLOCK_M_LOADING, BLOCK_N], dtype=tl.float32)
			
 
				+    qk += tl.dot(q, k)
			
 
				+    qk *= sm_scale
			
 
				+
			
 
				+    # the following is needed only when LAST_K_BLOCK or BLOCK_M < BLOCK_N
			
 
				+    if LAST_K_BLOCK | M_LT_N:
			
 
				+        qk += tl.where(
			
 
				+            offs_m[:, None] + past_len >= (start_n + offs_n[None, :]),
			
 
				+            0,
			
 
				+            float("-inf"),
			
 
				+        )
			
 
				+
			
 
				+    # flash-attn2
			
 
				+    m_ij = tl.maximum(m_i, tl.max(qk, 1))
			
 
				+    p = tl.math.exp2(qk - m_ij[:, None])
			
 
				+    l_ij = tl.sum(p, 1)
			
 
				+    alpha = tl.math.exp2(m_i - m_ij)
			
 
				+    acc = acc * alpha[:, None]
			
 
				+    # update m_i
			
 
				+    m_i = m_ij
			
 
				+    l_i = l_i * alpha + l_ij
			
 
				+
			
 
				+    p = p.to(Q.dtype.element_ty)
			
 
				+    # update acc
			
 
				+    if LAST_K_BLOCK:
			
 
				+        if EVEN_D:
			
 
				+            v = tl.load(
			
 
				+                v_ptrs + start_n * stride_vt,
			
 
				+                mask=offs_n[:, None] + start_n < k_seqlen,
			
 
				+            )
			
 
				+        else:
			
 
				+            v = tl.load(
			
 
				+                v_ptrs + start_n * stride_vt,
			
 
				+                mask=(offs_n[:, None] + start_n < k_seqlen) &
			
 
				+                (offs_d[None, :] < D_HEAD),
			
 
				+            )
			
 
				+    else:
			
 
				+        if EVEN_D:
			
 
				+            v = tl.load(v_ptrs + start_n * stride_vt)
			
 
				+        else:
			
 
				+            v = tl.load(v_ptrs + start_n * stride_vt,
			
 
				+                        mask=offs_d[None, :] < D_HEAD)
			
 
				+
			
 
				+    acc += tl.dot(p, v)
			
 
				+
			
 
				+    return acc, l_i, m_i
			
 
				+
			
 
				+
			
 
				+@triton.heuristics({
			
 
				+    "M_LT_N":
			
 
				+    lambda kwargs: kwargs["BLOCK_M"] < kwargs["BLOCK_N"],
			
 
				+})
			
 
				+@triton.jit
			
 
				+def _fwd_kernel_batch_inference(
			
 
				+    Q,
			
 
				+    K,
			
 
				+    V,
			
 
				+    Out,
			
 
				+    sm_scale,
			
 
				+    q_batch_starts,
			
 
				+    q_batch_ends,
			
 
				+    k_batch_starts,
			
 
				+    k_batch_ends,
			
 
				+    q_batch_ids,
			
 
				+    q_start_sids,
			
 
				+    stride_qb,
			
 
				+    stride_qt,
			
 
				+    stride_qh,
			
 
				+    stride_qd,
			
 
				+    stride_kb,
			
 
				+    stride_kt,
			
 
				+    stride_kh,
			
 
				+    stride_kd,
			
 
				+    stride_vb,
			
 
				+    stride_vt,
			
 
				+    stride_vh,
			
 
				+    stride_vd,
			
 
				+    stride_ob,
			
 
				+    stride_ot,
			
 
				+    stride_oh,
			
 
				+    stride_od,
			
 
				+    layout_crow_ptr,
			
 
				+    layout_col_ptr,
			
 
				+    layout_crow_stride_h,
			
 
				+    layout_crow_stride_m,
			
 
				+    layout_col_stride_h,
			
 
				+    layout_col_stride_m,
			
 
				+    q_k_ratio,
			
 
				+    HAS_BATCH_DIM: tl.constexpr,
			
 
				+    D_HEAD: tl.constexpr,
			
 
				+    BLOCK_M: tl.constexpr,
			
 
				+    BLOCK_N: tl.constexpr,
			
 
				+    BLOCK_D: tl.constexpr,
			
 
				+    BLOCK_M_LOADING: tl.constexpr,
			
 
				+    EVEN_D: tl.constexpr,
			
 
				+    M_LT_N: tl.constexpr,
			
 
				+):
			
 
				+    """
			
 
				+    NOTATION:
			
 
				+    pid: position id
			
 
				+    sid: storage id
			
 
				+    sbid: storage block id
			
 
				+    pbid: position block id
			
 
				+    offs_m, offs_n: storage offsets of m-dim(q, row) and n-dim(k, col)
			
 
				+    TODO:
			
 
				+    Optimize grouped-attn
			
 
				+    """
			
 
				+    off_zm = tl.program_id(0)
			
 
				+    off_h = tl.program_id(1)
			
 
				+
			
 
				+    off_h_for_kv = off_h // q_k_ratio
			
 
				+
			
 
				+    if HAS_BATCH_DIM:
			
 
				+        off_z = tl.program_id(2)
			
 
				+        Q += off_z * stride_qb
			
 
				+        K += off_z * stride_kb
			
 
				+        V += off_z * stride_vb
			
 
				+        Out += off_z * stride_ob
			
 
				+        start_m = off_zm
			
 
				+        q_start_sid = start_m * BLOCK_M  # always 0 for decoding
			
 
				+    else:
			
 
				+        off_z = tl.load(q_batch_ids + off_zm).to(tl.int32)  # [0, 0, 0, 1]
			
 
				+        q_start_sid = tl.load(q_start_sids + off_zm)
			
 
				+        start_m = q_start_sid // BLOCK_M  # q_sbid
			
 
				+
			
 
				+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M_LOADING)
			
 
				+    offs_n = tl.arange(0, BLOCK_N)
			
 
				+    offs_d = tl.arange(0, BLOCK_D)
			
 
				+
			
 
				+    q_cu_start = tl.load(q_batch_starts + off_z).to(tl.int32)
			
 
				+    q_seqlen = tl.load(q_batch_ends + off_z).to(tl.int32) - q_cu_start
			
 
				+    k_cu_start = tl.load(k_batch_starts + off_z).to(tl.int32)
			
 
				+    k_seqlen = tl.load(k_batch_ends + off_z).to(tl.int32) - k_cu_start
			
 
				+    past_len = k_seqlen - q_seqlen
			
 
				+
			
 
				+    Q += q_cu_start * stride_qt + off_h * stride_qh
			
 
				+    K += k_cu_start * stride_kt + off_h_for_kv * stride_kh
			
 
				+    V += k_cu_start * stride_vt + off_h_for_kv * stride_vh
			
 
				+    Out += q_cu_start * stride_ot + off_h * stride_oh
			
 
				+
			
 
				+    q_pbid = (past_len + q_start_sid) // BLOCK_M
			
 
				+
			
 
				+    if EVEN_D:
			
 
				+        q = tl.load(
			
 
				+            Q + offs_m[:, None] * stride_qt + offs_d[None, :] * stride_qd,
			
 
				+            mask=offs_m[:, None] < q_seqlen,
			
 
				+        )
			
 
				+    else:
			
 
				+        q = tl.load(
			
 
				+            Q + offs_m[:, None] * stride_qt + offs_d[None, :] * stride_qd,
			
 
				+            mask=(offs_m[:, None] < q_seqlen) & (offs_d[None, :] < D_HEAD),
			
 
				+            other=0,
			
 
				+        )
			
 
				+
			
 
				+    sparse_crow_ptr = (layout_crow_ptr + off_h * layout_crow_stride_h +
			
 
				+                       q_pbid * layout_crow_stride_m)
			
 
				+
			
 
				+    # TODO: load at once, with any Triton version
			
 
				+    # that supports `tl.split`, e.g., Triton 3.0
			
 
				+    k_block_start = tl.load(sparse_crow_ptr).to(tl.int32)
			
 
				+    k_block_end = tl.load(sparse_crow_ptr + 1).to(tl.int32)
			
 
				+
			
 
				+    m_i = tl.zeros([BLOCK_M_LOADING], dtype=tl.float32) - float("inf")
			
 
				+    l_i = tl.zeros([BLOCK_M_LOADING], dtype=tl.float32)
			
 
				+    acc = tl.zeros([BLOCK_M_LOADING, BLOCK_D], dtype=tl.float32)
			
 
				+
			
 
				+    k_ptrs = K + offs_n[None, :] * stride_kt + offs_d[:, None] * stride_kd
			
 
				+    v_ptrs = V + offs_n[:, None] * stride_vt + offs_d[None, :] * stride_vd
			
 
				+
			
 
				+    sm_scale *= (
			
 
				+        1.44269504  # 1/log2 as we use base2 for exponential and logarithm
			
 
				+    )
			
 
				+
			
 
				+    for k_block_col_idx in range(k_block_start, k_block_end - 1):
			
 
				+        acc, l_i, m_i = _fwd_kernel_inner(
			
 
				+            acc,
			
 
				+            l_i,
			
 
				+            m_i,
			
 
				+            q,
			
 
				+            Q,
			
 
				+            k_block_col_idx,
			
 
				+            layout_col_ptr,
			
 
				+            layout_col_stride_h,
			
 
				+            layout_col_stride_m,
			
 
				+            k_ptrs,
			
 
				+            v_ptrs,
			
 
				+            off_h,
			
 
				+            offs_m,
			
 
				+            offs_n,
			
 
				+            offs_d,
			
 
				+            stride_kt,
			
 
				+            stride_vt,
			
 
				+            sm_scale,
			
 
				+            k_seqlen,
			
 
				+            past_len,
			
 
				+            False,
			
 
				+            BLOCK_M_LOADING,
			
 
				+            BLOCK_N,
			
 
				+            D_HEAD,
			
 
				+            EVEN_D,
			
 
				+            M_LT_N,
			
 
				+        )
			
 
				+
			
 
				+    acc, l_i, m_i = _fwd_kernel_inner(
			
 
				+        acc,
			
 
				+        l_i,
			
 
				+        m_i,
			
 
				+        q,
			
 
				+        Q,
			
 
				+        k_block_end - 1,
			
 
				+        layout_col_ptr,
			
 
				+        layout_col_stride_h,
			
 
				+        layout_col_stride_m,
			
 
				+        k_ptrs,
			
 
				+        v_ptrs,
			
 
				+        off_h,
			
 
				+        offs_m,
			
 
				+        offs_n,
			
 
				+        offs_d,
			
 
				+        stride_kt,
			
 
				+        stride_vt,
			
 
				+        sm_scale,
			
 
				+        k_seqlen,
			
 
				+        past_len,
			
 
				+        True,
			
 
				+        BLOCK_M_LOADING,
			
 
				+        BLOCK_N,
			
 
				+        D_HEAD,
			
 
				+        EVEN_D,
			
 
				+        M_LT_N,
			
 
				+    )
			
 
				+
			
 
				+    # flash-attn 2
			
 
				+    m_i += tl.math.log2(l_i)
			
 
				+    acc = acc / l_i[:, None]
			
 
				+
			
 
				+    # write output
			
 
				+    if EVEN_D:
			
 
				+        tl.store(
			
 
				+            Out + offs_m[:, None] * stride_ot + offs_d[None, :] * stride_od,
			
 
				+            acc,
			
 
				+            mask=offs_m[:, None] < q_seqlen,
			
 
				+        )
			
 
				+    else:
			
 
				+        tl.store(
			
 
				+            Out + offs_m[:, None] * stride_ot + offs_d[None, :] * stride_od,
			
 
				+            acc,
			
 
				+            mask=(offs_m[:, None] < q_seqlen) & (offs_d[None, :] < D_HEAD),
			
 
				+        )
			
--- a/aphrodite/attention/ops/blocksparse_attention/interface.py
+++ b/aphrodite/attention/ops/blocksparse_attention/interface.py
@@ -0,0 +1,235 @@
 
				+import math
			
 
				+
			
 
				+import torch
			
 
				+
			
 
				+from aphrodite.attention.ops.blocksparse_attention.utils import (
			
 
				+    dense_to_crow_col, get_head_sliding_step, get_sparse_attn_mask)
			
 
				+from aphrodite.common.utils import is_cpu, is_hip
			
 
				+
			
 
				+IS_COMPUTE_8_OR_ABOVE = (torch.cuda.is_available()
			
 
				+                         and torch.cuda.get_device_capability()[0] >= 8)
			
 
				+
			
 
				+if IS_COMPUTE_8_OR_ABOVE:
			
 
				+    from aphrodite.attention.ops.blocksparse_attention.blocksparse_attention_kernel import \
			
 
				+        blocksparse_flash_attn_varlen_fwd  # noqa: E501
			
 
				+
			
 
				+
			
 
				+class LocalStridedBlockSparseAttn(torch.nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        n_heads,
			
 
				+        max_seqlen,
			
 
				+        local_blocks,
			
 
				+        vert_stride,
			
 
				+        block_size,
			
 
				+        device=None,
			
 
				+        dtype=None,
			
 
				+        homo_head=False,
			
 
				+        active_head_range=None,
			
 
				+        q_block_size=None,
			
 
				+        use_spda=None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        if use_spda is None:
			
 
				+            use_spda = is_hip() or is_cpu() or not \
			
 
				+                       IS_COMPUTE_8_OR_ABOVE
			
 
				+        device = device or (torch.cuda.current_device()
			
 
				+                            if torch.cuda.is_available() else "cpu")
			
 
				+        device = torch.device(device)
			
 
				+        # NOTE: aphrodite CPU backend support BF16 instead of FP16.
			
 
				+        dtype = dtype or (torch.bfloat16 if IS_COMPUTE_8_OR_ABOVE
			
 
				+                          or device.type == "cpu" else torch.half)
			
 
				+
			
 
				+        self.n_heads = n_heads
			
 
				+        self.max_seqlen = max_seqlen
			
 
				+        self.local_blocks = local_blocks
			
 
				+        self.vert_stride = vert_stride
			
 
				+        self.use_spda = use_spda
			
 
				+        self.dtype = dtype
			
 
				+        self.device = device
			
 
				+        self.block_size = block_size
			
 
				+        self.q_block_size = q_block_size
			
 
				+        self.homo_head = homo_head
			
 
				+        self.active_head_range = active_head_range
			
 
				+        self.head_sliding_step = get_head_sliding_step(n_heads, vert_stride,
			
 
				+                                                       homo_head)
			
 
				+
			
 
				+        sparse_layout, sparse_pattern, self.dense_attn_mask = (
			
 
				+            self.get_attn_pattern(dtype, device))
			
 
				+
			
 
				+        if q_block_size is not None and q_block_size != block_size:
			
 
				+            if q_block_size > block_size:
			
 
				+                assert q_block_size % block_size == 0
			
 
				+                blocks_to_merge = q_block_size // block_size
			
 
				+                shape = sparse_pattern.shape
			
 
				+                sparse_pattern = sparse_pattern.view(shape[0], -1,
			
 
				+                                                     blocks_to_merge,
			
 
				+                                                     shape[-1])
			
 
				+                sparse_pattern = sparse_pattern.sum(2)
			
 
				+                sparse_layout = dense_to_crow_col(sparse_pattern)
			
 
				+            else:
			
 
				+                raise ValueError(
			
 
				+                    "Does not support smaller q_block_size. It will be slower."
			
 
				+                )
			
 
				+
			
 
				+        self.sparse_layout = sparse_layout
			
 
				+
			
 
				+    def get_attn_pattern(self, dtype, device):
			
 
				+        sparse_layout, sparse_pattern, dense_attn_mask = get_sparse_attn_mask(
			
 
				+            self.n_heads,
			
 
				+            self.max_seqlen,
			
 
				+            self.max_seqlen,
			
 
				+            dtype,
			
 
				+            device,
			
 
				+            block_size=self.block_size,
			
 
				+            local_blocks=self.local_blocks,
			
 
				+            vert_stride=self.vert_stride,
			
 
				+            homo_head=self.homo_head,
			
 
				+            return_dense=self.use_spda,
			
 
				+            dense_mask_type="bias",
			
 
				+        )
			
 
				+        if (not self.homo_head) and (self.active_head_range is not None):
			
 
				+            assert isinstance(self.active_head_range, tuple)
			
 
				+            assert (len(self.active_head_range) == 2)
			
 
				+            h_start, h_end = self.active_head_range
			
 
				+            sparse_layout = tuple(x[h_start:h_end] for x in sparse_layout)
			
 
				+            if self.use_spda:
			
 
				+                dense_attn_mask = dense_attn_mask[h_start:h_end]
			
 
				+        return sparse_layout, sparse_pattern, dense_attn_mask
			
 
				+
			
 
				+    def varlen_attn(self,
			
 
				+                    q,
			
 
				+                    k,
			
 
				+                    v,
			
 
				+                    cu_seqlens_k,
			
 
				+                    cu_seqlens_q=None,
			
 
				+                    sm_scale=None):
			
 
				+        """
			
 
				+        q, k, v: shape = (num_tokens, num_heads_q/kv, head_size).
			
 
				+        Support grouped attention, with `q[:, i*r:(i*r + r)]`
			
 
				+        is correspondent to `k[:, i]`, where `r` is the q/k ratio.
			
 
				+        cu_seqlens_k: shape=(batch_size + 1,), 
			
 
				+        indicating segment of samples, 
			
 
				+        e.g., `k[cu_seqlen[i]:cu_seqlne[i+1]]` is q of sample i
			
 
				+        cu_seqlens_q: shape=(batch_size + 1, ).
			
 
				+        Default None: same as cu_seqlens_k for prefilling or
			
 
				+        [0, 1, .., batch_size] for decoding.
			
 
				+        The only case you need to specify is when q is a mix of 
			
 
				+        prefilling and decoding.
			
 
				+        sm_scale: softmax scale, default to 1/sqrt(head_size).
			
 
				+        return: tensor of shape as q.
			
 
				+        """
			
 
				+        assert (
			
 
				+            IS_COMPUTE_8_OR_ABOVE
			
 
				+        ), "Requires compute capability of 8 or above (Ampere or newer) to use \
			
 
				+            Triton kernel."
			
 
				+
			
 
				+        sm_scale = sm_scale or 1.0 / math.sqrt(q.size(-1))
			
 
				+
			
 
				+        return blocksparse_flash_attn_varlen_fwd(
			
 
				+            q,
			
 
				+            k,
			
 
				+            v,
			
 
				+            cu_seqlens_k,
			
 
				+            cu_seqlens_q,
			
 
				+            sm_scale,
			
 
				+            self.sparse_layout,
			
 
				+            block_size=self.block_size,
			
 
				+            q_block_size=self.q_block_size,
			
 
				+            max_seqlen=self.max_seqlen,
			
 
				+        )
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def transpose_and_pad(x, cu_seqlens, maxlen, head_repeats=1):
			
 
				+        """
			
 
				+        :param x: (total_tokens, n_heads, head_size)
			
 
				+        :return: (batch, n_heads, length, head_size)
			
 
				+        """
			
 
				+        x_padded = x.new_empty(
			
 
				+            len(cu_seqlens) - 1, x.size(1), head_repeats, maxlen, x.size(2))
			
 
				+        cu_seqlens = cu_seqlens.cpu()
			
 
				+        for i, (s, e) in enumerate(zip(cu_seqlens[:-1], cu_seqlens[1:])):
			
 
				+            x_padded[i, :, :, :e - s].copy_(x[s:e].transpose(0,
			
 
				+                                                             1).unsqueeze(1))
			
 
				+        return x_padded.flatten(1, 2)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def transpose_and_unpad(x_padded, cu_seqlens):
			
 
				+        """
			
 
				+        :param x_padded: (batch, n_heads, length, head_size)
			
 
				+        :return: (total_tokens, n_heads, head_size)
			
 
				+        """
			
 
				+        cu_seqlens = cu_seqlens.cpu()
			
 
				+        total_n_tokens = cu_seqlens[-1]
			
 
				+        x = x_padded.new_empty(total_n_tokens, x_padded.size(1),
			
 
				+                               x_padded.size(3))
			
 
				+        for i, (s, e) in enumerate(zip(cu_seqlens[:-1], cu_seqlens[1:])):
			
 
				+            x[s:e].copy_(x_padded[i, :, :e - s].transpose(0, 1))
			
 
				+        return x
			
 
				+
			
 
				+    def spda(self, q, k, v, cu_seqlens_k, cu_seqlens_q=None, sm_scale=None):
			
 
				+        """For CPU, V100 or other older GPUs.
			
 
				+        NOTE: torch SPDA supports nested tensor, 
			
 
				+        but seems extremely slow. Choose to pad instead.
			
 
				+        """
			
 
				+        assert (cu_seqlens_q is None or
			
 
				+                (cu_seqlens_q
			
 
				+                 == cu_seqlens_k).all()), "Can only handle prompt with SPDA."
			
 
				+        assert q.size(0) == k.size(0), "can only handle prompt with SPDA."
			
 
				+
			
 
				+        assert q.size(1) % k.size(1) == 0
			
 
				+        q_k_ratio = q.size(1) // k.size(1)
			
 
				+        sm_scale = sm_scale or 1.0 / math.sqrt(q.size(-1))
			
 
				+        cu_seqlens = cu_seqlens_k.cpu()
			
 
				+        maxlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
			
 
				+
			
 
				+        if (self.dense_attn_mask.dtype != q.dtype
			
 
				+                or self.dense_attn_mask.device != q.device):
			
 
				+            _, _, self.dense_attn_mask = self.get_attn_pattern(
			
 
				+                q.dtype, q.device)
			
 
				+        attn_mask = self.dense_attn_mask[None, :, :maxlen, :maxlen]
			
 
				+
			
 
				+        q2 = self.transpose_and_pad(q, cu_seqlens, maxlen, 1)
			
 
				+        k2, v2 = [
			
 
				+            self.transpose_and_pad(x, cu_seqlens, maxlen, q_k_ratio)
			
 
				+            for x in [k, v]
			
 
				+        ]
			
 
				+        spda_output = torch.nn.functional.scaled_dot_product_attention(
			
 
				+            q2, k2, v2, attn_mask=attn_mask, scale=sm_scale)
			
 
				+        return self.transpose_and_unpad(spda_output, cu_seqlens)
			
 
				+
			
 
				+    def forward(self, q, k, v, cu_seqlens_k, cu_seqlens_q=None, sm_scale=None):
			
 
				+        """Dispatch to `varlen_attn` (Ampere or newer) or 
			
 
				+        `self.spda`(cpu, Volta, Turing or older)based on 
			
 
				+        the type of device used and cuda compute capability.
			
 
				+        q, k, v: shape = (num_tokens, num_heads_q/kv, head_size).
			
 
				+                Support grouped attention, with `q[:, i*r:(i*r + r)]`
			
 
				+                is correspondent to `k[:, i]`, where `r` is the q/k ratio.
			
 
				+        cu_seqlens_k: shape=(batch_size + 1,), indicating segment of samples,
			
 
				+                    e.g., `k[cu_seqlen[i]:cu_seqlne[i+1]]` is q of sample i
			
 
				+        cu_seqlens_q: shape=(batch_size + 1, ).
			
 
				+                    Default None: same as cu_seqlens_k for prefilling or
			
 
				+                    [0, 1, .., batch_size] for decoding.
			
 
				+                    The only case you need to specify 
			
 
				+                    is when q is a mix of prefilling 
			
 
				+                    and decoding.
			
 
				+        sm_scale: softmax scale, default to 1/sqrt(head_size).
			
 
				+        return: tensor of shape as q.
			
 
				+        """
			
 
				+        assert k.dim() == 3
			
 
				+        if self.use_spda:
			
 
				+            return self.spda(
			
 
				+                q,
			
 
				+                k,
			
 
				+                v,
			
 
				+                cu_seqlens_k,
			
 
				+                cu_seqlens_q=cu_seqlens_q,
			
 
				+                sm_scale=sm_scale,
			
 
				+            )
			
 
				+        return self.varlen_attn(q,
			
 
				+                                k,
			
 
				+                                v,
			
 
				+                                cu_seqlens_k,
			
 
				+                                cu_seqlens_q=cu_seqlens_q,
			
 
				+                                sm_scale=sm_scale)
			
--- a/aphrodite/attention/ops/blocksparse_attention/utils.py
+++ b/aphrodite/attention/ops/blocksparse_attention/utils.py
@@ -0,0 +1,216 @@
 
				+# Helper functions for 3D sparse pattern
			
 
				+# These function are not optimized and very inefficient.
			
 
				+# Avoid calling them too frequent or use a cache mechanism.
			
 
				+
			
 
				+from functools import lru_cache
			
 
				+
			
 
				+import torch
			
 
				+import triton
			
 
				+from scipy import sparse
			
 
				+
			
 
				+
			
 
				+def dense_to_crow_col(x: torch.Tensor):
			
 
				+    """Turning a 2D/3D torch tensor (x) to CSR rows/cols indexing.
			
 
				+    NOTE: col_indices padded -1
			
 
				+    """
			
 
				+    device = x.device
			
 
				+    pad = -1
			
 
				+    dim = x.dim()
			
 
				+    assert x.dim() in (2, 3)
			
 
				+    if x.dim() == 2:
			
 
				+        x = x[None]
			
 
				+    x = [sparse.csr_matrix(xi.bool().cpu().numpy()) for xi in x]
			
 
				+    crows = torch.vstack([torch.from_numpy(xi.indptr) for xi in x])
			
 
				+    cols = [torch.from_numpy(xi.indices) for xi in x]
			
 
				+    max_cols = max(len(xi) for xi in cols)
			
 
				+    cols = [
			
 
				+        torch.cat([xi, pad + xi.new_zeros(max_cols - xi.shape[0])])
			
 
				+        for xi in cols
			
 
				+    ]
			
 
				+    cols = torch.vstack(cols)
			
 
				+    if dim == 2:
			
 
				+        crows = crows[0]
			
 
				+        cols = cols[0]
			
 
				+    return crows.to(device), cols.to(device)
			
 
				+
			
 
				+
			
 
				+def crow_col_to_dense(crows: torch.Tensor,
			
 
				+                      cols: torch.Tensor,
			
 
				+                      dtype: torch.dtype = torch.float16):
			
 
				+    dim = crows.dim()
			
 
				+    if dim == 1:
			
 
				+        crows = crows[None]
			
 
				+        cols = cols[None]
			
 
				+    device = crows.device
			
 
				+    crows, cols = crows.cpu(), cols.cpu()  # faster in cpu
			
 
				+    shape = (crows.shape[0], crows.shape[1] - 1, cols.max() + 1)
			
 
				+    x = torch.zeros(shape, dtype=dtype)
			
 
				+    for i in range(shape[0]):
			
 
				+        for j in range(shape[1]):
			
 
				+            x[i, j, cols[i, crows[i, j]:crows[i, j + 1]]] = 1
			
 
				+    if dim == 1:
			
 
				+        x = x[0]
			
 
				+    return x.to(device)
			
 
				+
			
 
				+
			
 
				+def dense_to_ccol_row(x: torch.Tensor):
			
 
				+    """Similar, but to CSC format"""
			
 
				+    x = x.transpose(-2, -1)
			
 
				+    return dense_to_crow_col(x)
			
 
				+
			
 
				+
			
 
				+def ccol_row_to_dense(ccol: torch.Tensor,
			
 
				+                      rows: torch.Tensor,
			
 
				+                      dtype: torch.dtype = torch.float16):
			
 
				+    return crow_col_to_dense(ccol, rows, dtype).permute(0, 2, 1).contiguous()
			
 
				+
			
 
				+
			
 
				+def _get_sparse_attn_mask_homo_head(
			
 
				+    q_len: int,
			
 
				+    max_seqlen: int,
			
 
				+    dtype: torch.dtype,
			
 
				+    device: torch.device,
			
 
				+    block_size: int = 128,
			
 
				+    local_blocks: int = 4,
			
 
				+    vert_stride: int = 4,
			
 
				+    return_dense: bool = False,
			
 
				+):
			
 
				+    """
			
 
				+    :return: a tuple of 3:
			
 
				+        - tuple of crow_indices, col_indices representation 
			
 
				+            of CSR format.
			
 
				+        - block dense mask
			
 
				+        - all token dense mask (be aware that it can be 
			
 
				+            OOM if it is too big) if `return_dense==True`, 
			
 
				+            otherwise, None
			
 
				+    """
			
 
				+    with torch.no_grad():
			
 
				+        num_blocks = triton.cdiv(max_seqlen, block_size)
			
 
				+        q_pos = torch.arange(num_blocks)[:, None]
			
 
				+        k_pos = torch.arange(num_blocks)[None]
			
 
				+        mask_vert_strided = (torch.arange(num_blocks) + 1) % vert_stride == 0
			
 
				+        block_mask_dense = (((q_pos >= k_pos)
			
 
				+                             & ((q_pos - k_pos < local_blocks)
			
 
				+                                | mask_vert_strided)).to(device).to(dtype))
			
 
				+        num_blocks_q = triton.cdiv(q_len, block_size)
			
 
				+        block_mask_dense_output = (dense_to_crow_col(
			
 
				+            block_mask_dense[-num_blocks_q:].contiguous()))
			
 
				+    if return_dense:
			
 
				+        mask_dense = torch.kron(
			
 
				+            block_mask_dense,
			
 
				+            block_mask_dense.new_ones((block_size, block_size)),
			
 
				+        )
			
 
				+        causal_mask = torch.tril(torch.ones(
			
 
				+            max_seqlen, max_seqlen)).type_as(mask_dense)[-q_len:]
			
 
				+        mask_dense = mask_dense[-q_len:, :max_seqlen] * causal_mask
			
 
				+        return (
			
 
				+            block_mask_dense_output,
			
 
				+            block_mask_dense,
			
 
				+            mask_dense,
			
 
				+        )
			
 
				+    else:
			
 
				+        return (
			
 
				+            block_mask_dense_output,
			
 
				+            block_mask_dense,
			
 
				+            None,
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+def binary_mask_to_bias(mask_dense: torch.Tensor):
			
 
				+    mask_dense = 1 - mask_dense
			
 
				+    mask_dense.masked_fill_(mask_dense.bool(), -torch.inf)
			
 
				+    return mask_dense
			
 
				+
			
 
				+
			
 
				+def get_head_sliding_step(n_heads: int,
			
 
				+                          vert_stride: int,
			
 
				+                          homo_head: bool = False):
			
 
				+    if homo_head:
			
 
				+        return 0
			
 
				+    return max(1, int(vert_stride / n_heads))
			
 
				+
			
 
				+
			
 
				+@lru_cache
			
 
				+def get_sparse_attn_mask(
			
 
				+    n_heads: int,
			
 
				+    q_len: int,
			
 
				+    max_seqlen: int,
			
 
				+    dtype: torch.dtype,
			
 
				+    device: torch.device,
			
 
				+    block_size: int = 64,
			
 
				+    local_blocks: int = 4,
			
 
				+    vert_stride: int = 4,
			
 
				+    homo_head: bool = True,
			
 
				+    return_dense: bool = False,
			
 
				+    dense_mask_type: str = "binary",
			
 
				+):
			
 
				+    """
			
 
				+    :param dense_mask_type: "binary" (0 for skip token, 1 for others)
			
 
				+        or "bias" (-inf for skip token, 0 or others)
			
 
				+    :return: a tuple of 3:
			
 
				+        - tuple of crow_indices, col_indices representation 
			
 
				+            of CSR format.
			
 
				+        - block dense mask
			
 
				+        - all token dense mask (be aware that it can be OOM if it 
			
 
				+            is too big) if `return_dense==True`, otherwise, None
			
 
				+    """
			
 
				+    assert dense_mask_type in ("binary", "bias")
			
 
				+    if homo_head:
			
 
				+        with torch.no_grad():
			
 
				+            (crow, col), block_mask_dense, mask_dense = (
			
 
				+                _get_sparse_attn_mask_homo_head(
			
 
				+                    q_len,
			
 
				+                    max_seqlen,
			
 
				+                    dtype,
			
 
				+                    device,
			
 
				+                    block_size,
			
 
				+                    local_blocks,
			
 
				+                    vert_stride,
			
 
				+                    return_dense,
			
 
				+                ))
			
 
				+            crow = crow[None].expand(n_heads, crow.shape[0])
			
 
				+            col = col[None].expand(n_heads, col.shape[0])
			
 
				+            if return_dense:
			
 
				+                mask_dense = mask_dense[None].expand(n_heads,
			
 
				+                                                     *mask_dense.shape)
			
 
				+                if dense_mask_type == "bias":
			
 
				+                    mask_dense = binary_mask_to_bias(mask_dense)
			
 
				+            return (crow, col), block_mask_dense, mask_dense
			
 
				+
			
 
				+    with torch.no_grad():
			
 
				+        num_blocks = triton.cdiv(max_seqlen, block_size)
			
 
				+        q_pos = torch.arange(num_blocks)[None, :, None]
			
 
				+        k_pos = torch.arange(num_blocks)[None, None]
			
 
				+        head_sliding_step = get_head_sliding_step(n_heads, vert_stride)
			
 
				+        mask_vert_strided = [
			
 
				+            (torch.arange(num_blocks) + h * head_sliding_step + 1) %
			
 
				+            vert_stride == 0 for h in range(n_heads)
			
 
				+        ]
			
 
				+        mask_vert_strided = torch.vstack(mask_vert_strided).unsqueeze(1)
			
 
				+        block_mask_dense = (((q_pos >= k_pos)
			
 
				+                             & ((q_pos - k_pos < local_blocks)
			
 
				+                                | mask_vert_strided)).to(device).to(dtype))
			
 
				+        num_blocks_q = triton.cdiv(q_len, block_size)
			
 
				+        block_mask_dense_output = block_mask_dense[:, -num_blocks_q:]
			
 
				+    if return_dense:
			
 
				+        mask_dense = torch.kron(
			
 
				+            block_mask_dense,
			
 
				+            block_mask_dense.new_ones((block_size, block_size)),
			
 
				+        )
			
 
				+        causal_mask = torch.tril(torch.ones(
			
 
				+            max_seqlen, max_seqlen)).type_as(mask_dense)[-q_len:]
			
 
				+        mask_dense = mask_dense[..., -q_len:, :max_seqlen] * causal_mask[None]
			
 
				+        if dense_mask_type == "bias":
			
 
				+            mask_dense = binary_mask_to_bias(mask_dense)
			
 
				+
			
 
				+        return (
			
 
				+            dense_to_crow_col(block_mask_dense_output),
			
 
				+            block_mask_dense,
			
 
				+            mask_dense,
			
 
				+        )
			
 
				+    else:
			
 
				+        return (
			
 
				+            dense_to_crow_col(block_mask_dense_output),
			
 
				+            block_mask_dense,
			
 
				+            None,
			
 
				+        )
			
--- a/aphrodite/attention/ops/paged_attn.py
+++ b/aphrodite/attention/ops/paged_attn.py
@@ -92,7 +92,20 @@ class PagedAttention:
 
				         scale: float,
			
 
				         alibi_slopes: Optional[torch.Tensor],
			
 
				         kv_scale: float,
			
 
				+        tp_rank: int = 0,
			
 
				+        blocksparse_local_blocks: int = 0,
			
 
				+        blocksparse_vert_stride: int = 0,
			
 
				+        blocksparse_block_size: int = 64,
			
 
				+        blocksparse_head_sliding_step: int = 0,
			
 
				     ) -> torch.Tensor:
			
 
				+        if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1:
			
 
				+            # use blocksparse paged attention
			
 
				+            block_size = value_cache.size(-1)
			
 
				+            assert (blocksparse_block_size > 0 and
			
 
				+                    blocksparse_block_size % block_size == 0), \
			
 
				+                (f"{blocksparse_block_size=} needs to be a multiple of"
			
 
				+                 f"{block_size=} used in block_tables.")
			
 
				+
			
 
				         output = torch.empty_like(query)
			
 
				 
			
 
				         block_size = value_cache.shape[3]
			
@@ -124,6 +137,11 @@ class PagedAttention:
 
				                 alibi_slopes,
			
 
				                 kv_cache_dtype,
			
 
				                 kv_scale,
			
 
				+                tp_rank,
			
 
				+                blocksparse_local_blocks,
			
 
				+                blocksparse_vert_stride,
			
 
				+                blocksparse_block_size,
			
 
				+                blocksparse_head_sliding_step,
			
 
				             )
			
 
				         else:
			
 
				             # Run PagedAttention V2.
			
@@ -156,6 +174,11 @@ class PagedAttention:
 
				                 alibi_slopes,
			
 
				                 kv_cache_dtype,
			
 
				                 kv_scale,
			
 
				+                tp_rank,
			
 
				+                blocksparse_local_blocks,
			
 
				+                blocksparse_vert_stride,
			
 
				+                blocksparse_block_size,
			
 
				+                blocksparse_head_sliding_step,
			
 
				             )
			
 
				         return output
			
 
				 
			
--- a/aphrodite/attention/selector.py
+++ b/aphrodite/attention/selector.py
@@ -5,6 +5,7 @@ from typing import Optional, Type
 
				 
			
 
				 import torch
			
 
				 from loguru import logger
			
 
				+
			
 
				 from aphrodite.attention.backends.abstract import AttentionBackend
			
 
				 from aphrodite.common.utils import is_cpu, is_hip
			
 
				 
			
@@ -28,7 +29,14 @@ def get_attn_backend(
 
				     dtype: torch.dtype,
			
 
				     kv_cache_dtype: Optional[str],
			
 
				     block_size: int,
			
 
				+    is_blocksparse: bool = False,
			
 
				 ) -> Type[AttentionBackend]:
			
 
				+
			
 
				+    if is_blocksparse:
			
 
				+        logger.info("Using BlocksparseFlashAttention backend.")
			
 
				+        from aphrodite.attention.backends.blocksparse_attn import \
			
 
				+            BlocksparseFlashAttentionBackend
			
 
				+        return BlocksparseFlashAttentionBackend
			
 
				     """Determine which attention backend to use and only import
			
 
				     the selected backend module.
			
 
				     """
			
@@ -38,7 +46,6 @@ def get_attn_backend(
 
				     if backend == _Backend.FLASH_ATTN:
			
 
				         from aphrodite.attention.backends.flash_attn import \
			
 
				             FlashAttentionBackend  # noqa: F401
			
 
				-        logger.info("Using FlashAttention backend.")
			
 
				         return FlashAttentionBackend
			
 
				     if backend == _Backend.XFORMERS:
			
 
				         logger.info("Using XFormers backend.")
			
@@ -136,8 +143,8 @@ def which_attn_to_use(
 
				         try:
			
 
				             import vllm_flash_attn  # noqa: F401
			
 
				 
			
 
				-            from aphrodite.attention.backends.flash_attn import (  # noqa: F401
			
 
				-                FlashAttentionBackend)
			
 
				+            from aphrodite.attention.backends.flash_attn import \
			
 
				+                FlashAttentionBackend  # noqa: F401
			
 
				 
			
 
				             supported_sizes = FlashAttentionBackend.get_supported_head_sizes()
			
 
				             if head_size not in supported_sizes:
			
--- a/aphrodite/endpoints/openai/serving_engine.py
+++ b/aphrodite/endpoints/openai/serving_engine.py
@@ -122,6 +122,7 @@ class OpenAIServing:
 
				                 token_logprob = step_top_logprobs[token_id].logprob
			
 
				                 token = step_top_logprobs[token_id].decoded_token
			
 
				                 logprobs.tokens.append(token)
			
 
				+                token_logprob = max(token_logprob, -9999.0)
			
 
				                 logprobs.token_logprobs.append(token_logprob)
			
 
				 
			
 
				                 if num_output_top_logprobs:
			
--- a/aphrodite/modeling/models/__init__.py
+++ b/aphrodite/modeling/models/__init__.py
@@ -54,6 +54,7 @@ _GENERATION_MODELS = {
 
				     "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
			
 
				     "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"),
			
 
				     "XverseForCausalLM": ("xverse", "XverseForCausalLM"),
			
 
				+    "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"),
			
 
				 }
			
 
				 
			
 
				 _EMBEDDING_MODELS = {
			
--- a/aphrodite/modeling/models/phi3_small.py
+++ b/aphrodite/modeling/models/phi3_small.py
@@ -0,0 +1,446 @@
 
				+import math
			
 
				+from typing import Iterable, List, Optional, Tuple
			
 
				+
			
 
				+import torch
			
 
				+from torch import nn
			
 
				+from transformers.configuration_utils import PretrainedConfig
			
 
				+
			
 
				+from aphrodite.attention import Attention, AttentionMetadata
			
 
				+from aphrodite.common.config import CacheConfig, LoRAConfig
			
 
				+from aphrodite.common.sequence import SamplerOutput
			
 
				+from aphrodite.distributed import (get_tensor_model_parallel_rank,
			
 
				+                                   get_tensor_model_parallel_world_size)
			
 
				+from aphrodite.modeling.layers.linear import (MergedColumnParallelLinear,
			
 
				+                                              QKVParallelLinear,
			
 
				+                                              RowParallelLinear)
			
 
				+from aphrodite.modeling.layers.logits_processor import LogitsProcessor
			
 
				+from aphrodite.modeling.layers.rotary_embedding import get_rope
			
 
				+from aphrodite.modeling.layers.sampler import Sampler
			
 
				+from aphrodite.modeling.layers.vocab_parallel_embedding import (
			
 
				+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
			
 
				+from aphrodite.modeling.model_loader.weight_utils import default_weight_loader
			
 
				+from aphrodite.modeling.sampling_metadata import SamplingMetadata
			
 
				+from aphrodite.quantization.base_config import QuantizationConfig
			
 
				+
			
 
				+
			
 
				+def load_column_parallel_weight(param: torch.nn.Parameter,
			
 
				+                                loaded_weight: torch.Tensor):
			
 
				+    tp = get_tensor_model_parallel_world_size()
			
 
				+    rk = get_tensor_model_parallel_rank()
			
 
				+    assert param.size(0) * tp == loaded_weight.size(0)
			
 
				+    s = rk * param.size(0)
			
 
				+    e = (rk + 1) * param.size(0)
			
 
				+    loaded_weight = loaded_weight[s:e]
			
 
				+    assert param.shape == loaded_weight.shape
			
 
				+    param.data.copy_(loaded_weight)
			
 
				+
			
 
				+
			
 
				+class HeadMajorQKVParallelLinear(QKVParallelLinear):
			
 
				+
			
 
				+    def weight_loader(self, param: torch.nn.Parameter,
			
 
				+                      loaded_weight: torch.Tensor):
			
 
				+        return load_column_parallel_weight(param, loaded_weight)
			
 
				+
			
 
				+
			
 
				+class HeadMajorColumnParallelLinear(MergedColumnParallelLinear):
			
 
				+
			
 
				+    def weight_loader(self, param: torch.nn.Parameter,
			
 
				+                      loaded_weight: torch.Tensor):
			
 
				+        return load_column_parallel_weight(param, loaded_weight)
			
 
				+
			
 
				+
			
 
				+@torch.jit.script
			
 
				+def quick_gelu(x):
			
 
				+    return x * torch.sigmoid(1.702 * x)
			
 
				+
			
 
				+
			
 
				+@torch.jit.script
			
 
				+def gegelu(input, limit: Optional[float] = None):
			
 
				+    a_gelu, a_linear = input[..., ::2], input[..., 1::2]
			
 
				+    if limit is not None:
			
 
				+        a_gelu = torch.where(torch.isinf(a_gelu), a_gelu,
			
 
				+                             a_gelu.clamp(min=None, max=limit))
			
 
				+        a_linear = torch.where(
			
 
				+            torch.isinf(a_linear),
			
 
				+            a_linear,
			
 
				+            a_linear.clamp(min=-limit, max=limit),
			
 
				+        )
			
 
				+    out_gelu = quick_gelu(a_gelu)
			
 
				+    return out_gelu * (a_linear + 1)
			
 
				+
			
 
				+
			
 
				+class Phi3SmallMLP(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: PretrainedConfig,
			
 
				+        quant_config: Optional[QuantizationConfig] = None,
			
 
				+    ) -> None:
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        assert (self.config.hidden_act == "gegelu"
			
 
				+                ), "Only `gegelu` is supported for the 4.7 series of models .."
			
 
				+        self.hidden_size = config.hidden_size
			
 
				+        self.gegelu_limit = config.gegelu_limit
			
 
				+        self.intermediate_size = config.intermediate_size
			
 
				+
			
 
				+        self.up_proj = HeadMajorColumnParallelLinear(
			
 
				+            self.hidden_size,
			
 
				+            2 * [self.intermediate_size],
			
 
				+            bias=True,
			
 
				+            quant_config=quant_config,
			
 
				+        )
			
 
				+        self.down_proj = RowParallelLinear(
			
 
				+            self.intermediate_size,
			
 
				+            self.hidden_size,
			
 
				+            bias=True,
			
 
				+            quant_config=quant_config,
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        gate_up, _ = self.up_proj(x)
			
 
				+        x = gegelu(gate_up)
			
 
				+        x, _ = self.down_proj(x)
			
 
				+        return x
			
 
				+
			
 
				+
			
 
				+class Phi3SmallSelfAttention(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: PretrainedConfig,
			
 
				+        layer_idx: int,
			
 
				+        cache_config: Optional[CacheConfig] = None,
			
 
				+        quant_config: Optional[QuantizationConfig] = None,
			
 
				+    ) -> None:
			
 
				+        super().__init__()
			
 
				+        self.layer_idx = layer_idx
			
 
				+        self.config = config
			
 
				+        self.sparse_block_size = config.blocksparse_block_size
			
 
				+        self.homo_heads = config.blocksparse_homo_head_pattern
			
 
				+        self.local_blocks = config.blocksparse_num_local_blocks
			
 
				+        self.vert_stride = config.blocksparse_vert_stride
			
 
				+
			
 
				+        assert (config.blocksparse_block_size ==
			
 
				+                config.blocksparse_triton_kernel_block_size)
			
 
				+
			
 
				+        self.hidden_size = config.hidden_size
			
 
				+        # Number of Query Heads
			
 
				+        self.num_heads = config.num_attention_heads
			
 
				+
			
 
				+        self.head_dim = self.hidden_size // self.num_heads
			
 
				+        self.tp_size = get_tensor_model_parallel_world_size()
			
 
				+        # Number of total Key Value Heads before tensor parallel
			
 
				+        self.num_key_value_heads = config.num_key_value_heads
			
 
				+        self.num_q_per_kv = self.num_heads // self.num_key_value_heads
			
 
				+        if self.tp_size > 1:
			
 
				+            assert self.num_key_value_heads % self.tp_size == 0
			
 
				+        self.num_kv_heads_per_partion = max(
			
 
				+            1, self.num_key_value_heads // self.tp_size)
			
 
				+        self.num_heads_per_partition = self.num_heads // self.tp_size
			
 
				+
			
 
				+        self.max_position_embeddings = config.max_position_embeddings
			
 
				+        self.rope_embedding_base = config.rope_embedding_base
			
 
				+        self.rope_position_scale = config.rope_position_scale
			
 
				+        self.is_causal = True
			
 
				+
			
 
				+        norm_factor = None
			
 
				+        if config.mup_use_scaling:
			
 
				+            norm_factor = self.head_dim / config.mup_attn_multiplier
			
 
				+        else:
			
 
				+            norm_factor = math.sqrt(self.head_dim)
			
 
				+        self.scale = 1 / norm_factor
			
 
				+
			
 
				+        self.query_key_value = HeadMajorQKVParallelLinear(
			
 
				+            self.hidden_size,
			
 
				+            self.head_dim,
			
 
				+            self.num_heads,
			
 
				+            self.num_key_value_heads,
			
 
				+            bias=True,
			
 
				+            quant_config=quant_config,
			
 
				+        )
			
 
				+
			
 
				+        self.dense = RowParallelLinear(self.hidden_size,
			
 
				+                                       self.hidden_size,
			
 
				+                                       bias=True,
			
 
				+                                       quant_config=quant_config)
			
 
				+
			
 
				+        if getattr(self.config, "rope_scaling", None) is not None:
			
 
				+            rope_scaling = self.config.rope_scaling
			
 
				+            for key in rope_scaling:
			
 
				+                if isinstance(rope_scaling[key], list):
			
 
				+                    rope_scaling[key] = tuple(rope_scaling[key])
			
 
				+
			
 
				+            if "factor" not in rope_scaling:
			
 
				+                rope_scaling["factor"] = self.rope_position_scale
			
 
				+        else:
			
 
				+            rope_scaling = {
			
 
				+                "type": "linear",
			
 
				+                "factor": self.rope_position_scale,
			
 
				+            }
			
 
				+
			
 
				+        self.rotary_emb = get_rope(
			
 
				+            self.head_dim,
			
 
				+            rotary_dim=self.head_dim,
			
 
				+            max_position=self.max_position_embeddings,
			
 
				+            base=self.rope_embedding_base,
			
 
				+            rope_scaling=rope_scaling,
			
 
				+        )
			
 
				+
			
 
				+        # blocksparse params
			
 
				+        self.blocksparse_block_size = config.blocksparse_block_size
			
 
				+        self.blocksparse_num_local_blocks = config.blocksparse_num_local_blocks
			
 
				+        self.blocksparse_vert_stride = config.blocksparse_vert_stride
			
 
				+
			
 
				+        use_dense_attn = (getattr(self.config,
			
 
				+                                  "dense_attention_every_n_layers", None)
			
 
				+                          and (self.layer_idx + 1) %
			
 
				+                          self.config.dense_attention_every_n_layers == 0)
			
 
				+
			
 
				+        bs_params = None
			
 
				+        if not use_dense_attn:
			
 
				+            bs_params = {
			
 
				+                'max_seqlen': self.max_position_embeddings,
			
 
				+                'num_heads': self.num_heads_per_partition,
			
 
				+                "num_kv_heads": self.num_kv_heads_per_partion,
			
 
				+                "block_size": self.sparse_block_size,
			
 
				+                "local_blocks": self.local_blocks,
			
 
				+                "vert_stride": self.vert_stride,
			
 
				+                "homo_head": self.homo_heads
			
 
				+            }
			
 
				+
			
 
				+        self.attn = Attention(
			
 
				+            self.num_heads_per_partition,
			
 
				+            self.head_dim,
			
 
				+            self.scale,
			
 
				+            num_kv_heads=self.num_kv_heads_per_partion,
			
 
				+            cache_config=cache_config,
			
 
				+            quant_config=quant_config,
			
 
				+            blocksparse_params=bs_params,
			
 
				+        )
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        positions: torch.Tensor,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        kv_cache: torch.Tensor,
			
 
				+        attn_metadata: AttentionMetadata,
			
 
				+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
			
 
				+               Optional[Tuple[torch.Tensor]]]:
			
 
				+        qkv, _ = self.query_key_value(hidden_states)
			
 
				+
			
 
				+        qkv = qkv.view(qkv.shape[:-1] +
			
 
				+                       (-1, (self.num_q_per_kv + 2), self.head_dim))
			
 
				+        q, k, v = qkv.split([self.num_q_per_kv, 1, 1], dim=-2)
			
 
				+
			
 
				+        # NOTE: this is required by RotaryEmbed, which indeed does not have to
			
 
				+        # TODO: allow 3D QK for rotary forward
			
 
				+        q = q.reshape(-1, self.head_dim * self.num_heads_per_partition)
			
 
				+        k = k.reshape(-1, self.head_dim * self.num_kv_heads_per_partion)
			
 
				+        v = v.reshape(-1, self.head_dim * self.num_kv_heads_per_partion)
			
 
				+
			
 
				+        q, k = self.rotary_emb(positions, q, k)
			
 
				+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata=attn_metadata)
			
 
				+        output, _ = self.dense(attn_output)
			
 
				+
			
 
				+        return output
			
 
				+
			
 
				+
			
 
				+class Phi3SmallDecoderLayer(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: PretrainedConfig,
			
 
				+        layer_idx: int,
			
 
				+        cache_config: Optional[CacheConfig] = None,
			
 
				+        quant_config: Optional[QuantizationConfig] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.hidden_size = config.hidden_size
			
 
				+        self.self_attn = Phi3SmallSelfAttention(config,
			
 
				+                                                layer_idx,
			
 
				+                                                cache_config=cache_config,
			
 
				+                                                quant_config=quant_config)
			
 
				+        self.mlp = Phi3SmallMLP(config, quant_config)
			
 
				+
			
 
				+        self.input_layernorm = nn.LayerNorm(config.hidden_size,
			
 
				+                                            eps=config.layer_norm_epsilon)
			
 
				+        self.post_attention_layernorm = nn.LayerNorm(
			
 
				+            config.hidden_size, eps=config.layer_norm_epsilon)
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        positions: torch.Tensor,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        kv_cache: torch.Tensor,
			
 
				+        attn_metadata: AttentionMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        residual = hidden_states
			
 
				+        hidden_states = self.input_layernorm(hidden_states)
			
 
				+
			
 
				+        hidden_states = self.self_attn(
			
 
				+            positions=positions,
			
 
				+            hidden_states=hidden_states,
			
 
				+            kv_cache=kv_cache,
			
 
				+            attn_metadata=attn_metadata,
			
 
				+        )
			
 
				+        hidden_states = residual + hidden_states
			
 
				+
			
 
				+        residual = hidden_states
			
 
				+        hidden_states = self.post_attention_layernorm(hidden_states)
			
 
				+        hidden_states = self.mlp(hidden_states)
			
 
				+        hidden_states = residual + hidden_states
			
 
				+        return hidden_states
			
 
				+
			
 
				+
			
 
				+class Phi3SmallModel(nn.Module):
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: PretrainedConfig,
			
 
				+        cache_config: Optional[CacheConfig] = None,
			
 
				+        quant_config: Optional[QuantizationConfig] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
			
 
				+                                                   config.hidden_size)
			
 
				+        self.mup_embedding_multiplier = config.mup_embedding_multiplier
			
 
				+        self.layers = nn.ModuleList([
			
 
				+            Phi3SmallDecoderLayer(config, layer_idx, cache_config,
			
 
				+                                  quant_config)
			
 
				+            for layer_idx in range(config.num_hidden_layers)
			
 
				+        ])
			
 
				+
			
 
				+        self.final_layernorm = nn.LayerNorm(config.hidden_size,
			
 
				+                                            eps=config.layer_norm_epsilon)
			
 
				+
			
 
				+    def get_input_embeddings(self):
			
 
				+        return self.embed_tokens
			
 
				+
			
 
				+    def set_input_embeddings(self, value):
			
 
				+        self.embed_tokens = value
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.LongTensor,
			
 
				+        positions: Optional[torch.LongTensor],
			
 
				+        kv_caches: List[torch.Tensor],
			
 
				+        attn_metadata: AttentionMetadata = None,
			
 
				+    ):
			
 
				+        hidden_states = self.embed_tokens(input_ids)
			
 
				+        if (self.mup_embedding_multiplier is not None
			
 
				+                and self.mup_embedding_multiplier > 0.0):
			
 
				+            hidden_states = hidden_states * self.mup_embedding_multiplier
			
 
				+        for i in range(len(self.layers)):
			
 
				+            layer = self.layers[i]
			
 
				+            hidden_states = layer(
			
 
				+                positions,
			
 
				+                hidden_states,
			
 
				+                kv_caches[i],
			
 
				+                attn_metadata,
			
 
				+            )
			
 
				+        hidden_states = self.final_layernorm(hidden_states)
			
 
				+        return hidden_states
			
 
				+
			
 
				+
			
 
				+class Phi3SmallForCausalLM(nn.Module):
			
 
				+    _tied_weights_keys = ["lm_head.weight"]
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config,
			
 
				+        cache_config: Optional[CacheConfig] = None,
			
 
				+        quant_config: Optional[QuantizationConfig] = None,
			
 
				+        lora_config: Optional[LoRAConfig] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        self.quant_config = quant_config
			
 
				+        self.model = Phi3SmallModel(config, cache_config, quant_config)
			
 
				+        self.vocab_size = config.vocab_size
			
 
				+        self.mup_width_multiplier = config.mup_width_multiplier
			
 
				+        self.lm_head = ParallelLMHead(
			
 
				+            self.vocab_size,
			
 
				+            config.hidden_size,
			
 
				+            org_num_embeddings=config.vocab_size,
			
 
				+            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
			
 
				+        )
			
 
				+        self.logits_processor = LogitsProcessor(config.vocab_size)
			
 
				+        self.sampler = Sampler()
			
 
				+
			
 
				+        # tokens in tiktoken but not used
			
 
				+        if hasattr(config, 'dummy_token_indices'):
			
 
				+            device = self.lm_head.weight.device
			
 
				+            self.register_buffer('dummy_token_indices',
			
 
				+                                 torch.LongTensor(
			
 
				+                                     config.dummy_token_indices).to(device),
			
 
				+                                 persistent=False)
			
 
				+        else:
			
 
				+            self.dummy_token_indices = None
			
 
				+
			
 
				+    def get_input_embeddings(self):
			
 
				+        return self.model.embed_tokens
			
 
				+
			
 
				+    def set_input_embeddings(self, value):
			
 
				+        self.model.embed_tokens = value
			
 
				+
			
 
				+    def get_output_embeddings(self):
			
 
				+        return self.lm_head
			
 
				+
			
 
				+    def set_output_embeddings(self, value):
			
 
				+        self.lm_head = value
			
 
				+
			
 
				+    def set_decoder(self, decoder):
			
 
				+        self.model = decoder
			
 
				+
			
 
				+    def get_decoder(self):
			
 
				+        return self.model
			
 
				+
			
 
				+    def compute_logits(self, hidden_states: torch.Tensor,
			
 
				+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
			
 
				+        logits = self.logits_processor(self.lm_head.weight, hidden_states,
			
 
				+                                       sampling_metadata)
			
 
				+        if self.dummy_token_indices is not None and logits is not None:
			
 
				+            logits.index_fill_(-1, self.dummy_token_indices, -torch.inf)
			
 
				+        return logits
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.LongTensor,
			
 
				+        positions: Optional[torch.LongTensor],
			
 
				+        kv_caches: List[torch.Tensor],
			
 
				+        attn_metadata: AttentionMetadata,
			
 
				+    ) -> torch.Tensor:
			
 
				+        output_hidden_states = self.model(
			
 
				+            input_ids=input_ids,
			
 
				+            positions=positions,
			
 
				+            kv_caches=kv_caches,
			
 
				+            attn_metadata=attn_metadata,
			
 
				+        )
			
 
				+        output_hidden_states = output_hidden_states
			
 
				+        return output_hidden_states
			
 
				+
			
 
				+    def sample(
			
 
				+        self,
			
 
				+        logits: torch.Tensor,
			
 
				+        sampling_metadata: SamplingMetadata,
			
 
				+    ) -> Optional[SamplerOutput]:
			
 
				+
			
 
				+        next_tokens = self.sampler(logits / self.mup_width_multiplier,
			
 
				+                                   sampling_metadata)
			
 
				+        return next_tokens
			
 
				+
			
 
				+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
			
 
				+
			
 
				+        params_dict = dict(self.named_parameters())
			
 
				+        for name, loaded_weight in weights:
			
 
				+            if "rotary_emb.inv_freq" in name:
			
 
				+                continue
			
 
				+            if name.endswith(".bias") and name not in params_dict:
			
 
				+                continue
			
 
				+            param = params_dict[name]
			
 
				+            weight_loader = getattr(param, "weight_loader",
			
 
				+                                    default_weight_loader)
			
 
				+            weight_loader(param, loaded_weight)
			
 
				+        self.lm_head.weight.data.copy_(self.model.embed_tokens.weight.data)
			
--- a/kernels/attention/attention_kernels.cu
+++ b/kernels/attention/attention_kernels.cu
@@ -86,6 +86,7 @@ inline __device__ float block_sum(float* red_smem, float sum) {
 
				 // Grid: (num_heads, num_seqs, max_num_partitions).
			
 
				 template <typename scalar_t, typename cache_t, int HEAD_SIZE, int BLOCK_SIZE,
			
 
				           int NUM_THREADS, aphrodite::Fp8KVCacheDataType KV_DTYPE,
			
 
				+          bool IS_BLOCK_SPARSE,
			
 
				           int PARTITION_SIZE = 0>  // Zero means no partitioning.
			
 
				 __device__ void paged_attention_kernel(
			
 
				     float* __restrict__ exp_sums,  // [num_seqs, num_heads, max_num_partitions]
			
@@ -105,7 +106,9 @@ __device__ void paged_attention_kernel(
 
				     const int max_num_blocks_per_seq,
			
 
				     const float* __restrict__ alibi_slopes,  // [num_heads]
			
 
				     const int q_stride, const int kv_block_stride, const int kv_head_stride,
			
 
				-    const float kv_scale) {
			
 
				+    const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
			
 
				+    const int blocksparse_vert_stride, const int blocksparse_block_size,
			
 
				+    const int blocksparse_head_sliding_step) {
			
 
				   const int seq_idx = blockIdx.y;
			
 
				   const int partition_idx = blockIdx.z;
			
 
				   const int max_num_partitions = gridDim.z;
			
@@ -172,8 +175,8 @@ __device__ void paged_attention_kernel(
 
				   // Each thread in a thread group has a different part of the query.
			
 
				   // For example, if the the thread group size is 4, then the first thread in
			
 
				   // the group has 0, 4, 8, ... th vectors of the query, and the second thread
			
 
				-  // has 1, 5, 9, ... th vectors of the query, and so on. NOTE: Because q is
			
 
				-  // split from a qkv tensor, it may not be contiguous.
			
 
				+  // has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because
			
 
				+  // q is split from a qkv tensor, it may not be contiguous.
			
 
				   const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
			
 
				   __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
			
 
				 #pragma unroll
			
@@ -183,8 +186,8 @@ __device__ void paged_attention_kernel(
 
				     q_vecs[thread_group_offset][i] =
			
 
				         *reinterpret_cast<const Q_vec*>(q_ptr + vec_idx * VEC_SIZE);
			
 
				   }
			
 
				-  __syncthreads();  // TODO: possible speedup if this is replaced with a memory
			
 
				-                    // wall right before we use q_vecs
			
 
				+  __syncthreads();  // TODO: possible speedup if this is replaced with a
			
 
				+                    // memory wall right before we use q_vecs
			
 
				 
			
 
				   // Memory planning.
			
 
				   extern __shared__ char shared_mem[];
			
@@ -203,11 +206,55 @@ __device__ void paged_attention_kernel(
 
				   // Each thread group in a warp fetches a key from the block, and computes
			
 
				   // dot product with the query.
			
 
				   const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
			
 
				+
			
 
				+  // blocksparse specific vars
			
 
				+  int bs_block_offset;
			
 
				+  int q_bs_block_id;
			
 
				+  if constexpr (IS_BLOCK_SPARSE) {
			
 
				+    // const int num_blocksparse_blocks = DIVIDE_ROUND_UP(seq_len,
			
 
				+    // blocksparse_block_size);
			
 
				+    q_bs_block_id = (seq_len - 1) / blocksparse_block_size;
			
 
				+    if (blocksparse_head_sliding_step >= 0)
			
 
				+      // sliding on q heads
			
 
				+      bs_block_offset =
			
 
				+          (tp_rank * num_heads + head_idx) * blocksparse_head_sliding_step + 1;
			
 
				+    else
			
 
				+      // sliding on kv heads
			
 
				+      bs_block_offset = (tp_rank * num_kv_heads + kv_head_idx) *
			
 
				+                            (-blocksparse_head_sliding_step) +
			
 
				+                        1;
			
 
				+  }
			
 
				+
			
 
				   for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx;
			
 
				        block_idx += NUM_WARPS) {
			
 
				-    // NOTE: The block number is stored in int32. However, we cast it to int64
			
 
				-    // because int32 can lead to overflow when this variable is multiplied by
			
 
				-    // large numbers (e.g., kv_block_stride).
			
 
				+    // NOTE: The block number is stored in int32. However, we cast it to
			
 
				+    // int64 because int32 can lead to overflow when this variable is multiplied
			
 
				+    // by large numbers (e.g., kv_block_stride).
			
 
				+    // For blocksparse attention: skip computation on blocks that are not
			
 
				+    // attended
			
 
				+    if constexpr (IS_BLOCK_SPARSE) {
			
 
				+      const int k_bs_block_id = block_idx * BLOCK_SIZE / blocksparse_block_size;
			
 
				+      const bool is_remote =
			
 
				+          ((k_bs_block_id + bs_block_offset) % blocksparse_vert_stride == 0);
			
 
				+      const bool is_local =
			
 
				+          (k_bs_block_id > q_bs_block_id - blocksparse_local_blocks);
			
 
				+      if (!is_remote && !is_local) {
			
 
				+        for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
			
 
				+          const int physical_block_offset =
			
 
				+              (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
			
 
				+          const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
			
 
				+
			
 
				+          if (thread_group_offset == 0) {
			
 
				+            // NOTE: assign very large number to skipped tokens to
			
 
				+            // avoid contribution to the sumexp softmax normalizer. This will
			
 
				+            // not be used at computing sum(softmax*v) as the blocks will be
			
 
				+            // skipped.
			
 
				+            logits[token_idx - start_token_idx] = -FLT_MAX;
			
 
				+          }
			
 
				+        }
			
 
				+        continue;
			
 
				+      }
			
 
				+    }
			
 
				     const int64_t physical_block_number =
			
 
				         static_cast<int64_t>(block_table[block_idx]);
			
 
				 
			
@@ -333,9 +380,18 @@ __device__ void paged_attention_kernel(
 
				   zero(zero_value);
			
 
				   for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx;
			
 
				        block_idx += NUM_WARPS) {
			
 
				-    // NOTE: The block number is stored in int32. However, we cast it to int64
			
 
				-    // because int32 can lead to overflow when this variable is multiplied by
			
 
				-    // large numbers (e.g., kv_block_stride).
			
 
				+    // NOTE: The block number is stored in int32. However, we cast it to
			
 
				+    // int64 because int32 can lead to overflow when this variable is multiplied
			
 
				+    // by large numbers (e.g., kv_block_stride).
			
 
				+    // For blocksparse attention: skip computation on blocks that are not
			
 
				+    // attended
			
 
				+    if constexpr (IS_BLOCK_SPARSE) {
			
 
				+      int v_bs_block_id = block_idx * BLOCK_SIZE / blocksparse_block_size;
			
 
				+      if (!((v_bs_block_id + bs_block_offset) % blocksparse_vert_stride == 0) &&
			
 
				+          !((v_bs_block_id > q_bs_block_id - blocksparse_local_blocks))) {
			
 
				+        continue;
			
 
				+      }
			
 
				+    }
			
 
				     const int64_t physical_block_number =
			
 
				         static_cast<int64_t>(block_table[block_idx]);
			
 
				     const int physical_block_offset = (lane % NUM_V_VECS_PER_ROW) * V_VEC_SIZE;
			
@@ -363,9 +419,9 @@ __device__ void paged_attention_kernel(
 
				                                                                     kv_scale);
			
 
				         }
			
 
				         if (block_idx == num_seq_blocks - 1) {
			
 
				-          // NOTE: When v_vec contains the tokens that are out of the context,
			
 
				-          // we should explicitly zero out the values since they may contain
			
 
				-          // NaNs.
			
 
				+          // NOTE: When v_vec contains the tokens that are out of the
			
 
				+          // context, we should explicitly zero out the values since they may
			
 
				+          // contain NaNs.
			
 
				           scalar_t* v_vec_ptr = reinterpret_cast<scalar_t*>(&v_vec);
			
 
				 #pragma unroll
			
 
				           for (int j = 0; j < V_VEC_SIZE; j++) {
			
@@ -388,8 +444,8 @@ __device__ void paged_attention_kernel(
 
				     accs[i] = acc;
			
 
				   }
			
 
				 
			
 
				-  // NOTE: A barrier is required because the shared memory space for logits
			
 
				-  // is reused for the output.
			
 
				+  // NOTE: A barrier is required because the shared memory space for
			
 
				+  // logits is reused for the output.
			
 
				   __syncthreads();
			
 
				 
			
 
				   // Perform reduction across warps.
			
@@ -441,8 +497,8 @@ __device__ void paged_attention_kernel(
 
				 
			
 
				 // Grid: (num_heads, num_seqs, 1).
			
 
				 template <typename scalar_t, typename cache_t, int HEAD_SIZE, int BLOCK_SIZE,
			
 
				-          int NUM_THREADS,
			
 
				-          aphrodite::Fp8KVCacheDataType KV_DTYPE>
			
 
				+          int NUM_THREADS, aphrodite::Fp8KVCacheDataType KV_DTYPE,
			
 
				+          bool IS_BLOCK_SPARSE>
			
 
				 __global__ void paged_attention_v1_kernel(
			
 
				     scalar_t* __restrict__ out,           // [num_seqs, num_heads, head_size]
			
 
				     const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
			
@@ -457,18 +513,23 @@ __global__ void paged_attention_v1_kernel(
 
				     const int max_num_blocks_per_seq,
			
 
				     const float* __restrict__ alibi_slopes,  // [num_heads]
			
 
				     const int q_stride, const int kv_block_stride, const int kv_head_stride,
			
 
				-    const float kv_scale) {
			
 
				+    const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
			
 
				+    const int blocksparse_vert_stride, const int blocksparse_block_size,
			
 
				+    const int blocksparse_head_sliding_step) {
			
 
				   paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,
			
 
				-                         KV_DTYPE>(
			
 
				+                         KV_DTYPE, IS_BLOCK_SPARSE>(
			
 
				       /* exp_sums */ nullptr, /* max_logits */ nullptr, out, q, k_cache,
			
 
				       v_cache, num_kv_heads, scale, block_tables, seq_lens,
			
 
				       max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride,
			
 
				-      kv_head_stride, kv_scale);
			
 
				+      kv_head_stride, kv_scale, tp_rank, blocksparse_local_blocks,
			
 
				+      blocksparse_vert_stride, blocksparse_block_size,
			
 
				+      blocksparse_head_sliding_step);
			
 
				 }
			
 
				 
			
 
				 // Grid: (num_heads, num_seqs, max_num_partitions).
			
 
				 template <typename scalar_t, typename cache_t, int HEAD_SIZE, int BLOCK_SIZE,
			
 
				           int NUM_THREADS, aphrodite::Fp8KVCacheDataType KV_DTYPE,
			
 
				+          bool IS_BLOCK_SPARSE,
			
 
				           int PARTITION_SIZE>
			
 
				 __global__ void paged_attention_v2_kernel(
			
 
				     float* __restrict__ exp_sums,  // [num_seqs, num_heads, max_num_partitions]
			
@@ -488,12 +549,16 @@ __global__ void paged_attention_v2_kernel(
 
				     const int max_num_blocks_per_seq,
			
 
				     const float* __restrict__ alibi_slopes,  // [num_heads]
			
 
				     const int q_stride, const int kv_block_stride, const int kv_head_stride,
			
 
				-    const float kv_scale) {
			
 
				+    const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
			
 
				+    const int blocksparse_vert_stride, const int blocksparse_block_size,
			
 
				+    const int blocksparse_head_sliding_step) {
			
 
				   paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,
			
 
				-                         KV_DTYPE, PARTITION_SIZE>(
			
 
				+                         KV_DTYPE, IS_BLOCK_SPARSE, PARTITION_SIZE>(
			
 
				       exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
			
 
				       block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes, q_stride,
			
 
				-      kv_block_stride, kv_head_stride, kv_scale);
			
 
				+      kv_block_stride, kv_head_stride, kv_scale, tp_rank,
			
 
				+      blocksparse_local_blocks, blocksparse_vert_stride, blocksparse_block_size,
			
 
				+      blocksparse_head_sliding_step);
			
 
				 }
			
 
				 
			
 
				 // Grid: (num_heads, num_seqs).
			
@@ -605,27 +670,34 @@ __global__ void paged_attention_v2_reduce_kernel(
 
				 
			
 
				 }  // namespace aphrodite
			
 
				 
			
 
				-#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE)                                \
			
 
				-  APHRODITE_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(                \
			
 
				-      ((void*)aphrodite::paged_attention_v1_kernel<                         \
			
 
				-          T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, KV_DTYPE>),       \
			
 
				-      shared_mem_size);                                                     \
			
 
				-  aphrodite::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE,   \
			
 
				-                                       NUM_THREADS, KV_DTYPE>               \
			
 
				-      <<<grid, block, shared_mem_size, stream>>>(                           \
			
 
				-          out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \
			
 
				-          scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq,    \
			
 
				-          alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,      \
			
 
				-          kv_scale);
			
 
				+#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE)                                   \
			
 
				+  APHRODITE_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(                   \
			
 
				+      ((void*)aphrodite::paged_attention_v1_kernel<                            \
			
 
				+          T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, KV_DTYPE,            \
			
 
				+          IS_BLOCK_SPARSE>),                                                   \
			
 
				+      shared_mem_size);                                                        \
			
 
				+  aphrodite::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE,      \
			
 
				+                                       NUM_THREADS, KV_DTYPE, IS_BLOCK_SPARSE> \
			
 
				+      <<<grid, block, shared_mem_size, stream>>>(                              \
			
 
				+          out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads,    \
			
 
				+          scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq,       \
			
 
				+          alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,         \
			
 
				+          kv_scale, tp_rank, blocksparse_local_blocks,                         \
			
 
				+          blocksparse_vert_stride, blocksparse_block_size,                     \
			
 
				+          blocksparse_head_sliding_step);
			
 
				 
			
 
				 // TODO: Tune NUM_THREADS.
			
 
				 template <typename T, typename CACHE_T, int BLOCK_SIZE,
			
 
				-          aphrodite::Fp8KVCacheDataType KV_DTYPE, int NUM_THREADS = 128>
			
 
				+          aphrodite::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE,
			
 
				+          int NUM_THREADS = 128>
			
 
				 void paged_attention_v1_launcher(
			
 
				     torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
			
 
				     torch::Tensor& value_cache, int num_kv_heads, float scale,
			
 
				     torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
			
 
				-    const c10::optional<torch::Tensor>& alibi_slopes, float kv_scale) {
			
 
				+    const c10::optional<torch::Tensor>& alibi_slopes, float kv_scale,
			
 
				+    const int tp_rank, const int blocksparse_local_blocks,
			
 
				+    const int blocksparse_vert_stride, const int blocksparse_block_size,
			
 
				+    const int blocksparse_head_sliding_step) {
			
 
				   int num_seqs = query.size(0);
			
 
				   int num_heads = query.size(1);
			
 
				   int head_size = query.size(2);
			
@@ -692,23 +764,36 @@ void paged_attention_v1_launcher(
 
				   }
			
 
				 }
			
 
				 
			
 
				-#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE)                   \
			
 
				-  paged_attention_v1_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE>(             \
			
 
				+#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE)  \
			
 
				+  paged_attention_v1_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE,              \
			
 
				+                              IS_BLOCK_SPARSE>(                              \
			
 
				       out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
			
 
				-      seq_lens, max_seq_len, alibi_slopes, kv_scale);
			
 
				+      seq_lens, max_seq_len, alibi_slopes, kv_scale, tp_rank,                \
			
 
				+      blocksparse_local_blocks, blocksparse_vert_stride,                     \
			
 
				+      blocksparse_block_size, blocksparse_head_sliding_step);
			
 
				+
			
 
				+#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
			
 
				+  switch (is_block_sparse) {                                               \
			
 
				+    case true:                                                             \
			
 
				+      CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);     \
			
 
				+      break;                                                               \
			
 
				+    case false:                                                            \
			
 
				+      CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);    \
			
 
				+      break;                                                               \
			
 
				+  }
			
 
				 
			
 
				 // NOTE: To reduce the compilation time, we omitted block sizes
			
 
				 // 1, 2, 4, 64, 128, 256.
			
 
				 #define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE)         \
			
 
				   switch (block_size) {                                           \
			
 
				     case 8:                                                       \
			
 
				-      CALL_V1_LAUNCHER(T, CACHE_T, 8, KV_DTYPE);                  \
			
 
				+      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE);         \
			
 
				       break;                                                      \
			
 
				     case 16:                                                      \
			
 
				-      CALL_V1_LAUNCHER(T, CACHE_T, 16, KV_DTYPE);                 \
			
 
				+      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE);        \
			
 
				       break;                                                      \
			
 
				     case 32:                                                      \
			
 
				-      CALL_V1_LAUNCHER(T, CACHE_T, 32, KV_DTYPE);                 \
			
 
				+      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE);        \
			
 
				       break;                                                      \
			
 
				     default:                                                      \
			
 
				       TORCH_CHECK(false, "Unsupported block size: ", block_size); \
			
@@ -728,18 +813,26 @@ void paged_attention_v1(
 
				     torch::Tensor& seq_lens,      // [num_seqs]
			
 
				     int block_size, int max_seq_len,
			
 
				     const c10::optional<torch::Tensor>& alibi_slopes,
			
 
				-    const std::string& kv_cache_dtype, float kv_scale){
			
 
				+    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
			
 
				+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
			
 
				+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
			
 
				+  const bool is_block_sparse = (blocksparse_vert_stride > 1);
			
 
				+
			
 
				+  DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
			
 
				+                             CALL_V1_LAUNCHER_BLOCK_SIZE)
			
 
				+}
			
 
				 
			
 
				-    DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
			
 
				-                               CALL_V1_LAUNCHER_BLOCK_SIZE)}
			
 
				 #define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE)                                   \
			
 
				   aphrodite::paged_attention_v2_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE,      \
			
 
				-                                       NUM_THREADS, KV_DTYPE, PARTITION_SIZE>  \
			
 
				+                                       NUM_THREADS, KV_DTYPE, IS_BLOCK_SPARSE, \
			
 
				+                                       PARTITION_SIZE>                         \
			
 
				       <<<grid, block, shared_mem_size, stream>>>(                              \
			
 
				           exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \
			
 
				           value_cache_ptr, num_kv_heads, scale, block_tables_ptr,              \
			
 
				           seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride,    \
			
 
				-          kv_block_stride, kv_head_stride, kv_scale);                          \
			
 
				+          kv_block_stride, kv_head_stride, kv_scale, tp_rank,                  \
			
 
				+          blocksparse_local_blocks, blocksparse_vert_stride,                   \
			
 
				+          blocksparse_block_size, blocksparse_head_sliding_step);              \
			
 
				   aphrodite::paged_attention_v2_reduce_kernel<T, HEAD_SIZE, NUM_THREADS,       \
			
 
				                                               PARTITION_SIZE>                  \
			
 
				       <<<reduce_grid, block, reduce_shared_mem_size, stream>>>(                \
			
@@ -747,14 +840,17 @@ void paged_attention_v1(
 
				           max_num_partitions);
			
 
				 
			
 
				 template <typename T, typename CACHE_T, int BLOCK_SIZE,
			
 
				-          aphrodite::Fp8KVCacheDataType KV_DTYPE, int NUM_THREADS = 128,
			
 
				-          int PARTITION_SIZE = 512>
			
 
				+          aphrodite::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE,
			
 
				+          int NUM_THREADS = 128, int PARTITION_SIZE = 512>
			
 
				 void paged_attention_v2_launcher(
			
 
				     torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
			
 
				     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
			
 
				     torch::Tensor& value_cache, int num_kv_heads, float scale,
			
 
				     torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
			
 
				-    const c10::optional<torch::Tensor>& alibi_slopes, float kv_scale) {
			
 
				+    const c10::optional<torch::Tensor>& alibi_slopes, float kv_scale,
			
 
				+    const int tp_rank, const int blocksparse_local_blocks,
			
 
				+    const int blocksparse_vert_stride, const int blocksparse_block_size,
			
 
				+    const int blocksparse_head_sliding_step) {
			
 
				   int num_seqs = query.size(0);
			
 
				   int num_heads = query.size(1);
			
 
				   int head_size = query.size(2);
			
@@ -825,24 +921,36 @@ void paged_attention_v2_launcher(
 
				   }
			
 
				 }
			
 
				 
			
 
				-#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE)                    \
			
 
				-  paged_attention_v2_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE>(              \
			
 
				+#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE)   \
			
 
				+  paged_attention_v2_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE,               \
			
 
				+                              IS_BLOCK_SPARSE>(                               \
			
 
				       out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,      \
			
 
				       num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \
			
 
				-      kv_scale);
			
 
				+      kv_scale, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride,   \
			
 
				+      blocksparse_block_size, blocksparse_head_sliding_step);
			
 
				+
			
 
				+#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
			
 
				+  switch (is_block_sparse) {                                               \
			
 
				+    case true:                                                             \
			
 
				+      CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);     \
			
 
				+      break;                                                               \
			
 
				+    case false:                                                            \
			
 
				+      CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);    \
			
 
				+      break;                                                               \
			
 
				+  }
			
 
				 
			
 
				 // NOTE: To reduce the compilation time, we omitted block sizes
			
 
				 // 1, 2, 4, 64, 128, 256.
			
 
				 #define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE)         \
			
 
				   switch (block_size) {                                           \
			
 
				     case 8:                                                       \
			
 
				-      CALL_V2_LAUNCHER(T, CACHE_T, 8, KV_DTYPE);                  \
			
 
				+      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE);         \
			
 
				       break;                                                      \
			
 
				     case 16:                                                      \
			
 
				-      CALL_V2_LAUNCHER(T, CACHE_T, 16, KV_DTYPE);                 \
			
 
				+      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE);        \
			
 
				       break;                                                      \
			
 
				     case 32:                                                      \
			
 
				-      CALL_V2_LAUNCHER(T, CACHE_T, 32, KV_DTYPE);                 \
			
 
				+      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE);        \
			
 
				       break;                                                      \
			
 
				     default:                                                      \
			
 
				       TORCH_CHECK(false, "Unsupported block size: ", block_size); \
			
@@ -866,7 +974,10 @@ void paged_attention_v2(
 
				     torch::Tensor& seq_lens,      // [num_seqs]
			
 
				     int block_size, int max_seq_len,
			
 
				     const c10::optional<torch::Tensor>& alibi_slopes,
			
 
				-    const std::string& kv_cache_dtype, float kv_scale) {
			
 
				+    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
			
 
				+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
			
 
				+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
			
 
				+  const bool is_block_sparse = (blocksparse_vert_stride > 1);
			
 
				   DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
			
 
				                              CALL_V2_LAUNCHER_BLOCK_SIZE)
			
 
				 }
			
--- a/kernels/cpu/attention.cpp
+++ b/kernels/cpu/attention.cpp
@@ -415,14 +415,17 @@ void paged_attention_v1_impl_launcher(
 
				   }
			
 
				 }  // namespace
			
 
				 
			
 
				-void paged_attention_v1(torch::Tensor& out, torch::Tensor& query,
			
 
				-                        torch::Tensor& key_cache, torch::Tensor& value_cache,
			
 
				-                        int num_kv_heads, float scale,
			
 
				-                        torch::Tensor& block_tables, torch::Tensor& seq_lens,
			
 
				-                        int block_size, int max_seq_len,
			
 
				-                        const c10::optional<torch::Tensor>& alibi_slopes,
			
 
				-                        const std::string& kv_cache_dtype, float kv_scale) {
			
 
				+void paged_attention_v1(
			
 
				+    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
			
 
				+    torch::Tensor& value_cache, int num_kv_heads, float scale,
			
 
				+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
			
 
				+    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
			
 
				+    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
			
 
				+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
			
 
				+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
			
 
				   TORCH_CHECK(kv_scale == 1.0f);
			
 
				+  TORCH_CHECK(blocksparse_vert_stride <= 1,
			
 
				+              "CPU backend does not support blocksparse attention yet.");
			
 
				   APHRODITE_DISPATCH_FLOATING_TYPES(
			
 
				       query.scalar_type(), "paged_attention_v1_impl", [&] {
			
 
				         CPU_KERNEL_GUARD_IN(paged_attention_v1_impl)
			
@@ -726,16 +729,18 @@ void paged_attention_v2_impl_launcher(
 
				   }
			
 
				 }  // namespace
			
 
				 
			
 
				-void paged_attention_v2(torch::Tensor& out, torch::Tensor& exp_sums,
			
 
				-                        torch::Tensor& max_logits, torch::Tensor& tmp_out,
			
 
				-                        torch::Tensor& query, torch::Tensor& key_cache,
			
 
				-                        torch::Tensor& value_cache, int num_kv_heads,
			
 
				-                        float scale, torch::Tensor& block_tables,
			
 
				-                        torch::Tensor& seq_lens, int block_size,
			
 
				-                        int max_seq_len,
			
 
				-                        const c10::optional<torch::Tensor>& alibi_slopes,
			
 
				-                        const std::string& kv_cache_dtype, float kv_scale) {
			
 
				+void paged_attention_v2(
			
 
				+    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
			
 
				+    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
			
 
				+    torch::Tensor& value_cache, int num_kv_heads, float scale,
			
 
				+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
			
 
				+    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
			
 
				+    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
			
 
				+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
			
 
				+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
			
 
				   TORCH_CHECK(kv_scale == 1.0f);
			
 
				+  TORCH_CHECK(blocksparse_vert_stride <= 1,
			
 
				+              "CPU backend does not support blocksparse attention yet.");
			
 
				   APHRODITE_DISPATCH_FLOATING_TYPES(
			
 
				       query.scalar_type(), "paged_attention_v2_impl", [&] {
			
 
				         CPU_KERNEL_GUARD_IN(paged_attention_v2_impl)
			
--- a/kernels/ops.h
+++ b/kernels/ops.h
@@ -2,23 +2,24 @@
 
				 
			
 
				 #include <torch/extension.h>
			
 
				 
			
 
				-void paged_attention_v1(torch::Tensor& out, torch::Tensor& query,
			
 
				-                        torch::Tensor& key_cache, torch::Tensor& value_cache,
			
 
				-                        int num_kv_heads, float scale,
			
 
				-                        torch::Tensor& block_tables, torch::Tensor& seq_lens,
			
 
				-                        int block_size, int max_seq_len,
			
 
				-                        const c10::optional<torch::Tensor>& alibi_slopes,
			
 
				-                        const std::string& kv_cache_dtype, float kv_scale);
			
 
				+void paged_attention_v1(
			
 
				+    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
			
 
				+    torch::Tensor& value_cache, int num_kv_heads, float scale,
			
 
				+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
			
 
				+    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
			
 
				+    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
			
 
				+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
			
 
				+    const int blocksparse_block_size, const int blocksparse_head_sliding_step);
			
 
				 
			
 
				-void paged_attention_v2(torch::Tensor& out, torch::Tensor& exp_sums,
			
 
				-                        torch::Tensor& max_logits, torch::Tensor& tmp_out,
			
 
				-                        torch::Tensor& query, torch::Tensor& key_cache,
			
 
				-                        torch::Tensor& value_cache, int num_kv_heads,
			
 
				-                        float scale, torch::Tensor& block_tables,
			
 
				-                        torch::Tensor& seq_lens, int block_size,
			
 
				-                        int max_seq_len,
			
 
				-                        const c10::optional<torch::Tensor>& alibi_slopes,
			
 
				-                        const std::string& kv_cache_dtype, float kv_scale);
			
 
				+void paged_attention_v2(
			
 
				+    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
			
 
				+    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
			
 
				+    torch::Tensor& value_cache, int num_kv_heads, float scale,
			
 
				+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
			
 
				+    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
			
 
				+    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
			
 
				+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
			
 
				+    const int blocksparse_block_size, const int blocksparse_head_sliding_step);
			
 
				 
			
 
				 void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
			
 
				               float epsilon);