1 місяць тому · 7fffa507ff
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -222,7 +222,72 @@ if(APHRODITE_GPU_LANG STREQUAL "CUDA")
 
				     "kernels/quantization/fp8/fp8_marlin.cu"
			
 
				     "kernels/all_reduce/custom_all_reduce.cu"
			
 
				     "kernels/permute_cols.cu"
			
 
				-    "kernels/sampling/sampling.cu")
			
 
				+    "kernels/sampling/sampling.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim32_bf16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim32_bf16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim32_fp16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim32_fp16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim64_bf16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim64_bf16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim64_fp16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim64_fp16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim96_bf16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim96_bf16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim96_fp16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim96_fp16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim128_bf16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim128_bf16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim128_fp16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim128_fp16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim160_bf16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim160_bf16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim160_fp16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim160_fp16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim192_bf16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim192_bf16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim192_fp16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim192_fp16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim224_bf16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim224_bf16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim224_fp16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim224_fp16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim256_bf16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim256_bf16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim256_fp16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_hdim256_fp16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim32_bf16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim32_bf16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim32_fp16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim32_fp16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim64_bf16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim64_bf16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim64_fp16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim64_fp16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim96_bf16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim96_bf16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim96_fp16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim96_fp16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim128_bf16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim128_bf16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim128_fp16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim128_fp16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim160_bf16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim160_bf16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim160_fp16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim160_fp16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim192_bf16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim192_bf16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim192_fp16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim192_fp16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim224_bf16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim224_bf16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim224_fp16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim224_fp16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim256_bf16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim256_bf16_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim256_fp16_causal_sm80.cu"
			
 
				+    "kernels/flash_attn/flash_fwd_split_hdim256_fp16_sm80.cu"
			
 
				+    )
			
 
				 
			
 
				   if(NOT MSVC)
			
 
				     # Include CUTLASS only when needed
			
@@ -361,16 +426,16 @@ if(APHRODITE_GPU_LANG STREQUAL "HIP")
 
				     WITH_SOABI)
			
 
				 endif()
			
 
				 
			
 
				-
			
 
				 if(APHRODITE_GPU_LANG STREQUAL "CUDA" OR APHRODITE_GPU_LANG STREQUAL "HIP")
			
 
				   message(STATUS "Enabling C extension.")
			
 
				   add_dependencies(default _C)
			
 
				 
			
 
				   message(STATUS "Enabling moe extension.")
			
 
				   add_dependencies(default _moe_C)
			
 
				+
			
 
				 endif()
			
 
				 
			
 
				 if(APHRODITE_GPU_LANG STREQUAL "HIP")
			
 
				   message(STATUS "Enabling rocm extension.")
			
 
				   add_dependencies(default _rocm_C)
			
 
				-endif()
			
 
				+endif()
			
--- a/aphrodite/_custom_ops.py
+++ b/aphrodite/_custom_ops.py
@@ -1057,6 +1057,128 @@ def top_k_top_p_sampling_from_probs(
 
				         raise ValueError(f"Invalid filter_apply_order: {filter_apply_order}")
			
 
				 
			
 
				 
			
 
				+# Flash Attention kernels
			
 
				+def fwd(
			
 
				+        q: torch.Tensor,
			
 
				+        k: torch.Tensor,
			
 
				+        v: torch.Tensor,
			
 
				+        alibi_slopes: torch.Tensor,
			
 
				+        dropout_p: float,
			
 
				+        softmax_scale: float,
			
 
				+        causal: bool,
			
 
				+        window_size_left: int,
			
 
				+        window_size_right: int,
			
 
				+        softcap: float,
			
 
				+        return_softmax: bool,
			
 
				+        out: torch.Tensor,
			
 
				+        gen: Optional[torch.Generator] = None,
			
 
				+):
			
 
				+    return torch.ops._C.fwd(
			
 
				+        q,
			
 
				+        k,
			
 
				+        v,
			
 
				+        out,
			
 
				+        alibi_slopes,
			
 
				+        dropout_p,
			
 
				+        softmax_scale,
			
 
				+        causal,
			
 
				+        window_size_left,
			
 
				+        window_size_right,
			
 
				+        softcap,
			
 
				+        return_softmax,
			
 
				+        gen,
			
 
				+    )
			
 
				+
			
 
				+def varlen_fwd(
			
 
				+        q: torch.Tensor,
			
 
				+        k: torch.Tensor,
			
 
				+        v: torch.Tensor,
			
 
				+        out: Optional[torch.Tensor],
			
 
				+        cu_seqlens_q: torch.Tensor,
			
 
				+        cu_seqlens_k: torch.Tensor,
			
 
				+        seqused_k: Optional[torch.Tensor],
			
 
				+        block_table: Optional[torch.Tensor],
			
 
				+        alibi_slopes: Optional[torch.Tensor],
			
 
				+        max_seqlen_q: int,
			
 
				+        max_seqlen_k: int,
			
 
				+        dropout_p: float,
			
 
				+        softmax_scale: float,
			
 
				+        zero_tensors: bool,
			
 
				+        causal: bool,
			
 
				+        window_size_left: int,
			
 
				+        window_size_right: int,
			
 
				+        softcap: float,
			
 
				+        return_softmax: bool,
			
 
				+        gen: Optional[torch.Generator] = None,
			
 
				+):
			
 
				+    return torch.ops._C.varlen_fwd(
			
 
				+        q,
			
 
				+        k,
			
 
				+        v,
			
 
				+        out,
			
 
				+        cu_seqlens_q,
			
 
				+        cu_seqlens_k,
			
 
				+        seqused_k,
			
 
				+        block_table,
			
 
				+        alibi_slopes,
			
 
				+        max_seqlen_q,
			
 
				+        max_seqlen_k,
			
 
				+        dropout_p,
			
 
				+        softmax_scale,
			
 
				+        zero_tensors,
			
 
				+        causal,
			
 
				+        window_size_left,
			
 
				+        window_size_right,
			
 
				+        softcap,
			
 
				+        return_softmax,
			
 
				+        gen,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def fwd_kvcache(
			
 
				+        q: torch.Tensor,
			
 
				+        kcache: torch.Tensor,
			
 
				+        vcache: torch.Tensor,
			
 
				+        k: Optional[torch.Tensor],
			
 
				+        v: Optional[torch.Tensor],
			
 
				+        seqlens_k: Optional[torch.Tensor],
			
 
				+        rotary_cos: Optional[torch.Tensor],
			
 
				+        rotary_sin: Optional[torch.Tensor],
			
 
				+        cache_batch_idx: Optional[torch.Tensor],
			
 
				+        block_table: Optional[torch.Tensor],
			
 
				+        alibi_slopes: Optional[torch.Tensor],
			
 
				+        out: Optional[torch.Tensor],
			
 
				+        softmax_scale: float,
			
 
				+        causal: bool,
			
 
				+        window_size_left: int,
			
 
				+        window_size_right: int,
			
 
				+        softcap: float,
			
 
				+        rotary_interleaved: bool,
			
 
				+        num_splits: int,
			
 
				+):
			
 
				+    return torch.ops._C.fwd_kvcache(
			
 
				+        q,
			
 
				+        kcache,
			
 
				+        vcache,
			
 
				+        k,
			
 
				+        v,
			
 
				+        seqlens_k,
			
 
				+        rotary_cos,
			
 
				+        rotary_sin,
			
 
				+        cache_batch_idx,
			
 
				+        block_table,
			
 
				+        alibi_slopes,
			
 
				+        out,
			
 
				+        softmax_scale,
			
 
				+        causal,
			
 
				+        window_size_left,
			
 
				+        window_size_right,
			
 
				+        softcap,
			
 
				+        rotary_interleaved,
			
 
				+        num_splits,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				 # TODO: remove this later
			
 
				 names_and_values = globals()
			
 
				 names_and_values_to_update = {}
			
--- a/aphrodite/attention/backends/flash_attn.py
+++ b/aphrodite/attention/backends/flash_attn.py
@@ -15,16 +15,15 @@ from aphrodite.attention.backends.utils import (PAD_SLOT_ID,
 
				                                                 compute_slot_mapping,
			
 
				                                                 compute_slot_mapping_start_idx,
			
 
				                                                 is_block_tables_empty)
			
 
				+from aphrodite.attention.ops.aphrodite_flash_attn import (
			
 
				+    flash_attn_varlen_func as _flash_attn_varlen_func)
			
 
				+from aphrodite.attention.ops.aphrodite_flash_attn import (
			
 
				+    flash_attn_with_kvcache as _flash_attn_with_kvcache)
			
 
				 from aphrodite.common.utils import async_tensor_h2d, make_tensor_with_pad
			
 
				 
			
 
				 if TYPE_CHECKING:
			
 
				-    from aphrodite.worker.model_runner import (ModelInputForGPUBuilder,
			
 
				-                                               ModelInputForGPUWithSamplingMetadata)
			
 
				-
			
 
				-from aphrodite_flash_attn import (
			
 
				-    flash_attn_varlen_func as _flash_attn_varlen_func)
			
 
				-from aphrodite_flash_attn import (
			
 
				-    flash_attn_with_kvcache as _flash_attn_with_kvcache)
			
 
				+    from aphrodite.worker.model_runner import (
			
 
				+        ModelInputForGPUBuilder, ModelInputForGPUWithSamplingMetadata)
			
 
				 
			
 
				 
			
 
				 @torch.library.custom_op("aphrodite::flash_attn_varlen_func", mutates_args=[])
			
--- a/aphrodite/attention/ops/aphrodite_flash_attn.py
+++ b/aphrodite/attention/ops/aphrodite_flash_attn.py
@@ -0,0 +1,287 @@
 
				+from typing import Optional, Union
			
 
				+
			
 
				+import torch
			
 
				+
			
 
				+import aphrodite._custom_ops as ops
			
 
				+
			
 
				+
			
 
				+def maybe_contiguous(x):
			
 
				+    return x.contiguous() if x is not None and x.stride(-1) != 1 else x
			
 
				+
			
 
				+
			
 
				+def _flash_attn_forward(
			
 
				+    q, k, v, dropout_p, softmax_scale, causal,
			
 
				+    window_size, softcap, alibi_slopes,
			
 
				+    return_softmax, *, out=None
			
 
				+):
			
 
				+    q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
			
 
				+    (out, q, k, v, out_padded, softmax_lse,
			
 
				+     S_dmask, rng_state) = ops.fwd(
			
 
				+        q=q,
			
 
				+        k=k,
			
 
				+        v=v,
			
 
				+        out=out,
			
 
				+        alibi_slopes=alibi_slopes,
			
 
				+        dropout_p=dropout_p,
			
 
				+        softmax_scale=softmax_scale,
			
 
				+        causal=causal,
			
 
				+        window_size_left=window_size[0],
			
 
				+        window_size_right=window_size[1],
			
 
				+        softcap=softcap,
			
 
				+        return_softmax=return_softmax,
			
 
				+        gen=None,
			
 
				+    )  # type: ignore
			
 
				+    return out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state
			
 
				+
			
 
				+
			
 
				+def _flash_attn_varlen_forward(
			
 
				+    q,
			
 
				+    k,
			
 
				+    v,
			
 
				+    cu_seqlens_q,
			
 
				+    cu_seqlens_k,
			
 
				+    max_seqlen_q,
			
 
				+    max_seqlen_k,
			
 
				+    dropout_p,
			
 
				+    softmax_scale,
			
 
				+    causal,
			
 
				+    window_size,
			
 
				+    softcap,
			
 
				+    alibi_slopes,
			
 
				+    return_softmax,
			
 
				+    block_table,
			
 
				+    *,
			
 
				+    out=None
			
 
				+):
			
 
				+    q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
			
 
				+    (out, q, k, v, out_padded, softmax_lse,
			
 
				+     S_dmask, rng_state) = ops.varlen_fwd(
			
 
				+        q=q,
			
 
				+        k=k,
			
 
				+        v=v,
			
 
				+        cu_seqlens_q=cu_seqlens_q,
			
 
				+        cu_seqlens_k=cu_seqlens_k,
			
 
				+        max_seqlen_q=max_seqlen_q,
			
 
				+        max_seqlen_k=max_seqlen_k,
			
 
				+        dropout_p=dropout_p,
			
 
				+        softmax_scale=softmax_scale,
			
 
				+        causal=causal,
			
 
				+        window_size_left=window_size[0],
			
 
				+        window_size_right=window_size[1],
			
 
				+        softcap=softcap,
			
 
				+        alibi_slopes=alibi_slopes,
			
 
				+        block_table=block_table,
			
 
				+        return_softmax=return_softmax,
			
 
				+        gen=None,
			
 
				+        out=out,
			
 
				+        seqused_k=None,
			
 
				+        zero_tensors=False,
			
 
				+    )  # type: ignore
			
 
				+    return out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state
			
 
				+
			
 
				+
			
 
				+class FlashAttnFunc(torch.autograd.Function):
			
 
				+    @staticmethod
			
 
				+    def forward(
			
 
				+        ctx,
			
 
				+        q,
			
 
				+        k,
			
 
				+        v,
			
 
				+        dropout_p,
			
 
				+        softmax_scale,
			
 
				+        causal,
			
 
				+        window_size,
			
 
				+        softcap,
			
 
				+        alibi_slopes,
			
 
				+        deterministic,
			
 
				+        return_softmax,
			
 
				+        out=None,
			
 
				+    ):
			
 
				+        if softmax_scale is None:
			
 
				+            softmax_scale = q.shape[-1] ** (-0.5)
			
 
				+        (out, q, k, v, out_padded, softmax_lse,
			
 
				+         S_dmask, rng_state) = _flash_attn_forward(
			
 
				+            q,
			
 
				+            k,
			
 
				+            v,
			
 
				+            dropout_p,
			
 
				+            softmax_scale,
			
 
				+            causal=causal,
			
 
				+            window_size=window_size,
			
 
				+            softcap=softcap,
			
 
				+            alibi_slopes=alibi_slopes,
			
 
				+            return_softmax=return_softmax and dropout_p > 0,
			
 
				+            out=out,
			
 
				+        )
			
 
				+        ctx.save_for_backward(q, k, v, out_padded, softmax_lse, rng_state)
			
 
				+        ctx.dropout_p = dropout_p
			
 
				+        ctx.softmax_scale = softmax_scale
			
 
				+        ctx.causal = causal
			
 
				+        ctx.window_size = window_size
			
 
				+        ctx.softcap = softcap
			
 
				+        ctx.alibi_slopes = alibi_slopes
			
 
				+        ctx.deterministic = deterministic
			
 
				+        return out if not return_softmax else (out, softmax_lse, S_dmask)
			
 
				+
			
 
				+
			
 
				+class FlashAttnVarlenFunc(torch.autograd.Function):
			
 
				+    @staticmethod
			
 
				+    def forward(
			
 
				+        ctx,
			
 
				+        q,
			
 
				+        k,
			
 
				+        v,
			
 
				+        cu_seqlens_q,
			
 
				+        cu_seqlens_k,
			
 
				+        max_seqlen_q,
			
 
				+        max_seqlen_k,
			
 
				+        dropout_p,
			
 
				+        softmax_scale,
			
 
				+        causal,
			
 
				+        window_size,
			
 
				+        softcap,
			
 
				+        alibi_slopes,
			
 
				+        deterministic,
			
 
				+        return_softmax,
			
 
				+        block_table,
			
 
				+        out=None,
			
 
				+    ):
			
 
				+        if softmax_scale is None:
			
 
				+            softmax_scale = q.shape[-1] ** (-0.5)
			
 
				+        (out, q, k, v, out_padded, softmax_lse,
			
 
				+         S_dmask, rng_state) = _flash_attn_varlen_forward(
			
 
				+            q,
			
 
				+            k,
			
 
				+            v,
			
 
				+            cu_seqlens_q,
			
 
				+            cu_seqlens_k,
			
 
				+            max_seqlen_q,
			
 
				+            max_seqlen_k,
			
 
				+            dropout_p,
			
 
				+            softmax_scale,
			
 
				+            causal=causal,
			
 
				+            window_size=window_size,
			
 
				+            softcap=softcap,
			
 
				+            alibi_slopes=alibi_slopes,
			
 
				+            return_softmax=return_softmax and dropout_p > 0,
			
 
				+            block_table=block_table,
			
 
				+            out=out,
			
 
				+        )
			
 
				+        ctx.save_for_backward(
			
 
				+            q, k, v, out_padded, softmax_lse, cu_seqlens_q,
			
 
				+            cu_seqlens_k, rng_state
			
 
				+        )
			
 
				+        ctx.dropout_p = dropout_p
			
 
				+        ctx.max_seqlen_q = max_seqlen_q
			
 
				+        ctx.max_seqlen_k = max_seqlen_k
			
 
				+        ctx.softmax_scale = softmax_scale
			
 
				+        ctx.causal = causal
			
 
				+        ctx.window_size = window_size
			
 
				+        ctx.softcap = softcap
			
 
				+        ctx.alibi_slopes = alibi_slopes
			
 
				+        ctx.deterministic = deterministic
			
 
				+        return out if not return_softmax else (out, softmax_lse, S_dmask)
			
 
				+    
			
 
				+
			
 
				+def flash_attn_varlen_func(
			
 
				+    q,
			
 
				+    k,
			
 
				+    v,
			
 
				+    cu_seqlens_q,
			
 
				+    cu_seqlens_k,
			
 
				+    max_seqlen_q,
			
 
				+    max_seqlen_k,
			
 
				+    dropout_p=0.0,
			
 
				+    softmax_scale=None,
			
 
				+    causal=False,
			
 
				+    window_size=(-1, -1),  # -1 means infinite context window
			
 
				+    softcap=0.0, # 0.0 means deactivated
			
 
				+    alibi_slopes=None,
			
 
				+    deterministic=False,
			
 
				+    return_attn_probs=False,
			
 
				+    block_table=None,
			
 
				+    *,
			
 
				+    out=None,
			
 
				+):
			
 
				+    return FlashAttnVarlenFunc.apply(
			
 
				+        q,
			
 
				+        k,
			
 
				+        v,
			
 
				+        cu_seqlens_q,
			
 
				+        cu_seqlens_k,
			
 
				+        max_seqlen_q,
			
 
				+        max_seqlen_k,
			
 
				+        dropout_p,
			
 
				+        softmax_scale,
			
 
				+        causal,
			
 
				+        window_size,
			
 
				+        softcap,
			
 
				+        alibi_slopes,
			
 
				+        deterministic,
			
 
				+        return_attn_probs,
			
 
				+        block_table,
			
 
				+        out,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def flash_attn_with_kvcache(
			
 
				+    q,
			
 
				+    k_cache,
			
 
				+    v_cache,
			
 
				+    k=None,
			
 
				+    v=None,
			
 
				+    rotary_cos=None,
			
 
				+    rotary_sin=None,
			
 
				+    cache_seqlens: Optional[Union[(int, torch.Tensor)]] = None,
			
 
				+    cache_batch_idx: Optional[torch.Tensor] = None,
			
 
				+    block_table: Optional[torch.Tensor] = None,
			
 
				+    softmax_scale=None,
			
 
				+    causal=False,
			
 
				+    window_size=(-1, -1),  # -1 means infinite context window
			
 
				+    softcap=0.0, # 0.0 means deactivated
			
 
				+    rotary_interleaved=True,
			
 
				+    alibi_slopes=None,
			
 
				+    num_splits=0,
			
 
				+    return_softmax_lse=False,
			
 
				+    *,
			
 
				+    out=None,
			
 
				+):
			
 
				+    assert k_cache.stride(-1) == 1, (
			
 
				+        "k_cache must have contiguous last dimension"
			
 
				+    )
			
 
				+    assert v_cache.stride(-1) == 1, (
			
 
				+        "v_cache must have contiguous last dimension"
			
 
				+    )
			
 
				+    q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
			
 
				+    if softmax_scale is None:
			
 
				+        softmax_scale = q.shape[-1] ** (-0.5)
			
 
				+    if cache_seqlens is not None and isinstance(cache_seqlens, int):
			
 
				+        cache_seqlens = torch.full(
			
 
				+            (k_cache.shape[0],), cache_seqlens,
			
 
				+            dtype=torch.int32, device=k_cache.device
			
 
				+        )
			
 
				+        cache_seqlens = maybe_contiguous(cache_seqlens)
			
 
				+    cache_batch_idx = maybe_contiguous(cache_batch_idx)
			
 
				+    block_table = maybe_contiguous(block_table)
			
 
				+    out, softmax_lse = ops.fwd_kvcache(
			
 
				+        q=q,
			
 
				+        kcache=k_cache,
			
 
				+        vcache=v_cache,
			
 
				+        k=k,
			
 
				+        v=v,
			
 
				+        seqlens_k=cache_seqlens,
			
 
				+        rotary_cos=rotary_cos,
			
 
				+        rotary_sin=rotary_sin,
			
 
				+        cache_batch_idx=cache_batch_idx,
			
 
				+        block_table=block_table,
			
 
				+        alibi_slopes=alibi_slopes,
			
 
				+        out=out,
			
 
				+        softmax_scale=softmax_scale,
			
 
				+        causal=causal,
			
 
				+        window_size_left=window_size[0],
			
 
				+        window_size_right=window_size[1],
			
 
				+        softcap=softcap,
			
 
				+        rotary_interleaved=rotary_interleaved,
			
 
				+        num_splits=num_splits,
			
 
				+    )  # type: ignore
			
 
				+    return (out, softmax_lse) if return_softmax_lse else out
			
--- a/aphrodite/attention/selector.py
+++ b/aphrodite/attention/selector.py
@@ -253,8 +253,7 @@ def which_attn_to_use(
 
				     # FlashAttn is valid for the model, checking if the package is installed.
			
 
				     if selected_backend == _Backend.FLASH_ATTN:
			
 
				         try:
			
 
				-            import aphrodite_flash_attn  # noqa: F401
			
 
				-
			
 
				+            import aphrodite.attention.ops.aphrodite_flash_attn  # noqa: F401
			
 
				             from aphrodite.attention.backends.flash_attn import (  # noqa: F401
			
 
				                 FlashAttentionBackend)
			
 
				 
			
@@ -267,8 +266,8 @@ def which_attn_to_use(
 
				         except ImportError:
			
 
				             logger.info(
			
 
				                 "Cannot use FlashAttention-2 backend because the "
			
 
				-                "aphrodite_flash_attn package is not found. "
			
 
				-                "`pip install aphrodite-flash-attn` for better performance.")
			
 
				+                "aphrodite._aphrodite_flash_attn_C object is not found. "
			
 
				+                "This is built by default on supported hardware.")
			
 
				             selected_backend = _Backend.XFORMERS
			
 
				 
			
 
				     return selected_backend
			
--- a/kernels/flash_attn/block_info.h
+++ b/kernels/flash_attn/block_info.h
@@ -0,0 +1,65 @@
 
				+/******************************************************************************
			
 
				+ * Copyright (c) 2023, Tri Dao.
			
 
				+ ******************************************************************************/
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+namespace flash {
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template <bool Varlen = true>
			
 
				+struct BlockInfo {
			
 
				+  template <typename Params>
			
 
				+  __device__ BlockInfo(const Params& params, const int bidb)
			
 
				+      : sum_s_q(!Varlen || params.cu_seqlens_q == nullptr
			
 
				+                    ? -1
			
 
				+                    : params.cu_seqlens_q[bidb]),
			
 
				+        sum_s_k(!Varlen || params.cu_seqlens_k == nullptr ||
			
 
				+                        !params.is_seqlens_k_cumulative
			
 
				+                    ? -1
			
 
				+                    : params.cu_seqlens_k[bidb]),
			
 
				+        actual_seqlen_q(!Varlen || params.cu_seqlens_q == nullptr
			
 
				+                            ? params.seqlen_q
			
 
				+                            : params.cu_seqlens_q[bidb + 1] - sum_s_q)
			
 
				+        // If is_seqlens_k_cumulative, then seqlen_k is cu_seqlens_k[bidb + 1] -
			
 
				+        // cu_seqlens_k[bidb]. Otherwise it's cu_seqlens_k[bidb], i.e., we use
			
 
				+        // cu_seqlens_k to store the sequence lengths of K.
			
 
				+        ,
			
 
				+        seqlen_k_cache(!Varlen || params.cu_seqlens_k == nullptr
			
 
				+                           ? params.seqlen_k
			
 
				+                           : (params.is_seqlens_k_cumulative
			
 
				+                                  ? params.cu_seqlens_k[bidb + 1] - sum_s_k
			
 
				+                                  : params.cu_seqlens_k[bidb])),
			
 
				+        actual_seqlen_k(params.seqused_k
			
 
				+                            ? params.seqused_k[bidb]
			
 
				+                            : seqlen_k_cache + (params.knew_ptr == nullptr
			
 
				+                                                    ? 0
			
 
				+                                                    : params.seqlen_knew)) {}
			
 
				+
			
 
				+  template <typename index_t>
			
 
				+  __forceinline__ __device__ index_t q_offset(const index_t batch_stride,
			
 
				+                                              const index_t row_stride,
			
 
				+                                              const int bidb) const {
			
 
				+    return sum_s_q == -1 ? bidb * batch_stride : uint32_t(sum_s_q) * row_stride;
			
 
				+  }
			
 
				+
			
 
				+  template <typename index_t>
			
 
				+  __forceinline__ __device__ index_t k_offset(const index_t batch_stride,
			
 
				+                                              const index_t row_stride,
			
 
				+                                              const int bidb) const {
			
 
				+    return sum_s_k == -1 ? bidb * batch_stride : uint32_t(sum_s_k) * row_stride;
			
 
				+  }
			
 
				+
			
 
				+  const int sum_s_q;
			
 
				+  const int sum_s_k;
			
 
				+  const int actual_seqlen_q;
			
 
				+  // We have to have seqlen_k_cache declared before actual_seqlen_k, otherwise
			
 
				+  // actual_seqlen_k is set to 0.
			
 
				+  const int seqlen_k_cache;
			
 
				+  const int actual_seqlen_k;
			
 
				+};
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+}  // namespace flash
			
--- a/kernels/flash_attn/dropout.h
+++ b/kernels/flash_attn/dropout.h
@@ -0,0 +1,121 @@
 
				+/******************************************************************************
			
 
				+ * Copyright (c) 2024, Tri Dao.
			
 
				+ ******************************************************************************/
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include "philox.cuh"
			
 
				+#include "utils.h"
			
 
				+
			
 
				+namespace flash {
			
 
				+
			
 
				+struct Dropout {
			
 
				+  const unsigned long long seed, offset;
			
 
				+  const uint8_t p_dropout_in_uint8_t;
			
 
				+
			
 
				+  __forceinline__ __device__ Dropout(const unsigned long long seed,
			
 
				+                                     const unsigned long long offset,
			
 
				+                                     const uint8_t p_dropout_in_uint8_t,
			
 
				+                                     const int bid, const int hid,
			
 
				+                                     const int tid, const int nheads)
			
 
				+      : seed(seed),
			
 
				+        offset(offset + (bid * nheads + hid) * 32 + tid % 32),
			
 
				+        p_dropout_in_uint8_t(p_dropout_in_uint8_t) {}
			
 
				+
			
 
				+  template <bool encode_dropout_in_sign_bit = false, typename Engine,
			
 
				+            typename Layout>
			
 
				+  __forceinline__ __device__ void apply_dropout(Tensor<Engine, Layout>& tensor_,
			
 
				+                                                int block_row_start,
			
 
				+                                                int block_col_start,
			
 
				+                                                int block_row_stride) {
			
 
				+    // convert shape from (4, MMA_M, MMA_N) to (8, MMA_M, MMA_N / 2)
			
 
				+    Tensor tensor = make_tensor(
			
 
				+        tensor_.data(), flash::convert_layout_acc_dropout(tensor_.layout()));
			
 
				+    using T = typename Engine::value_type;
			
 
				+    auto encode_dropout = [](bool keep, T val) {
			
 
				+      return keep ? val : (encode_dropout_in_sign_bit ? -val : T(0));
			
 
				+    };
			
 
				+    static_assert(decltype(size<2>(tensor))::value % 2 == 0);
			
 
				+    const uint16_t p_dropout_8bit_in_uint16_t = uint16_t(p_dropout_in_uint8_t);
			
 
				+    const uint32_t p_dropout_8bit_in_uint32_t =
			
 
				+        (uint32_t(p_dropout_8bit_in_uint16_t) << 16) |
			
 
				+        uint32_t(p_dropout_8bit_in_uint16_t);
			
 
				+// if (cute::thread0()) { printf("threshold2 = 0x%x\n",
			
 
				+// p_dropout_8bit_in_uint32_t); }
			
 
				+#pragma unroll
			
 
				+    for (int m = 0; m < size<1>(tensor);
			
 
				+         ++m, block_row_start += block_row_stride) {
			
 
				+      uint2 rowcol = make_uint2(block_row_start, block_col_start);
			
 
				+#pragma unroll
			
 
				+      for (int n = 0; n < size<2>(tensor) / 2; ++n, ++rowcol.y) {
			
 
				+        // if (cute::thread(32, 0)) { printf("m = %d, n = %d, row = %d, col =
			
 
				+        // %d\n", m, n, int(rowcol.x), int(rowcol.y));}
			
 
				+        uint4 random_uint4 = flash::philox(
			
 
				+            seed, reinterpret_cast<unsigned long long&>(rowcol), offset);
			
 
				+        // if (cute::thread0()) { printf("philox = %u, %d, %d, %d\n",
			
 
				+        // random_uint4.x, random_uint4.y, random_uint4.z, random_uint4.w);}
			
 
				+        uint8_t(&rnd_8)[16] = reinterpret_cast<uint8_t(&)[16]>(random_uint4);
			
 
				+        // Special implementation for 16-bit types: we duplicate the threshold
			
 
				+        // to the low and high 16 bits of a 32-bit value, then use the f16x2
			
 
				+        // comparison instruction to get a mask. The low 16 bits of the mask
			
 
				+        // will be either 0xffff or 0x0000, and the high 16 bits will be either
			
 
				+        // 0xffff or 0x0000, depending on whether the random value is less than
			
 
				+        // the threshold. We then do a bit-wise AND between the mask and the
			
 
				+        // original value (in 32-bit). We're exploiting the fact that floating
			
 
				+        // point comparison is equivalent to integer comparison, since we're
			
 
				+        // comparing unsigned integers whose top 8-bits are zero.
			
 
				+        if (!encode_dropout_in_sign_bit &&
			
 
				+            (std::is_same<T, cutlass::half_t>::value ||
			
 
				+             std::is_same<T, cutlass::bfloat16_t>::value)) {
			
 
				+          uint16_t rnd_16[16];
			
 
				+#pragma unroll
			
 
				+          for (int i = 0; i < 16; i++) {
			
 
				+            rnd_16[i] = uint16_t(rnd_8[i]);
			
 
				+          }
			
 
				+          uint32_t(&rnd_32)[8] = reinterpret_cast<uint32_t(&)[8]>(rnd_16);
			
 
				+#pragma unroll
			
 
				+          for (int j = 0; j < 2; j++) {
			
 
				+            Tensor tensor_uint32 = recast<uint32_t>(tensor(_, m, n * 2 + j));
			
 
				+// if (cute::thread0()) { printf("random = 0x%x, 0x%x, 0x%x, 0x%x\n", rnd_32[j *
			
 
				+// 4 + 0], rnd_32[j * 4 + 1], rnd_32[j * 4 + 2], rnd_32[j * 4 + 3]); } if
			
 
				+// (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n",
			
 
				+// tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); }
			
 
				+#pragma unroll
			
 
				+            for (int i = 0; i < 4; i++) {
			
 
				+              uint32_t mask;
			
 
				+              asm volatile("set.le.u32.f16x2 %0, %1, %2;\n"
			
 
				+                           : "=r"(mask)
			
 
				+                           : "r"(rnd_32[j * 4 + i]),
			
 
				+                             "r"(p_dropout_8bit_in_uint32_t));
			
 
				+              tensor_uint32(i) &= mask;
			
 
				+            }
			
 
				+            // if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x,
			
 
				+            // 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2),
			
 
				+            // tensor_uint32(3)); }
			
 
				+          }
			
 
				+        } else {
			
 
				+#pragma unroll
			
 
				+          for (int j = 0; j < 2; j++) {
			
 
				+#pragma unroll
			
 
				+            for (int i = 0; i < 8; i++) {
			
 
				+              tensor(i, m, n * 2 + j) =
			
 
				+                  encode_dropout(rnd_8[j * 8 + i] <= p_dropout_in_uint8_t,
			
 
				+                                 tensor(i, m, n * 2 + j));
			
 
				+            }
			
 
				+            Tensor tensor_uint32 = recast<uint32_t>(tensor(_, m, n * 2 + j));
			
 
				+            // if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x,
			
 
				+            // 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2),
			
 
				+            // tensor_uint32(3)); }
			
 
				+          }
			
 
				+        }
			
 
				+        // // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0))
			
 
				+        // {
			
 
				+        // //     printf("n = %d, ph  Philox: %u, %u, %u, %u\n", n, rnd_8.x,
			
 
				+        // rnd_8.y, rnd_8.z, rnd_8.w);
			
 
				+        // // }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+}  // namespace flash
			
--- a/kernels/flash_attn/flash.h
+++ b/kernels/flash_attn/flash.h
@@ -0,0 +1,156 @@
 
				+/******************************************************************************
			
 
				+ * Copyright (c) 2023, Tri Dao.
			
 
				+ ******************************************************************************/
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include <cuda.h>
			
 
				+#include <vector>
			
 
				+
			
 
				+#ifdef OLD_GENERATOR_PATH
			
 
				+  #include <ATen/CUDAGeneratorImpl.h>
			
 
				+#else
			
 
				+  #include <ATen/cuda/CUDAGeneratorImpl.h>
			
 
				+#endif
			
 
				+
			
 
				+#include <ATen/cuda/CUDAGraphsUtils.cuh>  // For at::cuda::philox::unpack
			
 
				+
			
 
				+constexpr int TOTAL_DIM = 0;
			
 
				+constexpr int H_DIM = 1;
			
 
				+constexpr int D_DIM = 2;
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+struct Qkv_params {
			
 
				+  using index_t = int64_t;
			
 
				+  // The QKV matrices.
			
 
				+  void* __restrict__ q_ptr;
			
 
				+  void* __restrict__ k_ptr;
			
 
				+  void* __restrict__ v_ptr;
			
 
				+
			
 
				+  // The stride between rows of the Q, K and V matrices.
			
 
				+  index_t q_batch_stride;
			
 
				+  index_t k_batch_stride;
			
 
				+  index_t v_batch_stride;
			
 
				+  index_t q_row_stride;
			
 
				+  index_t k_row_stride;
			
 
				+  index_t v_row_stride;
			
 
				+  index_t q_head_stride;
			
 
				+  index_t k_head_stride;
			
 
				+  index_t v_head_stride;
			
 
				+
			
 
				+  // The number of heads.
			
 
				+  int h, h_k;
			
 
				+  // In the case of multi-query and grouped-query attention (MQA/GQA), nheads_k
			
 
				+  // could be different from nheads (query).
			
 
				+  int h_h_k_ratio;  // precompute h / h_k,
			
 
				+};
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+struct Flash_fwd_params : public Qkv_params {
			
 
				+  // The O matrix (output).
			
 
				+  void* __restrict__ o_ptr;
			
 
				+  void* __restrict__ oaccum_ptr;
			
 
				+
			
 
				+  // The stride between rows of O.
			
 
				+  index_t o_batch_stride;
			
 
				+  index_t o_row_stride;
			
 
				+  index_t o_head_stride;
			
 
				+
			
 
				+  // The pointer to the P matrix.
			
 
				+  void* __restrict__ p_ptr;
			
 
				+
			
 
				+  // The pointer to the softmax sum.
			
 
				+  void* __restrict__ softmax_lse_ptr;
			
 
				+  void* __restrict__ softmax_lseaccum_ptr;
			
 
				+
			
 
				+  // The dimensions.
			
 
				+  int b, seqlen_q, seqlen_k, seqlen_knew, d, seqlen_q_rounded, seqlen_k_rounded,
			
 
				+      d_rounded, rotary_dim, total_q;
			
 
				+
			
 
				+  // The scaling factors for the kernel.
			
 
				+  float scale_softmax;
			
 
				+  float scale_softmax_log2;
			
 
				+
			
 
				+  // array of length b+1 holding starting offset of each sequence.
			
 
				+  int* __restrict__ cu_seqlens_q;
			
 
				+  int* __restrict__ cu_seqlens_k;
			
 
				+
			
 
				+  // If provided, the actual length of each k sequence.
			
 
				+  int* __restrict__ seqused_k;
			
 
				+
			
 
				+  int* __restrict__ blockmask;
			
 
				+
			
 
				+  // The K_new and V_new matrices.
			
 
				+  void* __restrict__ knew_ptr;
			
 
				+  void* __restrict__ vnew_ptr;
			
 
				+
			
 
				+  // The stride between rows of the Q, K and V matrices.
			
 
				+  index_t knew_batch_stride;
			
 
				+  index_t vnew_batch_stride;
			
 
				+  index_t knew_row_stride;
			
 
				+  index_t vnew_row_stride;
			
 
				+  index_t knew_head_stride;
			
 
				+  index_t vnew_head_stride;
			
 
				+
			
 
				+  // The cos and sin matrices for rotary embedding.
			
 
				+  void* __restrict__ rotary_cos_ptr;
			
 
				+  void* __restrict__ rotary_sin_ptr;
			
 
				+
			
 
				+  // The indices to index into the KV cache.
			
 
				+  int* __restrict__ cache_batch_idx;
			
 
				+
			
 
				+  // Paged KV cache
			
 
				+  int* __restrict__ block_table;
			
 
				+  index_t block_table_batch_stride;
			
 
				+  int page_block_size;
			
 
				+
			
 
				+  // The dropout probability (probability of keeping an activation).
			
 
				+  float p_dropout;
			
 
				+  // uint32_t p_dropout_in_uint;
			
 
				+  // uint16_t p_dropout_in_uint16_t;
			
 
				+  uint8_t p_dropout_in_uint8_t;
			
 
				+
			
 
				+  // Scale factor of 1 / (1 - p_dropout).
			
 
				+  float rp_dropout;
			
 
				+  float scale_softmax_rp_dropout;
			
 
				+
			
 
				+  // Local window size
			
 
				+  int window_size_left, window_size_right;
			
 
				+  float softcap;
			
 
				+
			
 
				+  // Random state.
			
 
				+  at::PhiloxCudaState philox_args;
			
 
				+
			
 
				+  // Pointer to the RNG seed (idx 0) and offset (idx 1).
			
 
				+  uint64_t* rng_state;
			
 
				+
			
 
				+  bool is_bf16;
			
 
				+  bool is_causal;
			
 
				+
			
 
				+  // If is_seqlens_k_cumulative, then seqlen_k is cu_seqlens_k[bidb + 1] -
			
 
				+  // cu_seqlens_k[bidb]. Otherwise it's cu_seqlens_k[bidb], i.e., we use
			
 
				+  // cu_seqlens_k to store the sequence lengths of K.
			
 
				+  bool is_seqlens_k_cumulative;
			
 
				+
			
 
				+  bool is_rotary_interleaved;
			
 
				+
			
 
				+  int num_splits;  // For split-KV version
			
 
				+
			
 
				+  void* __restrict__ alibi_slopes_ptr;
			
 
				+  index_t alibi_slopes_batch_stride;
			
 
				+
			
 
				+  bool unpadded_lse;  // For varlen paths: LSE is in [nheads, total_seqlen_q]
			
 
				+                      // format instead of [b, nheads, seqlen_q].
			
 
				+  bool seqlenq_ngroups_swapped;  // q has been transposed from (b, 1, (nheads_kv
			
 
				+                                 // ngroups), d) to (b, ngroups, nheads_kv, d).
			
 
				+};
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template <typename T, int Headdim, bool Is_causal>
			
 
				+void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream);
			
 
				+template <typename T, int Headdim, bool Is_causal>
			
 
				+void run_mha_fwd_splitkv_dispatch(Flash_fwd_params& params,
			
 
				+                                  cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_api.h
+++ b/kernels/flash_attn/flash_api.h
@@ -0,0 +1,1167 @@
 
				+/******************************************************************************
			
 
				+ * Copyright (c) 2024, Tri Dao.
			
 
				+ ******************************************************************************/
			
 
				+
			
 
				+// Include these 2 headers instead of torch/extension.h since we don't need all
			
 
				+// of the torch headers.
			
 
				+#include "registration.h"
			
 
				+#include <torch/library.h>
			
 
				+#include <torch/nn/functional.h>
			
 
				+#include <ATen/cuda/CUDAContext.h>
			
 
				+#include <c10/cuda/CUDAGuard.h>
			
 
				+
			
 
				+#include <cutlass/numeric_types.h>
			
 
				+
			
 
				+#include "flash.h"
			
 
				+#include "static_switch.h"
			
 
				+
			
 
				+#define CHECK_DEVICE(x) TORCH_CHECK(x.is_cuda(), #x " must be on CUDA")
			
 
				+#define CHECK_SHAPE(x, ...)                                   \
			
 
				+  TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), \
			
 
				+              #x " must have shape (" #__VA_ARGS__ ")")
			
 
				+#define CHECK_CONTIGUOUS(x) \
			
 
				+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
			
 
				+
			
 
				+void set_params_fprop(Flash_fwd_params& params,
			
 
				+                      // sizes
			
 
				+                      const size_t b, const size_t seqlen_q,
			
 
				+                      const size_t seqlen_k, const size_t seqlen_q_rounded,
			
 
				+                      const size_t seqlen_k_rounded, const size_t h,
			
 
				+                      const size_t h_k, const size_t d, const size_t d_rounded,
			
 
				+                      // device pointers
			
 
				+                      const at::Tensor q, const at::Tensor k,
			
 
				+                      const at::Tensor v, at::Tensor out, void* cu_seqlens_q_d,
			
 
				+                      void* cu_seqlens_k_d, void* seqused_k, void* p_d,
			
 
				+                      void* softmax_lse_d, float p_dropout, float softmax_scale,
			
 
				+                      int window_size_left, int window_size_right,
			
 
				+                      const float softcap, bool seqlenq_ngroups_swapped = false,
			
 
				+                      const bool unpadded_lse = false) {
			
 
				+  // Reset the parameters
			
 
				+  params = {};
			
 
				+
			
 
				+  params.is_bf16 = q.dtype() == torch::kBFloat16;
			
 
				+
			
 
				+  // Set the pointers and strides.
			
 
				+  params.q_ptr = q.data_ptr();
			
 
				+  params.k_ptr = k.data_ptr();
			
 
				+  params.v_ptr = v.data_ptr();
			
 
				+  // All stride are in elements, not bytes.
			
 
				+  params.q_row_stride = q.stride(-3);
			
 
				+  params.k_row_stride = k.stride(-3);
			
 
				+  params.v_row_stride = v.stride(-3);
			
 
				+  params.q_head_stride = q.stride(-2);
			
 
				+  params.k_head_stride = k.stride(-2);
			
 
				+  params.v_head_stride = v.stride(-2);
			
 
				+  params.o_ptr = out.data_ptr();
			
 
				+  params.o_row_stride = out.stride(-3);
			
 
				+  params.o_head_stride = out.stride(-2);
			
 
				+
			
 
				+  if (cu_seqlens_q_d == nullptr) {
			
 
				+    params.q_batch_stride = q.stride(0);
			
 
				+    params.k_batch_stride = k.stride(0);
			
 
				+    params.v_batch_stride = v.stride(0);
			
 
				+    params.o_batch_stride = out.stride(0);
			
 
				+    if (seqlenq_ngroups_swapped) {
			
 
				+      params.q_batch_stride *= seqlen_q;
			
 
				+      params.o_batch_stride *= seqlen_q;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  params.cu_seqlens_q = static_cast<int*>(cu_seqlens_q_d);
			
 
				+  params.cu_seqlens_k = static_cast<int*>(cu_seqlens_k_d);
			
 
				+  params.seqused_k = static_cast<int*>(seqused_k);
			
 
				+
			
 
				+  // P = softmax(QK^T)
			
 
				+  params.p_ptr = p_d;
			
 
				+
			
 
				+  // Softmax sum
			
 
				+  params.softmax_lse_ptr = softmax_lse_d;
			
 
				+
			
 
				+  // Set the dimensions.
			
 
				+  params.b = b;
			
 
				+  params.h = h;
			
 
				+  params.h_k = h_k;
			
 
				+  params.h_h_k_ratio = h / h_k;
			
 
				+  params.seqlen_q = seqlen_q;
			
 
				+  params.seqlen_k = seqlen_k;
			
 
				+  params.seqlen_q_rounded = seqlen_q_rounded;
			
 
				+  params.seqlen_k_rounded = seqlen_k_rounded;
			
 
				+  params.d = d;
			
 
				+  params.d_rounded = d_rounded;
			
 
				+
			
 
				+// Set the different scale values.
			
 
				+#ifdef FLASHATTENTION_DISABLE_SOFTCAP
			
 
				+  TORCH_CHECK(softcap <= 0.0,
			
 
				+              "This flash attention build does not support softcap.");
			
 
				+#endif
			
 
				+  if (softcap > 0.0) {
			
 
				+    params.softcap = softmax_scale / softcap;
			
 
				+    params.scale_softmax = softcap;
			
 
				+    params.scale_softmax_log2 = softcap * M_LOG2E;
			
 
				+  } else {
			
 
				+    // Remove potential NaN
			
 
				+    params.softcap = 0.0;
			
 
				+    params.scale_softmax = softmax_scale;
			
 
				+    params.scale_softmax_log2 = softmax_scale * M_LOG2E;
			
 
				+  }
			
 
				+
			
 
				+  // Set this to probability of keeping an element to simplify things.
			
 
				+  params.p_dropout = 1.f - p_dropout;
			
 
				+  // Convert p from float to int so we don't have to convert the random uint to
			
 
				+  // float to compare. [Minor] We want to round down since when we do the
			
 
				+  // comparison we use <= instead of < params.p_dropout_in_uint =
			
 
				+  // uint32_t(std::floor(params.p_dropout * 4294967295.0));
			
 
				+  // params.p_dropout_in_uint16_t = uint16_t(std::floor(params.p_dropout *
			
 
				+  // 65535.0));
			
 
				+  params.p_dropout_in_uint8_t = uint8_t(std::floor(params.p_dropout * 255.0));
			
 
				+  params.rp_dropout = 1.f / params.p_dropout;
			
 
				+  params.scale_softmax_rp_dropout = params.rp_dropout * params.scale_softmax;
			
 
				+  TORCH_CHECK(p_dropout < 1.f);
			
 
				+#ifdef FLASHATTENTION_DISABLE_DROPOUT
			
 
				+  TORCH_CHECK(p_dropout == 0.0f,
			
 
				+              "This flash attention build does not support dropout.");
			
 
				+#endif
			
 
				+
			
 
				+  // Causal is the special case where window_size_right == 0 and
			
 
				+  // window_size_left < 0. Local is the more general case where
			
 
				+  // window_size_right >= 0 or window_size_left >= 0.
			
 
				+  params.is_causal = window_size_left < 0 && window_size_right == 0;
			
 
				+
			
 
				+  if (window_size_left < 0 && window_size_right >= 0) {
			
 
				+    window_size_left = seqlen_k;
			
 
				+  }
			
 
				+  if (window_size_left >= 0 && window_size_right < 0) {
			
 
				+    window_size_right = seqlen_k;
			
 
				+  }
			
 
				+  params.window_size_left = window_size_left;
			
 
				+  params.window_size_right = window_size_right;
			
 
				+
			
 
				+#ifdef FLASHATTENTION_DISABLE_LOCAL
			
 
				+  TORCH_CHECK(
			
 
				+      params.is_causal || (window_size_left < 0 && window_size_right < 0),
			
 
				+      "This flash attention build does not support local attention.");
			
 
				+#endif
			
 
				+
			
 
				+  params.is_seqlens_k_cumulative = true;
			
 
				+
			
 
				+#ifdef FLASHATTENTION_DISABLE_UNEVEN_K
			
 
				+  TORCH_CHECK(d == d_rounded,
			
 
				+              "This flash attention build does not support headdim not being a "
			
 
				+              "multiple of 32.");
			
 
				+#endif
			
 
				+
			
 
				+  params.unpadded_lse = unpadded_lse;
			
 
				+  params.seqlenq_ngroups_swapped = seqlenq_ngroups_swapped;
			
 
				+}
			
 
				+
			
 
				+void run_mha_fwd(Flash_fwd_params& params, cudaStream_t stream,
			
 
				+                 bool force_split_kernel = false) {
			
 
				+  FP16_SWITCH(!params.is_bf16, [&] {
			
 
				+    HEADDIM_SWITCH(params.d, [&] {
			
 
				+      BOOL_SWITCH(params.is_causal, Is_causal, [&] {
			
 
				+        if (params.num_splits <= 1 &&
			
 
				+            !force_split_kernel) {  // If we don't set it num_splits == 0
			
 
				+          run_mha_fwd_<elem_type, kHeadDim, Is_causal>(params, stream);
			
 
				+        } else {
			
 
				+          run_mha_fwd_splitkv_dispatch<elem_type, kHeadDim, Is_causal>(params,
			
 
				+                                                                       stream);
			
 
				+        }
			
 
				+      });
			
 
				+    });
			
 
				+  });
			
 
				+}
			
 
				+
			
 
				+// Find the number of splits that maximizes the occupancy. For example, if we
			
 
				+// have batch * n_heads = 48 and we have 108 SMs, having 2 splits (efficiency =
			
 
				+// 0.89) is better than having 3 splits (efficiency = 0.67). However, we also
			
 
				+// don't want too many splits as that would incur more HBM reads/writes. So we
			
 
				+// find the best efficiency, then find the smallest number of splits that gets
			
 
				+// 85% of the best efficiency.
			
 
				+inline int num_splits_heuristic(int batch_nheads_mblocks, int num_SMs,
			
 
				+                                int num_n_blocks, int max_splits) {
			
 
				+  // If we have enough to almost fill the SMs, then just use 1 split
			
 
				+  if (batch_nheads_mblocks >= 0.8f * num_SMs) {
			
 
				+    return 1;
			
 
				+  }
			
 
				+  max_splits = std::min({max_splits, num_SMs, num_n_blocks});
			
 
				+  float max_efficiency = 0.f;
			
 
				+  std::vector<float> efficiency;
			
 
				+  efficiency.reserve(max_splits);
			
 
				+  auto ceildiv = [](int a, int b) { return (a + b - 1) / b; };
			
 
				+  // Some splits are not eligible. For example, if we have 64 blocks and choose
			
 
				+  // 11 splits, we'll have 6 * 10 + 4 blocks. If we choose 12 splits, we'll have
			
 
				+  // 6 * 11 + (-2) blocks (i.e. it's 11 splits anyway). So we check if the
			
 
				+  // number of blocks per split is the same as the previous num_splits.
			
 
				+  auto is_split_eligible = [&ceildiv, &num_n_blocks](int num_splits) {
			
 
				+    return num_splits == 1 || ceildiv(num_n_blocks, num_splits) !=
			
 
				+                                  ceildiv(num_n_blocks, num_splits - 1);
			
 
				+  };
			
 
				+  for (int num_splits = 1; num_splits <= max_splits; num_splits++) {
			
 
				+    if (!is_split_eligible(num_splits)) {
			
 
				+      efficiency.push_back(0.f);
			
 
				+    } else {
			
 
				+      float n_waves = float(batch_nheads_mblocks * num_splits) / num_SMs;
			
 
				+      float eff = n_waves / ceil(n_waves);
			
 
				+      // printf("num_splits = %d, eff = %f\n", num_splits, eff);
			
 
				+      if (eff > max_efficiency) {
			
 
				+        max_efficiency = eff;
			
 
				+      }
			
 
				+      efficiency.push_back(eff);
			
 
				+    }
			
 
				+  }
			
 
				+  for (int num_splits = 1; num_splits <= max_splits; num_splits++) {
			
 
				+    if (!is_split_eligible(num_splits)) {
			
 
				+      continue;
			
 
				+    }
			
 
				+    if (efficiency[num_splits - 1] >= 0.85 * max_efficiency) {
			
 
				+      // printf("num_splits chosen = %d\n", num_splits);
			
 
				+      return num_splits;
			
 
				+    }
			
 
				+  }
			
 
				+  return 1;
			
 
				+}
			
 
				+
			
 
				+std::tuple<at::Tensor, at::Tensor> set_params_splitkv(
			
 
				+    Flash_fwd_params& params, const int batch_size, const int num_heads,
			
 
				+    const int head_size, const int max_seqlen_k, const int max_seqlen_q,
			
 
				+    const int head_size_rounded, const float p_dropout, const int num_splits,
			
 
				+    cudaDeviceProp* dprops, struct c10::TensorOptions opts) {
			
 
				+  // This needs to match with run_mha_fwd_splitkv_dispatch
			
 
				+  const int block_n = head_size <= 64 ? 256 : (head_size <= 128 ? 128 : 64);
			
 
				+  const int num_n_blocks = (max_seqlen_k + block_n - 1) / block_n;
			
 
				+  // Technically kBlockM = 64 only for the splitKV kernels, not the standard
			
 
				+  // kernel. In any case we don't expect seqlen_q to be larger than 64 for
			
 
				+  // inference.
			
 
				+  const int num_m_blocks = (max_seqlen_q + 64 - 1) / 64;
			
 
				+  params.num_splits = num_splits;
			
 
				+  at::Tensor softmax_lse_accum;
			
 
				+  at::Tensor out_accum;
			
 
				+
			
 
				+  if (p_dropout == 0.0f) {  // SplitKV is not implemented for dropout
			
 
				+    if (num_splits < 1) {
			
 
				+      // We multiply number of SMs by 2 to hard-code the fact that we're using
			
 
				+      // 128 threads per block.
			
 
				+      params.num_splits = num_splits_heuristic(
			
 
				+          batch_size * num_heads * num_m_blocks,
			
 
				+          dprops->multiProcessorCount * 2, num_n_blocks, 128);
			
 
				+    }
			
 
				+    if (params.num_splits > 1) {
			
 
				+      softmax_lse_accum =
			
 
				+          torch::empty({params.num_splits, batch_size, num_heads, max_seqlen_q},
			
 
				+                       opts.dtype(at::kFloat));
			
 
				+      out_accum = torch::empty({params.num_splits, batch_size, num_heads,
			
 
				+                                max_seqlen_q, head_size_rounded},
			
 
				+                               opts.dtype(at::kFloat));
			
 
				+      params.softmax_lseaccum_ptr = softmax_lse_accum.data_ptr();
			
 
				+      params.oaccum_ptr = out_accum.data_ptr();
			
 
				+    }
			
 
				+    TORCH_CHECK(params.num_splits <= 128, "num_splits > 128 not supported");
			
 
				+  }
			
 
				+
			
 
				+  return std::make_tuple(softmax_lse_accum, out_accum);
			
 
				+}
			
 
				+
			
 
				+void set_params_alibi(Flash_fwd_params& params,
			
 
				+                      const c10::optional<at::Tensor>& alibi_slopes_,
			
 
				+                      int batch_size, int num_heads) {
			
 
				+#ifdef FLASHATTENTION_DISABLE_ALIBI
			
 
				+  TORCH_CHECK(!alibi_slopes_.has_value(),
			
 
				+              "This flash attention build does not support alibi.");
			
 
				+  params.alibi_slopes_ptr = nullptr;
			
 
				+#else
			
 
				+  if (alibi_slopes_.has_value()) {
			
 
				+    auto alibi_slopes = alibi_slopes_.value();
			
 
				+    TORCH_CHECK(alibi_slopes.dtype() == torch::kFloat32,
			
 
				+                "ALiBi slopes must have dtype fp32");
			
 
				+    CHECK_DEVICE(alibi_slopes);
			
 
				+    TORCH_CHECK(alibi_slopes.stride(-1) == 1,
			
 
				+                "ALiBi slopes tensor must have contiguous last dimension");
			
 
				+    TORCH_CHECK(alibi_slopes.sizes() == torch::IntArrayRef({num_heads}) ||
			
 
				+                alibi_slopes.sizes() ==
			
 
				+                    torch::IntArrayRef({batch_size, num_heads}));
			
 
				+    params.alibi_slopes_ptr = alibi_slopes.data_ptr();
			
 
				+    params.alibi_slopes_batch_stride =
			
 
				+        alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0;
			
 
				+  } else {
			
 
				+    params.alibi_slopes_ptr = nullptr;
			
 
				+  }
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+std::vector<at::Tensor> mha_fwd(
			
 
				+    at::Tensor& q,        // batch_size x seqlen_q x num_heads x head_size
			
 
				+    const at::Tensor& k,  // batch_size x seqlen_k x num_heads_k x head_size
			
 
				+    const at::Tensor& v,  // batch_size x seqlen_k x num_heads_k x head_size
			
 
				+    const c10::optional<at::Tensor>&
			
 
				+        out_,  // batch_size x seqlen_q x num_heads x head_size
			
 
				+    const c10::optional<at::Tensor>&
			
 
				+        alibi_slopes_,  // num_heads or batch_size x num_heads
			
 
				+    const double p_dropout, const double softmax_scale, bool is_causal,
			
 
				+    int64_t window_size_left, int64_t window_size_right, const double softcap,
			
 
				+    const bool return_softmax, c10::optional<at::Generator> gen_) {
			
 
				+  auto dprops = at::cuda::getCurrentDeviceProperties();
			
 
				+  // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
			
 
				+  bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
			
 
				+  bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
			
 
				+  TORCH_CHECK(is_sm90 || is_sm8x,
			
 
				+              "FlashAttention only supports Ampere GPUs or newer.");
			
 
				+  // We will support Turing in the near future
			
 
				+  // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports
			
 
				+  // Turing GPUs or newer.");
			
 
				+
			
 
				+  auto q_dtype = q.dtype();
			
 
				+  TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
			
 
				+              "FlashAttention only support fp16 and bf16 data type");
			
 
				+  if (q_dtype == torch::kBFloat16) {
			
 
				+    TORCH_CHECK(is_sm90 || is_sm8x,
			
 
				+                "bfloat16 is only supported on Ampere GPUs or newer");
			
 
				+  }
			
 
				+  TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
			
 
				+  TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
			
 
				+
			
 
				+  CHECK_DEVICE(q);
			
 
				+  CHECK_DEVICE(k);
			
 
				+  CHECK_DEVICE(v);
			
 
				+
			
 
				+  TORCH_CHECK(q.stride(-1) == 1,
			
 
				+              "Input tensor must have contiguous last dimension");
			
 
				+  TORCH_CHECK(k.stride(-1) == 1,
			
 
				+              "Input tensor must have contiguous last dimension");
			
 
				+  TORCH_CHECK(v.stride(-1) == 1,
			
 
				+              "Input tensor must have contiguous last dimension");
			
 
				+
			
 
				+  const auto sizes = q.sizes();
			
 
				+
			
 
				+  const int batch_size = sizes[0];
			
 
				+  int seqlen_q = sizes[1];
			
 
				+  int num_heads = sizes[2];
			
 
				+  const int head_size_og = sizes[3];
			
 
				+  const int seqlen_k = k.size(1);
			
 
				+  const int num_heads_k = k.size(2);
			
 
				+  TORCH_CHECK(batch_size > 0, "batch size must be positive");
			
 
				+  TORCH_CHECK(
			
 
				+      head_size_og <= 256,
			
 
				+      "FlashAttention forward only supports head dimension at most 256");
			
 
				+  TORCH_CHECK(
			
 
				+      num_heads % num_heads_k == 0,
			
 
				+      "Number of heads in key/value must divide number of heads in query");
			
 
				+
			
 
				+  if (softcap > 0.f) {
			
 
				+    TORCH_CHECK(p_dropout == 0.f,
			
 
				+                "Softcapping does not support dropout for now");
			
 
				+  }
			
 
				+
			
 
				+  if (window_size_left >= seqlen_k) {
			
 
				+    window_size_left = -1;
			
 
				+  }
			
 
				+  if (window_size_right >= seqlen_k) {
			
 
				+    window_size_right = -1;
			
 
				+  }
			
 
				+
			
 
				+  // causal=true is the same as causal=false in this case
			
 
				+  if (seqlen_q == 1 && !alibi_slopes_.has_value()) {
			
 
				+    is_causal = false;
			
 
				+  }
			
 
				+  if (is_causal) {
			
 
				+    window_size_right = 0;
			
 
				+  }
			
 
				+
			
 
				+  // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups,
			
 
				+  // nheads_kv, d) in this case H/t Daniel Haziza
			
 
				+  const int seqlenq_ngroups_swapped =
			
 
				+      seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 &&
			
 
				+      window_size_right < 0 && p_dropout == 0.f && head_size_og % 8 == 0 &&
			
 
				+      !alibi_slopes_.has_value();
			
 
				+  const int ngroups = num_heads / num_heads_k;
			
 
				+  if (seqlenq_ngroups_swapped) {
			
 
				+    q = q.reshape({batch_size, num_heads_k, ngroups, head_size_og})
			
 
				+            .transpose(1, 2);
			
 
				+    seqlen_q = ngroups;
			
 
				+    num_heads = num_heads_k;
			
 
				+  }
			
 
				+
			
 
				+  CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size_og);
			
 
				+  CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size_og);
			
 
				+  CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size_og);
			
 
				+
			
 
				+  at::Tensor q_padded, k_padded, v_padded;
			
 
				+  if (head_size_og % 8 != 0) {
			
 
				+    q_padded = torch::nn::functional::pad(
			
 
				+        q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
			
 
				+    k_padded = torch::nn::functional::pad(
			
 
				+        k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
			
 
				+    v_padded = torch::nn::functional::pad(
			
 
				+        v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
			
 
				+  } else {
			
 
				+    q_padded = q;
			
 
				+    k_padded = k;
			
 
				+    v_padded = v;
			
 
				+  }
			
 
				+
			
 
				+  at::Tensor out;
			
 
				+  if (out_.has_value()) {
			
 
				+    out = out_.value();
			
 
				+    TORCH_CHECK(out.dtype() == q_dtype,
			
 
				+                "Output must have the same dtype as inputs");
			
 
				+    CHECK_DEVICE(out);
			
 
				+    TORCH_CHECK(out.stride(-1) == 1,
			
 
				+                "Output tensor must have contiguous last dimension");
			
 
				+    CHECK_SHAPE(out, batch_size, sizes[1], sizes[2], head_size_og);
			
 
				+    if (seqlenq_ngroups_swapped) {
			
 
				+      out = out.reshape({batch_size, num_heads_k, ngroups, head_size_og})
			
 
				+                .transpose(1, 2);
			
 
				+    }
			
 
				+    if (head_size_og % 8 != 0) {
			
 
				+      out = torch::empty_like(q_padded);
			
 
				+    }
			
 
				+  } else {
			
 
				+    out = torch::empty_like(q_padded);
			
 
				+  }
			
 
				+
			
 
				+  auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
			
 
				+  const int head_size = round_multiple(head_size_og, 8);
			
 
				+  const int head_size_rounded = round_multiple(head_size, 32);
			
 
				+  const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
			
 
				+  const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
			
 
				+
			
 
				+  // Otherwise the kernel will be launched from cuda:0 device
			
 
				+  // Cast to char to avoid compiler warning about narrowing
			
 
				+  at::cuda::CUDAGuard device_guard{(char)q.get_device()};
			
 
				+
			
 
				+  auto opts = q.options();
			
 
				+
			
 
				+  auto softmax_lse =
			
 
				+      torch::empty({batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat));
			
 
				+  at::Tensor p;
			
 
				+  // Only return softmax if there's dropout to reduce compilation time
			
 
				+  if (return_softmax) {
			
 
				+    TORCH_CHECK(p_dropout > 0.0f,
			
 
				+                "return_softmax is only supported when p_dropout > 0.0");
			
 
				+    p = torch::empty(
			
 
				+        {batch_size, num_heads, seqlen_q_rounded, seqlen_k_rounded}, opts);
			
 
				+  }
			
 
				+
			
 
				+  Flash_fwd_params params;
			
 
				+  set_params_fprop(params, batch_size, seqlen_q, seqlen_k, seqlen_q_rounded,
			
 
				+                   seqlen_k_rounded, num_heads, num_heads_k, head_size,
			
 
				+                   head_size_rounded, q_padded, k_padded, v_padded, out,
			
 
				+                   /*cu_seqlens_q_d=*/nullptr,
			
 
				+                   /*cu_seqlens_k_d=*/nullptr,
			
 
				+                   /*seqused_k=*/nullptr,
			
 
				+                   return_softmax ? p.data_ptr() : nullptr,
			
 
				+                   softmax_lse.data_ptr(), p_dropout, softmax_scale,
			
 
				+                   window_size_left, window_size_right, softcap);
			
 
				+
			
 
				+  // Keep references to these tensors to extend their lifetime
			
 
				+  at::Tensor softmax_lse_accum, out_accum;
			
 
				+  std::tie(softmax_lse_accum, out_accum) = set_params_splitkv(
			
 
				+      params, batch_size, num_heads, head_size, seqlen_k, seqlen_q,
			
 
				+      head_size_rounded, p_dropout, /*num_splits*/ 0, dprops, opts);
			
 
				+
			
 
				+  // number of times random will be generated per thread, to offset philox
			
 
				+  // counter in thc random state We use a custom RNG that increases the offset
			
 
				+  // by batch_size * nheads * 32.
			
 
				+  int64_t counter_offset = params.b * params.h * 32;
			
 
				+  auto options =
			
 
				+      torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
			
 
				+  auto rng_state = torch::empty({2}, options.dtype(torch::kInt64));
			
 
				+  // Forward kernel will populate memory with the seed and offset.
			
 
				+  params.rng_state = reinterpret_cast<uint64_t*>(rng_state.data_ptr());
			
 
				+
			
 
				+  if (p_dropout > 0.0) {
			
 
				+    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
			
 
				+        gen_, at::cuda::detail::getDefaultCUDAGenerator());
			
 
				+    // See Note [Acquire lock when using random generators]
			
 
				+    std::lock_guard<std::mutex> lock(gen->mutex_);
			
 
				+    params.philox_args = gen->philox_cuda_state(counter_offset);
			
 
				+  }
			
 
				+
			
 
				+  set_params_alibi(params, alibi_slopes_, batch_size, num_heads);
			
 
				+
			
 
				+  if (seqlen_k > 0) {
			
 
				+    auto stream = at::cuda::getCurrentCUDAStream().stream();
			
 
				+    run_mha_fwd(params, stream);
			
 
				+  } else {
			
 
				+    // If seqlen_k == 0, then we have an empty tensor. We need to set the output
			
 
				+    // to 0.
			
 
				+    out.zero_();
			
 
				+    softmax_lse.fill_(std::numeric_limits<float>::infinity());
			
 
				+  }
			
 
				+
			
 
				+  at::Tensor out_padded = out;
			
 
				+  if (head_size_og % 8 != 0) {
			
 
				+    out = out.index(
			
 
				+        {"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
			
 
				+    if (out_.has_value()) {
			
 
				+      out_.value().copy_(out);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  if (seqlenq_ngroups_swapped) {
			
 
				+    out = out.transpose(1, 2).reshape(
			
 
				+        {batch_size, 1, num_heads_k * seqlen_q, head_size_og});
			
 
				+    out_padded = out_padded.transpose(1, 2).reshape(
			
 
				+        {batch_size, 1, num_heads_k * seqlen_q, head_size_og});
			
 
				+    q_padded = q_padded.transpose(1, 2).reshape(
			
 
				+        {batch_size, 1, num_heads_k * seqlen_q, head_size_og});
			
 
				+    softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1});
			
 
				+  }
			
 
				+  return {out,        q_padded,    k_padded, v_padded,
			
 
				+          out_padded, softmax_lse, p,        rng_state};
			
 
				+}
			
 
				+
			
 
				+std::vector<at::Tensor> mha_varlen_fwd(
			
 
				+    at::Tensor&
			
 
				+        q,  // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
			
 
				+    const at::Tensor& k,  // total_k x num_heads_k x head_size, total_k :=
			
 
				+                          // \sum_{i=0}^{b} s_i or num_blocks x page_block_size
			
 
				+                          // x num_heads_k x head_size if there's a block_table.
			
 
				+    const at::Tensor& v,  // total_k x num_heads_k x head_size, total_k :=
			
 
				+                          // \sum_{i=0}^{b} s_i or num_blocks x page_block_size
			
 
				+                          // x num_heads_k x head_size if there's a block_table.
			
 
				+    const c10::optional<at::Tensor>&
			
 
				+        out_,  // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
			
 
				+    const at::Tensor& cu_seqlens_q,  // b+1
			
 
				+    const at::Tensor& cu_seqlens_k,  // b+1
			
 
				+    const c10::optional<at::Tensor>&
			
 
				+        seqused_k,  // b. If given, only this many elements of each batch
			
 
				+                    // element's keys are used.
			
 
				+    const c10::optional<at::Tensor>&
			
 
				+        block_table_,  // batch_size x max_num_blocks_per_seq
			
 
				+    const c10::optional<at::Tensor>&
			
 
				+        alibi_slopes_,  // num_heads or b x num_heads
			
 
				+    int64_t max_seqlen_q, const int64_t max_seqlen_k, const double p_dropout,
			
 
				+    const double softmax_scale, const bool zero_tensors, bool is_causal,
			
 
				+    int64_t window_size_left, int64_t window_size_right, const double softcap,
			
 
				+    const bool return_softmax, c10::optional<at::Generator> gen_) {
			
 
				+  auto dprops = at::cuda::getCurrentDeviceProperties();
			
 
				+  // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
			
 
				+  bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
			
 
				+  bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
			
 
				+  TORCH_CHECK(is_sm90 || is_sm8x,
			
 
				+              "FlashAttention only supports Ampere GPUs or newer.");
			
 
				+  // We will support Turing in the near future
			
 
				+  // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports
			
 
				+  // Turing GPUs or newer.");
			
 
				+
			
 
				+  auto q_dtype = q.dtype();
			
 
				+  TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
			
 
				+              "FlashAttention only support fp16 and bf16 data type");
			
 
				+  if (q_dtype == torch::kBFloat16) {
			
 
				+    TORCH_CHECK(is_sm90 || is_sm8x,
			
 
				+                "bfloat16 is only supported on Ampere GPUs or newer");
			
 
				+  }
			
 
				+  TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
			
 
				+  TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
			
 
				+  TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32,
			
 
				+              "cu_seqlens_q must have dtype int32");
			
 
				+  TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32,
			
 
				+              "cu_seqlens_k must have dtype int32");
			
 
				+
			
 
				+  CHECK_DEVICE(q);
			
 
				+  CHECK_DEVICE(k);
			
 
				+  CHECK_DEVICE(v);
			
 
				+  CHECK_DEVICE(cu_seqlens_q);
			
 
				+  CHECK_DEVICE(cu_seqlens_k);
			
 
				+
			
 
				+  at::Tensor block_table;
			
 
				+  const bool paged_KV = block_table_.has_value();
			
 
				+  if (paged_KV) {
			
 
				+    block_table = block_table_.value();
			
 
				+    CHECK_DEVICE(block_table);
			
 
				+    TORCH_CHECK(block_table.dtype() == torch::kInt32,
			
 
				+                "block_table must have dtype torch.int32");
			
 
				+    TORCH_CHECK(block_table.stride(-1) == 1,
			
 
				+                "block_table must have contiguous last dimension");
			
 
				+  }
			
 
				+
			
 
				+  TORCH_CHECK(q.stride(-1) == 1,
			
 
				+              "Input tensor must have contiguous last dimension");
			
 
				+  TORCH_CHECK(k.stride(-1) == 1,
			
 
				+              "Input tensor must have contiguous last dimension");
			
 
				+  TORCH_CHECK(v.stride(-1) == 1,
			
 
				+              "Input tensor must have contiguous last dimension");
			
 
				+  CHECK_CONTIGUOUS(cu_seqlens_q);
			
 
				+  CHECK_CONTIGUOUS(cu_seqlens_k);
			
 
				+
			
 
				+  const auto sizes = q.sizes();
			
 
				+
			
 
				+  const int batch_size = cu_seqlens_q.numel() - 1;
			
 
				+  int num_heads = sizes[1];
			
 
				+  const int head_size_og = sizes[2];
			
 
				+  const int num_heads_k = paged_KV ? k.size(2) : k.size(1);
			
 
				+
			
 
				+  if (softcap > 0.f) {
			
 
				+    TORCH_CHECK(p_dropout == 0.f,
			
 
				+                "Softcapping does not support dropout for now");
			
 
				+  }
			
 
				+
			
 
				+  const int max_num_blocks_per_seq = !paged_KV ? 0 : block_table.size(1);
			
 
				+  const int num_blocks = !paged_KV ? 0 : k.size(0);
			
 
				+  const int page_block_size = !paged_KV ? 1 : k.size(1);
			
 
				+  TORCH_CHECK(!paged_KV || page_block_size % 16 == 0,
			
 
				+              "Paged KV cache block size must be divisible by 16");
			
 
				+
			
 
				+  if (max_seqlen_q == 1 && !alibi_slopes_.has_value()) {
			
 
				+    is_causal = false;
			
 
				+  }  // causal=true is the same as causal=false in this case
			
 
				+  if (is_causal) {
			
 
				+    window_size_right = 0;
			
 
				+  }
			
 
				+
			
 
				+  void* cu_seqlens_q_d = cu_seqlens_q.data_ptr();
			
 
				+
			
 
				+  // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups,
			
 
				+  // nheads_kv, d) in this case H/t Daniel Haziza
			
 
				+  const int seqlenq_ngroups_swapped =
			
 
				+      max_seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 &&
			
 
				+      window_size_right < 0 && p_dropout == 0.f && head_size_og % 8 == 0 &&
			
 
				+      !alibi_slopes_.has_value();
			
 
				+  const int ngroups = num_heads / num_heads_k;
			
 
				+  if (seqlenq_ngroups_swapped) {
			
 
				+    q = q.reshape({batch_size, num_heads_k, ngroups, head_size_og})
			
 
				+            .transpose(1, 2)
			
 
				+            .reshape({batch_size * ngroups, num_heads_k, head_size_og});
			
 
				+    max_seqlen_q = ngroups;
			
 
				+    num_heads = num_heads_k;
			
 
				+    cu_seqlens_q_d = nullptr;
			
 
				+  }
			
 
				+
			
 
				+  const int total_q = q.sizes()[0];
			
 
				+
			
 
				+  TORCH_CHECK(batch_size > 0, "batch size must be positive");
			
 
				+  TORCH_CHECK(
			
 
				+      head_size_og <= 256,
			
 
				+      "FlashAttention forward only supports head dimension at most 256");
			
 
				+  TORCH_CHECK(
			
 
				+      num_heads % num_heads_k == 0,
			
 
				+      "Number of heads in key/value must divide number of heads in query");
			
 
				+
			
 
				+  if (window_size_left >= max_seqlen_k) {
			
 
				+    window_size_left = -1;
			
 
				+  }
			
 
				+  if (window_size_right >= max_seqlen_k) {
			
 
				+    window_size_right = -1;
			
 
				+  }
			
 
				+
			
 
				+  CHECK_SHAPE(q, total_q, num_heads, head_size_og);
			
 
				+  if (!paged_KV) {
			
 
				+    const int total_k = k.size(0);
			
 
				+    CHECK_SHAPE(k, total_k, num_heads_k, head_size_og);
			
 
				+    CHECK_SHAPE(v, total_k, num_heads_k, head_size_og);
			
 
				+  } else {
			
 
				+    CHECK_SHAPE(k, num_blocks, page_block_size, num_heads_k, head_size_og);
			
 
				+    CHECK_SHAPE(v, num_blocks, page_block_size, num_heads_k, head_size_og);
			
 
				+    CHECK_SHAPE(block_table, batch_size, max_num_blocks_per_seq);
			
 
				+  }
			
 
				+
			
 
				+  CHECK_SHAPE(cu_seqlens_q, batch_size + 1);
			
 
				+  CHECK_SHAPE(cu_seqlens_k, batch_size + 1);
			
 
				+  if (seqused_k.has_value()) {
			
 
				+    auto seqused_k_ = seqused_k.value();
			
 
				+    TORCH_CHECK(seqused_k_.dtype() == torch::kInt32,
			
 
				+                "seqused_k must have dtype int32");
			
 
				+    TORCH_CHECK(seqused_k_.is_cuda(), "seqused_k must be on CUDA device");
			
 
				+    TORCH_CHECK(seqused_k_.is_contiguous(), "seqused_k must be contiguous");
			
 
				+    CHECK_SHAPE(seqused_k_, batch_size);
			
 
				+  }
			
 
				+
			
 
				+  at::Tensor q_padded, k_padded, v_padded;
			
 
				+  if (head_size_og % 8 != 0) {
			
 
				+    q_padded = torch::nn::functional::pad(
			
 
				+        q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
			
 
				+    k_padded = torch::nn::functional::pad(
			
 
				+        k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
			
 
				+    v_padded = torch::nn::functional::pad(
			
 
				+        v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
			
 
				+  } else {
			
 
				+    q_padded = q;
			
 
				+    k_padded = k;
			
 
				+    v_padded = v;
			
 
				+  }
			
 
				+
			
 
				+  at::Tensor out;
			
 
				+  if (out_.has_value()) {
			
 
				+    out = out_.value();
			
 
				+    TORCH_CHECK(out.dtype() == q_dtype,
			
 
				+                "Output must have the same dtype as inputs");
			
 
				+    CHECK_DEVICE(out);
			
 
				+    TORCH_CHECK(out.stride(-1) == 1,
			
 
				+                "Output tensor must have contiguous last dimension");
			
 
				+    CHECK_SHAPE(out, sizes[0], sizes[1], head_size_og);
			
 
				+    if (seqlenq_ngroups_swapped) {
			
 
				+      out = out.reshape({batch_size, num_heads_k, ngroups, head_size_og})
			
 
				+                .transpose(1, 2)
			
 
				+                .reshape({batch_size * ngroups, num_heads_k, head_size_og});
			
 
				+    }
			
 
				+    if (head_size_og % 8 != 0) {
			
 
				+      out = torch::empty_like(q_padded);
			
 
				+    }
			
 
				+  } else {
			
 
				+    out = torch::empty_like(q_padded);
			
 
				+  }
			
 
				+
			
 
				+  auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
			
 
				+  const int head_size = round_multiple(head_size_og, 8);
			
 
				+  const int head_size_rounded = round_multiple(head_size, 32);
			
 
				+  const int seqlen_q_rounded = round_multiple(max_seqlen_q, 128);
			
 
				+  const int seqlen_k_rounded = round_multiple(max_seqlen_k, 128);
			
 
				+
			
 
				+  // Otherwise the kernel will be launched from cuda:0 device
			
 
				+  // Cast to char to avoid compiler warning about narrowing
			
 
				+  at::cuda::CUDAGuard device_guard{(char)q.get_device()};
			
 
				+
			
 
				+  auto opts = q.options();
			
 
				+  auto softmax_lse = torch::empty({num_heads, total_q}, opts.dtype(at::kFloat));
			
 
				+  at::Tensor p;
			
 
				+  // Only return softmax if there's dropout to reduce compilation time
			
 
				+  if (return_softmax) {
			
 
				+    TORCH_CHECK(p_dropout > 0.0f,
			
 
				+                "return_softmax is only supported when p_dropout > 0.0");
			
 
				+    p = torch::empty(
			
 
				+        {batch_size, num_heads, seqlen_q_rounded, seqlen_k_rounded}, opts);
			
 
				+  }
			
 
				+
			
 
				+  if (zero_tensors) {
			
 
				+    out.zero_();
			
 
				+    softmax_lse.fill_(-std::numeric_limits<float>::infinity());
			
 
				+    if (return_softmax) {
			
 
				+      p.zero_();
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  Flash_fwd_params params;
			
 
				+  set_params_fprop(
			
 
				+      params, batch_size, max_seqlen_q, max_seqlen_k, seqlen_q_rounded,
			
 
				+      seqlen_k_rounded, num_heads, num_heads_k, head_size, head_size_rounded,
			
 
				+      q_padded, k_padded, v_padded, out, cu_seqlens_q_d,
			
 
				+      cu_seqlens_k.data_ptr(),
			
 
				+      seqused_k.has_value() ? seqused_k.value().data_ptr() : nullptr,
			
 
				+      return_softmax ? p.data_ptr() : nullptr, softmax_lse.data_ptr(),
			
 
				+      p_dropout, softmax_scale, window_size_left, window_size_right, softcap,
			
 
				+      seqlenq_ngroups_swapped,
			
 
				+      /*unpadded_lse*/ true);
			
 
				+  params.total_q = total_q;
			
 
				+
			
 
				+  if (paged_KV) {
			
 
				+    params.block_table = block_table.data_ptr<int>();
			
 
				+    params.block_table_batch_stride = block_table.stride(0);
			
 
				+    params.k_batch_stride = k_padded.stride(0);
			
 
				+    params.v_batch_stride = v_padded.stride(0);
			
 
				+  }
			
 
				+  params.page_block_size = page_block_size;
			
 
				+  // Keep references to these tensors to extend their lifetime
			
 
				+  at::Tensor softmax_lse_accum, out_accum;
			
 
				+  if (seqlenq_ngroups_swapped) {
			
 
				+    // Only apply split-k for decoding
			
 
				+    std::tie(softmax_lse_accum, out_accum) = set_params_splitkv(
			
 
				+        params, batch_size, num_heads, head_size, max_seqlen_k, max_seqlen_q,
			
 
				+        head_size_rounded, p_dropout, /*num_splits*/ 0, dprops, opts);
			
 
				+  }
			
 
				+
			
 
				+  // number of times random will be generated per thread, to offset philox
			
 
				+  // counter in thc random state We use a custom RNG that increases the offset
			
 
				+  // by batch_size * nheads * 32.
			
 
				+  int64_t counter_offset = params.b * params.h * 32;
			
 
				+  auto options =
			
 
				+      torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
			
 
				+  auto rng_state = torch::empty({2}, options.dtype(torch::kInt64));
			
 
				+  // Forward kernel will populate memory with the seed and offset.
			
 
				+  params.rng_state = reinterpret_cast<uint64_t*>(rng_state.data_ptr());
			
 
				+
			
 
				+  if (p_dropout > 0.0) {
			
 
				+    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
			
 
				+        gen_, at::cuda::detail::getDefaultCUDAGenerator());
			
 
				+    // See Note [Acquire lock when using random generators]
			
 
				+    std::lock_guard<std::mutex> lock(gen->mutex_);
			
 
				+    params.philox_args = gen->philox_cuda_state(counter_offset);
			
 
				+  }
			
 
				+
			
 
				+  set_params_alibi(params, alibi_slopes_, batch_size, num_heads);
			
 
				+
			
 
				+  if (max_seqlen_k > 0) {
			
 
				+    auto stream = at::cuda::getCurrentCUDAStream().stream();
			
 
				+    run_mha_fwd(params, stream, paged_KV);
			
 
				+  } else {
			
 
				+    // If seqlen_k == 0, then we have an empty tensor. We need to set the output
			
 
				+    // to 0.
			
 
				+    out.zero_();
			
 
				+    softmax_lse.fill_(std::numeric_limits<float>::infinity());
			
 
				+  }
			
 
				+
			
 
				+  at::Tensor out_padded = out;
			
 
				+  if (head_size_og % 8 != 0) {
			
 
				+    out = out.index(
			
 
				+        {"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
			
 
				+    if (out_.has_value()) {
			
 
				+      out_.value().copy_(out);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  if (seqlenq_ngroups_swapped) {
			
 
				+    int64_t size_before[] = {batch_size, max_seqlen_q, num_heads_k,
			
 
				+                             head_size_og};
			
 
				+    int64_t size_after[] = {batch_size, num_heads_k * max_seqlen_q,
			
 
				+                            head_size_og};
			
 
				+    out = out.reshape(size_before).transpose(1, 2).reshape(size_after);
			
 
				+    out_padded =
			
 
				+        out_padded.reshape(size_before).transpose(1, 2).reshape(size_after);
			
 
				+    q_padded =
			
 
				+        q_padded.reshape(size_before).transpose(1, 2).reshape(size_after);
			
 
				+    softmax_lse = softmax_lse.reshape({num_heads * max_seqlen_q, batch_size});
			
 
				+  }
			
 
				+
			
 
				+  return {out,        q_padded,    k_padded, v_padded,
			
 
				+          out_padded, softmax_lse, p,        rng_state};
			
 
				+}
			
 
				+
			
 
				+std::vector<at::Tensor> mha_fwd_kvcache(
			
 
				+    at::Tensor& q,  // batch_size x seqlen_q x num_heads x head_size
			
 
				+    const at::Tensor&
			
 
				+        kcache,  // batch_size_c x seqlen_k x num_heads_k x head_size or
			
 
				+                 // num_blocks x page_block_size x num_heads_k x head_size if
			
 
				+                 // there's a block_table.
			
 
				+    const at::Tensor&
			
 
				+        vcache,  // batch_size_c x seqlen_k x num_heads_k x head_size or
			
 
				+                 // num_blocks x page_block_size x num_heads_k x head_size if
			
 
				+                 // there's a block_table.
			
 
				+    const c10::optional<at::Tensor>&
			
 
				+        k_,  // batch_size x seqlen_knew x num_heads_k x head_size
			
 
				+    const c10::optional<at::Tensor>&
			
 
				+        v_,  // batch_size x seqlen_knew x num_heads_k x head_size
			
 
				+    const c10::optional<at::Tensor>& seqlens_k_,  // batch_size
			
 
				+    const c10::optional<at::Tensor>&
			
 
				+        rotary_cos_,  // seqlen_ro x (rotary_dim / 2)
			
 
				+    const c10::optional<at::Tensor>&
			
 
				+        rotary_sin_,  // seqlen_ro x (rotary_dim / 2)
			
 
				+    const c10::optional<at::Tensor>&
			
 
				+        cache_batch_idx_,  // indices to index into the KV cache
			
 
				+    const c10::optional<at::Tensor>&
			
 
				+        block_table_,  // batch_size x max_num_blocks_per_seq
			
 
				+    const c10::optional<at::Tensor>&
			
 
				+        alibi_slopes_,  // num_heads or batch_size x num_heads
			
 
				+    const c10::optional<at::Tensor>&
			
 
				+        out_,  // batch_size x seqlen_q x num_heads x head_size
			
 
				+    const double softmax_scale, bool is_causal, int64_t window_size_left,
			
 
				+    int64_t window_size_right, const double softcap,
			
 
				+    bool is_rotary_interleaved,  // if true, rotary combines indices 0 & 1, else
			
 
				+                                 // indices 0 & rotary_dim / 2
			
 
				+    int64_t num_splits) {
			
 
				+  auto dprops = at::cuda::getCurrentDeviceProperties();
			
 
				+  // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
			
 
				+  bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
			
 
				+  bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
			
 
				+  TORCH_CHECK(is_sm90 || is_sm8x,
			
 
				+              "FlashAttention only supports Ampere GPUs or newer.");
			
 
				+  // We will support Turing in the near future
			
 
				+  // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports
			
 
				+  // Turing GPUs or newer.");
			
 
				+
			
 
				+  auto q_dtype = q.dtype();
			
 
				+  TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
			
 
				+              "FlashAttention only support fp16 and bf16 data type");
			
 
				+  if (q_dtype == torch::kBFloat16) {
			
 
				+    TORCH_CHECK(is_sm90 || is_sm8x,
			
 
				+                "bfloat16 is only supported on Ampere GPUs or newer");
			
 
				+  }
			
 
				+  TORCH_CHECK(kcache.dtype() == q_dtype,
			
 
				+              "query and key must have the same dtype");
			
 
				+  TORCH_CHECK(vcache.dtype() == q_dtype,
			
 
				+              "query and value must have the same dtype");
			
 
				+
			
 
				+  CHECK_DEVICE(q);
			
 
				+  CHECK_DEVICE(kcache);
			
 
				+  CHECK_DEVICE(vcache);
			
 
				+
			
 
				+  TORCH_CHECK(q.stride(-1) == 1,
			
 
				+              "Input tensor must have contiguous last dimension");
			
 
				+  TORCH_CHECK(kcache.stride(-1) == 1,
			
 
				+              "Input tensor must have contiguous last dimension");
			
 
				+  TORCH_CHECK(vcache.stride(-1) == 1,
			
 
				+              "Input tensor must have contiguous last dimension");
			
 
				+
			
 
				+  at::Tensor block_table;
			
 
				+  const bool paged_KV = block_table_.has_value();
			
 
				+  if (paged_KV) {
			
 
				+    TORCH_CHECK(!cache_batch_idx_.has_value(),
			
 
				+                "Paged KVcache does not support cache_batch_idx");
			
 
				+    block_table = block_table_.value();
			
 
				+    CHECK_DEVICE(block_table);
			
 
				+    TORCH_CHECK(block_table.dtype() == torch::kInt32,
			
 
				+                "block_table must have dtype torch.int32");
			
 
				+    TORCH_CHECK(block_table.stride(-1) == 1,
			
 
				+                "block_table must have contiguous last dimension");
			
 
				+  }
			
 
				+
			
 
				+  const auto sizes = q.sizes();
			
 
				+
			
 
				+  const int batch_size = sizes[0];
			
 
				+  int seqlen_q = sizes[1];
			
 
				+  const int seqlen_q_og = seqlen_q;
			
 
				+  int num_heads = sizes[2];
			
 
				+  const int num_heads_og = num_heads;
			
 
				+  const int head_size_og = sizes[3];
			
 
				+
			
 
				+  const int max_num_blocks_per_seq = !paged_KV ? 0 : block_table.size(1);
			
 
				+  const int num_blocks = !paged_KV ? 0 : kcache.size(0);
			
 
				+  const int page_block_size = !paged_KV ? 1 : kcache.size(1);
			
 
				+  TORCH_CHECK(!paged_KV || page_block_size % 16 == 0,
			
 
				+              "Paged KV cache block size must be divisible by 16");
			
 
				+  const int seqlen_k =
			
 
				+      !paged_KV ? kcache.size(1) : max_num_blocks_per_seq * page_block_size;
			
 
				+  const int num_heads_k = kcache.size(2);
			
 
				+  const int batch_size_c = !paged_KV ? kcache.size(0) : batch_size;
			
 
				+  TORCH_CHECK(batch_size > 0, "batch size must be positive");
			
 
				+  TORCH_CHECK(
			
 
				+      head_size_og <= 256,
			
 
				+      "FlashAttention forward only supports head dimension at most 256");
			
 
				+  TORCH_CHECK(
			
 
				+      num_heads % num_heads_k == 0,
			
 
				+      "Number of heads in key/value must divide number of heads in query");
			
 
				+
			
 
				+  // causal=true is the same as causal=false in this case
			
 
				+  if (seqlen_q == 1 && !alibi_slopes_.has_value()) {
			
 
				+    is_causal = false;
			
 
				+  }
			
 
				+  if (is_causal) {
			
 
				+    window_size_right = 0;
			
 
				+  }
			
 
				+
			
 
				+  // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups,
			
 
				+  // nheads_kv, d) in this case H/t Daniel Haziza
			
 
				+  const int seqlenq_ngroups_swapped =
			
 
				+      seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 &&
			
 
				+      window_size_right < 0 && head_size_og % 8 == 0 &&
			
 
				+      !alibi_slopes_.has_value();
			
 
				+  if (seqlenq_ngroups_swapped) {
			
 
				+    const int ngroups = num_heads / num_heads_k;
			
 
				+    q = q.reshape({batch_size, num_heads_k, ngroups, head_size_og})
			
 
				+            .transpose(1, 2);
			
 
				+    seqlen_q = ngroups;
			
 
				+    num_heads = num_heads_k;
			
 
				+  }
			
 
				+
			
 
				+  if (window_size_left >= seqlen_k) {
			
 
				+    window_size_left = -1;
			
 
				+  }
			
 
				+  if (window_size_right >= seqlen_k) {
			
 
				+    window_size_right = -1;
			
 
				+  }
			
 
				+
			
 
				+  CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size_og);
			
 
				+  if (!paged_KV) {
			
 
				+    CHECK_SHAPE(kcache, batch_size_c, seqlen_k, num_heads_k, head_size_og);
			
 
				+    CHECK_SHAPE(vcache, batch_size_c, seqlen_k, num_heads_k, head_size_og);
			
 
				+  } else {
			
 
				+    CHECK_SHAPE(kcache, num_blocks, page_block_size, num_heads_k, head_size_og);
			
 
				+    CHECK_SHAPE(vcache, num_blocks, page_block_size, num_heads_k, head_size_og);
			
 
				+    CHECK_SHAPE(block_table, batch_size, max_num_blocks_per_seq);
			
 
				+  }
			
 
				+
			
 
				+  at::Tensor q_padded, kcache_padded, vcache_padded;
			
 
				+  if (head_size_og % 8 != 0) {
			
 
				+    q_padded = torch::nn::functional::pad(
			
 
				+        q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
			
 
				+    kcache_padded = torch::nn::functional::pad(
			
 
				+        kcache,
			
 
				+        torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
			
 
				+    vcache_padded = torch::nn::functional::pad(
			
 
				+        vcache,
			
 
				+        torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
			
 
				+  } else {
			
 
				+    q_padded = q;
			
 
				+    kcache_padded = kcache;
			
 
				+    vcache_padded = vcache;
			
 
				+  }
			
 
				+
			
 
				+  at::Tensor out;
			
 
				+  if (out_.has_value()) {
			
 
				+    out = out_.value();
			
 
				+    TORCH_CHECK(out.dtype() == q_dtype,
			
 
				+                "Output must have the same dtype as inputs");
			
 
				+    CHECK_DEVICE(out);
			
 
				+    TORCH_CHECK(out.stride(-1) == 1,
			
 
				+                "Output tensor must have contiguous last dimension");
			
 
				+    CHECK_SHAPE(out, batch_size, seqlen_q_og, num_heads_og, head_size_og);
			
 
				+    if (head_size_og % 8 != 0) {
			
 
				+      out = torch::empty_like(q_padded);
			
 
				+    } else if (seqlenq_ngroups_swapped) {
			
 
				+      out = out.reshape({batch_size, num_heads, seqlen_q, head_size_og})
			
 
				+                .transpose(1, 2);
			
 
				+    }
			
 
				+  } else {
			
 
				+    out = torch::empty_like(q_padded);
			
 
				+  }
			
 
				+
			
 
				+  auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
			
 
				+  const int head_size = round_multiple(head_size_og, 8);
			
 
				+  const int head_size_rounded = round_multiple(head_size, 32);
			
 
				+  const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
			
 
				+  const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
			
 
				+
			
 
				+  // Otherwise the kernel will be launched from cuda:0 device
			
 
				+  // Cast to char to avoid compiler warning about narrowing
			
 
				+  at::cuda::CUDAGuard device_guard{(char)q.get_device()};
			
 
				+
			
 
				+  auto opts = q.options();
			
 
				+
			
 
				+  auto softmax_lse =
			
 
				+      torch::empty({batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat));
			
 
				+
			
 
				+  Flash_fwd_params params;
			
 
				+  set_params_fprop(params, batch_size, seqlen_q, seqlen_k, seqlen_q_rounded,
			
 
				+                   seqlen_k_rounded, num_heads, num_heads_k, head_size,
			
 
				+                   head_size_rounded, q_padded, kcache_padded, vcache_padded,
			
 
				+                   out,
			
 
				+                   /*cu_seqlens_q_d=*/nullptr,
			
 
				+                   /*cu_seqlens_k_d=*/nullptr,
			
 
				+                   /*seqused_k=*/nullptr,
			
 
				+                   /*p_ptr=*/nullptr, softmax_lse.data_ptr(),
			
 
				+                   /*p_dropout=*/0.f, softmax_scale, window_size_left,
			
 
				+                   window_size_right, softcap);
			
 
				+
			
 
				+  at::Tensor k, v, k_padded, v_padded;
			
 
				+  if (k_.has_value()) {
			
 
				+    TORCH_CHECK(v_.has_value(),
			
 
				+                "If key is supplied, value must also be passed in");
			
 
				+    TORCH_CHECK(seqlens_k_.has_value(),
			
 
				+                "If key is supplied, seqlens_k must also be passed in");
			
 
				+    TORCH_CHECK(seqlen_q <= seqlen_k,
			
 
				+                "If key is supplied, it must have seqlen <= the seqlen of the "
			
 
				+                "KV cache");
			
 
				+    k = k_.value();
			
 
				+    v = v_.value();
			
 
				+    TORCH_CHECK(k.dtype() == q_dtype, "Key must have the same dtype as query");
			
 
				+    TORCH_CHECK(v.dtype() == q_dtype,
			
 
				+                "Value must have the same dtype as query");
			
 
				+    CHECK_DEVICE(k);
			
 
				+    CHECK_DEVICE(v);
			
 
				+    TORCH_CHECK(k.stride(-1) == 1,
			
 
				+                "Key tensor must have contiguous last dimension");
			
 
				+    TORCH_CHECK(v.stride(-1) == 1,
			
 
				+                "Value tensor must have contiguous last dimension");
			
 
				+    int seqlen_knew = k.size(1);
			
 
				+    CHECK_SHAPE(k, batch_size, seqlen_knew, num_heads_k, head_size_og);
			
 
				+    CHECK_SHAPE(v, batch_size, seqlen_knew, num_heads_k, head_size_og);
			
 
				+    if (head_size_og % 8 != 0) {
			
 
				+      k_padded = torch::nn::functional::pad(
			
 
				+          k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
			
 
				+      v_padded = torch::nn::functional::pad(
			
 
				+          v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
			
 
				+    } else {
			
 
				+      k_padded = k;
			
 
				+      v_padded = v;
			
 
				+    }
			
 
				+    params.seqlen_knew = seqlen_knew;
			
 
				+    params.knew_ptr = k_padded.data_ptr();
			
 
				+    params.vnew_ptr = v_padded.data_ptr();
			
 
				+    // All stride are in elements, not bytes.
			
 
				+    params.knew_batch_stride = k_padded.stride(0);
			
 
				+    params.vnew_batch_stride = v_padded.stride(0);
			
 
				+    params.knew_row_stride = k_padded.stride(-3);
			
 
				+    params.vnew_row_stride = v_padded.stride(-3);
			
 
				+    params.knew_head_stride = k_padded.stride(-2);
			
 
				+    params.vnew_head_stride = v_padded.stride(-2);
			
 
				+  }
			
 
				+
			
 
				+  if (seqlens_k_.has_value()) {
			
 
				+    auto seqlens_k = seqlens_k_.value();
			
 
				+    TORCH_CHECK(seqlens_k.dtype() == torch::kInt32,
			
 
				+                "seqlens_k must have dtype int32");
			
 
				+    CHECK_DEVICE(seqlens_k);
			
 
				+    CHECK_CONTIGUOUS(seqlens_k);
			
 
				+    CHECK_SHAPE(seqlens_k, batch_size);
			
 
				+    params.cu_seqlens_k = static_cast<int*>(seqlens_k.data_ptr());
			
 
				+  }
			
 
				+  params.is_seqlens_k_cumulative = !(seqlens_k_.has_value());
			
 
				+
			
 
				+  if (rotary_cos_.has_value()) {
			
 
				+    TORCH_CHECK(k_.has_value(),
			
 
				+                "If rotary cos/sin are provided, new key / value to be "
			
 
				+                "appended to KV cache must also be provided");
			
 
				+    auto rotary_cos = rotary_cos_.value();
			
 
				+    CHECK_DEVICE(rotary_cos);
			
 
				+    params.rotary_dim = rotary_cos.size(1) * 2;
			
 
				+    TORCH_CHECK(params.rotary_dim <= head_size,
			
 
				+                "rotary_dim must be <= headdim");
			
 
				+    TORCH_CHECK(
			
 
				+        params.rotary_dim % 16 == 0,
			
 
				+        "Only rotary dimensions divisible by 16 are currently supported");
			
 
				+    const int seqlen_ro = rotary_cos.size(0);
			
 
				+    TORCH_CHECK(seqlen_ro >= seqlen_k,
			
 
				+                "cos/sin seqlen must be at least the seqlen of KV cache");
			
 
				+    CHECK_SHAPE(rotary_cos, seqlen_ro, params.rotary_dim / 2);
			
 
				+    CHECK_CONTIGUOUS(rotary_cos);
			
 
				+    TORCH_CHECK(rotary_cos.scalar_type() == q_dtype,
			
 
				+                "rotary_cos must have the same dtype as query");
			
 
				+
			
 
				+    TORCH_CHECK(rotary_sin_.has_value(),
			
 
				+                "If rotary cos is provided, rotary sin must also be provided");
			
 
				+    auto rotary_sin = rotary_sin_.value();
			
 
				+    CHECK_DEVICE(rotary_sin);
			
 
				+    CHECK_SHAPE(rotary_sin, seqlen_ro, params.rotary_dim / 2);
			
 
				+    CHECK_CONTIGUOUS(rotary_sin);
			
 
				+    TORCH_CHECK(rotary_sin.scalar_type() == q_dtype,
			
 
				+                "rotary_cos must have the same dtype as query");
			
 
				+    params.rotary_cos_ptr = rotary_cos.data_ptr();
			
 
				+    params.rotary_sin_ptr = rotary_sin.data_ptr();
			
 
				+    params.is_rotary_interleaved = is_rotary_interleaved;
			
 
				+  } else {
			
 
				+    params.rotary_dim = 0;
			
 
				+  }
			
 
				+
			
 
				+  if (cache_batch_idx_.has_value()) {
			
 
				+    auto cache_batch_idx = cache_batch_idx_.value();
			
 
				+    CHECK_DEVICE(cache_batch_idx);
			
 
				+    CHECK_CONTIGUOUS(cache_batch_idx);
			
 
				+    TORCH_CHECK(cache_batch_idx.scalar_type() == torch::kInt32,
			
 
				+                "cache_batch_idx must have dtype int32");
			
 
				+    params.cache_batch_idx = reinterpret_cast<int*>(cache_batch_idx.data_ptr());
			
 
				+  }
			
 
				+
			
 
				+  // Keep references to these tensors to extend their lifetime
			
 
				+  at::Tensor softmax_lse_accum, out_accum;
			
 
				+  std::tie(softmax_lse_accum, out_accum) = set_params_splitkv(
			
 
				+      params, batch_size, num_heads, head_size, seqlen_k, seqlen_q,
			
 
				+      head_size_rounded, /*dropout*/ 0.f, num_splits, dprops, opts);
			
 
				+
			
 
				+  if (paged_KV) {
			
 
				+    params.block_table = block_table.data_ptr<int>();
			
 
				+    params.block_table_batch_stride = block_table.stride(0);
			
 
				+  }
			
 
				+  params.page_block_size = page_block_size;
			
 
				+
			
 
				+  set_params_alibi(params, alibi_slopes_, batch_size, num_heads);
			
 
				+
			
 
				+  auto stream = at::cuda::getCurrentCUDAStream().stream();
			
 
				+  // Only split kernel supports appending to KV cache, or indexing to the cache
			
 
				+  // with cache_batch_idx, or paged KV cache
			
 
				+  run_mha_fwd(params, stream, /*force_split_kernel=*/k_.has_value() ||
			
 
				+                                  cache_batch_idx_.has_value() || paged_KV);
			
 
				+
			
 
				+  if (head_size_og % 8 != 0) {
			
 
				+    out = out.index(
			
 
				+        {"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
			
 
				+    if (out_.has_value()) {
			
 
				+      out_.value().copy_(out);
			
 
				+    }
			
 
				+    if (k_.has_value()) {
			
 
				+      // It's expensive to copy the KV cache here for the case where head size
			
 
				+      // not divisible by 8, but we don't expect to get this case in practice.
			
 
				+      // This is just so that the code works for that case.
			
 
				+      kcache.copy_(kcache_padded.index(
			
 
				+          {"...",
			
 
				+           torch::indexing::Slice(torch::indexing::None, head_size_og)}));
			
 
				+      vcache.copy_(vcache_padded.index(
			
 
				+          {"...",
			
 
				+           torch::indexing::Slice(torch::indexing::None, head_size_og)}));
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  if (seqlenq_ngroups_swapped) {
			
 
				+    out = out.transpose(1, 2).reshape(
			
 
				+        {batch_size, 1, num_heads_k * seqlen_q, head_size_og});
			
 
				+    softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1});
			
 
				+  }
			
 
				+  return {out, softmax_lse};
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim128_bf16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim128_bf16_causal_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::bfloat16_t, 128, true>(Flash_fwd_params& params,
			
 
				+                                                  cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim128<cutlass::bfloat16_t, true>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim128_bf16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim128_bf16_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::bfloat16_t, 128, false>(Flash_fwd_params& params,
			
 
				+                                                   cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim128<cutlass::bfloat16_t, false>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim128_fp16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim128_fp16_causal_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::half_t, 128, true>(Flash_fwd_params& params,
			
 
				+                                              cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim128<cutlass::half_t, true>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim128_fp16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim128_fp16_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::half_t, 128, false>(Flash_fwd_params& params,
			
 
				+                                               cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim128<cutlass::half_t, false>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim160_bf16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim160_bf16_causal_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::bfloat16_t, 160, true>(Flash_fwd_params& params,
			
 
				+                                                  cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim160<cutlass::bfloat16_t, true>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim160_bf16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim160_bf16_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::bfloat16_t, 160, false>(Flash_fwd_params& params,
			
 
				+                                                   cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim160<cutlass::bfloat16_t, false>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim160_fp16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim160_fp16_causal_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::half_t, 160, true>(Flash_fwd_params& params,
			
 
				+                                              cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim160<cutlass::half_t, true>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim160_fp16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim160_fp16_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::half_t, 160, false>(Flash_fwd_params& params,
			
 
				+                                               cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim160<cutlass::half_t, false>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim192_bf16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim192_bf16_causal_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::bfloat16_t, 192, true>(Flash_fwd_params& params,
			
 
				+                                                  cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim192<cutlass::bfloat16_t, true>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim192_bf16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim192_bf16_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::bfloat16_t, 192, false>(Flash_fwd_params& params,
			
 
				+                                                   cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim192<cutlass::bfloat16_t, false>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim192_fp16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim192_fp16_causal_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::half_t, 192, true>(Flash_fwd_params& params,
			
 
				+                                              cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim192<cutlass::half_t, true>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim192_fp16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim192_fp16_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::half_t, 192, false>(Flash_fwd_params& params,
			
 
				+                                               cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim192<cutlass::half_t, false>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim224_bf16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim224_bf16_causal_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::bfloat16_t, 224, true>(Flash_fwd_params& params,
			
 
				+                                                  cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim224<cutlass::bfloat16_t, true>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim224_bf16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim224_bf16_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::bfloat16_t, 224, false>(Flash_fwd_params& params,
			
 
				+                                                   cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim224<cutlass::bfloat16_t, false>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim224_fp16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim224_fp16_causal_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::half_t, 224, true>(Flash_fwd_params& params,
			
 
				+                                              cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim224<cutlass::half_t, true>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim224_fp16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim224_fp16_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::half_t, 224, false>(Flash_fwd_params& params,
			
 
				+                                               cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim224<cutlass::half_t, false>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim256_bf16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim256_bf16_causal_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::bfloat16_t, 256, true>(Flash_fwd_params& params,
			
 
				+                                                  cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim256<cutlass::bfloat16_t, true>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim256_bf16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim256_bf16_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::bfloat16_t, 256, false>(Flash_fwd_params& params,
			
 
				+                                                   cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim256<cutlass::bfloat16_t, false>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim256_fp16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim256_fp16_causal_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::half_t, 256, true>(Flash_fwd_params& params,
			
 
				+                                              cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim256<cutlass::half_t, true>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim256_fp16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim256_fp16_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::half_t, 256, false>(Flash_fwd_params& params,
			
 
				+                                               cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim256<cutlass::half_t, false>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim32_bf16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim32_bf16_causal_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::bfloat16_t, 32, true>(Flash_fwd_params& params,
			
 
				+                                                 cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim32<cutlass::bfloat16_t, true>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim32_bf16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim32_bf16_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::bfloat16_t, 32, false>(Flash_fwd_params& params,
			
 
				+                                                  cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim32<cutlass::bfloat16_t, false>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim32_fp16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim32_fp16_causal_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::half_t, 32, true>(Flash_fwd_params& params,
			
 
				+                                             cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim32<cutlass::half_t, true>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim32_fp16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim32_fp16_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::half_t, 32, false>(Flash_fwd_params& params,
			
 
				+                                              cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim32<cutlass::half_t, false>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim64_bf16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim64_bf16_causal_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::bfloat16_t, 64, true>(Flash_fwd_params& params,
			
 
				+                                                 cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim64<cutlass::bfloat16_t, true>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim64_bf16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim64_bf16_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::bfloat16_t, 64, false>(Flash_fwd_params& params,
			
 
				+                                                  cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim64<cutlass::bfloat16_t, false>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim64_fp16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim64_fp16_causal_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::half_t, 64, true>(Flash_fwd_params& params,
			
 
				+                                             cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim64<cutlass::half_t, true>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim64_fp16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim64_fp16_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::half_t, 64, false>(Flash_fwd_params& params,
			
 
				+                                              cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim64<cutlass::half_t, false>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim96_bf16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim96_bf16_causal_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::bfloat16_t, 96, true>(Flash_fwd_params& params,
			
 
				+                                                 cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim96<cutlass::bfloat16_t, true>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim96_bf16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim96_bf16_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::bfloat16_t, 96, false>(Flash_fwd_params& params,
			
 
				+                                                  cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim96<cutlass::bfloat16_t, false>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim96_fp16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim96_fp16_causal_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::half_t, 96, true>(Flash_fwd_params& params,
			
 
				+                                             cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim96<cutlass::half_t, true>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_hdim96_fp16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_hdim96_fp16_sm80.cu
@@ -0,0 +1,11 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up
			
 
				+// compilation. This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template <>
			
 
				+void run_mha_fwd_<cutlass::half_t, 96, false>(Flash_fwd_params& params,
			
 
				+                                              cudaStream_t stream) {
			
 
				+  run_mha_fwd_hdim96<cutlass::half_t, false>(params, stream);
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_kernel.h
+++ b/kernels/flash_attn/flash_fwd_kernel.h
@@ -0,0 +1,1715 @@
 
				+/******************************************************************************
			
 
				+ * Copyright (c) 2024, Tri Dao.
			
 
				+ ******************************************************************************/
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include <cute/tensor.hpp>
			
 
				+
			
 
				+#include <cutlass/cutlass.h>
			
 
				+#include <cutlass/array.h>
			
 
				+#include <cutlass/numeric_types.h>
			
 
				+
			
 
				+#include "block_info.h"
			
 
				+#include "kernel_traits.h"
			
 
				+#include "utils.h"
			
 
				+#include "softmax.h"
			
 
				+#include "mask.h"
			
 
				+#include "dropout.h"
			
 
				+#include "rotary.h"
			
 
				+
			
 
				+namespace flash {
			
 
				+
			
 
				+using namespace cute;
			
 
				+
			
 
				+template <typename Engine, typename Layout>
			
 
				+__forceinline__ __device__ void apply_softcap(Tensor<Engine, Layout>& tensor,
			
 
				+                                              const float softcap) {
			
 
				+#pragma unroll
			
 
				+  for (int i = 0; i < size(tensor); ++i) {
			
 
				+    tensor(i) = cutlass::fast_tanh(tensor(i) * softcap);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template <typename ElementAccum, typename Params, int kBlockM, bool Is_even_MN>
			
 
				+__forceinline__ __device__ auto get_lse_tile(
			
 
				+    const Params& params, const int bidb, const int bidh, const int m_block,
			
 
				+    const BlockInfo</*Varlen=*/!Is_even_MN>& binfo) {
			
 
				+  // When params.unpadded_lse is false, LSE is written as (b, h, seqlen_q) -
			
 
				+  // this is non-variable seqlen path. Otherwise, when
			
 
				+  // params.seqlenq_ngroups_swapped is true, it is written as (h, seqlen_q, b)
			
 
				+  // to account for seqlen_q <-> h swapping trick. Otherwise, it's written as
			
 
				+  // (h, b, seqlen_q).
			
 
				+  const bool varlen_q = params.unpadded_lse && !params.seqlenq_ngroups_swapped;
			
 
				+  auto lse_offset = varlen_q ? binfo.q_offset(params.seqlen_q, 1, bidb) : 0;
			
 
				+  auto gmem_ptr_lse = make_gmem_ptr(
			
 
				+      reinterpret_cast<ElementAccum*>(params.softmax_lse_ptr) + lse_offset);
			
 
				+
			
 
				+  auto lse_shape = varlen_q ? make_shape(1, params.h, params.total_q)
			
 
				+                            : make_shape(params.b, params.h, params.seqlen_q);
			
 
				+  auto lse_stride =
			
 
				+      params.seqlenq_ngroups_swapped
			
 
				+          ? make_stride(1, params.seqlen_q * params.b, params.b)
			
 
				+          : (params.unpadded_lse
			
 
				+                 ? make_stride(params.h * params.total_q, params.total_q, 1)
			
 
				+                 : make_stride(params.h * params.seqlen_q, params.seqlen_q, 1));
			
 
				+
			
 
				+  auto lse_layout = make_layout(lse_shape, lse_stride);
			
 
				+  Tensor mLSE = make_tensor(gmem_ptr_lse, lse_layout);
			
 
				+  auto mLSE_slice = varlen_q ? mLSE(0, bidh, _) : mLSE(bidb, bidh, _);
			
 
				+  return local_tile(mLSE_slice, Shape<Int<kBlockM>>{}, make_coord(m_block));
			
 
				+}
			
 
				+
			
 
				+template <typename Kernel_traits, bool Is_dropout, bool Is_causal,
			
 
				+          bool Is_local, bool Has_alibi, bool Is_even_MN, bool Is_even_K,
			
 
				+          bool Is_softcap, bool Return_softmax, typename Params>
			
 
				+inline __device__ void compute_attn_1rowblock(const Params& params,
			
 
				+                                              const int bidb, const int bidh,
			
 
				+                                              const int m_block) {
			
 
				+  using Element = typename Kernel_traits::Element;
			
 
				+  using ElementAccum = typename Kernel_traits::ElementAccum;
			
 
				+  using index_t = typename Kernel_traits::index_t;
			
 
				+
			
 
				+  // Shared memory.
			
 
				+  extern __shared__ char smem_[];
			
 
				+
			
 
				+  // The thread index.
			
 
				+  const int tidx = threadIdx.x;
			
 
				+
			
 
				+  constexpr int kBlockM = Kernel_traits::kBlockM;
			
 
				+  constexpr int kBlockN = Kernel_traits::kBlockN;
			
 
				+  constexpr int kHeadDim = Kernel_traits::kHeadDim;
			
 
				+  constexpr int kNWarps = Kernel_traits::kNWarps;
			
 
				+
			
 
				+  auto seed_offset = at::cuda::philox::unpack(params.philox_args);
			
 
				+  flash::Dropout dropout(std::get<0>(seed_offset), std::get<1>(seed_offset),
			
 
				+                         params.p_dropout_in_uint8_t, bidb, bidh, tidx,
			
 
				+                         params.h);
			
 
				+
			
 
				+  // Save seed and offset for backward, before any early exiting. Otherwise the
			
 
				+  // 0-th thread block might exit early and no one saves the rng states.
			
 
				+  if (Is_dropout && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 &&
			
 
				+      tidx == 0) {
			
 
				+    params.rng_state[0] = std::get<0>(seed_offset);
			
 
				+    params.rng_state[1] = std::get<1>(seed_offset);
			
 
				+  }
			
 
				+
			
 
				+  const BlockInfo</*Varlen=*/!Is_even_MN> binfo(params, bidb);
			
 
				+  if (m_block * kBlockM >= binfo.actual_seqlen_q) return;
			
 
				+
			
 
				+  const int n_block_min =
			
 
				+      !Is_local
			
 
				+          ? 0
			
 
				+          : std::max(0, (m_block * kBlockM + binfo.actual_seqlen_k -
			
 
				+                         binfo.actual_seqlen_q - params.window_size_left) /
			
 
				+                            kBlockN);
			
 
				+  int n_block_max = cute::ceil_div(binfo.actual_seqlen_k, kBlockN);
			
 
				+  if (Is_causal || Is_local) {
			
 
				+    n_block_max = std::min(
			
 
				+        n_block_max,
			
 
				+        cute::ceil_div((m_block + 1) * kBlockM + binfo.actual_seqlen_k -
			
 
				+                           binfo.actual_seqlen_q + params.window_size_right,
			
 
				+                       kBlockN));
			
 
				+    // if (threadIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0) {
			
 
				+    //     printf("m_block = %d, n_block_max = %d\n", m_block, n_block_max);
			
 
				+    // }
			
 
				+  }
			
 
				+  // We exit early and write 0 to gO and gLSE. This also covers the case where
			
 
				+  // actual_seqlen_k == 0. Otherwise we might read OOB elements from gK and gV.
			
 
				+  if ((Is_causal || Is_local || !Is_even_MN) && n_block_max <= n_block_min) {
			
 
				+    Tensor mO = make_tensor(
			
 
				+        make_gmem_ptr(
			
 
				+            reinterpret_cast<Element*>(params.o_ptr) +
			
 
				+            binfo.q_offset(params.o_batch_stride, params.o_row_stride, bidb)),
			
 
				+        make_shape(binfo.actual_seqlen_q, params.h, params.d),
			
 
				+        make_stride(params.o_row_stride, params.o_head_stride, _1{}));
			
 
				+    Tensor gO = local_tile(mO(_, bidh, _), Shape<Int<kBlockM>, Int<kHeadDim>>{},
			
 
				+                           make_coord(m_block, 0));  // (kBlockM, kHeadDim)
			
 
				+
			
 
				+    Tensor gLSE = get_lse_tile<ElementAccum, Params, kBlockM, Is_even_MN>(
			
 
				+        params, bidb, bidh, m_block, binfo);
			
 
				+
			
 
				+    typename Kernel_traits::GmemTiledCopyO gmem_tiled_copy_O;
			
 
				+    auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(tidx);
			
 
				+    Tensor tOgO = gmem_thr_copy_O.partition_D(gO);
			
 
				+    Tensor tOrO = make_tensor<Element>(shape(tOgO));
			
 
				+    clear(tOrO);
			
 
				+    // Construct identity layout for sO
			
 
				+    Tensor cO = make_identity_tensor(make_shape(
			
 
				+        size<0>(gO), size<1>(gO)));  // (BLK_M,BLK_K) -> (blk_m,blk_k)
			
 
				+    // Repeat the partitioning with identity layouts
			
 
				+    Tensor tOcO = gmem_thr_copy_O.partition_D(cO);
			
 
				+    Tensor tOpO = make_tensor<bool>(make_shape(size<2>(tOgO)));
			
 
				+    if (!Is_even_K) {
			
 
				+#pragma unroll
			
 
				+      for (int k = 0; k < size(tOpO); ++k) {
			
 
				+        tOpO(k) = get<1>(tOcO(0, 0, k)) < params.d;
			
 
				+      }
			
 
				+    }
			
 
				+    // Clear_OOB_K must be false since we don't want to write zeros to gmem
			
 
				+    flash::copy<Is_even_MN, Is_even_K, /*Clear_OOB_MN=*/false,
			
 
				+                /*Clear_OOB_K=*/false>(
			
 
				+        gmem_tiled_copy_O, tOrO, tOgO, tOcO, tOpO,
			
 
				+        binfo.actual_seqlen_q - m_block * kBlockM);
			
 
				+#pragma unroll
			
 
				+    for (int m = 0; m < size<1>(tOgO); ++m) {
			
 
				+      const int row = get<0>(tOcO(0, m, 0));
			
 
				+      if (row < binfo.actual_seqlen_q - m_block * kBlockM &&
			
 
				+          get<1>(tOcO(0, m, 0)) == 0) {
			
 
				+        gLSE(row) = INFINITY;
			
 
				+      }
			
 
				+    }
			
 
				+    return;
			
 
				+  }
			
 
				+  // if (tidx == 0) { printf("m_block = %d, n_block_min = %d, n_block_max =
			
 
				+  // %d\n", m_block, n_block_min, n_block_max); }
			
 
				+
			
 
				+  // We iterate over the blocks in reverse order. This is because the last block
			
 
				+  // is the only one that needs masking when we read K and V from global memory.
			
 
				+  // Moreover, iterating in reverse might save us 1 register (we just need
			
 
				+  // n_block instead of both n_block and n_block_max).
			
 
				+
			
 
				+  const index_t row_offset_p =
			
 
				+      ((bidb * params.h + bidh) * params.seqlen_q_rounded + m_block * kBlockM) *
			
 
				+          params.seqlen_k_rounded +
			
 
				+      (n_block_max - 1) * kBlockN;
			
 
				+
			
 
				+  Tensor mQ =
			
 
				+      make_tensor(make_gmem_ptr(reinterpret_cast<Element*>(params.q_ptr) +
			
 
				+                                binfo.q_offset(params.q_batch_stride,
			
 
				+                                               params.q_row_stride, bidb)),
			
 
				+                  make_shape(binfo.actual_seqlen_q, params.h, params.d),
			
 
				+                  make_stride(params.q_row_stride, params.q_head_stride, _1{}));
			
 
				+  Tensor gQ = local_tile(mQ(_, bidh, _), Shape<Int<kBlockM>, Int<kHeadDim>>{},
			
 
				+                         make_coord(m_block, 0));  // (kBlockM, kHeadDim)
			
 
				+  Tensor mK =
			
 
				+      make_tensor(make_gmem_ptr(reinterpret_cast<Element*>(params.k_ptr) +
			
 
				+                                binfo.k_offset(params.k_batch_stride,
			
 
				+                                               params.k_row_stride, bidb)),
			
 
				+                  make_shape(binfo.actual_seqlen_k, params.h_k, params.d),
			
 
				+                  make_stride(params.k_row_stride, params.k_head_stride, _1{}));
			
 
				+  Tensor gK = local_tile(mK(_, bidh / params.h_h_k_ratio, _),
			
 
				+                         Shape<Int<kBlockN>, Int<kHeadDim>>{},
			
 
				+                         make_coord(_, 0));  // (kBlockN, kHeadDim, nblocksN)
			
 
				+  Tensor mV =
			
 
				+      make_tensor(make_gmem_ptr(reinterpret_cast<Element*>(params.v_ptr) +
			
 
				+                                binfo.k_offset(params.v_batch_stride,
			
 
				+                                               params.v_row_stride, bidb)),
			
 
				+                  make_shape(binfo.actual_seqlen_k, params.h_k, params.d),
			
 
				+                  make_stride(params.v_row_stride, params.v_head_stride, _1{}));
			
 
				+  Tensor gV = local_tile(mV(_, bidh / params.h_h_k_ratio, _),
			
 
				+                         Shape<Int<kBlockN>, Int<kHeadDim>>{},
			
 
				+                         make_coord(_, 0));  // (kBlockN, kHeadDim, nblocksN)
			
 
				+  Tensor gP = make_tensor(
			
 
				+      make_gmem_ptr(reinterpret_cast<Element*>(params.p_ptr) + row_offset_p),
			
 
				+      Shape<Int<kBlockM>, Int<kBlockN>>{},
			
 
				+      make_stride(params.seqlen_k_rounded, _1{}));
			
 
				+
			
 
				+  Tensor sQ = make_tensor(make_smem_ptr(reinterpret_cast<Element*>(smem_)),
			
 
				+                          typename Kernel_traits::SmemLayoutQ{});
			
 
				+  // Careful we're using the same smem for sQ and sK | sV if Share_Q_K_smem;
			
 
				+  Tensor sK =
			
 
				+      make_tensor(sQ.data() + (Kernel_traits::Share_Q_K_smem ? 0 : size(sQ)),
			
 
				+                  typename Kernel_traits::SmemLayoutKV{});
			
 
				+
			
 
				+  Tensor sV =
			
 
				+      make_tensor(sK.data() + size(sK), typename Kernel_traits::SmemLayoutKV{});
			
 
				+  Tensor sVt =
			
 
				+      make_tensor(sV.data(), typename Kernel_traits::SmemLayoutVtransposed{});
			
 
				+  Tensor sVtNoSwizzle =
			
 
				+      make_tensor(sV.data().get(),
			
 
				+                  typename Kernel_traits::SmemLayoutVtransposedNoSwizzle{});
			
 
				+
			
 
				+  typename Kernel_traits::GmemTiledCopyQKV gmem_tiled_copy_QKV;
			
 
				+  auto gmem_thr_copy_QKV = gmem_tiled_copy_QKV.get_thread_slice(tidx);
			
 
				+
			
 
				+  Tensor tQgQ = gmem_thr_copy_QKV.partition_S(gQ);
			
 
				+  Tensor tQsQ = gmem_thr_copy_QKV.partition_D(sQ);
			
 
				+  Tensor tKgK =
			
 
				+      gmem_thr_copy_QKV.partition_S(gK);  // (KCPY, KCPY_N, KCPY_K, nblocksN)
			
 
				+  Tensor tKsK = gmem_thr_copy_QKV.partition_D(sK);
			
 
				+  Tensor tVgV =
			
 
				+      gmem_thr_copy_QKV.partition_S(gV);  // (VCPY, VCPY_N, VCPY_K, nblocksN)
			
 
				+  Tensor tVsV = gmem_thr_copy_QKV.partition_D(sV);
			
 
				+
			
 
				+  typename Kernel_traits::TiledMma tiled_mma;
			
 
				+  auto thr_mma = tiled_mma.get_thread_slice(tidx);
			
 
				+  Tensor tSrQ = thr_mma.partition_fragment_A(sQ);  // (MMA,MMA_M,MMA_K)
			
 
				+  Tensor tSrK = thr_mma.partition_fragment_B(sK);  // (MMA,MMA_N,MMA_K)
			
 
				+  Tensor tOrVt =
			
 
				+      thr_mma.partition_fragment_B(sVtNoSwizzle);  // (MMA, MMA_K,MMA_N)
			
 
				+
			
 
				+  Tensor tSgS = thr_mma.partition_C(gP);
			
 
				+
			
 
				+  Tensor acc_o = partition_fragment_C(
			
 
				+      tiled_mma, Shape<Int<kBlockM>, Int<kHeadDim>>{});  // MMA, MMA_M, MMA_K
			
 
				+
			
 
				+  //
			
 
				+  // Copy Atom retiling
			
 
				+  //
			
 
				+
			
 
				+  auto smem_tiled_copy_Q =
			
 
				+      make_tiled_copy_A(typename Kernel_traits::SmemCopyAtom{}, tiled_mma);
			
 
				+  auto smem_thr_copy_Q = smem_tiled_copy_Q.get_thread_slice(tidx);
			
 
				+  // if (cute::thread0()) {smem_thr_copy_Q.print_all();}
			
 
				+  Tensor tSsQ = smem_thr_copy_Q.partition_S(sQ);
			
 
				+  // if (cute::thread0()) {print(tSsQ.layout()); printf("\n");}
			
 
				+
			
 
				+  auto smem_tiled_copy_K =
			
 
				+      make_tiled_copy_B(typename Kernel_traits::SmemCopyAtom{}, tiled_mma);
			
 
				+  auto smem_thr_copy_K = smem_tiled_copy_K.get_thread_slice(tidx);
			
 
				+  Tensor tSsK = smem_thr_copy_K.partition_S(sK);
			
 
				+
			
 
				+  auto smem_tiled_copy_V = make_tiled_copy_B(
			
 
				+      typename Kernel_traits::SmemCopyAtomTransposed{}, tiled_mma);
			
 
				+  auto smem_thr_copy_V = smem_tiled_copy_V.get_thread_slice(tidx);
			
 
				+  Tensor tOsVt = smem_thr_copy_V.partition_S(sVt);
			
 
				+
			
 
				+  //
			
 
				+  // PREDICATES
			
 
				+  //
			
 
				+
			
 
				+  // // Allocate predicate tensors for m and n
			
 
				+  // Tensor tQpQ = make_tensor<bool>(make_shape(size<1>(tQsQ), size<2>(tQsQ)),
			
 
				+  // Stride<_1,_0>{}); Tensor tKVpKV =
			
 
				+  // make_tensor<bool>(make_shape(size<1>(tKsK), size<2>(tKsK)),
			
 
				+  // Stride<_1,_0>{});
			
 
				+
			
 
				+  // Construct identity layout for sQ and sK
			
 
				+  Tensor cQ = make_identity_tensor(
			
 
				+      make_shape(size<0>(sQ), size<1>(sQ)));  // (BLK_M,BLK_K) -> (blk_m,blk_k)
			
 
				+  Tensor cKV = make_identity_tensor(
			
 
				+      make_shape(size<0>(sK), size<1>(sK)));  // (BLK_N,BLK_K) -> (blk_n,blk_k)
			
 
				+  // Tensor tScQ = thr_mma.partition_A(cQ);                           //
			
 
				+  // (MMA,MMA_M,MMA_K) if (cute::thread0()) {
			
 
				+  //     print(tScQ.layout()); printf("\n");
			
 
				+  //     for (int i = 0; i < size(tScQ); ++i) {
			
 
				+  //         printf("%d ", get<0>(tScQ(i)));
			
 
				+  //     }
			
 
				+  //     printf("\n");
			
 
				+  //     for (int i = 0; i < size(tScQ); ++i) {
			
 
				+  //         printf("%d ", get<1>(tScQ(i)));
			
 
				+  //     }
			
 
				+  //     printf("\n");
			
 
				+  // }
			
 
				+
			
 
				+  // Repeat the partitioning with identity layouts
			
 
				+  Tensor tQcQ = gmem_thr_copy_QKV.partition_S(
			
 
				+      cQ);  // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
			
 
				+  Tensor tKVcKV = gmem_thr_copy_QKV.partition_S(
			
 
				+      cKV);  // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k)
			
 
				+
			
 
				+  // Allocate predicate tensors for k
			
 
				+  Tensor tQpQ = make_tensor<bool>(make_shape(size<2>(tQsQ)));
			
 
				+  Tensor tKVpKV = make_tensor<bool>(make_shape(size<2>(tKsK)));
			
 
				+
			
 
				+  // Set predicates for k bounds
			
 
				+  if (!Is_even_K) {
			
 
				+#pragma unroll
			
 
				+    for (int k = 0; k < size(tQpQ); ++k) {
			
 
				+      tQpQ(k) = get<1>(tQcQ(0, 0, k)) < params.d;
			
 
				+    }
			
 
				+#pragma unroll
			
 
				+    for (int k = 0; k < size(tKVpKV); ++k) {
			
 
				+      tKVpKV(k) = get<1>(tKVcKV(0, 0, k)) < params.d;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Prologue
			
 
				+
			
 
				+  // We don't need to clear the sQ smem tiles since we'll only write out the
			
 
				+  // valid outputs
			
 
				+  flash::copy<Is_even_MN, Is_even_K>(gmem_tiled_copy_QKV, tQgQ, tQsQ, tQcQ,
			
 
				+                                     tQpQ,
			
 
				+                                     binfo.actual_seqlen_q - m_block * kBlockM);
			
 
				+  if (Kernel_traits::Is_Q_in_regs) {
			
 
				+    cute::cp_async_fence();
			
 
				+  }
			
 
				+
			
 
				+  // // if (cute::thread(1, 0)) { print(tQsQ); }
			
 
				+  // // Tensor sQNoSwizzle = make_tensor(make_smem_ptr(reinterpret_cast<Element
			
 
				+  // *>(smem_)), typename Kernel_traits::SmemLayoutQNoSwizzle{});
			
 
				+  // // if (cute::thread0()) { print(sQNoSwizzle); }
			
 
				+
			
 
				+  if (Kernel_traits::Share_Q_K_smem) {
			
 
				+    flash::cp_async_wait<0>();
			
 
				+    __syncthreads();
			
 
				+    Tensor tSrQ_copy_view = smem_thr_copy_Q.retile_D(tSrQ);
			
 
				+    CUTE_STATIC_ASSERT_V(size<1>(tSsQ) == size<1>(tSrQ_copy_view));  // M
			
 
				+    cute::copy(smem_tiled_copy_Q, tSsQ, tSrQ_copy_view);
			
 
				+    __syncthreads();
			
 
				+  }
			
 
				+
			
 
				+  int n_block = n_block_max - 1;
			
 
				+  // We don't need to clear the sK smem tiles since we'll mask out the scores
			
 
				+  // anyway.
			
 
				+  flash::copy<Is_even_MN, Is_even_K>(
			
 
				+      gmem_tiled_copy_QKV, tKgK(_, _, _, n_block), tKsK, tKVcKV, tKVpKV,
			
 
				+      binfo.actual_seqlen_k - n_block * kBlockN);
			
 
				+  cute::cp_async_fence();
			
 
				+  // if (threadIdx.x == 0 && blockIdx.y == 0 && blockIdx.z < 2) { print(tKgK); }
			
 
				+  // __syncthreads();
			
 
				+
			
 
				+  if (Kernel_traits::Is_Q_in_regs && !Kernel_traits::Share_Q_K_smem) {
			
 
				+    flash::cp_async_wait<1>();
			
 
				+    __syncthreads();
			
 
				+    Tensor tSrQ_copy_view = smem_thr_copy_Q.retile_D(tSrQ);
			
 
				+    CUTE_STATIC_ASSERT_V(size<1>(tSsQ) == size<1>(tSrQ_copy_view));  // M
			
 
				+    cute::copy(smem_tiled_copy_Q, tSsQ, tSrQ_copy_view);
			
 
				+  }
			
 
				+
			
 
				+  clear(acc_o);
			
 
				+
			
 
				+  flash::Softmax<2 * size<1>(acc_o)> softmax;
			
 
				+
			
 
				+  const float alibi_slope =
			
 
				+      !Has_alibi || params.alibi_slopes_ptr == nullptr
			
 
				+          ? 0.0f
			
 
				+          : reinterpret_cast<float*>(params.alibi_slopes_ptr)
			
 
				+                    [bidb * params.alibi_slopes_batch_stride + bidh] /
			
 
				+                params.scale_softmax;
			
 
				+  flash::Mask<Is_causal, Is_local, Has_alibi> mask(
			
 
				+      binfo.actual_seqlen_k, binfo.actual_seqlen_q, params.window_size_left,
			
 
				+      params.window_size_right, alibi_slope);
			
 
				+
			
 
				+  // For performance reason, we separate out two kinds of iterations:
			
 
				+  // those that need masking on S, and those that don't.
			
 
				+  // We need masking on S for the very last block when K and V has length not
			
 
				+  // multiple of kBlockN. We also need masking on S if it's causal, for the last
			
 
				+  // ceil_div(kBlockM, kBlockN) blocks. We will have at least 1 "masking"
			
 
				+  // iteration.
			
 
				+
			
 
				+  // If not even_N, then seqlen_k might end in the middle of a block. In that
			
 
				+  // case we need to mask 2 blocks (e.g. when kBlockM == kBlockN), not just 1.
			
 
				+  constexpr int n_masking_steps =
			
 
				+      (!Is_causal && !Is_local)
			
 
				+          ? 1
			
 
				+          : ((Is_even_MN && Is_causal) ? cute::ceil_div(kBlockM, kBlockN)
			
 
				+                                       : cute::ceil_div(kBlockM, kBlockN) + 1);
			
 
				+#pragma unroll
			
 
				+  for (int masking_step = 0; masking_step < n_masking_steps;
			
 
				+       ++masking_step, --n_block) {
			
 
				+    Tensor acc_s = partition_fragment_C(
			
 
				+        tiled_mma,
			
 
				+        Shape<Int<kBlockM>, Int<kBlockN>>{});  // (MMA=4, MMA_M, MMA_N)
			
 
				+    clear(acc_s);
			
 
				+    flash::cp_async_wait<0>();
			
 
				+    __syncthreads();
			
 
				+
			
 
				+    // Advance gV
			
 
				+    if (masking_step > 0) {
			
 
				+      flash::copy</*Is_even_MN=*/true, Is_even_K>(
			
 
				+          gmem_tiled_copy_QKV, tVgV(_, _, _, n_block), tVsV, tKVcKV, tKVpKV);
			
 
				+    } else {
			
 
				+      // Clear the smem tiles to account for predicated off loads
			
 
				+      flash::copy<Is_even_MN, Is_even_K, /*Clear_OOB_MN=*/true>(
			
 
				+          gmem_tiled_copy_QKV, tVgV(_, _, _, n_block), tVsV, tKVcKV, tKVpKV,
			
 
				+          binfo.actual_seqlen_k - n_block * kBlockN);
			
 
				+    }
			
 
				+    cute::cp_async_fence();
			
 
				+
			
 
				+    flash::gemm</*A_in_regs=*/Kernel_traits::Is_Q_in_regs>(
			
 
				+        acc_s, tSrQ, tSrK, tSsQ, tSsK, tiled_mma, smem_tiled_copy_Q,
			
 
				+        smem_tiled_copy_K, smem_thr_copy_Q, smem_thr_copy_K);
			
 
				+    // if (cute::thread0()) { print(acc_s); }
			
 
				+    if constexpr (Is_softcap) {
			
 
				+      apply_softcap(acc_s, params.softcap);
			
 
				+    }
			
 
				+
			
 
				+    mask.template apply_mask<Is_causal, Is_even_MN>(
			
 
				+        acc_s, n_block * kBlockN,
			
 
				+        m_block * kBlockM + (tidx / 32) * 16 + (tidx % 32) / 4, kNWarps * 16);
			
 
				+
			
 
				+    flash::cp_async_wait<0>();
			
 
				+    __syncthreads();
			
 
				+    if (n_block > n_block_min) {
			
 
				+      flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_QKV,
			
 
				+                                                  tKgK(_, _, _, n_block - 1),
			
 
				+                                                  tKsK, tKVcKV, tKVpKV);
			
 
				+      // This cp_async_fence needs to be in the if block, otherwise the
			
 
				+      // synchronization isn't right and we get race conditions.
			
 
				+      cute::cp_async_fence();
			
 
				+    }
			
 
				+
			
 
				+    // TODO: when we have key_padding_mask we'll need to Check_inf
			
 
				+    masking_step == 0
			
 
				+        ? softmax.template softmax_rescale_o<
			
 
				+              /*Is_first=*/true, /*Check_inf=*/Is_causal || Is_local>(
			
 
				+              acc_s, acc_o, params.scale_softmax_log2)
			
 
				+        : softmax.template softmax_rescale_o<
			
 
				+              /*Is_first=*/false, /*Check_inf=*/Is_causal || Is_local>(
			
 
				+              acc_s, acc_o, params.scale_softmax_log2);
			
 
				+
			
 
				+    // Convert acc_s from fp32 to fp16/bf16
			
 
				+    Tensor rP = flash::convert_type<Element>(acc_s);
			
 
				+    int block_row_idx = m_block * (kBlockM / 16) + tidx / 32;
			
 
				+    int block_col_idx = n_block * (kBlockN / 32);
			
 
				+    if (Return_softmax) {
			
 
				+      Tensor rP_drop = make_fragment_like(rP);
			
 
				+      cute::copy(rP, rP_drop);
			
 
				+      dropout.template apply_dropout</*encode_dropout_in_sign_bit=*/true>(
			
 
				+          rP_drop, block_row_idx, block_col_idx, kNWarps);
			
 
				+      cute::copy(rP_drop, tSgS);
			
 
				+      tSgS.data() = tSgS.data() + (-kBlockN);
			
 
				+    }
			
 
				+    if (Is_dropout) {
			
 
				+      dropout.apply_dropout(rP, block_row_idx, block_col_idx, kNWarps);
			
 
				+    }
			
 
				+
			
 
				+    // Reshape rP from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
			
 
				+    // if using m16n8k16 or (4, MMA_M, MMA_N) if using m16n8k8.
			
 
				+    Tensor tOrP = make_tensor(
			
 
				+        rP.data(),
			
 
				+        flash::convert_layout_acc_Aregs<Kernel_traits::TiledMma>(rP.layout()));
			
 
				+    // if (cute::thread0()) { print(tOrP); }
			
 
				+    flash::gemm_rs(acc_o, tOrP, tOrVt, tOsVt, tiled_mma, smem_tiled_copy_V,
			
 
				+                   smem_thr_copy_V);
			
 
				+    // if (cute::thread0()) { print(scores); }
			
 
				+
			
 
				+    // This check is at the end of the loop since we always have at least 1
			
 
				+    // iteration
			
 
				+    if (n_masking_steps > 1 && n_block <= n_block_min) {
			
 
				+      --n_block;
			
 
				+      break;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // These are the iterations where we don't need masking on S
			
 
				+  for (; n_block >= n_block_min; --n_block) {
			
 
				+    Tensor acc_s = partition_fragment_C(
			
 
				+        tiled_mma,
			
 
				+        Shape<Int<kBlockM>, Int<kBlockN>>{});  // (MMA=4, MMA_M, MMA_N)
			
 
				+    clear(acc_s);
			
 
				+    flash::cp_async_wait<0>();
			
 
				+    __syncthreads();
			
 
				+    flash::copy</*Is_even_MN=*/true, Is_even_K>(
			
 
				+        gmem_tiled_copy_QKV, tVgV(_, _, _, n_block), tVsV, tKVcKV, tKVpKV);
			
 
				+    cute::cp_async_fence();
			
 
				+
			
 
				+    flash::gemm</*A_in_regs=*/Kernel_traits::Is_Q_in_regs>(
			
 
				+        acc_s, tSrQ, tSrK, tSsQ, tSsK, tiled_mma, smem_tiled_copy_Q,
			
 
				+        smem_tiled_copy_K, smem_thr_copy_Q, smem_thr_copy_K);
			
 
				+    if constexpr (Is_softcap) {
			
 
				+      apply_softcap(acc_s, params.softcap);
			
 
				+    }
			
 
				+
			
 
				+    flash::cp_async_wait<0>();
			
 
				+    __syncthreads();
			
 
				+    if (n_block > n_block_min) {
			
 
				+      flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_QKV,
			
 
				+                                                  tKgK(_, _, _, n_block - 1),
			
 
				+                                                  tKsK, tKVcKV, tKVpKV);
			
 
				+      // This cp_async_fence needs to be in the if block, otherwise the
			
 
				+      // synchronization isn't right and we get race conditions.
			
 
				+      cute::cp_async_fence();
			
 
				+    }
			
 
				+
			
 
				+    mask.template apply_mask</*Causal_mask=*/false>(
			
 
				+        acc_s, n_block * kBlockN,
			
 
				+        m_block * kBlockM + (tidx / 32) * 16 + (tidx % 32) / 4, kNWarps * 16);
			
 
				+
			
 
				+    softmax
			
 
				+        .template softmax_rescale_o</*Is_first=*/false, /*Check_inf=*/Is_local>(
			
 
				+            acc_s, acc_o, params.scale_softmax_log2);
			
 
				+
			
 
				+    Tensor rP = flash::convert_type<Element>(acc_s);
			
 
				+    int block_row_idx = m_block * (kBlockM / 16) + tidx / 32;
			
 
				+    int block_col_idx = n_block * (kBlockN / 32);
			
 
				+    if (Return_softmax) {
			
 
				+      Tensor rP_drop = make_fragment_like(rP);
			
 
				+      cute::copy(rP, rP_drop);
			
 
				+      dropout.template apply_dropout</*encode_dropout_in_sign_bit=*/true>(
			
 
				+          rP_drop, block_row_idx, block_col_idx, kNWarps);
			
 
				+      cute::copy(rP_drop, tSgS);
			
 
				+      tSgS.data() = tSgS.data() + (-kBlockN);
			
 
				+    }
			
 
				+    if (Is_dropout) {
			
 
				+      dropout.apply_dropout(rP, block_row_idx, block_col_idx, kNWarps);
			
 
				+    }
			
 
				+
			
 
				+    // Reshape rP from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
			
 
				+    // if using m16n8k16 or (4, MMA_M, MMA_N) if using m16n8k8.
			
 
				+    Tensor tOrP = make_tensor(
			
 
				+        rP.data(),
			
 
				+        flash::convert_layout_acc_Aregs<Kernel_traits::TiledMma>(rP.layout()));
			
 
				+    flash::gemm_rs(acc_o, tOrP, tOrVt, tOsVt, tiled_mma, smem_tiled_copy_V,
			
 
				+                   smem_thr_copy_V);
			
 
				+  }
			
 
				+
			
 
				+  // Epilogue
			
 
				+
			
 
				+  Tensor lse = softmax.template normalize_softmax_lse<Is_dropout>(
			
 
				+      acc_o, params.scale_softmax, params.rp_dropout);
			
 
				+
			
 
				+  // Convert acc_o from fp32 to fp16/bf16
			
 
				+  Tensor rO = flash::convert_type<Element>(acc_o);
			
 
				+  Tensor sO = make_tensor(
			
 
				+      sQ.data(), typename Kernel_traits::SmemLayoutO{});  // (SMEM_M,SMEM_N)
			
 
				+  // Partition sO to match the accumulator partitioning
			
 
				+  auto smem_tiled_copy_O =
			
 
				+      make_tiled_copy_C(typename Kernel_traits::SmemCopyAtomO{}, tiled_mma);
			
 
				+  auto smem_thr_copy_O = smem_tiled_copy_O.get_thread_slice(tidx);
			
 
				+  Tensor taccOrO =
			
 
				+      smem_thr_copy_O.retile_S(rO);  // ((Atom,AtomNum), MMA_M, MMA_N)
			
 
				+  Tensor taccOsO =
			
 
				+      smem_thr_copy_O.partition_D(sO);  // ((Atom,AtomNum),PIPE_M,PIPE_N)
			
 
				+
			
 
				+  // sO has the same size as sQ, so we don't need to sync here.
			
 
				+  if (Kernel_traits::Share_Q_K_smem) {
			
 
				+    __syncthreads();
			
 
				+  }
			
 
				+
			
 
				+  cute::copy(smem_tiled_copy_O, taccOrO, taccOsO);
			
 
				+
			
 
				+  Tensor mO =
			
 
				+      make_tensor(make_gmem_ptr(reinterpret_cast<Element*>(params.o_ptr) +
			
 
				+                                binfo.q_offset(params.o_batch_stride,
			
 
				+                                               params.o_row_stride, bidb)),
			
 
				+                  make_shape(binfo.actual_seqlen_q, params.h, params.d),
			
 
				+                  make_stride(params.o_row_stride, params.o_head_stride, _1{}));
			
 
				+  Tensor gO = local_tile(mO(_, bidh, _), Shape<Int<kBlockM>, Int<kHeadDim>>{},
			
 
				+                         make_coord(m_block, 0));  // (kBlockM, kHeadDim)
			
 
				+  Tensor gLSE = get_lse_tile<ElementAccum, Params, kBlockM, Is_even_MN>(
			
 
				+      params, bidb, bidh, m_block, binfo);
			
 
				+
			
 
				+  typename Kernel_traits::GmemTiledCopyO gmem_tiled_copy_O;
			
 
				+  auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(tidx);
			
 
				+  Tensor tOsO =
			
 
				+      gmem_thr_copy_O.partition_S(sO);  // ((Atom,AtomNum),ATOM_M,ATOM_N)
			
 
				+  Tensor tOgO = gmem_thr_copy_O.partition_D(gO);
			
 
				+
			
 
				+  __syncthreads();
			
 
				+
			
 
				+  Tensor tOrO = make_tensor<Element>(shape(tOgO));
			
 
				+  cute::copy(gmem_tiled_copy_O, tOsO, tOrO);
			
 
				+
			
 
				+  Tensor caccO = make_identity_tensor(
			
 
				+      Shape<Int<kBlockM>, Int<kHeadDim>>{});  // (BLK_M,BLK_K) -> (blk_m,blk_k)
			
 
				+  Tensor taccOcO = thr_mma.partition_C(caccO);  // (MMA,MMA_M,MMA_K)
			
 
				+  static_assert(decltype(size<0>(taccOcO))::value == 4);
			
 
				+  // Convert to ((2, 2), MMA_M, MMA_K) then take only the row indices.
			
 
				+  Tensor taccOcO_row =
			
 
				+      logical_divide(taccOcO, Shape<_2>{})(make_coord(0, _), _, 0);
			
 
				+  CUTE_STATIC_ASSERT_V(size(lse) == size(taccOcO_row));  // MMA_M
			
 
				+  if (get<1>(taccOcO_row(0)) == 0) {
			
 
				+#pragma unroll
			
 
				+    for (int mi = 0; mi < size(lse); ++mi) {
			
 
				+      const int row = get<0>(taccOcO_row(mi));
			
 
				+      if (row < binfo.actual_seqlen_q - m_block * kBlockM) {
			
 
				+        gLSE(row) = lse(mi);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Construct identity layout for sO
			
 
				+  Tensor cO = make_identity_tensor(
			
 
				+      make_shape(size<0>(sO), size<1>(sO)));  // (BLK_M,BLK_K) -> (blk_m,blk_k)
			
 
				+  // Repeat the partitioning with identity layouts
			
 
				+  Tensor tOcO =
			
 
				+      gmem_thr_copy_O.partition_D(cO);  // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
			
 
				+  Tensor tOpO = make_tensor<bool>(make_shape(size<2>(tOgO)));
			
 
				+  if (!Is_even_K) {
			
 
				+#pragma unroll
			
 
				+    for (int k = 0; k < size(tOpO); ++k) {
			
 
				+      tOpO(k) = get<1>(tOcO(0, 0, k)) < params.d;
			
 
				+    }
			
 
				+  }
			
 
				+  // Clear_OOB_K must be false since we don't want to write zeros to gmem
			
 
				+  flash::copy<Is_even_MN, Is_even_K, /*Clear_OOB_MN=*/false,
			
 
				+              /*Clear_OOB_K=*/false>(gmem_tiled_copy_O, tOrO, tOgO, tOcO, tOpO,
			
 
				+                                     binfo.actual_seqlen_q - m_block * kBlockM);
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template <typename Kernel_traits, bool Is_causal, bool Is_local, bool Has_alibi,
			
 
				+          bool Is_even_MN, bool Is_even_K, bool Is_softcap, bool Split,
			
 
				+          bool Append_KV, typename Params>
			
 
				+inline __device__ void compute_attn_1rowblock_splitkv(
			
 
				+    const Params& params, const int bidb, const int bidh, const int m_block,
			
 
				+    const int n_split_idx, const int num_n_splits) {
			
 
				+  using Element = typename Kernel_traits::Element;
			
 
				+  using ElementAccum = typename Kernel_traits::ElementAccum;
			
 
				+  using index_t = typename Kernel_traits::index_t;
			
 
				+
			
 
				+  // Shared memory.
			
 
				+  extern __shared__ char smem_[];
			
 
				+
			
 
				+  // The thread index.
			
 
				+  const int tidx = threadIdx.x;
			
 
				+
			
 
				+  constexpr int kBlockM = Kernel_traits::kBlockM;
			
 
				+  constexpr int kBlockN = Kernel_traits::kBlockN;
			
 
				+  constexpr int kHeadDim = Kernel_traits::kHeadDim;
			
 
				+  constexpr int kNWarps = Kernel_traits::kNWarps;
			
 
				+
			
 
				+  using GmemTiledCopyO =
			
 
				+      std::conditional_t<!Split, typename Kernel_traits::GmemTiledCopyO,
			
 
				+                         typename Kernel_traits::GmemTiledCopyOaccum>;
			
 
				+  using ElementO = std::conditional_t<!Split, Element, ElementAccum>;
			
 
				+
			
 
				+  const BlockInfo</*Varlen=*/!Is_even_MN> binfo(params, bidb);
			
 
				+  // if (threadIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0) {
			
 
				+  // printf("Is_even_MN = %d, is_cumulativ = %d, seqlen_k_cache = %d,
			
 
				+  // actual_seqlen_k = %d\n", Is_even_MN, params.is_seqlens_k_cumulative,
			
 
				+  // binfo.seqlen_k_cache, binfo.actual_seqlen_k); } if (threadIdx.x == 0 &&
			
 
				+  // blockIdx.y == 1 && blockIdx.z == 0) { printf("params.knew_ptr = %p,
			
 
				+  // seqlen_k_cache + seqlen_knew = %d\n", params.knew_ptr, binfo.seqlen_k_cache
			
 
				+  // + (params.knew_ptr == nullptr ? 0 : params.seqlen_knew)); }
			
 
				+  if (m_block * kBlockM >= binfo.actual_seqlen_q) return;
			
 
				+
			
 
				+  const int n_blocks_per_split =
			
 
				+      ((binfo.actual_seqlen_k + kBlockN - 1) / kBlockN + num_n_splits - 1) /
			
 
				+      num_n_splits;
			
 
				+  const int n_block_min =
			
 
				+      !Is_local ? n_split_idx * n_blocks_per_split
			
 
				+                : std::max(n_split_idx * n_blocks_per_split,
			
 
				+                           (m_block * kBlockM + binfo.actual_seqlen_k -
			
 
				+                            binfo.actual_seqlen_q - params.window_size_left) /
			
 
				+                               kBlockN);
			
 
				+  int n_block_max = std::min(cute::ceil_div(binfo.actual_seqlen_k, kBlockN),
			
 
				+                             (n_split_idx + 1) * n_blocks_per_split);
			
 
				+  if (Is_causal || Is_local) {
			
 
				+    n_block_max = std::min(
			
 
				+        n_block_max,
			
 
				+        cute::ceil_div((m_block + 1) * kBlockM + binfo.actual_seqlen_k -
			
 
				+                           binfo.actual_seqlen_q + params.window_size_right,
			
 
				+                       kBlockN));
			
 
				+  }
			
 
				+  if (n_block_min >=
			
 
				+      n_block_max) {  // This also covers the case where n_block_max <= 0
			
 
				+    // We exit early and write 0 to gOaccum and -inf to gLSEaccum.
			
 
				+    // Otherwise we might read OOB elements from gK and gV,
			
 
				+    // or get wrong results when we combine gOaccum from different blocks.
			
 
				+    const index_t row_offset_o =
			
 
				+        binfo.q_offset(params.o_batch_stride, params.o_row_stride, bidb) +
			
 
				+        m_block * kBlockM * params.o_row_stride + bidh * params.o_head_stride;
			
 
				+    const index_t row_offset_oaccum =
			
 
				+        (((n_split_idx * params.b + bidb) * params.h + bidh) * params.seqlen_q +
			
 
				+         m_block * kBlockM) *
			
 
				+        params.d_rounded;
			
 
				+    const index_t row_offset_lseaccum =
			
 
				+        ((n_split_idx * params.b + bidb) * params.h + bidh) * params.seqlen_q +
			
 
				+        m_block * kBlockM;
			
 
				+    Tensor gOaccum = make_tensor(
			
 
				+        make_gmem_ptr(reinterpret_cast<ElementO*>(Split ? params.oaccum_ptr
			
 
				+                                                        : params.o_ptr) +
			
 
				+                      (Split ? row_offset_oaccum : row_offset_o)),
			
 
				+        Shape<Int<kBlockM>, Int<kHeadDim>>{},
			
 
				+        make_stride(Split ? kHeadDim : params.o_row_stride, _1{}));
			
 
				+    Tensor gLSEaccum = make_tensor(
			
 
				+        make_gmem_ptr(
			
 
				+            reinterpret_cast<ElementAccum*>(Split ? params.softmax_lseaccum_ptr
			
 
				+                                                  : params.softmax_lse_ptr) +
			
 
				+            row_offset_lseaccum),
			
 
				+        Shape<Int<kBlockM>>{}, Stride<_1>{});
			
 
				+
			
 
				+    GmemTiledCopyO gmem_tiled_copy_Oaccum;
			
 
				+    auto gmem_thr_copy_Oaccum = gmem_tiled_copy_Oaccum.get_thread_slice(tidx);
			
 
				+    Tensor tOgOaccum = gmem_thr_copy_Oaccum.partition_D(gOaccum);
			
 
				+    Tensor tOrOaccum = make_tensor<ElementO>(shape(tOgOaccum));
			
 
				+    clear(tOrOaccum);
			
 
				+    // Construct identity layout for sO
			
 
				+    Tensor cO = make_identity_tensor(make_shape(
			
 
				+        size<0>(gOaccum), size<1>(gOaccum)));  // (BLK_M,BLK_K) -> (blk_m,blk_k)
			
 
				+    // Repeat the partitioning with identity layouts
			
 
				+    Tensor tOcO = gmem_thr_copy_Oaccum.partition_D(cO);
			
 
				+    Tensor tOpO = make_tensor<bool>(make_shape(size<2>(tOgOaccum)));
			
 
				+    if (!Is_even_K) {
			
 
				+#pragma unroll
			
 
				+      for (int k = 0; k < size(tOpO); ++k) {
			
 
				+        tOpO(k) = get<1>(tOcO(0, 0, k)) < params.d;
			
 
				+      }
			
 
				+    }
			
 
				+    // Clear_OOB_K must be false since we don't want to write zeros to gmem
			
 
				+    flash::copy<Is_even_MN, Is_even_K, /*Clear_OOB_MN=*/false,
			
 
				+                /*Clear_OOB_K=*/false>(
			
 
				+        gmem_tiled_copy_Oaccum, tOrOaccum, tOgOaccum, tOcO, tOpO,
			
 
				+        binfo.actual_seqlen_q - m_block * kBlockM);
			
 
				+#pragma unroll
			
 
				+    for (int m = 0; m < size<1>(tOgOaccum); ++m) {
			
 
				+      const int row = get<0>(tOcO(0, m, 0));
			
 
				+      if (row < binfo.actual_seqlen_q - m_block * kBlockM &&
			
 
				+          get<1>(tOcO(0, m, 0)) == 0) {
			
 
				+        gLSEaccum(row) = Split ? -INFINITY : INFINITY;
			
 
				+      }
			
 
				+    }
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  // We iterate over the blocks in reverse order. This is because the last block
			
 
				+  // is the only one that needs masking when we read K and V from global memory.
			
 
				+  // Moreover, iterating in reverse might save us 1 register (we just need
			
 
				+  // n_block instead of both n_block and n_block_max).
			
 
				+
			
 
				+  // We move K and V to the last block.
			
 
				+  const int bidb_cache =
			
 
				+      params.cache_batch_idx == nullptr ? bidb : params.cache_batch_idx[bidb];
			
 
				+  const int* block_table =
			
 
				+      params.block_table == nullptr
			
 
				+          ? nullptr
			
 
				+          : params.block_table + bidb * params.block_table_batch_stride;
			
 
				+  const index_t row_offset_k =
			
 
				+      block_table == nullptr
			
 
				+          ? binfo.k_offset(params.k_batch_stride, params.k_row_stride,
			
 
				+                           bidb_cache) +
			
 
				+                (n_block_max - 1) * kBlockN * params.k_row_stride +
			
 
				+                (bidh / params.h_h_k_ratio) * params.k_head_stride
			
 
				+          : (bidh / params.h_h_k_ratio) *
			
 
				+                params.k_head_stride;  // block addresses are later resolved
			
 
				+                                       // per-thread
			
 
				+
			
 
				+  const index_t row_offset_v =
			
 
				+      block_table == nullptr
			
 
				+          ? binfo.k_offset(params.v_batch_stride, params.v_row_stride,
			
 
				+                           bidb_cache) +
			
 
				+                (n_block_max - 1) * kBlockN * params.v_row_stride +
			
 
				+                (bidh / params.h_h_k_ratio) * params.v_head_stride
			
 
				+          : (bidh / params.h_h_k_ratio) * params.v_head_stride;
			
 
				+
			
 
				+  Tensor mQ =
			
 
				+      make_tensor(make_gmem_ptr(reinterpret_cast<Element*>(params.q_ptr) +
			
 
				+                                binfo.q_offset(params.q_batch_stride,
			
 
				+                                               params.q_row_stride, bidb)),
			
 
				+                  make_shape(binfo.actual_seqlen_q, params.h, params.d),
			
 
				+                  make_stride(params.q_row_stride, params.q_head_stride, _1{}));
			
 
				+  Tensor gQ = local_tile(mQ(_, bidh, _), Shape<Int<kBlockM>, Int<kHeadDim>>{},
			
 
				+                         make_coord(m_block, 0));  // (kBlockM, kHeadDim)
			
 
				+  Tensor gK = make_tensor(
			
 
				+      make_gmem_ptr(reinterpret_cast<Element*>(params.k_ptr) + row_offset_k),
			
 
				+      Shape<Int<kBlockN>, Int<kHeadDim>>{},
			
 
				+      make_stride(params.k_row_stride, _1{}));
			
 
				+  // if (threadIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0) { printf("k_ptr
			
 
				+  // = %p, row_offset_k = %d, gK_ptr = %p\n", params.k_ptr, row_offset_k,
			
 
				+  // gK.data()); }
			
 
				+  Tensor gV = make_tensor(
			
 
				+      make_gmem_ptr(reinterpret_cast<Element*>(params.v_ptr) + row_offset_v),
			
 
				+      Shape<Int<kBlockN>, Int<kHeadDim>>{},
			
 
				+      make_stride(params.v_row_stride, _1{}));
			
 
				+  Tensor sQ = make_tensor(make_smem_ptr(reinterpret_cast<Element*>(smem_)),
			
 
				+                          typename Kernel_traits::SmemLayoutQ{});
			
 
				+  Tensor sK =
			
 
				+      make_tensor(sQ.data() + size(sQ), typename Kernel_traits::SmemLayoutKV{});
			
 
				+  Tensor sV =
			
 
				+      make_tensor(sK.data() + size(sK), typename Kernel_traits::SmemLayoutKV{});
			
 
				+  Tensor sVt =
			
 
				+      make_tensor(sV.data(), typename Kernel_traits::SmemLayoutVtransposed{});
			
 
				+  Tensor sVtNoSwizzle =
			
 
				+      make_tensor(sV.data().get(),
			
 
				+                  typename Kernel_traits::SmemLayoutVtransposedNoSwizzle{});
			
 
				+
			
 
				+  typename Kernel_traits::GmemTiledCopyQKV gmem_tiled_copy_Q;
			
 
				+  auto gmem_thr_copy_Q = gmem_tiled_copy_Q.get_thread_slice(tidx);
			
 
				+  typename Kernel_traits::GmemTiledCopyQKVPaged gmem_tiled_copy_KV;
			
 
				+  auto gmem_thr_copy_KV = gmem_tiled_copy_KV.get_thread_slice(tidx);
			
 
				+
			
 
				+  Tensor tQgQ = gmem_thr_copy_Q.partition_S(gQ);
			
 
				+  Tensor tQsQ = gmem_thr_copy_Q.partition_D(sQ);
			
 
				+
			
 
				+  Tensor tKgK_ = gmem_thr_copy_KV.partition_S(gK);  // (KCPY, KCPY_N, KCPY_K)
			
 
				+  Tensor tKsK_ = gmem_thr_copy_KV.partition_D(sK);
			
 
				+  Tensor tVgV_ = gmem_thr_copy_KV.partition_S(gV);  // (VCPY, VCPY_N, VCPY_K)
			
 
				+  Tensor tVsV_ = gmem_thr_copy_KV.partition_D(sV);
			
 
				+
			
 
				+  Tensor tKgK = make_tensor(tKgK_.data(), reshape_thread_tile(tKgK_.layout()));
			
 
				+  Tensor tKsK = make_tensor(tKsK_.data(), reshape_thread_tile(tKsK_.layout()));
			
 
				+  Tensor tVgV = make_tensor(tVgV_.data(), reshape_thread_tile(tVgV_.layout()));
			
 
				+  Tensor tVsV = make_tensor(tVsV_.data(), reshape_thread_tile(tVsV_.layout()));
			
 
				+
			
 
				+  if (block_table != nullptr) {
			
 
				+    tKgK.data() =
			
 
				+        gK.data() + flash::resolve_thread_kv_page_slice_offset<Kernel_traits>(
			
 
				+                        tidx, n_block_max, params.page_block_size, block_table,
			
 
				+                        params.k_batch_stride, params.k_row_stride);
			
 
				+    tVgV.data() =
			
 
				+        gV.data() + flash::resolve_thread_kv_page_slice_offset<Kernel_traits>(
			
 
				+                        tidx, n_block_max, params.page_block_size, block_table,
			
 
				+                        params.v_batch_stride, params.v_row_stride);
			
 
				+  }
			
 
				+
			
 
				+  typename Kernel_traits::TiledMma tiled_mma;
			
 
				+  auto thr_mma = tiled_mma.get_thread_slice(tidx);
			
 
				+  Tensor tSrQ = thr_mma.partition_fragment_A(sQ);  // (MMA,MMA_M,MMA_K)
			
 
				+  Tensor tSrK = thr_mma.partition_fragment_B(sK);  // (MMA,MMA_N,MMA_K)
			
 
				+  Tensor tOrVt =
			
 
				+      thr_mma.partition_fragment_B(sVtNoSwizzle);  // (MMA, MMA_K,MMA_N)
			
 
				+
			
 
				+  Tensor acc_o = partition_fragment_C(
			
 
				+      tiled_mma, Shape<Int<kBlockM>, Int<kHeadDim>>{});  // MMA, MMA_M, MMA_K
			
 
				+
			
 
				+  //
			
 
				+  // Copy Atom retiling
			
 
				+  //
			
 
				+
			
 
				+  auto smem_tiled_copy_Q =
			
 
				+      make_tiled_copy_A(typename Kernel_traits::SmemCopyAtom{}, tiled_mma);
			
 
				+  auto smem_thr_copy_Q = smem_tiled_copy_Q.get_thread_slice(tidx);
			
 
				+  Tensor tSsQ = smem_thr_copy_Q.partition_S(sQ);
			
 
				+
			
 
				+  auto smem_tiled_copy_K =
			
 
				+      make_tiled_copy_B(typename Kernel_traits::SmemCopyAtom{}, tiled_mma);
			
 
				+  auto smem_thr_copy_K = smem_tiled_copy_K.get_thread_slice(tidx);
			
 
				+  Tensor tSsK = smem_thr_copy_K.partition_S(sK);
			
 
				+
			
 
				+  auto smem_tiled_copy_V = make_tiled_copy_B(
			
 
				+      typename Kernel_traits::SmemCopyAtomTransposed{}, tiled_mma);
			
 
				+  auto smem_thr_copy_V = smem_tiled_copy_V.get_thread_slice(tidx);
			
 
				+  Tensor tOsVt = smem_thr_copy_V.partition_S(sVt);
			
 
				+
			
 
				+  // PREDICATES
			
 
				+  //
			
 
				+
			
 
				+  // // Allocate predicate tensors for m and n
			
 
				+  // Tensor tQpQ = make_tensor<bool>(make_shape(size<1>(tQsQ), size<2>(tQsQ)),
			
 
				+  // Stride<_1,_0>{}); Tensor tKVpKV =
			
 
				+  // make_tensor<bool>(make_shape(size<1>(tKsK), size<2>(tKsK)),
			
 
				+  // Stride<_1,_0>{});
			
 
				+
			
 
				+  // Construct identity layout for sQ and sK
			
 
				+  Tensor cQ = make_identity_tensor(
			
 
				+      make_shape(size<0>(sQ), size<1>(sQ)));  // (BLK_M,BLK_K) -> (blk_m,blk_k)
			
 
				+  Tensor cKV = make_identity_tensor(
			
 
				+      make_shape(size<0>(sK), size<1>(sK)));  // (BLK_N,BLK_K) -> (blk_n,blk_k)
			
 
				+
			
 
				+  // Repeat the partitioning with identity layouts
			
 
				+  Tensor tQcQ =
			
 
				+      gmem_thr_copy_Q.partition_S(cQ);  // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
			
 
				+  Tensor tKVcKV_ = gmem_thr_copy_KV.partition_S(
			
 
				+      cKV);  // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k)
			
 
				+  Tensor tKVcKV =
			
 
				+      make_tensor(tKVcKV_.data(), reshape_thread_tile(tKVcKV_.layout()));
			
 
				+
			
 
				+  // Allocate predicate tensors for k
			
 
				+  Tensor tQpQ = make_tensor<bool>(make_shape(size<2>(tQsQ)));
			
 
				+  Tensor tKVpKV = make_tensor<bool>(make_shape(size<2>(tKsK)));
			
 
				+
			
 
				+  // Set predicates for k bounds
			
 
				+  if (!Is_even_K) {
			
 
				+#pragma unroll
			
 
				+    for (int k = 0; k < size(tQpQ); ++k) {
			
 
				+      tQpQ(k) = get<1>(tQcQ(0, 0, k)) < params.d;
			
 
				+    }
			
 
				+#pragma unroll
			
 
				+    for (int k = 0; k < size(tKVpKV); ++k) {
			
 
				+      tKVpKV(k) = get<1>(tKVcKV(0, 0, k)) < params.d;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Prologue
			
 
				+
			
 
				+  // Copy from Knew to K, optionally apply rotary embedding.
			
 
				+  if constexpr (Append_KV) {
			
 
				+    typename Kernel_traits::GmemTiledCopyRotcossinPaged gmem_tiled_copy_rotary;
			
 
				+    auto gmem_thr_copy_rotary = gmem_tiled_copy_rotary.get_thread_slice(tidx);
			
 
				+    typename Kernel_traits::GmemTiledCopyRotcossinContPaged
			
 
				+        gmem_tiled_copy_rotary_cont;
			
 
				+    auto gmem_thr_copy_rotary_cont =
			
 
				+        gmem_tiled_copy_rotary_cont.get_thread_slice(tidx);
			
 
				+
			
 
				+    // Even if we have MQA / GQA, all threadblocks responsible for the same KV
			
 
				+    // head are writing to gmem. Technically it's a race condition, but they all
			
 
				+    // write the same content anyway, and it's safe. We want to do this so that
			
 
				+    // all threadblocks can proceed right after they finish writing the KV
			
 
				+    // cache.
			
 
				+    const index_t row_offset_cossin =
			
 
				+        ((n_block_max - 1) * kBlockN) * (params.rotary_dim / 2);
			
 
				+    Tensor gCos = make_tensor(
			
 
				+        make_gmem_ptr(reinterpret_cast<Element*>(params.rotary_cos_ptr) +
			
 
				+                      row_offset_cossin),
			
 
				+        Shape<Int<kBlockN>, Int<kHeadDim / 2>>{},
			
 
				+        make_stride(params.rotary_dim / 2, _1{}));
			
 
				+    Tensor gSin = make_tensor(
			
 
				+        make_gmem_ptr(reinterpret_cast<Element*>(params.rotary_sin_ptr) +
			
 
				+                      row_offset_cossin),
			
 
				+        Shape<Int<kBlockN>, Int<kHeadDim / 2>>{},
			
 
				+        make_stride(params.rotary_dim / 2, _1{}));
			
 
				+    Tensor gCosCont = make_tensor(
			
 
				+        make_gmem_ptr(reinterpret_cast<Element*>(params.rotary_cos_ptr) +
			
 
				+                      row_offset_cossin),
			
 
				+        Shape<Int<kBlockN>, Int<kHeadDim>>{},
			
 
				+        make_stride(params.rotary_dim / 2, _1{}));
			
 
				+    Tensor gSinCont = make_tensor(
			
 
				+        make_gmem_ptr(reinterpret_cast<Element*>(params.rotary_sin_ptr) +
			
 
				+                      row_offset_cossin),
			
 
				+        Shape<Int<kBlockN>, Int<kHeadDim>>{},
			
 
				+        make_stride(params.rotary_dim / 2, _1{}));
			
 
				+
			
 
				+    Tensor tRgCos_ = gmem_thr_copy_rotary.partition_S(gCos);
			
 
				+    Tensor tRgSin_ = gmem_thr_copy_rotary.partition_S(gSin);
			
 
				+    Tensor tRgCosCont_ = gmem_thr_copy_rotary_cont.partition_S(gCosCont);
			
 
				+    Tensor tRgSinCont_ = gmem_thr_copy_rotary_cont.partition_S(gSinCont);
			
 
				+
			
 
				+    Tensor tRgCos =
			
 
				+        make_tensor(tRgCos_.data(), reshape_thread_tile(tRgCos_.layout()));
			
 
				+    Tensor tRgSin =
			
 
				+        make_tensor(tRgSin_.data(), reshape_thread_tile(tRgSin_.layout()));
			
 
				+    Tensor tRgCosCont = make_tensor(
			
 
				+        tRgCosCont_.data(), reshape_flatten_thread_tile(tRgCosCont_.layout()));
			
 
				+    Tensor tRgSinCont = make_tensor(
			
 
				+        tRgSinCont_.data(), reshape_flatten_thread_tile(tRgSinCont_.layout()));
			
 
				+
			
 
				+    // if (cute::thread(0, 0)) { printf("rotary_cos_ptr = %p, gCos.data() = %p,
			
 
				+    // tRgCos.data() = %p, rotary_dim = %d\n", params.rotary_cos_ptr,
			
 
				+    // gCos.data(), tRgCos.data(), params.rotary_dim); } if (cute::thread(8, 0))
			
 
				+    // { print_tensor(gCos); } if (cute::thread(0, 0)) { print_tensor(tRgCos); }
			
 
				+
			
 
				+    const index_t row_offset_knew =
			
 
				+        binfo.k_offset(params.knew_batch_stride, params.knew_row_stride, bidb) +
			
 
				+        ((n_block_max - 1) * kBlockN) * params.knew_row_stride +
			
 
				+        (bidh / params.h_h_k_ratio) * params.knew_head_stride;
			
 
				+    const index_t row_offset_vnew =
			
 
				+        binfo.k_offset(params.vnew_batch_stride, params.vnew_row_stride, bidb) +
			
 
				+        ((n_block_max - 1) * kBlockN) * params.vnew_row_stride +
			
 
				+        (bidh / params.h_h_k_ratio) * params.vnew_head_stride;
			
 
				+    // Subtract seqlen_k_cache * row stride so that conceptually gK and gKnew
			
 
				+    // "line up". When we access them, e.g. if gK has 128 rows and gKnew has 64
			
 
				+    // rows, we access gK[:128] and gKNew[128:128 + 64]. This maps to accessing
			
 
				+    // the first 64 rows of knew_ptr.
			
 
				+    Tensor gKnew = make_tensor(
			
 
				+        make_gmem_ptr(reinterpret_cast<Element*>(params.knew_ptr) +
			
 
				+                      row_offset_knew -
			
 
				+                      binfo.seqlen_k_cache * params.knew_row_stride),
			
 
				+        Shape<Int<kBlockN>, Int<kHeadDim>>{},
			
 
				+        make_stride(params.knew_row_stride, _1{}));
			
 
				+    // if (threadIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0) {
			
 
				+    // printf("knew_ptr = %p, row_offset_knew = %d, gKnew_ptr = %p\n",
			
 
				+    // params.knew_ptr, row_offset_knew, gKnew.data()); }
			
 
				+    Tensor gVnew = make_tensor(
			
 
				+        make_gmem_ptr(reinterpret_cast<Element*>(params.vnew_ptr) +
			
 
				+                      row_offset_vnew -
			
 
				+                      binfo.seqlen_k_cache * params.vnew_row_stride),
			
 
				+        Shape<Int<kBlockN>, Int<kHeadDim>>{},
			
 
				+        make_stride(params.vnew_row_stride, _1{}));
			
 
				+    typename Kernel_traits::GmemTiledCopyQKVPaged gmem_tiled_copy_KV_new;
			
 
				+    auto gmem_thr_copy_KV_new = gmem_tiled_copy_KV_new.get_thread_slice(tidx);
			
 
				+    Tensor tKgKnew_ =
			
 
				+        gmem_thr_copy_KV_new.partition_S(gKnew);  // (KCPY, KCPY_N, KCPY_K)
			
 
				+    Tensor tVgVnew_ =
			
 
				+        gmem_thr_copy_KV_new.partition_S(gVnew);  // (VCPY, VCPY_N, VCPY_K)
			
 
				+
			
 
				+    auto tKgKnew =
			
 
				+        make_tensor(tKgKnew_.data(), reshape_thread_tile(tKgKnew_.layout()));
			
 
				+    auto tVgVnew =
			
 
				+        make_tensor(tVgVnew_.data(), reshape_thread_tile(tVgVnew_.layout()));
			
 
				+
			
 
				+    const int n_block_copy_min =
			
 
				+        std::max(n_block_min, binfo.seqlen_k_cache / kBlockN);
			
 
				+    auto tKgK_data = tKgK.data();
			
 
				+    auto tVgV_data = tVgV.data();
			
 
				+    for (int n_block = n_block_max - 1; n_block >= n_block_copy_min;
			
 
				+         n_block--) {
			
 
				+      flash::copy_w_min_idx<Is_even_K>(
			
 
				+          tVgVnew, tVgV, tKVcKV, tKVpKV,
			
 
				+          binfo.actual_seqlen_k - n_block * kBlockN,
			
 
				+          binfo.seqlen_k_cache - n_block * kBlockN);
			
 
				+      tVgVnew.data() =
			
 
				+          tVgVnew.data() + (-int(kBlockN * params.vnew_row_stride));
			
 
				+      if (params.rotary_dim == 0) {
			
 
				+        flash::copy_w_min_idx<Is_even_K>(
			
 
				+            tKgKnew, tKgK, tKVcKV, tKVpKV,
			
 
				+            binfo.actual_seqlen_k - n_block * kBlockN,
			
 
				+            binfo.seqlen_k_cache - n_block * kBlockN);
			
 
				+      } else {
			
 
				+        if (params.is_rotary_interleaved) {
			
 
				+          // Don't clear OOB_K because we're writing to global memory
			
 
				+          flash::copy_rotary_interleaved<Is_even_K, /*Clear_OOB_K=*/false>(
			
 
				+              tKgKnew, tKgK, tRgCos, tRgSin, tKVcKV,
			
 
				+              binfo.actual_seqlen_k - n_block * kBlockN,
			
 
				+              binfo.seqlen_k_cache - n_block * kBlockN, params.d,
			
 
				+              params.rotary_dim);
			
 
				+          tRgCos.data() =
			
 
				+              tRgCos.data() + (-int(kBlockN * params.rotary_dim / 2));
			
 
				+          tRgSin.data() =
			
 
				+              tRgSin.data() + (-int(kBlockN * params.rotary_dim / 2));
			
 
				+        } else {
			
 
				+          // Don't clear OOB_K because we're writing to global memory
			
 
				+          flash::copy_rotary_contiguous<Is_even_K, /*Clear_OOB_K=*/false>(
			
 
				+              tKgKnew, tKgK, tRgCosCont, tRgSinCont, tKVcKV,
			
 
				+              binfo.actual_seqlen_k - n_block * kBlockN,
			
 
				+              binfo.seqlen_k_cache - n_block * kBlockN, params.d,
			
 
				+              params.rotary_dim);
			
 
				+          tRgCosCont.data() =
			
 
				+              tRgCosCont.data() + (-int(kBlockN * params.rotary_dim / 2));
			
 
				+          tRgSinCont.data() =
			
 
				+              tRgSinCont.data() + (-int(kBlockN * params.rotary_dim / 2));
			
 
				+        }
			
 
				+      }
			
 
				+      tKgKnew.data() =
			
 
				+          tKgKnew.data() + (-int(kBlockN * params.knew_row_stride));
			
 
				+      if (block_table == nullptr) {
			
 
				+        tVgV.data() = tVgV.data() + (-int(kBlockN * params.v_row_stride));
			
 
				+        tKgK.data() = tKgK.data() + (-int(kBlockN * params.k_row_stride));
			
 
				+      } else {
			
 
				+        if (n_block > n_block_copy_min) {
			
 
				+          tVgV.data() =
			
 
				+              gV.data() +
			
 
				+              flash::resolve_thread_kv_page_slice_offset<Kernel_traits>(
			
 
				+                  tidx, n_block, params.page_block_size, block_table,
			
 
				+                  params.v_batch_stride, params.v_row_stride);
			
 
				+          tKgK.data() =
			
 
				+              gK.data() +
			
 
				+              flash::resolve_thread_kv_page_slice_offset<Kernel_traits>(
			
 
				+                  tidx, n_block, params.page_block_size, block_table,
			
 
				+                  params.k_batch_stride, params.k_row_stride);
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    // Need this before we can read in K again, so that we'll see the updated K
			
 
				+    // values.
			
 
				+    __syncthreads();
			
 
				+    tKgK.data() = tKgK_data;
			
 
				+    tVgV.data() = tVgV_data;
			
 
				+  }
			
 
				+
			
 
				+  // Read Q from gmem to smem, optionally apply rotary embedding.
			
 
				+  if (!Append_KV || params.rotary_dim == 0) {
			
 
				+    // We don't need to clear the sQ smem tiles since we'll only write out the
			
 
				+    // valid outputs
			
 
				+    flash::copy<Is_even_MN, Is_even_K>(
			
 
				+        gmem_tiled_copy_Q, tQgQ, tQsQ, tQcQ, tQpQ,
			
 
				+        binfo.actual_seqlen_q - m_block * kBlockM);
			
 
				+  } else {
			
 
				+    typename Kernel_traits::GmemTiledCopyRotcossin gmem_tiled_copy_rotary;
			
 
				+    auto gmem_thr_copy_rotary = gmem_tiled_copy_rotary.get_thread_slice(tidx);
			
 
				+    typename Kernel_traits::GmemTiledCopyRotcossinCont
			
 
				+        gmem_tiled_copy_rotary_cont;
			
 
				+    auto gmem_thr_copy_rotary_cont =
			
 
				+        gmem_tiled_copy_rotary_cont.get_thread_slice(tidx);
			
 
				+    const index_t row_offset_cossin =
			
 
				+        (binfo.seqlen_k_cache +
			
 
				+         (Is_causal || Is_local ? m_block * kBlockM : 0)) *
			
 
				+        (params.rotary_dim / 2);
			
 
				+    // If not causal, all the queries get the same the cos/sin, taken at
			
 
				+    // location seqlen_k_cache. We do this by setting the row stride of gCos /
			
 
				+    // gSin to 0.
			
 
				+    Tensor gCos = make_tensor(
			
 
				+        make_gmem_ptr(reinterpret_cast<Element*>(params.rotary_cos_ptr) +
			
 
				+                      row_offset_cossin),
			
 
				+        Shape<Int<kBlockM>, Int<kHeadDim / 2>>{},
			
 
				+        make_stride(Is_causal || Is_local ? params.rotary_dim / 2 : 0, _1{}));
			
 
				+    Tensor gSin = make_tensor(
			
 
				+        make_gmem_ptr(reinterpret_cast<Element*>(params.rotary_sin_ptr) +
			
 
				+                      row_offset_cossin),
			
 
				+        Shape<Int<kBlockM>, Int<kHeadDim / 2>>{},
			
 
				+        make_stride(Is_causal || Is_local ? params.rotary_dim / 2 : 0, _1{}));
			
 
				+    Tensor gCosCont = make_tensor(
			
 
				+        make_gmem_ptr(reinterpret_cast<Element*>(params.rotary_cos_ptr) +
			
 
				+                      row_offset_cossin),
			
 
				+        Shape<Int<kBlockM>, Int<kHeadDim>>{},
			
 
				+        make_stride(Is_causal || Is_local ? params.rotary_dim / 2 : 0, _1{}));
			
 
				+    Tensor gSinCont = make_tensor(
			
 
				+        make_gmem_ptr(reinterpret_cast<Element*>(params.rotary_sin_ptr) +
			
 
				+                      row_offset_cossin),
			
 
				+        Shape<Int<kBlockM>, Int<kHeadDim>>{},
			
 
				+        make_stride(Is_causal || Is_local ? params.rotary_dim / 2 : 0, _1{}));
			
 
				+    Tensor tRgCos = gmem_thr_copy_rotary.partition_S(gCos);
			
 
				+    Tensor tRgSin = gmem_thr_copy_rotary.partition_S(gSin);
			
 
				+    Tensor tRgCosCont = gmem_thr_copy_rotary_cont.partition_S(gCosCont);
			
 
				+    Tensor tRgSinCont = gmem_thr_copy_rotary_cont.partition_S(gSinCont);
			
 
				+    if (params.is_rotary_interleaved) {
			
 
				+      flash::copy_rotary_interleaved<Is_even_K>(
			
 
				+          tQgQ, tQsQ, tRgCos, tRgSin, tQcQ,
			
 
				+          binfo.actual_seqlen_q - m_block * kBlockM, 0, params.d,
			
 
				+          params.rotary_dim);
			
 
				+    } else {
			
 
				+      flash::copy_rotary_contiguous<Is_even_K>(
			
 
				+          tQgQ, tQsQ, tRgCosCont, tRgSinCont, tQcQ,
			
 
				+          binfo.actual_seqlen_q - m_block * kBlockM, 0, params.d,
			
 
				+          params.rotary_dim);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  int n_block = n_block_max - 1;
			
 
				+  // We don't need to clear the sK smem tiles since we'll mask out the scores
			
 
				+  // anyway.
			
 
				+  flash::copy<Is_even_MN, Is_even_K>(gmem_tiled_copy_KV, tKgK, tKsK, tKVcKV,
			
 
				+                                     tKVpKV,
			
 
				+                                     binfo.actual_seqlen_k - n_block * kBlockN);
			
 
				+  cute::cp_async_fence();
			
 
				+
			
 
				+  // flash::cp_async_wait<0>();
			
 
				+  // __syncthreads();
			
 
				+  // if (tidx == 0 && blockIdx.y == 0 && blockIdx.z == 0) { print(tKsK); }
			
 
				+  // __syncthreads();
			
 
				+
			
 
				+  clear(acc_o);
			
 
				+
			
 
				+  flash::Softmax<2 * size<1>(acc_o)> softmax;
			
 
				+
			
 
				+  const float alibi_slope =
			
 
				+      !Has_alibi ? 0.0f
			
 
				+                 : reinterpret_cast<float*>(params.alibi_slopes_ptr)
			
 
				+                           [bidb * params.alibi_slopes_batch_stride + bidh] /
			
 
				+                       params.scale_softmax;
			
 
				+  flash::Mask<Is_causal, Is_local, Has_alibi> mask(
			
 
				+      binfo.actual_seqlen_k, binfo.actual_seqlen_q, params.window_size_left,
			
 
				+      params.window_size_right, alibi_slope);
			
 
				+
			
 
				+  // For performance reason, we separate out two kinds of iterations:
			
 
				+  // those that need masking on S, and those that don't.
			
 
				+  // We need masking on S for the very last block when K and V has length not
			
 
				+  // multiple of kBlockN. We also need masking on S if it's causal, for the last
			
 
				+  // ceil_div(kBlockM, kBlockN) blocks. We will have at least 1 "masking"
			
 
				+  // iteration.
			
 
				+
			
 
				+  // If not even_N, then seqlen_k might end in the middle of a block. In that
			
 
				+  // case we need to mask 2 blocks (e.g. when kBlockM == kBlockN), not just 1.
			
 
				+  constexpr int n_masking_steps =
			
 
				+      (!Is_causal && !Is_local)
			
 
				+          ? 1
			
 
				+          : ((Is_even_MN && Is_causal) ? cute::ceil_div(kBlockM, kBlockN)
			
 
				+                                       : cute::ceil_div(kBlockM, kBlockN) + 1);
			
 
				+#pragma unroll
			
 
				+  for (int masking_step = 0; masking_step < n_masking_steps;
			
 
				+       ++masking_step, --n_block) {
			
 
				+    Tensor acc_s = partition_fragment_C(
			
 
				+        tiled_mma,
			
 
				+        Shape<Int<kBlockM>, Int<kBlockN>>{});  // (MMA=4, MMA_M, MMA_N)
			
 
				+    clear(acc_s);
			
 
				+    flash::cp_async_wait<0>();
			
 
				+    __syncthreads();
			
 
				+
			
 
				+    // Advance gV
			
 
				+    if (masking_step > 0) {
			
 
				+      if (block_table == nullptr) {
			
 
				+        tVgV.data() = tVgV.data() + (-int(kBlockN * params.v_row_stride));
			
 
				+      } else {
			
 
				+        tVgV.data() =
			
 
				+            gV.data() +
			
 
				+            flash::resolve_thread_kv_page_slice_offset<Kernel_traits>(
			
 
				+                tidx, n_block + 1, params.page_block_size, block_table,
			
 
				+                params.v_batch_stride, params.v_row_stride);
			
 
				+      }
			
 
				+      flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_KV, tVgV,
			
 
				+                                                  tVsV, tKVcKV, tKVpKV);
			
 
				+    } else {
			
 
				+      // Clear the smem tiles to account for predicated off loads
			
 
				+      flash::copy<Is_even_MN, Is_even_K, /*Clear_OOB_MN=*/true>(
			
 
				+          gmem_tiled_copy_KV, tVgV, tVsV, tKVcKV, tKVpKV,
			
 
				+          binfo.actual_seqlen_k - n_block * kBlockN);
			
 
				+    }
			
 
				+    cute::cp_async_fence();
			
 
				+
			
 
				+    flash::gemm(acc_s, tSrQ, tSrK, tSsQ, tSsK, tiled_mma, smem_tiled_copy_Q,
			
 
				+                smem_tiled_copy_K, smem_thr_copy_Q, smem_thr_copy_K);
			
 
				+    // if (cute::thread0()) { print(acc_s); }
			
 
				+    if constexpr (Is_softcap) {
			
 
				+      apply_softcap(acc_s, params.softcap);
			
 
				+    }
			
 
				+
			
 
				+    mask.template apply_mask<Is_causal, Is_even_MN>(
			
 
				+        acc_s, n_block * kBlockN,
			
 
				+        m_block * kBlockM + (tidx / 32) * 16 + (tidx % 32) / 4, kNWarps * 16);
			
 
				+
			
 
				+    flash::cp_async_wait<0>();
			
 
				+    __syncthreads();
			
 
				+    // if (tidx == 0 && blockIdx.y == 0 && blockIdx.z == 0) { print(tVsV); }
			
 
				+    // __syncthreads();
			
 
				+
			
 
				+    if (n_block > n_block_min) {
			
 
				+      // Advance gK
			
 
				+      if (block_table == nullptr) {
			
 
				+        tKgK.data() = tKgK.data() + (-int(kBlockN * params.k_row_stride));
			
 
				+      } else {
			
 
				+        tKgK.data() = gK.data() +
			
 
				+                      flash::resolve_thread_kv_page_slice_offset<Kernel_traits>(
			
 
				+                          tidx, n_block, params.page_block_size, block_table,
			
 
				+                          params.k_batch_stride, params.k_row_stride);
			
 
				+      }
			
 
				+      flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_KV, tKgK,
			
 
				+                                                  tKsK, tKVcKV, tKVpKV);
			
 
				+      // This cp_async_fence needs to be in the if block, otherwise the
			
 
				+      // synchronization isn't right and we get race conditions.
			
 
				+      cute::cp_async_fence();
			
 
				+    }
			
 
				+
			
 
				+    // We have key_padding_mask so we'll need to Check_inf
			
 
				+    masking_step == 0
			
 
				+        ? softmax.template softmax_rescale_o</*Is_first=*/true,
			
 
				+                                             /*Check_inf=*/Is_causal ||
			
 
				+                                                 Is_local || !Is_even_MN>(
			
 
				+              acc_s, acc_o, params.scale_softmax_log2)
			
 
				+        : softmax.template softmax_rescale_o</*Is_first=*/false,
			
 
				+                                             /*Check_inf=*/Is_causal ||
			
 
				+                                                 Is_local || !Is_even_MN>(
			
 
				+              acc_s, acc_o, params.scale_softmax_log2);
			
 
				+    // if (cute::thread0()) { print(scores_max); print(scores_sum);
			
 
				+    // print(scores); }
			
 
				+
			
 
				+    // Convert acc_s from fp32 to fp16/bf16
			
 
				+    Tensor rP = flash::convert_type<Element>(acc_s);
			
 
				+    // Reshape rP from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
			
 
				+    // if using m16n8k16 or (4, MMA_M, MMA_N) if using m16n8k8.
			
 
				+    Tensor tOrP = make_tensor(
			
 
				+        rP.data(),
			
 
				+        flash::convert_layout_acc_Aregs<Kernel_traits::TiledMma>(rP.layout()));
			
 
				+
			
 
				+    flash::gemm_rs(acc_o, tOrP, tOrVt, tOsVt, tiled_mma, smem_tiled_copy_V,
			
 
				+                   smem_thr_copy_V);
			
 
				+
			
 
				+    // This check is at the end of the loop since we always have at least 1
			
 
				+    // iteration
			
 
				+    if (n_masking_steps > 1 && n_block <= n_block_min) {
			
 
				+      --n_block;
			
 
				+      break;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // These are the iterations where we don't need masking on S
			
 
				+  for (; n_block >= n_block_min; --n_block) {
			
 
				+    Tensor acc_s = partition_fragment_C(
			
 
				+        tiled_mma,
			
 
				+        Shape<Int<kBlockM>, Int<kBlockN>>{});  // (MMA=4, MMA_M, MMA_N)
			
 
				+    clear(acc_s);
			
 
				+    flash::cp_async_wait<0>();
			
 
				+    __syncthreads();
			
 
				+    // Advance gV
			
 
				+    if (block_table == nullptr) {
			
 
				+      tVgV.data() = tVgV.data() + (-int(kBlockN * params.v_row_stride));
			
 
				+    } else {
			
 
				+      tVgV.data() = gV.data() +
			
 
				+                    flash::resolve_thread_kv_page_slice_offset<Kernel_traits>(
			
 
				+                        tidx, n_block + 1, params.page_block_size, block_table,
			
 
				+                        params.v_batch_stride, params.v_row_stride);
			
 
				+    }
			
 
				+    flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_KV, tVgV, tVsV,
			
 
				+                                                tKVcKV, tKVpKV);
			
 
				+    cute::cp_async_fence();
			
 
				+
			
 
				+    flash::gemm(acc_s, tSrQ, tSrK, tSsQ, tSsK, tiled_mma, smem_tiled_copy_Q,
			
 
				+                smem_tiled_copy_K, smem_thr_copy_Q, smem_thr_copy_K);
			
 
				+    if constexpr (Is_softcap) {
			
 
				+      apply_softcap(acc_s, params.softcap);
			
 
				+    }
			
 
				+
			
 
				+    flash::cp_async_wait<0>();
			
 
				+    __syncthreads();
			
 
				+    if (n_block > n_block_min) {
			
 
				+      // Advance gK
			
 
				+      if (block_table == nullptr) {
			
 
				+        tKgK.data() = tKgK.data() + (-int(kBlockN * params.k_row_stride));
			
 
				+      } else {
			
 
				+        tKgK.data() = gK.data() +
			
 
				+                      flash::resolve_thread_kv_page_slice_offset<Kernel_traits>(
			
 
				+                          tidx, n_block, params.page_block_size, block_table,
			
 
				+                          params.k_batch_stride, params.k_row_stride);
			
 
				+      }
			
 
				+      flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_KV, tKgK,
			
 
				+                                                  tKsK, tKVcKV, tKVpKV);
			
 
				+      // This cp_async_fence needs to be in the if block, otherwise the
			
 
				+      // synchronization isn't right and we get race conditions.
			
 
				+      cute::cp_async_fence();
			
 
				+    }
			
 
				+
			
 
				+    mask.template apply_mask</*Causal_mask=*/false>(
			
 
				+        acc_s, n_block * kBlockN,
			
 
				+        m_block * kBlockM + (tidx / 32) * 16 + (tidx % 32) / 4, kNWarps * 16);
			
 
				+    softmax
			
 
				+        .template softmax_rescale_o</*Is_first=*/false, /*Check_inf=*/Is_local>(
			
 
				+            acc_s, acc_o, params.scale_softmax_log2);
			
 
				+
			
 
				+    Tensor rP = flash::convert_type<Element>(acc_s);
			
 
				+    // Reshape rP from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
			
 
				+    // if using m16n8k16 or (4, MMA_M, MMA_N) if using m16n8k8.
			
 
				+    Tensor tOrP = make_tensor(
			
 
				+        rP.data(),
			
 
				+        flash::convert_layout_acc_Aregs<Kernel_traits::TiledMma>(rP.layout()));
			
 
				+
			
 
				+    flash::gemm_rs(acc_o, tOrP, tOrVt, tOsVt, tiled_mma, smem_tiled_copy_V,
			
 
				+                   smem_thr_copy_V);
			
 
				+  }
			
 
				+
			
 
				+  // Epilogue
			
 
				+
			
 
				+  Tensor lse =
			
 
				+      softmax.template normalize_softmax_lse</*Is_dropout=*/false, Split>(
			
 
				+          acc_o, params.scale_softmax);
			
 
				+  // if (cute::thread0()) { print(lse); }
			
 
				+
			
 
				+  Tensor sOaccum =
			
 
				+      make_tensor(make_smem_ptr(reinterpret_cast<ElementO*>(smem_)),
			
 
				+                  typename Kernel_traits::SmemLayoutO{});  // (SMEM_M,SMEM_N)
			
 
				+  // Partition sO to match the accumulator partitioning
			
 
				+  using SmemTiledCopyO =
			
 
				+      std::conditional_t<!Split, typename Kernel_traits::SmemCopyAtomO,
			
 
				+                         typename Kernel_traits::SmemCopyAtomOaccum>;
			
 
				+  auto smem_tiled_copy_Oaccum = make_tiled_copy_C(SmemTiledCopyO{}, tiled_mma);
			
 
				+  auto smem_thr_copy_Oaccum = smem_tiled_copy_Oaccum.get_thread_slice(tidx);
			
 
				+  Tensor rO = flash::convert_type<ElementO>(acc_o);
			
 
				+  Tensor taccOrOaccum =
			
 
				+      smem_thr_copy_Oaccum.retile_S(rO);  // ((Atom,AtomNum), MMA_M, MMA_N)
			
 
				+  Tensor taccOsOaccum = smem_thr_copy_Oaccum.partition_D(
			
 
				+      sOaccum);  // ((Atom,AtomNum),PIPE_M,PIPE_N)
			
 
				+
			
 
				+  // sOaccum is larger than sQ, so we need to syncthreads here
			
 
				+  // TODO: allocate enough smem for sOaccum
			
 
				+  if constexpr (Split) {
			
 
				+    __syncthreads();
			
 
				+  }
			
 
				+
			
 
				+  cute::copy(smem_tiled_copy_Oaccum, taccOrOaccum, taccOsOaccum);
			
 
				+
			
 
				+  const index_t row_offset_o =
			
 
				+      binfo.q_offset(params.o_batch_stride, params.o_row_stride, bidb) +
			
 
				+      m_block * kBlockM * params.o_row_stride + bidh * params.o_head_stride;
			
 
				+  const index_t row_offset_oaccum =
			
 
				+      (((n_split_idx * params.b + bidb) * params.h + bidh) * params.seqlen_q +
			
 
				+       m_block * kBlockM) *
			
 
				+      params.d_rounded;
			
 
				+  const index_t row_offset_lseaccum =
			
 
				+      (Split || !params.unpadded_lse
			
 
				+           ? ((n_split_idx * params.b + bidb) * params.h + bidh) *
			
 
				+                 params.seqlen_q
			
 
				+           : bidh * params.total_q + binfo.q_offset(params.seqlen_q, 1, bidb)) +
			
 
				+      m_block * kBlockM;
			
 
				+
			
 
				+  Tensor gOaccum =
			
 
				+      make_tensor(make_gmem_ptr(reinterpret_cast<ElementO*>(
			
 
				+                                    Split ? params.oaccum_ptr : params.o_ptr) +
			
 
				+                                (Split ? row_offset_oaccum : row_offset_o)),
			
 
				+                  Shape<Int<kBlockM>, Int<kHeadDim>>{},
			
 
				+                  make_stride(Split ? kHeadDim : params.o_row_stride, _1{}));
			
 
				+  Tensor gLSEaccum = make_tensor(
			
 
				+      make_gmem_ptr(
			
 
				+          reinterpret_cast<ElementAccum*>(Split ? params.softmax_lseaccum_ptr
			
 
				+                                                : params.softmax_lse_ptr) +
			
 
				+          row_offset_lseaccum),
			
 
				+      Shape<Int<kBlockM>>{}, Stride<_1>{});
			
 
				+  // if (tidx == 0) { printf("row_offset_o = %d, bidh = %d, gOaccum = %p\n",
			
 
				+  // row_offset_o, bidh, gOaccum.data()); }
			
 
				+
			
 
				+  GmemTiledCopyO gmem_tiled_copy_Oaccum;
			
 
				+  auto gmem_thr_copy_Oaccum = gmem_tiled_copy_Oaccum.get_thread_slice(tidx);
			
 
				+  Tensor tOsOaccum = gmem_thr_copy_Oaccum.partition_S(
			
 
				+      sOaccum);  // ((Atom,AtomNum),ATOM_M,ATOM_N)
			
 
				+  Tensor tOgOaccum = gmem_thr_copy_Oaccum.partition_D(gOaccum);
			
 
				+
			
 
				+  __syncthreads();
			
 
				+
			
 
				+  Tensor tOrOaccum = make_tensor<ElementO>(shape(tOgOaccum));
			
 
				+  cute::copy(gmem_tiled_copy_Oaccum, tOsOaccum, tOrOaccum);
			
 
				+
			
 
				+  Tensor caccO = make_identity_tensor(
			
 
				+      Shape<Int<kBlockM>, Int<kHeadDim>>{});  // (BLK_M,BLK_K) -> (blk_m,blk_k)
			
 
				+  Tensor taccOcO = thr_mma.partition_C(caccO);  // (MMA,MMA_M,MMA_K)
			
 
				+  static_assert(decltype(size<0>(taccOcO))::value == 4);
			
 
				+  // Convert to ((2, 2), MMA_M, MMA_K) then take only the row indices.
			
 
				+  Tensor taccOcO_row =
			
 
				+      logical_divide(taccOcO, Shape<_2>{})(make_coord(0, _), _, 0);
			
 
				+  CUTE_STATIC_ASSERT_V(size(lse) == size(taccOcO_row));  // MMA_M
			
 
				+  if (get<1>(taccOcO_row(0)) == 0) {
			
 
				+#pragma unroll
			
 
				+    for (int mi = 0; mi < size(lse); ++mi) {
			
 
				+      const int row = get<0>(taccOcO_row(mi));
			
 
				+      if (row < binfo.actual_seqlen_q - m_block * kBlockM) {
			
 
				+        gLSEaccum(row) = lse(mi);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Construct identity layout for sO
			
 
				+  Tensor cO = make_identity_tensor(make_shape(
			
 
				+      size<0>(sOaccum), size<1>(sOaccum)));  // (BLK_M,BLK_K) -> (blk_m,blk_k)
			
 
				+  // Repeat the partitioning with identity layouts
			
 
				+  Tensor tOcO = gmem_thr_copy_Oaccum.partition_D(
			
 
				+      cO);  // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
			
 
				+  Tensor tOpO = make_tensor<bool>(make_shape(size<2>(tOgOaccum)));
			
 
				+  if (!Is_even_K) {
			
 
				+#pragma unroll
			
 
				+    for (int k = 0; k < size(tOpO); ++k) {
			
 
				+      tOpO(k) = get<1>(tOcO(0, 0, k)) < params.d;
			
 
				+    }
			
 
				+  }
			
 
				+  // Clear_OOB_K must be false since we don't want to write zeros to gmem
			
 
				+  flash::copy<Is_even_MN, Is_even_K, /*Clear_OOB_MN=*/false,
			
 
				+              /*Clear_OOB_K=*/false>(gmem_tiled_copy_Oaccum, tOrOaccum,
			
 
				+                                     tOgOaccum, tOcO, tOpO,
			
 
				+                                     binfo.actual_seqlen_q - m_block * kBlockM);
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template <typename Kernel_traits, bool Is_dropout, bool Is_causal,
			
 
				+          bool Is_local, bool Has_alibi, bool Is_even_MN, bool Is_even_K,
			
 
				+          bool Is_softcap, bool Return_softmax, typename Params>
			
 
				+inline __device__ void compute_attn(const Params& params) {
			
 
				+  const int m_block = blockIdx.x;
			
 
				+  // The block index for the batch.
			
 
				+  const int bidb = blockIdx.y;
			
 
				+  // The block index for the head.
			
 
				+  const int bidh = blockIdx.z;
			
 
				+
			
 
				+  // We want the fwd and bwd to generate the same dropout pattern (RNG), without
			
 
				+  // restricting them to have the same number of threads or have to traverse the
			
 
				+  // attention matrix in the same order. In the Philox RNG, we use the offset to
			
 
				+  // store the batch, head, and the lane id (within a warp). We use the
			
 
				+  // subsequence to store the location of the 16 x 32 blocks within the
			
 
				+  // attention matrix. This way, as long as we have the batch, head, and the
			
 
				+  // location of the 16 x 32 block within the attention matrix, we can generate
			
 
				+  // the exact same dropout pattern.
			
 
				+
			
 
				+  flash::compute_attn_1rowblock<Kernel_traits, Is_dropout, Is_causal, Is_local,
			
 
				+                                Has_alibi, Is_even_MN, Is_even_K, Is_softcap,
			
 
				+                                Return_softmax>(params, bidb, bidh, m_block);
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template <typename Kernel_traits, bool Is_causal, bool Is_local, bool Has_alibi,
			
 
				+          bool Is_even_MN, bool Is_even_K, bool Is_softcap, bool Split,
			
 
				+          bool Append_KV, typename Params>
			
 
				+inline __device__ void compute_attn_splitkv(const Params& params) {
			
 
				+  const int m_block = blockIdx.x;
			
 
				+  // The block index for the batch.
			
 
				+  const int bidb = Split ? blockIdx.z / params.h : blockIdx.y;
			
 
				+  // The block index for the head.
			
 
				+  const int bidh = Split ? blockIdx.z - bidb * params.h : blockIdx.z;
			
 
				+  const int n_split_idx = Split ? blockIdx.y : 0;
			
 
				+  const int num_n_splits = Split ? gridDim.y : 1;
			
 
				+  flash::compute_attn_1rowblock_splitkv<Kernel_traits, Is_causal, Is_local,
			
 
				+                                        Has_alibi, Is_even_MN, Is_even_K,
			
 
				+                                        Is_softcap, Split, Append_KV>(
			
 
				+      params, bidb, bidh, m_block, n_split_idx, num_n_splits);
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template <typename Kernel_traits, int kBlockM, int Log_max_splits,
			
 
				+          bool Is_even_K, typename Params>
			
 
				+inline __device__ void combine_attn_seqk_parallel(const Params& params) {
			
 
				+  using Element = typename Kernel_traits::Element;
			
 
				+  using ElementAccum = typename Kernel_traits::ElementAccum;
			
 
				+  using index_t = typename Kernel_traits::index_t;
			
 
				+  constexpr int kMaxSplits = 1 << Log_max_splits;
			
 
				+  constexpr int kHeadDim = Kernel_traits::kHeadDim;
			
 
				+  constexpr int kNThreads = Kernel_traits::kNThreads;
			
 
				+
			
 
				+  static_assert(kMaxSplits <= 128, "kMaxSplits must be <= 128");
			
 
				+  static_assert(kBlockM == 4 || kBlockM == 8 || kBlockM == 16 || kBlockM == 32,
			
 
				+                "kBlockM must be 4, 8, 16 or 32");
			
 
				+  static_assert(kNThreads == 128, "We assume that each block has 128 threads");
			
 
				+
			
 
				+  // Shared memory.
			
 
				+  // kBlockM + 1 instead of kBlockM to reduce bank conflicts.
			
 
				+  __shared__ ElementAccum sLSE[kMaxSplits][kBlockM + 1];
			
 
				+
			
 
				+  // The thread and block index.
			
 
				+  const int tidx = threadIdx.x;
			
 
				+  const int bidx = blockIdx.x;
			
 
				+
			
 
				+  const index_t lse_size = params.b * params.h * params.seqlen_q;
			
 
				+
			
 
				+  const index_t row_offset_lse = bidx * kBlockM;
			
 
				+  Tensor gLSEaccum = make_tensor(
			
 
				+      make_gmem_ptr(
			
 
				+          reinterpret_cast<ElementAccum*>(params.softmax_lseaccum_ptr) +
			
 
				+          row_offset_lse),
			
 
				+      Shape<Int<kMaxSplits>, Int<kBlockM>>{}, make_stride(lse_size, _1{}));
			
 
				+
			
 
				+  // LSE format is different depending on params.unpadded_lse and
			
 
				+  // params.seqlenq_ngroups_swapped, see comment in get_lse_tile. This tensor's
			
 
				+  // layout maps row_offset_lse to {bidb, bidh, q_offset}.
			
 
				+  Tensor gLSE = make_tensor(
			
 
				+      make_gmem_ptr(reinterpret_cast<ElementAccum*>(params.softmax_lse_ptr) +
			
 
				+                    row_offset_lse),
			
 
				+      Shape<Int<kBlockM>>{}, Stride<_1>{});
			
 
				+
			
 
				+  // This layout maps row_offset_lse to {bidh, q_offset, bidb} or {bidh, bidb,
			
 
				+  // q_offset}.
			
 
				+  Layout flat_layout = make_layout(lse_size);
			
 
				+  Layout orig_layout =
			
 
				+      make_layout(make_shape(params.seqlen_q, params.h, params.b));
			
 
				+  auto transposed_stride =
			
 
				+      params.seqlenq_ngroups_swapped
			
 
				+          ? make_stride(params.b, params.seqlen_q * params.b, 1)
			
 
				+          : make_stride(1, params.seqlen_q * params.b, params.seqlen_q);
			
 
				+  Layout remapped_layout = make_layout(
			
 
				+      make_shape(params.seqlen_q, params.h, params.b), transposed_stride);
			
 
				+  Layout final_layout = cute::composition(
			
 
				+      remapped_layout, cute::composition(orig_layout, flat_layout));
			
 
				+
			
 
				+  Tensor gLSE_unpadded = make_tensor(
			
 
				+      make_gmem_ptr(reinterpret_cast<ElementAccum*>(params.softmax_lse_ptr)),
			
 
				+      final_layout);
			
 
				+
			
 
				+  constexpr int kNLsePerThread =
			
 
				+      (kMaxSplits * kBlockM + kNThreads - 1) / kNThreads;
			
 
				+
			
 
				+  // Read the LSE values from gmem and store them in shared memory, then
			
 
				+  // transpose them.
			
 
				+  constexpr int kRowsPerLoadLSE = kNThreads / kBlockM;
			
 
				+#pragma unroll
			
 
				+  for (int l = 0; l < kNLsePerThread; ++l) {
			
 
				+    const int row = l * kRowsPerLoadLSE + tidx / kBlockM;
			
 
				+    const int col = tidx % kBlockM;
			
 
				+    ElementAccum lse =
			
 
				+        (row < params.num_splits && col < lse_size - bidx * kBlockM)
			
 
				+            ? gLSEaccum(row, col)
			
 
				+            : -INFINITY;
			
 
				+    if (row < kMaxSplits) {
			
 
				+      sLSE[row][col] = lse;
			
 
				+    }
			
 
				+    // if (bidx == 0 && tidx < 32) { printf("tidx = %d, row = %d, col = %d, lse
			
 
				+    // = %f\n", tidx, row, col, lse); }
			
 
				+  }
			
 
				+  // if (bidx == 1 && tidx < 32) { printf("tidx = %d, row_offset_lse = %d, lse =
			
 
				+  // %f\n", tidx, row_offset_lse, lse_accum(0)); }
			
 
				+  __syncthreads();
			
 
				+  Tensor lse_accum = make_tensor<ElementAccum>(Shape<Int<kNLsePerThread>>{});
			
 
				+  constexpr int kRowsPerLoadTranspose = std::min(kRowsPerLoadLSE, kMaxSplits);
			
 
				+  // To make sure that kMaxSplits is within 1 warp: we decide how many elements
			
 
				+  // within kMaxSplits each thread should hold. If kMaxSplits = 16, then each
			
 
				+  // thread holds 2 elements (128 threads, kBlockM rows, so each time we load we
			
 
				+  // can load 128 / kBlockM rows). constexpr int kThreadsPerSplit = kMaxSplits /
			
 
				+  // kRowsPerLoadTranspose; static_assert(kThreadsPerSplit <= 32);
			
 
				+  static_assert(kRowsPerLoadTranspose <= 32);
			
 
				+  static_assert(kNLsePerThread * kRowsPerLoadTranspose <= kMaxSplits);
			
 
				+#pragma unroll
			
 
				+  for (int l = 0; l < kNLsePerThread; ++l) {
			
 
				+    const int row = l * kRowsPerLoadTranspose + tidx % kRowsPerLoadTranspose;
			
 
				+    const int col = tidx / kRowsPerLoadTranspose;
			
 
				+    lse_accum(l) =
			
 
				+        (row < kMaxSplits && col < kBlockM) ? sLSE[row][col] : -INFINITY;
			
 
				+    // if (bidx == 0 && tidx < 32) { printf("tidx = %d, row = %d, col = %d, lse
			
 
				+    // = %f\n", tidx, row, col, lse_accum(l)); }
			
 
				+  }
			
 
				+
			
 
				+  // Compute the logsumexp of the LSE along the split dimension.
			
 
				+  ElementAccum lse_max = lse_accum(0);
			
 
				+#pragma unroll
			
 
				+  for (int l = 1; l < kNLsePerThread; ++l) {
			
 
				+    lse_max = max(lse_max, lse_accum(l));
			
 
				+  }
			
 
				+  MaxOp<float> max_op;
			
 
				+  lse_max = Allreduce<kRowsPerLoadTranspose>::run(lse_max, max_op);
			
 
				+  lse_max =
			
 
				+      lse_max == -INFINITY ? 0.0f : lse_max;  // In case all local LSEs are -inf
			
 
				+  float lse_sum = expf(lse_accum(0) - lse_max);
			
 
				+#pragma unroll
			
 
				+  for (int l = 1; l < kNLsePerThread; ++l) {
			
 
				+    lse_sum += expf(lse_accum(l) - lse_max);
			
 
				+  }
			
 
				+  SumOp<float> sum_op;
			
 
				+  lse_sum = Allreduce<kRowsPerLoadTranspose>::run(lse_sum, sum_op);
			
 
				+  // For the case where all local lse == -INFINITY, we want to set lse_logsum to
			
 
				+  // INFINITY. Otherwise lse_logsum is log(0.0) = -INFINITY and we get NaN when
			
 
				+  // we do lse_accum(l) - lse_logsum.
			
 
				+  ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum)
			
 
				+                                ? INFINITY
			
 
				+                                : logf(lse_sum) + lse_max;
			
 
				+  // if (bidx == 0 && tidx < 32) { printf("tidx = %d, lse = %f, lse_max = %f,
			
 
				+  // lse_logsum = %f\n", tidx, lse_accum(0), lse_max, lse_logsum); }
			
 
				+  if (tidx % kRowsPerLoadTranspose == 0 &&
			
 
				+      tidx / kRowsPerLoadTranspose < kBlockM) {
			
 
				+    if (params.unpadded_lse) {
			
 
				+      const index_t lse_offset = row_offset_lse + tidx / kRowsPerLoadTranspose;
			
 
				+      if (lse_offset < lse_size) {
			
 
				+        gLSE_unpadded(lse_offset) = lse_logsum;
			
 
				+      }
			
 
				+    } else {
			
 
				+      gLSE(tidx / kRowsPerLoadTranspose) = lse_logsum;
			
 
				+    }
			
 
				+  }
			
 
				+// Store the scales exp(lse - lse_logsum) in shared memory.
			
 
				+#pragma unroll
			
 
				+  for (int l = 0; l < kNLsePerThread; ++l) {
			
 
				+    const int row = l * kRowsPerLoadTranspose + tidx % kRowsPerLoadTranspose;
			
 
				+    const int col = tidx / kRowsPerLoadTranspose;
			
 
				+    if (row < params.num_splits && col < kBlockM) {
			
 
				+      sLSE[row][col] = expf(lse_accum(l) - lse_logsum);
			
 
				+    }
			
 
				+  }
			
 
				+  __syncthreads();
			
 
				+
			
 
				+  const index_t row_offset_oaccum = bidx * kBlockM * params.d_rounded;
			
 
				+  Tensor gOaccum = make_tensor(
			
 
				+      make_gmem_ptr(reinterpret_cast<ElementAccum*>(params.oaccum_ptr) +
			
 
				+                    row_offset_oaccum),
			
 
				+      Shape<Int<kBlockM>, Int<kHeadDim>>{}, Stride<Int<kHeadDim>, _1>{});
			
 
				+  constexpr int kBlockN = kNThreads / kBlockM;
			
 
				+  using GmemLayoutAtomOaccum =
			
 
				+      Layout<Shape<Int<kBlockM>, Int<kBlockN>>, Stride<Int<kBlockN>, _1>>;
			
 
				+  using GmemTiledCopyOaccum = decltype(make_tiled_copy(
			
 
				+      Copy_Atom<DefaultCopy, ElementAccum>{}, GmemLayoutAtomOaccum{},
			
 
				+      Layout<Shape<_1, _4>>{}));  // Val layout, 4 vals per store
			
 
				+  GmemTiledCopyOaccum gmem_tiled_copy_Oaccum;
			
 
				+  auto gmem_thr_copy_Oaccum = gmem_tiled_copy_Oaccum.get_thread_slice(tidx);
			
 
				+  Tensor tOgOaccum = gmem_thr_copy_Oaccum.partition_S(gOaccum);
			
 
				+  Tensor tOrO = make_tensor<ElementAccum>(shape(tOgOaccum));
			
 
				+  Tensor tOrOaccum = make_tensor<ElementAccum>(shape(tOgOaccum));
			
 
				+  clear(tOrO);
			
 
				+
			
 
				+  // Predicates
			
 
				+  Tensor cOaccum = make_identity_tensor(Shape<Int<kBlockM>, Int<kHeadDim>>{});
			
 
				+  // Repeat the partitioning with identity layouts
			
 
				+  Tensor tOcOaccum = gmem_thr_copy_Oaccum.partition_S(cOaccum);
			
 
				+  Tensor tOpOaccum = make_tensor<bool>(make_shape(size<2>(tOgOaccum)));
			
 
				+  if (!Is_even_K) {
			
 
				+#pragma unroll
			
 
				+    for (int k = 0; k < size(tOpOaccum); ++k) {
			
 
				+      tOpOaccum(k) = get<1>(tOcOaccum(0, 0, k)) < params.d;
			
 
				+    }
			
 
				+  }
			
 
				+  // Load Oaccum in then scale and accumulate to O
			
 
				+  for (int split = 0; split < params.num_splits; ++split) {
			
 
				+    flash::copy</*Is_even_MN=*/false, Is_even_K>(
			
 
				+        gmem_tiled_copy_Oaccum, tOgOaccum, tOrOaccum, tOcOaccum, tOpOaccum,
			
 
				+        params.b * params.h * params.seqlen_q - bidx * kBlockM);
			
 
				+#pragma unroll
			
 
				+    for (int m = 0; m < size<1>(tOrOaccum); ++m) {
			
 
				+      int row = get<0>(tOcOaccum(0, m, 0));
			
 
				+      ElementAccum lse_scale = sLSE[split][row];
			
 
				+#pragma unroll
			
 
				+      for (int k = 0; k < size<2>(tOrOaccum); ++k) {
			
 
				+#pragma unroll
			
 
				+        for (int i = 0; i < size<0>(tOrOaccum); ++i) {
			
 
				+          tOrO(i, m, k) += lse_scale * tOrOaccum(i, m, k);
			
 
				+        }
			
 
				+      }
			
 
				+      // if (cute::thread0()) { printf("lse_scale = %f, %f\n", sLSE[split][0],
			
 
				+      // sLSE[split][1]); print(tOrOaccum); }
			
 
				+    }
			
 
				+    tOgOaccum.data() = tOgOaccum.data() +
			
 
				+                       params.b * params.h * params.seqlen_q * params.d_rounded;
			
 
				+  }
			
 
				+  // if (cute::thread0()) { print_tensor(tOrO); }
			
 
				+
			
 
				+  Tensor rO = flash::convert_type<Element>(tOrO);
			
 
				+// Write to gO
			
 
				+#pragma unroll
			
 
				+  for (int m = 0; m < size<1>(rO); ++m) {
			
 
				+    const int idx = bidx * kBlockM + get<0>(tOcOaccum(0, m, 0));
			
 
				+    if (idx < params.b * params.h * params.seqlen_q) {
			
 
				+      const int batch_idx = idx / (params.h * params.seqlen_q);
			
 
				+      const int head_idx =
			
 
				+          (idx - batch_idx * (params.h * params.seqlen_q)) / params.seqlen_q;
			
 
				+      // The index to the rows of Q
			
 
				+      const int row = idx - batch_idx * (params.h * params.seqlen_q) -
			
 
				+                      head_idx * params.seqlen_q;
			
 
				+      auto o_ptr = reinterpret_cast<Element*>(params.o_ptr) +
			
 
				+                   batch_idx * params.o_batch_stride +
			
 
				+                   head_idx * params.o_head_stride + row * params.o_row_stride;
			
 
				+#pragma unroll
			
 
				+      for (int k = 0; k < size<2>(rO); ++k) {
			
 
				+        if (Is_even_K || tOpOaccum(k)) {
			
 
				+          const int col = get<1>(tOcOaccum(0, m, k));
			
 
				+          Tensor gO = make_tensor(make_gmem_ptr(o_ptr + col),
			
 
				+                                  Shape<Int<decltype(size<0>(rO))::value>>{},
			
 
				+                                  Stride<_1>{});
			
 
				+          // TODO: Should check if this is using vectorized store, but it seems
			
 
				+          // pretty fast
			
 
				+          copy(rO(_, m, k), gO);
			
 
				+          // if (bidx == 0 && tidx == 0) { printf("tidx = %d, idx = %d,
			
 
				+          // batch_idx = %d, head_idx = %d, row = %d, col = %d\n", tidx, idx,
			
 
				+          // batch_idx, head_idx, row, col); print(rO(_, m, k)); print(gO); }
			
 
				+          // reinterpret_cast<uint64_t *>(o_ptr)[col / 4] =
			
 
				+          // recast<uint64_t>(rO)(0, m, k);
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+}  // namespace flash
			
--- a/kernels/flash_attn/flash_fwd_launch_template.h
+++ b/kernels/flash_attn/flash_fwd_launch_template.h
@@ -0,0 +1,356 @@
 
				+/******************************************************************************
			
 
				+ * Copyright (c) 2023, Tri Dao.
			
 
				+ ******************************************************************************/
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include <ATen/cuda/CUDAContext.h>
			
 
				+
			
 
				+#include "static_switch.h"
			
 
				+#include "flash.h"
			
 
				+#include "flash_fwd_kernel.h"
			
 
				+
			
 
				+// Determine if the architecture supports FLASH and define a macro to handle parameter modifiers
			
 
				+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
			
 
				+#define ARCH_SUPPORTS_FLASH
			
 
				+#define KERNEL_PARAM_MODIFIER __grid_constant__
			
 
				+#else
			
 
				+#define KERNEL_PARAM_MODIFIER
			
 
				+#endif
			
 
				+
			
 
				+// Define a macro for unsupported architecture handling to centralize the error message
			
 
				+#define FLASH_UNSUPPORTED_ARCH printf("FATAL: FlashAttention requires building with sm version sm80-sm90, but was built for < 8.0!");
			
 
				+
			
 
				+// Use a macro to clean up kernel definitions
			
 
				+#define DEFINE_FLASH_FORWARD_KERNEL(kernelName, ...) \
			
 
				+template<typename Kernel_traits, __VA_ARGS__> \
			
 
				+__global__ void kernelName(KERNEL_PARAM_MODIFIER const Flash_fwd_params params)
			
 
				+
			
 
				+DEFINE_FLASH_FORWARD_KERNEL(flash_fwd_kernel, bool Is_dropout, bool Is_causal, bool Is_local, bool Has_alibi, bool Is_even_MN, bool Is_even_K, bool Is_softcap, bool Return_softmax) {
			
 
				+    #if defined(ARCH_SUPPORTS_FLASH)
			
 
				+        static_assert(!(Is_causal && Is_local)); // Enforce constraints
			
 
				+        flash::compute_attn<Kernel_traits, Is_dropout, Is_causal, Is_local, Has_alibi, Is_even_MN, Is_even_K, Is_softcap, Return_softmax>(params);
			
 
				+    #else
			
 
				+        FLASH_UNSUPPORTED_ARCH
			
 
				+    #endif
			
 
				+}
			
 
				+
			
 
				+DEFINE_FLASH_FORWARD_KERNEL(flash_fwd_splitkv_kernel, bool Is_causal, bool Is_local, bool Has_alibi, bool Is_even_MN, bool Is_even_K, bool Is_softcap, bool Split, bool Append_KV) {
			
 
				+    #if defined(ARCH_SUPPORTS_FLASH)
			
 
				+        flash::compute_attn_splitkv<Kernel_traits, Is_causal, Is_local, Has_alibi, Is_even_MN, Is_even_K, Is_softcap, Split, Append_KV>(params);
			
 
				+    #else
			
 
				+        FLASH_UNSUPPORTED_ARCH
			
 
				+    #endif
			
 
				+}
			
 
				+
			
 
				+DEFINE_FLASH_FORWARD_KERNEL(flash_fwd_splitkv_combine_kernel, int kBlockM, int Log_max_splits, bool Is_even_K) {
			
 
				+    static_assert(Log_max_splits >= 1);
			
 
				+    flash::combine_attn_seqk_parallel<Kernel_traits, kBlockM, Log_max_splits, Is_even_K>(params);
			
 
				+}
			
 
				+
			
 
				+template<typename Kernel_traits, bool Is_dropout, bool Is_causal>
			
 
				+void run_flash_fwd(Flash_fwd_params &params, cudaStream_t stream) {
			
 
				+    constexpr size_t smem_size = Kernel_traits::kSmemSize;
			
 
				+    // printf("smem_size = %d\n", smem_size);
			
 
				+
			
 
				+    // Work-around for gcc 7. It doesn't like nested BOOL_SWITCH.
			
 
				+    // https://github.com/kokkos/kokkos-kernels/issues/349
			
 
				+    // https://github.com/HazyResearch/flash-attention/issues/21
			
 
				+
			
 
				+    const int num_m_block = (params.seqlen_q + Kernel_traits::kBlockM - 1) / Kernel_traits::kBlockM;
			
 
				+    dim3 grid(num_m_block, params.b, params.h);
			
 
				+    const bool is_even_MN = params.cu_seqlens_q == nullptr && params.cu_seqlens_k == nullptr && params.seqlen_k % Kernel_traits::kBlockN == 0 && params.seqlen_q % Kernel_traits::kBlockM == 0;
			
 
				+    const bool is_even_K = params.d == Kernel_traits::kHeadDim;
			
 
				+    const bool return_softmax = params.p_ptr != nullptr;
			
 
				+    BOOL_SWITCH(is_even_MN, IsEvenMNConst, [&] {
			
 
				+        EVENK_SWITCH(is_even_K, IsEvenKConst, [&] {
			
 
				+            LOCAL_SWITCH((params.window_size_left >= 0 || params.window_size_right >= 0) && !Is_causal, Is_local, [&] {
			
 
				+                BOOL_SWITCH(return_softmax, ReturnSoftmaxConst, [&] {
			
 
				+                    ALIBI_SWITCH(params.alibi_slopes_ptr != nullptr, Has_alibi, [&] {
			
 
				+                        SOFTCAP_SWITCH(params.softcap > 0.0, Is_softcap, [&] {
			
 
				+                            // Will only return softmax if dropout, to reduce compilation time.
			
 
				+                            // If not IsEvenKConst, we also set IsEvenMNConst to false to reduce number of templates.
			
 
				+                            // If return_softmax, set IsEvenMNConst to false to reduce number of templates
			
 
				+                            // If head dim > 128, set IsEvenMNConst to false to reduce number of templates
			
 
				+                            // If Is_local, set Is_causal to false
			
 
				+                            auto kernel = &flash_fwd_kernel<Kernel_traits, Is_dropout && !Is_softcap, Is_causal, Is_local && !Is_causal, Has_alibi, IsEvenMNConst && IsEvenKConst && !Is_local && !ReturnSoftmaxConst && Kernel_traits::kHeadDim <= 128, IsEvenKConst, Is_softcap, ReturnSoftmaxConst && Is_dropout && !Is_softcap>;
			
 
				+                            // auto kernel = &flash_fwd_kernel<Kernel_traits, false, Is_causal, false, false, true, true, false>;
			
 
				+                            // printf("IsEvenMNConst = %d, IsEvenKConst = %d, Is_local = %d, Is_causal = %d, ReturnSoftmaxConst = %d, Is_dropout = %d\n", int(IsEvenMNConst), int(IsEvenKConst), int(Is_local), int(Is_causal), int(ReturnSoftmaxConst), int(Is_dropout));
			
 
				+                            // auto kernel = &flash_fwd_kernel<Kernel_traits, false, Is_causal, false, true, true, false>;
			
 
				+                            if (smem_size >= 48 * 1024) {
			
 
				+                                C10_CUDA_CHECK(cudaFuncSetAttribute(
			
 
				+                                    kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
			
 
				+                            }
			
 
				+                            // int ctas_per_sm;
			
 
				+                            // cudaError status_ = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
			
 
				+                            //     &ctas_per_sm, kernel, Kernel_traits::kNThreads, smem_size);
			
 
				+                            // printf("smem_size = %d, CTAs per SM = %d\n", int(smem_size), ctas_per_sm);
			
 
				+                            kernel<<<grid, Kernel_traits::kNThreads, smem_size, stream>>>(params);
			
 
				+                            C10_CUDA_KERNEL_LAUNCH_CHECK();
			
 
				+                        });
			
 
				+                    });
			
 
				+                });
			
 
				+            });
			
 
				+        });
			
 
				+    });
			
 
				+}
			
 
				+
			
 
				+template<typename Kernel_traits, bool Is_causal>
			
 
				+void run_flash_splitkv_fwd(Flash_fwd_params &params, cudaStream_t stream) {
			
 
				+    static_assert(!Kernel_traits::Is_Q_in_regs, "SplitKV implementation does not support Is_Q_in_regs");
			
 
				+    static_assert(!Kernel_traits::Share_Q_K_smem, "SplitKV implementation does not support Share_Q_K_smem");
			
 
				+    constexpr size_t smem_size = Kernel_traits::kSmemSize;
			
 
				+    const int num_m_block = (params.seqlen_q + Kernel_traits::kBlockM - 1) / Kernel_traits::kBlockM;
			
 
				+    dim3 grid(num_m_block, params.num_splits > 1 ? params.num_splits : params.b, params.num_splits > 1 ? params.b * params.h : params.h);
			
 
				+    const bool is_even_MN = params.cu_seqlens_q == nullptr && params.cu_seqlens_k == nullptr && params.seqlen_k % Kernel_traits::kBlockN == 0 && params.seqlen_q % Kernel_traits::kBlockM == 0;
			
 
				+    const bool is_even_K = params.d == Kernel_traits::kHeadDim;
			
 
				+    BOOL_SWITCH(is_even_MN, IsEvenMNConst, [&] {
			
 
				+        EVENK_SWITCH(is_even_K, IsEvenKConst, [&] {
			
 
				+            LOCAL_SWITCH((params.window_size_left >= 0 || params.window_size_right >= 0) && !Is_causal, Is_local, [&] {
			
 
				+                BOOL_SWITCH(params.num_splits > 1, Split, [&] {
			
 
				+                    BOOL_SWITCH(params.knew_ptr != nullptr, Append_KV, [&] {
			
 
				+                        ALIBI_SWITCH(params.alibi_slopes_ptr != nullptr, Has_alibi, [&] {
			
 
				+                            SOFTCAP_SWITCH(params.softcap > 0.0, Is_softcap, [&] {
			
 
				+                                // If Append_KV, then we must have seqlen_offsets, which means cu_seqlens_k != nullptr.
			
 
				+                                // If not IsEvenKConst, we also set IsEvenMNConst to false to reduce number of templates.
			
 
				+                                // If Is_local, set Is_causal to false
			
 
				+                                auto kernel = &flash_fwd_splitkv_kernel<Kernel_traits, Is_causal, Is_local && !Is_causal, Has_alibi, IsEvenMNConst && !Append_KV && IsEvenKConst && !Is_local && Kernel_traits::kHeadDim <= 128, IsEvenKConst, Is_softcap, Split, Append_KV>;
			
 
				+                                // auto kernel = &flash_fwd_splitkv_kernel<Kernel_traits, Is_causal, false, true, Split, Append_KV>;
			
 
				+                                // auto kernel = &flash_fwd_splitkv_kernel<Kernel_traits, Is_causal, false, IsEvenKConst>;
			
 
				+                                if (smem_size >= 48 * 1024) {
			
 
				+                                    C10_CUDA_CHECK(cudaFuncSetAttribute(
			
 
				+                                        kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
			
 
				+                                }
			
 
				+                                kernel<<<grid, Kernel_traits::kNThreads, smem_size, stream>>>(params);
			
 
				+                                C10_CUDA_KERNEL_LAUNCH_CHECK();
			
 
				+                            });
			
 
				+                        });
			
 
				+                    });
			
 
				+                });
			
 
				+            });
			
 
				+        });
			
 
				+    });
			
 
				+    if (params.num_splits > 1) {
			
 
				+        // We want kBlockM to be as small as possible for more parallelism.
			
 
				+        // With 128 threads we can load 512 elements at a time, so if headdim is divisible by 128, kBlockM = 4.
			
 
				+        // If headdim is divisible by 64, then we set kBlockM = 8, etc.
			
 
				+        constexpr static int kBlockM = Kernel_traits::kHeadDim % 128 == 0 ? 4 : (Kernel_traits::kHeadDim % 64 == 0 ? 8 : 16);
			
 
				+        dim3 grid_combine((params.b * params.h * params.seqlen_q + kBlockM - 1) / kBlockM);
			
 
				+        EVENK_SWITCH(is_even_K, IsEvenKConst, [&] {
			
 
				+            if (params.num_splits <= 2) {
			
 
				+                flash_fwd_splitkv_combine_kernel<Kernel_traits, kBlockM, 1, IsEvenKConst><<<grid_combine, Kernel_traits::kNThreads, 0, stream>>>(params);
			
 
				+            } else if (params.num_splits <= 4) {
			
 
				+                flash_fwd_splitkv_combine_kernel<Kernel_traits, kBlockM, 2, IsEvenKConst><<<grid_combine, Kernel_traits::kNThreads, 0, stream>>>(params);
			
 
				+            } else if (params.num_splits <= 8) {
			
 
				+                flash_fwd_splitkv_combine_kernel<Kernel_traits, kBlockM, 3, IsEvenKConst><<<grid_combine, Kernel_traits::kNThreads, 0, stream>>>(params);
			
 
				+            } else if (params.num_splits <= 16) {
			
 
				+                flash_fwd_splitkv_combine_kernel<Kernel_traits, kBlockM, 4, IsEvenKConst><<<grid_combine, Kernel_traits::kNThreads, 0, stream>>>(params);
			
 
				+            } else if (params.num_splits <= 32) {
			
 
				+                flash_fwd_splitkv_combine_kernel<Kernel_traits, kBlockM, 5, IsEvenKConst><<<grid_combine, Kernel_traits::kNThreads, 0, stream>>>(params);
			
 
				+            } else if (params.num_splits <= 64) {
			
 
				+                flash_fwd_splitkv_combine_kernel<Kernel_traits, kBlockM, 6, IsEvenKConst><<<grid_combine, Kernel_traits::kNThreads, 0, stream>>>(params);
			
 
				+            } else if (params.num_splits <= 128) {
			
 
				+                flash_fwd_splitkv_combine_kernel<Kernel_traits, kBlockM, 7, IsEvenKConst><<<grid_combine, Kernel_traits::kNThreads, 0, stream>>>(params);
			
 
				+            }
			
 
				+            C10_CUDA_KERNEL_LAUNCH_CHECK();
			
 
				+        });
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template<typename T, int Headdim, bool Is_causal>
			
 
				+void run_mha_fwd_splitkv_dispatch(Flash_fwd_params &params, cudaStream_t stream) {
			
 
				+    constexpr static int kBlockM = 64;  // Fixed for all head dimensions
			
 
				+    // TD [2023-08-28]: nvcc segfaults for headdim 96 with block size 64 x 256,
			
 
				+    // and for headdim 192 with block size 64 x 128.
			
 
				+    // Also for headdim 160 with block size 64 x 128 after the rotary addition.
			
 
				+    constexpr static int kBlockN = Headdim <= 64 ? 256 : (Headdim <= 128 ? 128 : 64);
			
 
				+    run_flash_splitkv_fwd<Flash_fwd_kernel_traits<Headdim, kBlockM, kBlockN, 4, false, false, T>, Is_causal>(params, stream);
			
 
				+}
			
 
				+
			
 
				+template<typename T, bool Is_causal>
			
 
				+void run_mha_fwd_hdim32(Flash_fwd_params &params, cudaStream_t stream) {
			
 
				+    constexpr static int Headdim = 32;
			
 
				+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
			
 
				+        run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 128, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+    });
			
 
				+}
			
 
				+
			
 
				+template<typename T, bool Is_causal>
			
 
				+void run_mha_fwd_hdim64(Flash_fwd_params &params, cudaStream_t stream) {
			
 
				+    constexpr static int Headdim = 64;
			
 
				+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
			
 
				+        if constexpr(!Is_dropout) {
			
 
				+            // Using 8 warps is 18% slower for seqlen=2k, 2 warps is 5% slower
			
 
				+            // Using block size (64 x 256) is 27% slower for seqlen=2k
			
 
				+            // Using block size (256 x 64) is 85% slower for seqlen=2k, because of register spilling
			
 
				+            run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 128, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+            // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 4, true, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+            // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 4, true, true, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+        } else {
			
 
				+            run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+            // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 4, true, true, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+            // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 4, true, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+            // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 128, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+        }
			
 
				+    });
			
 
				+}
			
 
				+
			
 
				+template<typename T, bool Is_causal>
			
 
				+void run_mha_fwd_hdim96(Flash_fwd_params &params, cudaStream_t stream) {
			
 
				+    constexpr static int Headdim = 96;
			
 
				+    auto dprops = at::cuda::getCurrentDeviceProperties();
			
 
				+    bool is_sm8x = dprops->major == 8 && dprops->minor > 0;
			
 
				+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
			
 
				+        // For sm86 or sm89, 64 x 64 is the fastest for causal (because it's square),
			
 
				+        if (is_sm8x) {
			
 
				+            if constexpr(!Is_causal) {
			
 
				+                run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+            } else {
			
 
				+                run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 64, 64, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+            }
			
 
				+        } else {
			
 
				+            run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+        }
			
 
				+        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 4, true, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 4, true, true, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+        // These two are always slower
			
 
				+        // run_flash_fwd<Flash_fwd_kernel_traits<96, 128, 128, 4, true, T>>(params, stream);
			
 
				+        // run_flash_fwd<Flash_fwd_kernel_traits<96, 64, 128, 4, true, T>>(params, stream);
			
 
				+    });
			
 
				+}
			
 
				+
			
 
				+template<typename T, bool Is_causal>
			
 
				+void run_mha_fwd_hdim128(Flash_fwd_params &params, cudaStream_t stream) {
			
 
				+    constexpr static int Headdim = 128;
			
 
				+    auto dprops = at::cuda::getCurrentDeviceProperties();
			
 
				+    bool is_sm8x = dprops->major == 8 && dprops->minor > 0;
			
 
				+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
			
 
				+        if constexpr(!Is_dropout) {
			
 
				+            // For sm86 or sm89, 64 x 64 is the fastest for causal (because it's square),
			
 
				+            // and 128 x 32 (48 KB smem) is the fastest for non-causal since we get 2 CTAs per SM.
			
 
				+            if (is_sm8x) {
			
 
				+                if constexpr(!Is_causal) {
			
 
				+                    run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 32, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+                } else {
			
 
				+                    run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 64, 64, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+                }
			
 
				+            } else {
			
 
				+                run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+            }
			
 
				+            // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 4, true, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+            // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 4, true, true, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+            // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 64, 128, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+            // Using 8 warps (128 x 128 and 256 x 64) is 28% slower for seqlen=2k
			
 
				+            // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 128, 8, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+            // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 8, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+            // 1st ones are good for H100, A100
			
 
				+            // 2nd one is good for A6000 bc we get slightly better occupancy
			
 
				+        } else {
			
 
				+            run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 32, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+            // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 64, 64, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+            // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 32, 4, true, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+            // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 32, 4, true, true, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+        }
			
 
				+    });
			
 
				+}
			
 
				+
			
 
				+template<typename T, bool Is_causal>
			
 
				+void run_mha_fwd_hdim160(Flash_fwd_params &params, cudaStream_t stream) {
			
 
				+    constexpr static int Headdim = 160;
			
 
				+    auto dprops = at::cuda::getCurrentDeviceProperties();
			
 
				+    bool is_sm8x = dprops->major == 8 && dprops->minor > 0;
			
 
				+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
			
 
				+        // For A100, H100, 128 x 32 is the fastest.
			
 
				+        // For sm86 or sm89, 64 x 64 is the fastest for causal (because it's square),
			
 
				+        // and 128 x 64 with 8 warps is the fastest for non-causal.
			
 
				+        if (is_sm8x) {
			
 
				+            if constexpr(!Is_causal) {
			
 
				+                run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 8, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+            } else {
			
 
				+                run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 64, 64, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+            }
			
 
				+        } else {
			
 
				+            run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 32, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+        }
			
 
				+        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 32, 4, false, true, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 4, false, T>>(params, stream);
			
 
				+        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 64, 128, 4, false, T>>(params, stream);
			
 
				+        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 64, 64, 4, false, T>>(params, stream);
			
 
				+        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 8, false, T>>(params, stream);
			
 
				+        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 128, 8, false, T>>(params, stream);
			
 
				+    });
			
 
				+}
			
 
				+
			
 
				+template<typename T, bool Is_causal>
			
 
				+void run_mha_fwd_hdim192(Flash_fwd_params &params, cudaStream_t stream) {
			
 
				+    constexpr static int Headdim = 192;
			
 
				+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
			
 
				+        if constexpr(!Is_dropout) {
			
 
				+            run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 8, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+        } else {
			
 
				+            run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 64, 64, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+        }
			
 
				+        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 64, 32, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 32, 8, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 4, false, T>>(params, stream);
			
 
				+        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 64, 128, 4, false, T>>(params, stream);
			
 
				+        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 128, 8, false, T>>(params, stream);
			
 
				+    });
			
 
				+}
			
 
				+
			
 
				+template<typename T, bool Is_causal>
			
 
				+void run_mha_fwd_hdim224(Flash_fwd_params &params, cudaStream_t stream) {
			
 
				+    constexpr static int Headdim = 224;
			
 
				+    int device;
			
 
				+    cudaGetDevice(&device);
			
 
				+    int max_smem_per_block;
			
 
				+    cudaError status_ = cudaDeviceGetAttribute(
			
 
				+        &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device);
			
 
				+    if (status_ != cudaSuccess) {
			
 
				+      C10_CUDA_CHECK(status_);
			
 
				+    }
			
 
				+    // printf("max_smem_per_block = %d\n", max_smem_per_block);
			
 
				+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
			
 
				+        if (max_smem_per_block >= 2 * Headdim * (128 + 2 * 64)) {  // 112 KB
			
 
				+            run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 8, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+        } else {
			
 
				+            run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 64, 64, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+        }
			
 
				+        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 32, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 64, 32, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+        // We can't do 128 x 32 with 8 warps because with headdim 224, kBlockKSmem = 32.
			
 
				+        // If we have N = 32, there are only 1024 elements to load at once, where each load
			
 
				+        // is 8 elements. This means we can only use 128 threads and not 256 threads.
			
 
				+        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 32, 8, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+    });
			
 
				+}
			
 
				+
			
 
				+template<typename T, bool Is_causal>
			
 
				+void run_mha_fwd_hdim256(Flash_fwd_params &params, cudaStream_t stream) {
			
 
				+    constexpr static int Headdim = 256;
			
 
				+    int device;
			
 
				+    cudaGetDevice(&device);
			
 
				+    int max_smem_per_sm, max_smem_per_block;
			
 
				+    cudaError status_ = cudaDeviceGetAttribute(
			
 
				+        &max_smem_per_sm, cudaDevAttrMaxSharedMemoryPerMultiprocessor, device);
			
 
				+    status_ = cudaDeviceGetAttribute(
			
 
				+        &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device);
			
 
				+    if (status_ != cudaSuccess) {
			
 
				+      C10_CUDA_CHECK(status_);
			
 
				+    }
			
 
				+    // printf("max_smem_per_sm = %d, max_smem_per_block = %d\n", max_smem_per_sm, max_smem_per_block);
			
 
				+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
			
 
				+        // For A100, we want to run with 128 x 64 (128KB smem).
			
 
				+        // For H100 we want to run with 64 x 64 (96KB smem) since then we can get 2 CTAs per SM.
			
 
				+        if (max_smem_per_block >= 2 * Headdim * (128 + 2 * 64) && max_smem_per_sm < 4 * Headdim * (64 + 2 * 64)) {
			
 
				+            run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 8, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+        } else {
			
 
				+            run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 64, 64, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+        }
			
 
				+        // 64 KB
			
 
				+        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 64, 32, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+        // 96 KB
			
 
				+        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 32, 8, false, false, T>, Is_dropout, Is_causal>(params, stream);
			
 
				+    });
			
 
				+}
			
--- a/kernels/flash_attn/flash_fwd_split_hdim128_bf16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim128_bf16_causal_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 128, true>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim128_bf16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim128_bf16_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 128, false>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim128_fp16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim128_fp16_causal_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, true>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim128_fp16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim128_fp16_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, false>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim160_bf16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim160_bf16_causal_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 160, true>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim160_bf16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim160_bf16_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 160, false>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim160_fp16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim160_fp16_causal_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 160, true>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim160_fp16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim160_fp16_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 160, false>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim192_bf16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim192_bf16_causal_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 192, true>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim192_bf16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim192_bf16_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 192, false>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim192_fp16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim192_fp16_causal_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 192, true>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim192_fp16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim192_fp16_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 192, false>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim224_bf16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim224_bf16_causal_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 224, true>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim224_bf16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim224_bf16_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 224, false>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim224_fp16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim224_fp16_causal_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 224, true>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim224_fp16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim224_fp16_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 224, false>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim256_bf16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim256_bf16_causal_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 256, true>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim256_bf16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim256_bf16_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 256, false>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim256_fp16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim256_fp16_causal_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 256, true>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim256_fp16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim256_fp16_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 256, false>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim32_bf16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim32_bf16_causal_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 32, true>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim32_bf16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim32_bf16_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 32, false>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim32_fp16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim32_fp16_causal_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 32, true>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim32_fp16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim32_fp16_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 32, false>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim64_bf16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim64_bf16_causal_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 64, true>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim64_bf16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim64_bf16_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 64, false>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim64_fp16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim64_fp16_causal_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 64, true>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim64_fp16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim64_fp16_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 64, false>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim96_bf16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim96_bf16_causal_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 96, true>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim96_bf16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim96_bf16_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 96, false>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim96_fp16_causal_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim96_fp16_causal_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 96, true>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/flash_fwd_split_hdim96_fp16_sm80.cu
+++ b/kernels/flash_attn/flash_fwd_split_hdim96_fp16_sm80.cu
@@ -0,0 +1,7 @@
 
				+// Copyright (c) 2023, Tri Dao.
			
 
				+// Splitting the different head dimensions to different files to speed up compilation.
			
 
				+// This file is auto-generated. See "generate_kernels.py"
			
 
				+
			
 
				+#include "flash_fwd_launch_template.h"
			
 
				+
			
 
				+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 96, false>(Flash_fwd_params &params, cudaStream_t stream);
			
--- a/kernels/flash_attn/kernel_traits.h
+++ b/kernels/flash_attn/kernel_traits.h
@@ -0,0 +1,180 @@
 
				+/******************************************************************************
			
 
				+ * Copyright (c) 2024, Tri Dao.
			
 
				+ ******************************************************************************/
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include "cute/tensor.hpp"
			
 
				+
			
 
				+#include "cutlass/cutlass.h"
			
 
				+#include "cutlass/layout/layout.h"
			
 
				+#include <cutlass/numeric_types.h>
			
 
				+
			
 
				+using namespace cute;
			
 
				+
			
 
				+template<int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_, typename elem_type=cutlass::half_t>
			
 
				+struct Flash_kernel_traits {
			
 
				+
			
 
				+#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 800
			
 
				+    using Element = elem_type;
			
 
				+    static constexpr bool Has_cp_async = true;
			
 
				+#else
			
 
				+    using Element = cutlass::half_t;
			
 
				+    static constexpr bool Has_cp_async = false;
			
 
				+#endif
			
 
				+
			
 
				+    using ElementAccum = float;
			
 
				+    using index_t = int64_t;
			
 
				+
			
 
				+#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 800
			
 
				+    using MMA_Atom_Arch = std::conditional_t<
			
 
				+        std::is_same_v<elem_type, cutlass::half_t>,
			
 
				+        MMA_Atom<SM80_16x8x16_F32F16F16F32_TN>,
			
 
				+        MMA_Atom<SM80_16x8x16_F32BF16BF16F32_TN>
			
 
				+    >;
			
 
				+#else
			
 
				+    using MMA_Atom_Arch = MMA_Atom<SM75_16x8x8_F32F16F16F32_TN>;
			
 
				+#endif
			
 
				+
			
 
				+#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 750
			
 
				+    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, elem_type>;
			
 
				+    using SmemCopyAtomTransposed = Copy_Atom<SM75_U16x8_LDSM_T, elem_type>;
			
 
				+#else
			
 
				+    using SmemCopyAtom = Copy_Atom<DefaultCopy, elem_type>;
			
 
				+    using SmemCopyAtomTransposed = Copy_Atom<DefaultCopy, elem_type>;
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+// If Share_Q_K_smem is true, that forces Is_Q_in_regs to be true
			
 
				+template<int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_, bool Is_Q_in_regs_=false, bool Share_Q_K_smem_=false, typename elem_type=cutlass::half_t,
			
 
				+         typename Base=Flash_kernel_traits<kHeadDim_, kBlockM_, kBlockN_, kNWarps_, elem_type> >
			
 
				+struct Flash_fwd_kernel_traits : public Base {
			
 
				+    using Element = typename Base::Element;
			
 
				+    using ElementAccum = typename Base::ElementAccum;
			
 
				+    using index_t = typename Base::index_t;
			
 
				+    static constexpr bool Has_cp_async = Base::Has_cp_async;
			
 
				+    using SmemCopyAtom = typename Base::SmemCopyAtom;
			
 
				+    using SmemCopyAtomTransposed = typename Base::SmemCopyAtomTransposed;
			
 
				+
			
 
				+    static constexpr bool Share_Q_K_smem = Share_Q_K_smem_;
			
 
				+    static constexpr bool Is_Q_in_regs = Is_Q_in_regs_ || Share_Q_K_smem;
			
 
				+
			
 
				+    // The number of threads.
			
 
				+    static constexpr int kNWarps = kNWarps_;
			
 
				+    static constexpr int kNThreads = kNWarps * 32;
			
 
				+
			
 
				+    static constexpr int kBlockM = kBlockM_;
			
 
				+    static constexpr int kBlockN = kBlockN_;
			
 
				+    static constexpr int kHeadDim = kHeadDim_;
			
 
				+    static_assert(kHeadDim % 32 == 0);
			
 
				+    static constexpr int kBlockKSmem = kHeadDim % 64 == 0 ? 64 : 32;
			
 
				+    static constexpr int kBlockKGmem = kHeadDim % 128 == 0 ? 128 : (kHeadDim % 64 == 0 ? 64 : 32);
			
 
				+    static constexpr int kSwizzle = kBlockKSmem == 32 ? 2 : 3;
			
 
				+
			
 
				+    using TiledMma = TiledMMA<
			
 
				+        typename Base::MMA_Atom_Arch,
			
 
				+        Layout<Shape<Int<kNWarps>,_1,_1>>,  // 4x1x1 or 8x1x1 thread group
			
 
				+        Tile<Int<16 * kNWarps>, _16, _16>>;
			
 
				+
			
 
				+    using SmemLayoutAtomQ = decltype(
			
 
				+        composition(Swizzle<kSwizzle, 3, 3>{},
			
 
				+                    // This has to be kBlockKSmem, using kHeadDim gives wrong results for d=128
			
 
				+                    Layout<Shape<_8, Int<kBlockKSmem>>,
			
 
				+                           Stride<Int<kBlockKSmem>, _1>>{}));
			
 
				+    using SmemLayoutQ = decltype(tile_to_shape(
			
 
				+        SmemLayoutAtomQ{},
			
 
				+        Shape<Int<kBlockM>, Int<kHeadDim>>{}));
			
 
				+
			
 
				+    using SmemLayoutKV = decltype(tile_to_shape(
			
 
				+        SmemLayoutAtomQ{},
			
 
				+        Shape<Int<kBlockN>, Int<kHeadDim>>{}));
			
 
				+
			
 
				+    // https://github.com/ColfaxResearch/cutlass-kernels/blob/a222587e6d59b93ba704853d3946fb686d8b8892/src/fmha/fmha_forward.cu#L434
			
 
				+    using SmemLayoutVtransposed = decltype(
			
 
				+        composition(SmemLayoutKV{}, make_layout(Shape<Int<kHeadDim>, Int<kBlockN>>{}, GenRowMajor{})));
			
 
				+    using SmemLayoutVtransposedNoSwizzle = decltype(get_nonswizzle_portion(SmemLayoutVtransposed{}));
			
 
				+
			
 
				+    using SmemLayoutAtomO = decltype(
			
 
				+        composition(Swizzle<kSwizzle, 3, 3>{},
			
 
				+                    Layout<Shape<Int<8>, Int<kBlockKSmem>>,
			
 
				+                           Stride<Int<kBlockKSmem>, _1>>{}));
			
 
				+    using SmemLayoutO = decltype(tile_to_shape(
			
 
				+        SmemLayoutAtomO{},
			
 
				+        Shape<Int<kBlockM>, Int<kHeadDim>>{}));
			
 
				+    using SmemCopyAtomO = Copy_Atom<DefaultCopy, Element>;
			
 
				+    using SmemCopyAtomOaccum = Copy_Atom<DefaultCopy, ElementAccum>;
			
 
				+
			
 
				+    static constexpr int kSmemQSize = size(SmemLayoutQ{}) * sizeof(Element);
			
 
				+    static constexpr int kSmemKVSize = size(SmemLayoutKV{}) * 2 * sizeof(Element);
			
 
				+    static constexpr int kSmemSize = Share_Q_K_smem ? std::max(kSmemQSize, kSmemKVSize) : kSmemQSize + kSmemKVSize;
			
 
				+
			
 
				+    static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element);
			
 
				+    static_assert(kHeadDim % kGmemElemsPerLoad == 0, "kHeadDim must be a multiple of kGmemElemsPerLoad");
			
 
				+    // Using kBlockKSmem here is 6-10% faster than kBlockKGmem for d=128 because of bank conflicts.
			
 
				+    // For example, for d=128, smem is split into 2 "pages", each page takes care of columns
			
 
				+    // 0-63 and 64-127. If we have 16 threads per row for gmem read, when we write to smem,
			
 
				+    // thread 0 - 7 will write to the first page and thread 8 - 15 will write to the second page,
			
 
				+    // to the same banks.
			
 
				+    static constexpr int kGmemThreadsPerRow = kBlockKSmem / kGmemElemsPerLoad;
			
 
				+    static_assert(kNThreads % kGmemThreadsPerRow == 0, "kNThreads must be a multiple of kGmemThreadsPerRow");
			
 
				+    using GmemLayoutAtom = Layout<Shape <Int<kNThreads / kGmemThreadsPerRow>, Int<kGmemThreadsPerRow>>,
			
 
				+                                  Stride<Int<kGmemThreadsPerRow>, _1>>;
			
 
				+
			
 
				+    // We use CACHEGLOBAL instead of CACHEALWAYS for both Q and K/V, since we won't be reading
			
 
				+    // from the same address by the same threadblock. This is slightly faster.
			
 
				+    using Gmem_copy_struct = std::conditional_t<
			
 
				+        Has_cp_async,
			
 
				+        SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>,
			
 
				+        DefaultCopy
			
 
				+    >;
			
 
				+    using GmemTiledCopyQKV = decltype(
			
 
				+        make_tiled_copy(Copy_Atom<Gmem_copy_struct, Element>{},
			
 
				+                        GmemLayoutAtom{},
			
 
				+                        Layout<Shape<_1, _8>>{}));  // Val layout, 8 vals per read
			
 
				+
			
 
				+    // from how many rows does each thread have to fetch
			
 
				+    static constexpr int kGmemRowsPerThread = kBlockN / (kNThreads / kGmemThreadsPerRow);
			
 
				+    // Here we assign a contiguous tile to each thread, rather than a 1x8 row every 
			
 
				+    // (kNThreads / kGmemThreadsPerRow) rows, ensuring that the elements assigned to each thread
			
 
				+    // do not cross a page boundary. This way, each thread need only fetch 1 page index per
			
 
				+    // mainloop iteration. R>udimentary testing shows no slowdown.
			
 
				+    using GmemTiledCopyQKVPaged = decltype(
			
 
				+        make_tiled_copy(Copy_Atom<Gmem_copy_struct, Element>{},
			
 
				+                        GmemLayoutAtom{},
			
 
				+                        Layout<Shape<Int<kGmemRowsPerThread>, _8>, Stride<_8, _1>>{}));
			
 
				+    using GmemTiledCopyO = decltype(
			
 
				+        make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
			
 
				+                        GmemLayoutAtom{},
			
 
				+                        Layout<Shape<_1, _8>>{}));  // Val layout, 8 vals per store
			
 
				+
			
 
				+    using GmemLayoutAtomOaccum = std::conditional_t<
			
 
				+        kBlockKSmem == 32,
			
 
				+        Layout<Shape <_16, _8>,  // Thread layout, 8 threads per row
			
 
				+               Stride< _8, _1>>,
			
 
				+        Layout<Shape <_8, _16>,  // Thread layout, 16 threads per row
			
 
				+               Stride< _16, _1>>
			
 
				+    >;
			
 
				+    using GmemTiledCopyOaccum = decltype(
			
 
				+        make_tiled_copy(Copy_Atom<DefaultCopy, ElementAccum>{},
			
 
				+                        GmemLayoutAtomOaccum{},
			
 
				+                        Layout<Shape < _1, _4>>{}));  // Val layout, 4 vals per store
			
 
				+    using GmemLayoutAtomRotcossin = GmemLayoutAtom;
			
 
				+    using GmemTiledCopyRotcossin = decltype(
			
 
				+        make_tiled_copy(Copy_Atom<UniversalCopy<uint64_t>, Element>{},
			
 
				+                        GmemLayoutAtomRotcossin{},
			
 
				+                        Layout<Shape < _1, _4>>{}));  // Val layout, 4 vals per load
			
 
				+    using GmemTiledCopyRotcossinCont = decltype(
			
 
				+        make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
			
 
				+                        GmemLayoutAtomRotcossin{},
			
 
				+                        Layout<Shape < _1, _8>>{}));  // Val layout, 8 vals per load
			
 
				+    using GmemTiledCopyRotcossinPaged = decltype(
			
 
				+        make_tiled_copy(Copy_Atom<UniversalCopy<uint64_t>, Element>{},
			
 
				+                        GmemLayoutAtomRotcossin{},
			
 
				+                        Layout<Shape<Int<kGmemRowsPerThread>, _4>, Stride<_4, _1>>{}));  // Val layout, 4 vals per load
			
 
				+    using GmemTiledCopyRotcossinContPaged = decltype(
			
 
				+        make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
			
 
				+                        GmemLayoutAtomRotcossin{},
			
 
				+                        Layout<Shape<Int<kGmemRowsPerThread>, _8>, Stride<_8, _1>>{}));  // Val layout, 8 vals per load
			
 
				+};
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
--- a/kernels/flash_attn/mask.h
+++ b/kernels/flash_attn/mask.h
@@ -0,0 +1,213 @@
 
				+/******************************************************************************
			
 
				+ * Copyright (c) 2024, Tri Dao.
			
 
				+ ******************************************************************************/
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include <cute/tensor.hpp>
			
 
				+
			
 
				+namespace flash {
			
 
				+
			
 
				+using namespace cute;
			
 
				+
			
 
				+template <typename Engine, typename Layout>
			
 
				+__forceinline__ __device__ void apply_mask(Tensor<Engine, Layout> &tensor, const int max_seqlen_k,
			
 
				+                                  const int col_idx_offset_ = 0) {
			
 
				+    // tensor has shape (nrow=(2, MMA_M), ncol=(2, MMA_N))
			
 
				+    static_assert(Layout::rank == 2, "Only support 2D Tensor");
			
 
				+    const int lane_id = threadIdx.x % 32;
			
 
				+    const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
			
 
				+    #pragma unroll
			
 
				+    for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
			
 
				+        const int col_idx_base = col_idx_offset + nj * 8;
			
 
				+        #pragma unroll
			
 
				+        for (int j = 0; j < size<1, 0>(tensor); ++j) {
			
 
				+            const int col_idx = col_idx_base + j;
			
 
				+            if (col_idx >= max_seqlen_k) {
			
 
				+                // Without the "make_coord" we get wrong results
			
 
				+                #pragma unroll
			
 
				+                for (int mi = 0; mi < size<0>(tensor); ++mi) {
			
 
				+                    tensor(mi, make_coord(j, nj)) = -INFINITY;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <bool HasWSLeft=true, typename Engine, typename Layout>
			
 
				+__forceinline__ __device__ void apply_mask_local(Tensor<Engine, Layout> &tensor, const int col_idx_offset_,
			
 
				+                                        const int max_seqlen_k, const int row_idx_offset,
			
 
				+                                        const int max_seqlen_q, const int warp_row_stride,
			
 
				+                                        const int window_size_left, const int window_size_right) {
			
 
				+    // tensor has shape (nrow=(2, MMA_M), ncol=(2, MMA_N))
			
 
				+    static_assert(Layout::rank == 2, "Only support 2D Tensor");
			
 
				+    const int lane_id = threadIdx.x % 32;
			
 
				+    const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
			
 
				+    #pragma unroll
			
 
				+    for (int mi = 0; mi < size<0, 1>(tensor); ++mi) {
			
 
				+        const int row_idx_base = row_idx_offset + mi * warp_row_stride;
			
 
				+        #pragma unroll
			
 
				+        for (int i = 0; i < size<0, 0>(tensor); ++i) {
			
 
				+            const int row_idx = row_idx_base + i * 8;
			
 
				+            const int col_idx_limit_left = std::max(0, row_idx + max_seqlen_k - max_seqlen_q - window_size_left);
			
 
				+            const int col_idx_limit_right = std::min(max_seqlen_k, row_idx + 1 + max_seqlen_k - max_seqlen_q + window_size_right);
			
 
				+            #pragma unroll
			
 
				+            for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
			
 
				+                const int col_idx_base = col_idx_offset + nj * 8;
			
 
				+                #pragma unroll
			
 
				+                for (int j = 0; j < size<1, 0>(tensor); ++j) {
			
 
				+                    const int col_idx = col_idx_base + j;
			
 
				+                    if (col_idx >= col_idx_limit_right || (HasWSLeft && col_idx < col_idx_limit_left)) {
			
 
				+                        tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+            // if (cute::thread0()) {
			
 
				+            //     printf("mi = %d, i = %d, row_idx = %d, max_seqlen_k = %d\n", mi, i, row_idx, max_seqlen_k);
			
 
				+            //     print(tensor(make_coord(i, mi), _));
			
 
				+            //     // print(tensor(_, j + nj * size<1, 0>(tensor)));
			
 
				+            // }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <typename Engine, typename Layout>
			
 
				+__forceinline__ __device__ void apply_mask_causal(Tensor<Engine, Layout> &tensor, const int col_idx_offset_,
			
 
				+                                         const int max_seqlen_k, const int row_idx_offset,
			
 
				+                                         const int max_seqlen_q, const int warp_row_stride) {
			
 
				+    // Causal masking is equivalent to local masking with window_size_left = infinity and window_size_right = 0
			
 
				+    apply_mask_local</*HasWSLeft=*/false>(tensor, col_idx_offset_, max_seqlen_k, row_idx_offset,
			
 
				+                                          max_seqlen_q, warp_row_stride, -1, 0);
			
 
				+}
			
 
				+
			
 
				+template <typename Engine0, typename Layout0, typename Engine1, typename Layout1>
			
 
				+__forceinline__ __device__ void apply_mask_causal_w_idx(
			
 
				+    Tensor<Engine0, Layout0> &tensor, Tensor<Engine1, Layout1> const &idx_rowcol,
			
 
				+    const int col_idx_offset_, const int max_seqlen_k, const int row_idx_offset)
			
 
				+{
			
 
				+    // tensor has shape (nrow=(2, MMA_M), ncol=(2, MMA_N))
			
 
				+    static_assert(Layout0::rank == 2, "Only support 2D Tensor");
			
 
				+    static_assert(Layout1::rank == 2, "Only support 2D Tensor");
			
 
				+    CUTE_STATIC_ASSERT_V(size<0>(tensor) == size<0>(idx_rowcol));
			
 
				+    CUTE_STATIC_ASSERT_V(size<1>(tensor) == size<1>(idx_rowcol));
			
 
				+    #pragma unroll
			
 
				+    for (int mi = 0; mi < size<0>(tensor); ++mi) {
			
 
				+        const int col_idx_limit = std::min(max_seqlen_k, 1 + row_idx_offset + get<0>(idx_rowcol(mi, 0)));
			
 
				+        #pragma unroll
			
 
				+        for (int ni = 0; ni < size<1, 1>(tensor); ++ni) {
			
 
				+            if (col_idx_offset_ + get<1>(idx_rowcol(0, ni)) >= col_idx_limit) {
			
 
				+                tensor(mi, ni) = -INFINITY;
			
 
				+            }
			
 
				+        }
			
 
				+        // if (cute::thread0()) {
			
 
				+        //     printf("ni = %d, j = %d, col_idx = %d, max_seqlen_k = %d\n", ni, j, col_idx, max_seqlen_k);
			
 
				+        //     print(tensor(_, make_coord(j, ni)));
			
 
				+        //     // print(tensor(_, j + ni * size<1, 0>(tensor)));
			
 
				+        // }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <bool Is_causal, bool Is_local, bool Has_alibi>
			
 
				+struct Mask {
			
 
				+
			
 
				+    const int max_seqlen_k, max_seqlen_q;
			
 
				+    const int window_size_left, window_size_right;
			
 
				+    const float alibi_slope;
			
 
				+
			
 
				+    __forceinline__ __device__ Mask(const int max_seqlen_k, const int max_seqlen_q,
			
 
				+                                    const int window_size_left, const int window_size_right,
			
 
				+                                    const float alibi_slope=0.f)
			
 
				+        : max_seqlen_k(max_seqlen_k)
			
 
				+        , max_seqlen_q(max_seqlen_q)
			
 
				+        , window_size_left(window_size_left)
			
 
				+        , window_size_right(window_size_right)
			
 
				+        , alibi_slope(!Has_alibi ? 0.0 : alibi_slope) {
			
 
				+    };
			
 
				+
			
 
				+    // Causal_mask: whether this particular iteration needs causal masking
			
 
				+    template <bool Causal_mask=false, bool Is_even_MN=true, typename Engine, typename Layout>
			
 
				+    __forceinline__ __device__ void apply_mask(Tensor<Engine, Layout> &tensor_,
			
 
				+                                               const int col_idx_offset_,
			
 
				+                                               const int row_idx_offset,
			
 
				+                                               const int warp_row_stride) {
			
 
				+        static_assert(!(Causal_mask && Is_local), "Cannot be both causal and local");
			
 
				+        static_assert(Layout::rank == 3, "Only support 3D Tensor");
			
 
				+        static_assert(decltype(size<0>(tensor_))::value == 4, "First dimension must be 4");
			
 
				+        static constexpr bool Need_masking = Has_alibi || Causal_mask || Is_local || !Is_even_MN;
			
 
				+        // if (cute::thread0()) { printf("Has_alibi = %d, Causal_mask=%d, Is_local=%d, Is_even_MN = %d, Need_masking = %d\n", Has_alibi, Causal_mask, Is_local, Is_even_MN, Need_masking); }
			
 
				+        if constexpr (Need_masking) {
			
 
				+            // Reshape tensor_ from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N))
			
 
				+            Tensor tensor = make_tensor(tensor_.data(), flash::convert_layout_acc_rowcol(tensor_.layout()));
			
 
				+            // Do we need both row and column indices, or just column incides?
			
 
				+            static constexpr bool Col_idx_only = !(Has_alibi && !Is_causal) && !Is_local && !Causal_mask;
			
 
				+            const int lane_id = threadIdx.x % 32;
			
 
				+            const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
			
 
				+            if constexpr (Col_idx_only) {
			
 
				+                #pragma unroll
			
 
				+                for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
			
 
				+                    const int col_idx_base = col_idx_offset + nj * 8;
			
 
				+                    #pragma unroll
			
 
				+                    for (int j = 0; j < size<1, 0>(tensor); ++j) {
			
 
				+                        const int col_idx = col_idx_base + j;
			
 
				+                        #pragma unroll
			
 
				+                        for (int mi = 0; mi < size<0>(tensor); ++mi) {
			
 
				+                            // No causal, no local
			
 
				+                            if constexpr (Has_alibi) {
			
 
				+                                tensor(mi, make_coord(j, nj)) += alibi_slope * col_idx;
			
 
				+                            }
			
 
				+                            if constexpr (!Is_even_MN) {
			
 
				+                                if (col_idx >= max_seqlen_k) { tensor(mi, make_coord(j, nj)) = -INFINITY; }
			
 
				+                            }
			
 
				+                        }
			
 
				+                    }
			
 
				+                }
			
 
				+            } else {
			
 
				+                #pragma unroll
			
 
				+                for (int mi = 0; mi < size<0, 1>(tensor); ++mi) {
			
 
				+                    const int row_idx_base = row_idx_offset + mi * warp_row_stride;
			
 
				+                    #pragma unroll
			
 
				+                    for (int i = 0; i < size<0, 0>(tensor); ++i) {
			
 
				+                        const int row_idx = row_idx_base + i * 8;
			
 
				+                        const int col_idx_limit_left = std::max(0, row_idx + max_seqlen_k - max_seqlen_q - window_size_left);
			
 
				+                        const int col_idx_limit_right = std::min(max_seqlen_k, row_idx + 1 + max_seqlen_k - max_seqlen_q + window_size_right);
			
 
				+                        #pragma unroll
			
 
				+                        for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
			
 
				+                            const int col_idx_base = col_idx_offset + nj * 8;
			
 
				+                            #pragma unroll
			
 
				+                            for (int j = 0; j < size<1, 0>(tensor); ++j) {
			
 
				+                                const int col_idx = col_idx_base + j;
			
 
				+                                if constexpr (Has_alibi) {
			
 
				+                                    if constexpr (Is_causal) {
			
 
				+                                        tensor(make_coord(i, mi), make_coord(j, nj)) += alibi_slope * col_idx;
			
 
				+                                    } else {
			
 
				+                                        tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx);
			
 
				+
			
 
				+                                    }
			
 
				+                                }
			
 
				+                                if constexpr (Causal_mask) {
			
 
				+                                    if (col_idx >= col_idx_limit_right) {
			
 
				+                                        tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
			
 
				+                                    }
			
 
				+                                }
			
 
				+                                if constexpr (Is_local) {
			
 
				+                                    if (col_idx >= col_idx_limit_right || col_idx < col_idx_limit_left) {
			
 
				+                                        tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
			
 
				+                                    }
			
 
				+                                }
			
 
				+                                if constexpr (!Causal_mask && !Is_local && !Is_even_MN) {
			
 
				+                                    // Causal and Local already handles MN masking
			
 
				+                                    if (col_idx >= max_seqlen_k) {
			
 
				+                                        tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
			
 
				+                                    }
			
 
				+                                }
			
 
				+                            }
			
 
				+                        }
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+    };
			
 
				+
			
 
				+};
			
 
				+
			
 
				+} // namespace flash
			
--- a/kernels/flash_attn/philox.cuh
+++ b/kernels/flash_attn/philox.cuh
@@ -0,0 +1,51 @@
 
				+// Pytorch also has an implementation of Philox RNG: https://github.com/pytorch/pytorch/blob/8ca3c881db3e3510fcb7725389f6a0633c9b992c/torch/csrc/jit/tensorexpr/cuda_random.h
			
 
				+#pragma once
			
 
				+// Philox CUDA.
			
 
				+
			
 
				+namespace flash {
			
 
				+
			
 
				+struct ull2 {
			
 
				+    unsigned long long x;
			
 
				+    unsigned long long y;
			
 
				+};
			
 
				+
			
 
				+__forceinline__ __device__ uint2 mulhilo32(const unsigned int a, const unsigned int b) {
			
 
				+    uint2 *res;
			
 
				+    unsigned long long tmp;
			
 
				+    asm ("mul.wide.u32 %0, %1, %2;\n\t"
			
 
				+          : "=l"(tmp)
			
 
				+          : "r"(a), "r"(b));
			
 
				+    res = (uint2*)(&tmp);
			
 
				+    return *res;
			
 
				+}
			
 
				+
			
 
				+__forceinline__ __device__ uint4 philox_single_round(const uint4 ctr, const uint2 key) {
			
 
				+    constexpr unsigned long kPhiloxSA = 0xD2511F53;
			
 
				+    constexpr unsigned long kPhiloxSB = 0xCD9E8D57;
			
 
				+    uint2 res0 = mulhilo32(kPhiloxSA, ctr.x);
			
 
				+    uint2 res1 = mulhilo32(kPhiloxSB, ctr.z);
			
 
				+    uint4 ret = {res1.y ^ ctr.y ^ key.x, res1.x, res0.y ^ ctr.w ^ key.y, res0.x};
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+__forceinline__ __device__ uint4 philox(unsigned long long seed,
			
 
				+                               unsigned long long subsequence,
			
 
				+                               unsigned long long offset) {
			
 
				+    constexpr unsigned long kPhilox10A = 0x9E3779B9;
			
 
				+    constexpr unsigned long kPhilox10B = 0xBB67AE85;
			
 
				+    uint2 key = reinterpret_cast<uint2&>(seed);
			
 
				+    uint4 counter;
			
 
				+    ull2 *tmp = reinterpret_cast<ull2*>(&counter);
			
 
				+    tmp->x = offset;
			
 
				+    tmp->y = subsequence;
			
 
				+    #pragma unroll
			
 
				+    for (int i = 0; i < 6; i++) {
			
 
				+        counter = philox_single_round(counter, key);
			
 
				+        key.x += (kPhilox10A);
			
 
				+        key.y += (kPhilox10B);
			
 
				+    }
			
 
				+    uint4 output = philox_single_round(counter, key);
			
 
				+    return output;
			
 
				+}
			
 
				+
			
 
				+} // namespace flash
			
--- a/kernels/flash_attn/registration.h
+++ b/kernels/flash_attn/registration.h
@@ -0,0 +1,22 @@
 
				+#pragma once
			
 
				+
			
 
				+#include <Python.h>
			
 
				+
			
 
				+#define _CONCAT(A, B) A##B
			
 
				+#define CONCAT(A, B) _CONCAT(A, B)
			
 
				+
			
 
				+#define _STRINGIFY(A) #A
			
 
				+#define STRINGIFY(A) _STRINGIFY(A)
			
 
				+
			
 
				+// A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME
			
 
				+// could be a macro instead of a literal token.
			
 
				+#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
			
 
				+
			
 
				+// REGISTER_EXTENSION allows the shared library to be loaded and initialized
			
 
				+// via python's import statement.
			
 
				+#define REGISTER_EXTENSION(NAME)                                               \
			
 
				+  PyMODINIT_FUNC CONCAT(PyInit_, NAME)() {                                     \
			
 
				+    static struct PyModuleDef module = {PyModuleDef_HEAD_INIT,                 \
			
 
				+                                        STRINGIFY(NAME), nullptr, 0, nullptr}; \
			
 
				+    return PyModule_Create(&module);                                           \
			
 
				+  }
			
--- a/kernels/flash_attn/rotary.h
+++ b/kernels/flash_attn/rotary.h
@@ -0,0 +1,152 @@
 
				+/******************************************************************************
			
 
				+ * Copyright (c) 2024, Tri Dao.
			
 
				+ ******************************************************************************/
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include <cute/tensor.hpp>
			
 
				+
			
 
				+#include "utils.h"
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+namespace flash {
			
 
				+
			
 
				+using namespace cute;
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template <bool Is_even_K=true, bool Clear_OOB_K=true,
			
 
				+          typename Engine0, typename Layout0, typename Engine1, typename Layout1,
			
 
				+          typename Engine2, typename Layout2, typename Engine3, typename Layout3>
			
 
				+__forceinline__ __device__ void copy_rotary_interleaved(Tensor<Engine0, Layout0> const &S,
			
 
				+                                               Tensor<Engine1, Layout1> &D,
			
 
				+                                               Tensor<Engine2, Layout2> const &Cos,
			
 
				+                                               Tensor<Engine2, Layout2> const &Sin,
			
 
				+                                               Tensor<Engine3, Layout3> const &identity_MN,
			
 
				+                                               const int max_MN, const int min_MN,
			
 
				+                                               const int dim, const int rotary_dim) {
			
 
				+    CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});
			
 
				+    CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});
			
 
				+    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D));                     // MMA
			
 
				+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D));                     // MMA_M
			
 
				+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D));                     // MMA_K
			
 
				+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(Cos));                     // MMA_M
			
 
				+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(Cos));                     // MMA_K
			
 
				+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(Sin));                     // MMA_M
			
 
				+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(Sin));                     // MMA_K
			
 
				+    CUTE_STATIC_ASSERT_V(size<0>(Cos) == size<0>(Sin));                     // MMA_K
			
 
				+    static_assert(decltype(size<0>(S))::value == decltype(size<0>(Cos))::value * 2);
			
 
				+    static_assert(decltype(size<0>(Cos))::value % 2 == 0);  // Since we do fast conversion from fp16/bf16 to fp32
			
 
				+    Tensor rCos = make_fragment_like(Cos);
			
 
				+    Tensor rSin = make_fragment_like(Sin);
			
 
				+    Tensor rS = make_fragment_like(S);
			
 
				+    #pragma unroll
			
 
				+    for (int m = 0; m < size<1>(S); ++m) {
			
 
				+        if (get<0>(identity_MN(0, m, 0)) >= min_MN && get<0>(identity_MN(0, m, 0)) < max_MN) {
			
 
				+            #pragma unroll
			
 
				+            for (int k = 0; k < size<2>(S); ++k) {
			
 
				+                if (Is_even_K || get<1>(identity_MN(0, 0, k)) < dim) {
			
 
				+                    cute::copy(S(_, m, k), rS(_, m, k));
			
 
				+                    if (get<1>(identity_MN(0, 0, k)) < rotary_dim) {
			
 
				+                        cute::copy(Cos(_, m, k), rCos(_, m, k));
			
 
				+                        cute::copy(Sin(_, m, k), rSin(_, m, k));
			
 
				+                        Tensor S_fp32 = convert_type<float>(rS(_, m, k));
			
 
				+                        Tensor cos_fp32 = convert_type<float>(rCos(_, m, k));
			
 
				+                        Tensor sin_fp32 = convert_type<float>(rSin(_, m, k));
			
 
				+                        #pragma unroll
			
 
				+                        for (int i = 0; i < size<0>(rS) / 2; ++i) {
			
 
				+                            float real = S_fp32(2 * i) * cos_fp32(i) - S_fp32(2 * i + 1) * sin_fp32(i);
			
 
				+                            float imag = S_fp32(2 * i) * sin_fp32(i) + S_fp32(2 * i + 1) * cos_fp32(i);
			
 
				+                            S_fp32(2 * i) = real;
			
 
				+                            S_fp32(2 * i + 1) = imag;
			
 
				+                        }
			
 
				+                        // Idk but I need to copy for the convert_type to work
			
 
				+                        Tensor S_fp32_copy = make_fragment_like(S_fp32);
			
 
				+                        cute::copy(S_fp32, S_fp32_copy);
			
 
				+                        using T = typename Engine0::value_type;
			
 
				+                        Tensor S_og_type = convert_type<T>(S_fp32_copy);
			
 
				+                        cute::copy(S_og_type, rS(_, m, k));
			
 
				+                    }
			
 
				+                    cute::copy(rS(_, m, k), D(_, m, k));
			
 
				+                } else if (Clear_OOB_K) {
			
 
				+                    cute::clear(D(_, m, k));
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template <bool Is_even_K=true, bool Clear_OOB_K=true,
			
 
				+          typename Engine0, typename Layout0, typename Engine1, typename Layout1,
			
 
				+          typename Engine2, typename Layout2, typename Engine3, typename Layout3>
			
 
				+__forceinline__ __device__ void copy_rotary_contiguous(Tensor<Engine0, Layout0> const &S,
			
 
				+                                              Tensor<Engine1, Layout1> &D,
			
 
				+                                              Tensor<Engine2, Layout2> const &Cos,
			
 
				+                                              Tensor<Engine2, Layout2> const &Sin,
			
 
				+                                              Tensor<Engine3, Layout3> const &identity_MN,
			
 
				+                                              const int max_MN, const int min_MN,
			
 
				+                                              const int dim, const int rotary_dim) {
			
 
				+    CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});
			
 
				+    CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});
			
 
				+    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D));                     // MMA
			
 
				+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D));                     // MMA_M
			
 
				+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D));                     // MMA_K
			
 
				+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(Cos));                     // MMA_M
			
 
				+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(Cos));                     // MMA_K
			
 
				+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(Sin));                     // MMA_M
			
 
				+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(Sin));                     // MMA_K
			
 
				+    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(Cos));                     // MMA
			
 
				+    CUTE_STATIC_ASSERT_V(size<0>(Cos) == size<0>(Sin));
			
 
				+    static_assert(decltype(size<0>(Cos))::value % 2 == 0);  // Since we do fast conversion from fp16/bf16 to fp32
			
 
				+    Tensor rCos = make_fragment_like(Cos);
			
 
				+    Tensor rSin = make_fragment_like(Sin);
			
 
				+    Tensor rS = make_fragment_like(S);
			
 
				+    Tensor rS_other = make_fragment_like(rS(_, 0, 0));
			
 
				+    #pragma unroll
			
 
				+    for (int m = 0; m < size<1>(S); ++m) {
			
 
				+        if (get<0>(identity_MN(0, m, 0)) >= min_MN && get<0>(identity_MN(0, m, 0)) < max_MN) {
			
 
				+            #pragma unroll
			
 
				+            for (int k = 0; k < size<2>(S); ++k) {
			
 
				+                if (Is_even_K || get<1>(identity_MN(0, 0, k)) < dim) {
			
 
				+                    cute::copy(S(_, m, k), rS(_, m, k));
			
 
				+                    if (get<1>(identity_MN(0, 0, k)) < rotary_dim) {
			
 
				+                        const bool is_left = get<1>(identity_MN(0, 0, k)) < rotary_dim / 2;
			
 
				+                        Tensor gS_other = make_tensor(S(_, m, k).data() + (is_left ? rotary_dim / 2 : -rotary_dim / 2), S(_, m, k).layout());
			
 
				+                        cute::copy(gS_other, rS_other);
			
 
				+                        // if (cute::thread0()) { print_tensor(rS(_, m, k)); print_tensor(rS_other); }
			
 
				+                        Tensor gCos = make_tensor(Cos(_, m, k).data() + (is_left ? 0 : -rotary_dim / 2), Cos(_, m, k).layout());
			
 
				+                        Tensor gSin = make_tensor(Sin(_, m, k).data() + (is_left ? 0 : -rotary_dim / 2), Sin(_, m, k).layout());
			
 
				+                        cute::copy(gCos, rCos(_, m, k));
			
 
				+                        cute::copy(gSin, rSin(_, m, k));
			
 
				+                        // if (cute::thread0()) { print_tensor(rCos(_, m, k)); print_tensor(rSin(_, m, k)); }
			
 
				+                        Tensor S_fp32 = convert_type<float>(rS(_, m, k));
			
 
				+                        Tensor S_other_fp32 = convert_type<float>(rS_other);
			
 
				+                        Tensor cos_fp32 = convert_type<float>(rCos(_, m, k));
			
 
				+                        Tensor sin_fp32 = convert_type<float>(rSin(_, m, k));
			
 
				+                        #pragma unroll
			
 
				+                        for (int i = 0; i < size<0>(rS); ++i) {
			
 
				+                            S_fp32(i) = S_fp32(i) * cos_fp32(i) + S_other_fp32(i) * (is_left ? -sin_fp32(i) : sin_fp32(i));
			
 
				+                        }
			
 
				+                        // Idk but I need to copy for the convert_type to work
			
 
				+                        Tensor S_fp32_copy = make_fragment_like(S_fp32);
			
 
				+                        cute::copy(S_fp32, S_fp32_copy);
			
 
				+                        using T = typename Engine0::value_type;
			
 
				+                        Tensor S_og_type = convert_type<T>(S_fp32_copy);
			
 
				+                        cute::copy(S_og_type, rS(_, m, k));
			
 
				+                        // if (cute::thread0()) { print_tensor(rS(_, m, k)); }
			
 
				+                    }
			
 
				+                    cute::copy(rS(_, m, k), D(_, m, k));
			
 
				+                } else if (Clear_OOB_K) {
			
 
				+                    cute::clear(D(_, m, k));
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+}  // namespace flash
			
--- a/kernels/flash_attn/softmax.h
+++ b/kernels/flash_attn/softmax.h
@@ -0,0 +1,188 @@
 
				+/******************************************************************************
			
 
				+ * Copyright (c) 2024, Tri Dao.
			
 
				+ ******************************************************************************/
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include <cmath>
			
 
				+
			
 
				+#include <cute/tensor.hpp>
			
 
				+
			
 
				+#include <cutlass/numeric_types.h>
			
 
				+
			
 
				+#include "philox.cuh"
			
 
				+#include "utils.h"
			
 
				+
			
 
				+namespace flash {
			
 
				+
			
 
				+using namespace cute;
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template<bool zero_init=true, typename Engine0, typename Layout0, typename Engine1, typename Layout1, typename Operator>
			
 
				+__device__ __forceinline__ void thread_reduce_(Tensor<Engine0, Layout0> const &tensor, Tensor<Engine1, Layout1> &summary, Operator &op) {
			
 
				+    static_assert(Layout0::rank == 2, "Only support 2D Tensor");
			
 
				+    static_assert(Layout1::rank == 1, "Only support 1D Tensor");
			
 
				+    CUTE_STATIC_ASSERT_V(size<0>(summary) == size<0>(tensor));
			
 
				+    #pragma unroll
			
 
				+    for (int mi = 0; mi < size<0>(tensor); mi++) {
			
 
				+        summary(mi) = zero_init ? tensor(mi, 0) : op(summary(mi), tensor(mi, 0));
			
 
				+        #pragma unroll
			
 
				+        for (int ni = 1; ni < size<1>(tensor); ni++) {
			
 
				+            summary(mi) = op(summary(mi), tensor(mi, ni));
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template<typename Engine0, typename Layout0, typename Engine1, typename Layout1, typename Operator>
			
 
				+__device__ __forceinline__ void quad_allreduce_(Tensor<Engine0, Layout0> &dst, Tensor<Engine1, Layout1> &src, Operator &op) {
			
 
				+    CUTE_STATIC_ASSERT_V(size(dst) == size(src));
			
 
				+    #pragma unroll
			
 
				+    for (int i = 0; i < size(dst); i++){
			
 
				+        dst(i) = Allreduce<4>::run(src(i), op);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template<bool zero_init=true, typename Engine0, typename Layout0, typename Engine1, typename Layout1, typename Operator>
			
 
				+__device__ __forceinline__ void reduce_(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, Layout1> &summary, Operator &op) {
			
 
				+    thread_reduce_<zero_init>(tensor, summary, op);
			
 
				+    quad_allreduce_(summary, summary, op);
			
 
				+}
			
 
				+
			
 
				+template<bool zero_init=true, typename Engine0, typename Layout0, typename Engine1, typename Layout1>
			
 
				+__device__ __forceinline__ void reduce_max(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, Layout1> &max){
			
 
				+    MaxOp<float> max_op;
			
 
				+    reduce_<zero_init>(tensor, max, max_op);
			
 
				+}
			
 
				+
			
 
				+template<bool zero_init=true, typename Engine0, typename Layout0, typename Engine1, typename Layout1>
			
 
				+__device__ __forceinline__ void reduce_sum(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, Layout1> &sum){
			
 
				+    SumOp<float> sum_op;
			
 
				+    thread_reduce_<zero_init>(tensor, sum, sum_op);
			
 
				+}
			
 
				+
			
 
				+// Apply the exp to all the elements.
			
 
				+template <bool Scale_max=true, typename Engine0, typename Layout0, typename Engine1, typename Layout1>
			
 
				+__forceinline__ __device__ void scale_apply_exp2(Tensor<Engine0, Layout0> &tensor, Tensor<Engine1, Layout1> const &max, const float scale) {
			
 
				+    static_assert(Layout0::rank == 2, "Only support 2D Tensor");
			
 
				+    static_assert(Layout1::rank == 1, "Only support 1D Tensor");
			
 
				+    CUTE_STATIC_ASSERT_V(size<0>(max) == size<0>(tensor));
			
 
				+    #pragma unroll
			
 
				+    for (int mi = 0; mi < size<0>(tensor); ++mi) {
			
 
				+        // If max is -inf, then all elements must have been -inf (possibly due to masking).
			
 
				+        // We don't want (-inf - (-inf)) since that would give NaN.
			
 
				+        // If we don't have float around M_LOG2E the multiplication is done in fp64.
			
 
				+        const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * (Scale_max ? scale : float(M_LOG2E));
			
 
				+        #pragma unroll
			
 
				+        for (int ni = 0; ni < size<1>(tensor); ++ni)  {
			
 
				+            // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
			
 
				+            // max * log_2(e)) This allows the compiler to use the ffma
			
 
				+            // instruction instead of fadd and fmul separately.
			
 
				+            // The following macro will disable the use of fma.
			
 
				+            // See: https://github.com/pytorch/pytorch/issues/121558 for more details
			
 
				+            // This macro is set in PyTorch and not FlashAttention
			
 
				+            #ifdef UNFUSE_FMA
			
 
				+                tensor(mi, ni) = exp2f(__fmul_rn(tensor(mi, ni), scale) - max_scaled);
			
 
				+            #else
			
 
				+                tensor(mi, ni) = exp2f(tensor(mi, ni) * scale - max_scaled);
			
 
				+            #endif
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// Apply the exp to all the elements.
			
 
				+template <bool zero_init=true, typename Engine0, typename Layout0, typename Engine1, typename Layout1>
			
 
				+__forceinline__ __device__ void max_scale_exp2_sum(Tensor<Engine0, Layout0> &tensor, Tensor<Engine1, Layout1> &max, Tensor<Engine1, Layout1> &sum, const float scale) {
			
 
				+    static_assert(Layout0::rank == 2, "Only support 2D Tensor");
			
 
				+    static_assert(Layout1::rank == 1, "Only support 1D Tensor");
			
 
				+    CUTE_STATIC_ASSERT_V(size<0>(max) == size<0>(tensor));
			
 
				+    #pragma unroll
			
 
				+    for (int mi = 0; mi < size<0>(tensor); ++mi) {
			
 
				+        MaxOp<float> max_op;
			
 
				+        max(mi) = zero_init ? tensor(mi, 0) : max_op(max(mi), tensor(mi, 0));
			
 
				+        #pragma unroll
			
 
				+        for (int ni = 1; ni < size<1>(tensor); ni++) {
			
 
				+            max(mi) = max_op(max(mi), tensor(mi, ni));
			
 
				+        }
			
 
				+        max(mi) = Allreduce<4>::run(max(mi), max_op);
			
 
				+        // If max is -inf, then all elements must have been -inf (possibly due to masking).
			
 
				+        // We don't want (-inf - (-inf)) since that would give NaN.
			
 
				+        const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * scale;
			
 
				+        sum(mi) = 0;
			
 
				+        #pragma unroll
			
 
				+        for (int ni = 0; ni < size<1>(tensor); ++ni)  {
			
 
				+            // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
			
 
				+            // max * log_2(e)) This allows the compiler to use the ffma
			
 
				+            // instruction instead of fadd and fmul separately.
			
 
				+            tensor(mi, ni) = exp2f(tensor(mi, ni) * scale - max_scaled);
			
 
				+            sum(mi) += tensor(mi, ni);
			
 
				+        }
			
 
				+        SumOp<float> sum_op;
			
 
				+        sum(mi) = Allreduce<4>::run(sum(mi), sum_op);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template <int kNRows>
			
 
				+struct Softmax {
			
 
				+
			
 
				+    using TensorT = decltype(make_tensor<float>(Shape<Int<kNRows>>{}));
			
 
				+    TensorT row_max, row_sum;
			
 
				+
			
 
				+    __forceinline__ __device__ Softmax() {};
			
 
				+
			
 
				+    template<bool Is_first, bool Check_inf=false, typename Tensor0, typename Tensor1>
			
 
				+    __forceinline__ __device__ void softmax_rescale_o(Tensor0 &acc_s, Tensor1 &acc_o, float softmax_scale_log2) {
			
 
				+        // Reshape acc_s from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N))
			
 
				+        Tensor scores = make_tensor(acc_s.data(), flash::convert_layout_acc_rowcol(acc_s.layout()));
			
 
				+        static_assert(decltype(size<0>(scores))::value == kNRows);
			
 
				+        if (Is_first) {
			
 
				+            flash::template reduce_max</*zero_init=*/true>(scores, row_max);
			
 
				+            flash::scale_apply_exp2(scores, row_max, softmax_scale_log2);
			
 
				+            flash::reduce_sum</*zero_init=*/true>(scores, row_sum);
			
 
				+        } else {
			
 
				+            Tensor scores_max_prev = make_fragment_like(row_max);
			
 
				+            cute::copy(row_max, scores_max_prev);
			
 
				+            flash::template reduce_max</*zero_init=*/false>(scores, row_max);
			
 
				+            // Reshape acc_o from (MMA=4, MMA_M, MMA_K) to (nrow=(2, MMA_M), ncol=(2, MMA_K))
			
 
				+            Tensor acc_o_rowcol = make_tensor(acc_o.data(), flash::convert_layout_acc_rowcol(acc_o.layout()));
			
 
				+            static_assert(decltype(size<0>(acc_o_rowcol))::value == kNRows);
			
 
				+            #pragma unroll
			
 
				+            for (int mi = 0; mi < size(row_max); ++mi) {
			
 
				+                float scores_max_cur = !Check_inf
			
 
				+                    ? row_max(mi)
			
 
				+                    : (row_max(mi) == -INFINITY ? 0.0f : row_max(mi));
			
 
				+                float scores_scale = exp2f((scores_max_prev(mi) - scores_max_cur) * softmax_scale_log2);
			
 
				+                row_sum(mi) *= scores_scale;
			
 
				+                #pragma unroll
			
 
				+                for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) { acc_o_rowcol(mi, ni) *= scores_scale; }
			
 
				+            }
			
 
				+            flash::scale_apply_exp2(scores, row_max, softmax_scale_log2);
			
 
				+            // We don't do the reduce across threads here since we don't need to use the row_sum.
			
 
				+            // We do that reduce at the end when we need to normalize the softmax.
			
 
				+            flash::reduce_sum</*zero_init=*/false>(scores, row_sum);
			
 
				+        }
			
 
				+    };
			
 
				+
			
 
				+    template<bool Is_dropout=false, bool Split=false, typename Tensor0>
			
 
				+    __forceinline__ __device__ TensorT normalize_softmax_lse(Tensor0 &acc_o, float softmax_scale, float rp_dropout=1.0) {
			
 
				+        SumOp<float> sum_op;
			
 
				+        quad_allreduce_(row_sum, row_sum, sum_op);
			
 
				+        TensorT lse = make_fragment_like(row_sum);
			
 
				+        Tensor acc_o_rowcol = make_tensor(acc_o.data(), flash::convert_layout_acc_rowcol(acc_o.layout()));
			
 
				+        static_assert(decltype(size<0>(acc_o_rowcol))::value == kNRows);
			
 
				+        #pragma unroll
			
 
				+        for (int mi = 0; mi < size<0>(acc_o_rowcol); ++mi) {
			
 
				+            float sum = row_sum(mi);
			
 
				+            float inv_sum = (sum == 0.f || sum != sum) ? 1.f : 1.f / sum;
			
 
				+            lse(mi) = (sum == 0.f || sum != sum) ? (Split ? -INFINITY : INFINITY) : row_max(mi) * softmax_scale + __logf(sum);
			
 
				+            float scale = !Is_dropout ? inv_sum : inv_sum * rp_dropout;
			
 
				+            #pragma unroll
			
 
				+            for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) { acc_o_rowcol(mi, ni) *= scale; }
			
 
				+        }
			
 
				+        return lse;
			
 
				+    };
			
 
				+};
			
 
				+
			
 
				+}  // namespace flash
			
--- a/kernels/flash_attn/static_switch.h
+++ b/kernels/flash_attn/static_switch.h
@@ -0,0 +1,117 @@
 
				+// Inspired by
			
 
				+// https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
			
 
				+// and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+/// @param COND       - a boolean expression to switch by
			
 
				+/// @param CONST_NAME - a name given for the constexpr bool variable.
			
 
				+/// @param ...       - code to execute for true and false
			
 
				+///
			
 
				+/// Usage:
			
 
				+/// ```
			
 
				+/// BOOL_SWITCH(flag, BoolConst, [&] {
			
 
				+///     some_function<BoolConst>(...);
			
 
				+/// });
			
 
				+/// ```
			
 
				+
			
 
				+#define BOOL_SWITCH(COND, CONST_NAME, ...)      \
			
 
				+  [&] {                                         \
			
 
				+    if (COND) {                                 \
			
 
				+      constexpr static bool CONST_NAME = true;  \
			
 
				+      return __VA_ARGS__();                     \
			
 
				+    } else {                                    \
			
 
				+      constexpr static bool CONST_NAME = false; \
			
 
				+      return __VA_ARGS__();                     \
			
 
				+    }                                           \
			
 
				+  }()
			
 
				+
			
 
				+#ifdef FLASHATTENTION_DISABLE_DROPOUT
			
 
				+  #define DROPOUT_SWITCH(COND, CONST_NAME, ...) \
			
 
				+  [&] {                                         \
			
 
				+    constexpr static bool CONST_NAME = false;   \
			
 
				+    return __VA_ARGS__();                       \
			
 
				+  }()
			
 
				+#else
			
 
				+  #define DROPOUT_SWITCH BOOL_SWITCH
			
 
				+#endif
			
 
				+
			
 
				+#ifdef FLASHATTENTION_DISABLE_ALIBI
			
 
				+  #define ALIBI_SWITCH(COND, CONST_NAME, ...)   \
			
 
				+  [&] {                                         \
			
 
				+    constexpr static bool CONST_NAME = false;   \
			
 
				+    return __VA_ARGS__();                       \
			
 
				+  }()
			
 
				+#else
			
 
				+  #define ALIBI_SWITCH BOOL_SWITCH
			
 
				+#endif
			
 
				+
			
 
				+#ifdef FLASHATTENTION_DISABLE_UNEVEN_K
			
 
				+  #define EVENK_SWITCH(COND, CONST_NAME, ...)   \
			
 
				+  [&] {                                         \
			
 
				+    constexpr static bool CONST_NAME = true;    \
			
 
				+    return __VA_ARGS__();                       \
			
 
				+  }()
			
 
				+#else
			
 
				+  #define EVENK_SWITCH BOOL_SWITCH
			
 
				+#endif
			
 
				+
			
 
				+#ifdef FLASHATTENTION_DISABLE_SOFTCAP
			
 
				+  #define SOFTCAP_SWITCH(COND, CONST_NAME, ...)   \
			
 
				+  [&] {                                         \
			
 
				+    constexpr static bool CONST_NAME = false;    \
			
 
				+    return __VA_ARGS__();                       \
			
 
				+  }()
			
 
				+#else
			
 
				+  #define SOFTCAP_SWITCH BOOL_SWITCH
			
 
				+#endif
			
 
				+
			
 
				+#ifdef FLASHATTENTION_DISABLE_LOCAL
			
 
				+  #define LOCAL_SWITCH(COND, CONST_NAME, ...)   \
			
 
				+  [&] {                                         \
			
 
				+    constexpr static bool CONST_NAME = false;    \
			
 
				+    return __VA_ARGS__();                       \
			
 
				+  }()
			
 
				+#else
			
 
				+  #define LOCAL_SWITCH BOOL_SWITCH
			
 
				+#endif
			
 
				+
			
 
				+#define FP16_SWITCH(COND, ...)               \
			
 
				+  [&] {                                      \
			
 
				+    if (COND) {                              \
			
 
				+      using elem_type = cutlass::half_t;     \
			
 
				+      return __VA_ARGS__();                  \
			
 
				+    } else {                                 \
			
 
				+      using elem_type = cutlass::bfloat16_t; \
			
 
				+      return __VA_ARGS__();                  \
			
 
				+    }                                        \
			
 
				+  }()
			
 
				+
			
 
				+#define HEADDIM_SWITCH(HEADDIM, ...)   \
			
 
				+  [&] {                                    \
			
 
				+    if (HEADDIM <= 32) {                   \
			
 
				+      constexpr static int kHeadDim = 32;  \
			
 
				+      return __VA_ARGS__();                \
			
 
				+    } else if (HEADDIM <= 64) {            \
			
 
				+      constexpr static int kHeadDim = 64;  \
			
 
				+      return __VA_ARGS__();                \
			
 
				+    } else if (HEADDIM <= 96) {            \
			
 
				+      constexpr static int kHeadDim = 96;  \
			
 
				+      return __VA_ARGS__();                \
			
 
				+    } else if (HEADDIM <= 128) {           \
			
 
				+      constexpr static int kHeadDim = 128; \
			
 
				+      return __VA_ARGS__();                \
			
 
				+    } else if (HEADDIM <= 160) {           \
			
 
				+      constexpr static int kHeadDim = 160; \
			
 
				+      return __VA_ARGS__();                \
			
 
				+    } else if (HEADDIM <= 192) {           \
			
 
				+      constexpr static int kHeadDim = 192; \
			
 
				+      return __VA_ARGS__();                \
			
 
				+    } else if (HEADDIM <= 224) {           \
			
 
				+      constexpr static int kHeadDim = 224; \
			
 
				+      return __VA_ARGS__();                \
			
 
				+    } else if (HEADDIM <= 256) {           \
			
 
				+      constexpr static int kHeadDim = 256; \
			
 
				+      return __VA_ARGS__();                \
			
 
				+    }                                      \
			
 
				+  }()
			
--- a/kernels/flash_attn/utils.h
+++ b/kernels/flash_attn/utils.h
@@ -0,0 +1,440 @@
 
				+/******************************************************************************
			
 
				+ * Copyright (c) 2023, Tri Dao.
			
 
				+ ******************************************************************************/
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include <assert.h>
			
 
				+#include <stdint.h>
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+#include <cuda_fp16.h>
			
 
				+
			
 
				+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
			
 
				+#include <cuda_bf16.h>
			
 
				+#endif
			
 
				+
			
 
				+#include <cute/tensor.hpp>
			
 
				+
			
 
				+#include <cutlass/array.h>
			
 
				+#include <cutlass/cutlass.h>
			
 
				+#include <cutlass/numeric_conversion.h>
			
 
				+#include <cutlass/numeric_types.h>
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+namespace flash {
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template<typename T>
			
 
				+__forceinline__ __device__ uint32_t relu2(const uint32_t x);
			
 
				+
			
 
				+template<>
			
 
				+__forceinline__ __device__ uint32_t relu2<cutlass::half_t>(const uint32_t x) {
			
 
				+    uint32_t res;
			
 
				+    const uint32_t zero = 0u;
			
 
				+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
			
 
				+    asm volatile("max.f16x2 %0, %1, %2;\n" : "=r"(res) : "r"(x), "r"(zero));
			
 
				+#else
			
 
				+    asm volatile( \
			
 
				+        "{\n" \
			
 
				+        "\t .reg .f16x2 sela;\n" \
			
 
				+        "\t set.gtu.u32.f16x2 sela, %1, %2;\n" \
			
 
				+        "\t and.b32 %0, sela, %1;\n" 
			
 
				+        "}\n" : "=r"(res) : "r"(x), "r"(zero));
			
 
				+#endif
			
 
				+    return res;
			
 
				+}
			
 
				+
			
 
				+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
			
 
				+template<>
			
 
				+__forceinline__ __device__ uint32_t relu2<cutlass::bfloat16_t>(const uint32_t x) {
			
 
				+    uint32_t res;
			
 
				+    const uint32_t zero = 0u;
			
 
				+    asm volatile("max.bf16x2 %0, %1, %2;\n" : "=r"(res) : "r"(x), "r"(zero));
			
 
				+    return res;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
			
 
				+
			
 
				+template<typename T>
			
 
				+__forceinline__ __device__ uint32_t convert_relu2(const float2 x);
			
 
				+
			
 
				+template<>
			
 
				+__forceinline__ __device__ uint32_t convert_relu2<cutlass::half_t>(const float2 x) {
			
 
				+    uint32_t res;
			
 
				+    const uint32_t a = reinterpret_cast<const uint32_t&>(x.x);
			
 
				+    const uint32_t b = reinterpret_cast<const uint32_t&>(x.y);
			
 
				+    asm volatile("cvt.rn.relu.f16x2.f32 %0, %1, %2;\n" : "=r"(res) : "r"(b), "r"(a));
			
 
				+    return res;
			
 
				+}
			
 
				+
			
 
				+template<>
			
 
				+__forceinline__ __device__ uint32_t convert_relu2<cutlass::bfloat16_t>(const float2 x) {
			
 
				+    uint32_t res;
			
 
				+    const uint32_t a = reinterpret_cast<const uint32_t&>(x.x);
			
 
				+    const uint32_t b = reinterpret_cast<const uint32_t&>(x.y);
			
 
				+    asm volatile("cvt.rn.relu.bf16x2.f32 %0, %1, %2;\n" : "=r"(res) : "r"(b), "r"(a));
			
 
				+    return res;
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template<typename T>
			
 
				+struct MaxOp {
			
 
				+__device__ __forceinline__ T operator()(T const & x, T const & y) { return x > y ? x : y; }
			
 
				+};
			
 
				+
			
 
				+template <>
			
 
				+struct MaxOp<float> {
			
 
				+// This is slightly faster
			
 
				+__device__ __forceinline__ float operator()(float const &x, float const &y) { return max(x, y); }
			
 
				+};
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template<typename T>
			
 
				+struct SumOp {
			
 
				+__device__ __forceinline__ T operator()(T const & x, T const & y) { return x + y; }
			
 
				+};
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template<int THREADS>
			
 
				+struct Allreduce {
			
 
				+    static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4);
			
 
				+    template<typename T, typename Operator>
			
 
				+    static __device__ __forceinline__ T run(T x, Operator &op) {
			
 
				+        constexpr int OFFSET = THREADS / 2;
			
 
				+        x = op(x, __shfl_xor_sync(uint32_t(-1), x, OFFSET));
			
 
				+        return Allreduce<OFFSET>::run(x, op);
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template<>
			
 
				+struct Allreduce<2> {
			
 
				+template<typename T, typename Operator> 
			
 
				+static __device__ __forceinline__ T run(T x, Operator &op) {
			
 
				+    x = op(x, __shfl_xor_sync(uint32_t(-1), x, 1));
			
 
				+    return x;
			
 
				+}
			
 
				+};
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template<bool A_in_regs=false, bool B_in_regs=false, typename Tensor0, typename Tensor1,
			
 
				+         typename Tensor2, typename Tensor3, typename Tensor4,
			
 
				+         typename TiledMma, typename TiledCopyA, typename TiledCopyB,
			
 
				+         typename ThrCopyA, typename ThrCopyB>
			
 
				+__forceinline__ __device__ void gemm(Tensor0 &acc, Tensor1 &tCrA, Tensor2 &tCrB, Tensor3 const& tCsA,
			
 
				+                            Tensor4 const& tCsB, TiledMma tiled_mma,
			
 
				+                            TiledCopyA smem_tiled_copy_A, TiledCopyB smem_tiled_copy_B,
			
 
				+                            ThrCopyA smem_thr_copy_A, ThrCopyB smem_thr_copy_B) {
			
 
				+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(acc));                     // MMA_M
			
 
				+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(acc));                     // MMA_N
			
 
				+    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                     // MMA_K
			
 
				+    Tensor tCrA_copy_view = smem_thr_copy_A.retile_D(tCrA);
			
 
				+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));            // M
			
 
				+    Tensor tCrB_copy_view = smem_thr_copy_B.retile_D(tCrB);
			
 
				+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // N
			
 
				+    if (!A_in_regs) { cute::copy(smem_tiled_copy_A, tCsA(_, _, _0{}), tCrA_copy_view(_, _, _0{})); }
			
 
				+    if (!B_in_regs) { cute::copy(smem_tiled_copy_B, tCsB(_, _, _0{}), tCrB_copy_view(_, _, _0{})); }
			
 
				+    #pragma unroll
			
 
				+    for (int i = 0; i < size<2>(tCrA); ++i) {
			
 
				+        if (i < size<2>(tCrA) - 1) {
			
 
				+            if (!A_in_regs) { cute::copy(smem_tiled_copy_A, tCsA(_, _, i + 1), tCrA_copy_view(_, _, i + 1)); }
			
 
				+            if (!B_in_regs) { cute::copy(smem_tiled_copy_B, tCsB(_, _, i + 1), tCrB_copy_view(_, _, i + 1)); }
			
 
				+        }
			
 
				+        cute::gemm(tiled_mma, tCrA(_, _, i), tCrB(_, _, i), acc);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template<typename Tensor0, typename Tensor1, typename Tensor2, typename Tensor3,
			
 
				+         typename TiledMma, typename TiledCopy, typename ThrCopy>
			
 
				+__forceinline__ __device__ void gemm_rs(Tensor0 &acc, Tensor1 &tCrA, Tensor2 &tCrB, Tensor3 const& tCsB,
			
 
				+                               TiledMma tiled_mma, TiledCopy smem_tiled_copy_B,
			
 
				+                               ThrCopy smem_thr_copy_B) {
			
 
				+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(acc));                     // MMA_M
			
 
				+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(acc));                     // MMA_N
			
 
				+    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                     // MMA_K
			
 
				+    Tensor tCrB_copy_view = smem_thr_copy_B.retile_D(tCrB);
			
 
				+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // N
			
 
				+    cute::copy(smem_tiled_copy_B, tCsB(_, _, _0{}), tCrB_copy_view(_, _, _0{}));
			
 
				+    #pragma unroll
			
 
				+    for (int i = 0; i < size<2>(tCrA); ++i) {
			
 
				+        if (i < size<2>(tCrA) - 1) {
			
 
				+            cute::copy(smem_tiled_copy_B, tCsB(_, _, i + 1), tCrB_copy_view(_, _, i + 1));
			
 
				+        }
			
 
				+        cute::gemm(tiled_mma, tCrA(_, _, i), tCrB(_, _, i), acc);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+// Convert acc_layout from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N))
			
 
				+template<typename Layout>
			
 
				+__forceinline__ __device__ auto convert_layout_acc_rowcol(Layout acc_layout) {
			
 
				+    static_assert(decltype(size<0>(acc_layout))::value == 4);
			
 
				+    static_assert(decltype(rank(acc_layout))::value == 3);
			
 
				+    auto l = logical_divide(acc_layout, Shape<_2>{});  // ((2, 2), MMA_M, MMA_N)
			
 
				+    return make_layout(make_layout(get<0, 1>(l), get<1>(l)), make_layout(get<0, 0>(l), get<2>(l)));
			
 
				+};
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+// Convert acc_layout from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
			
 
				+// if using m16n8k16, or to (4, MMA_M, MMA_N) if using m16n8k8.
			
 
				+template<typename MMA_traits, typename Layout>
			
 
				+__forceinline__ __device__ auto convert_layout_acc_Aregs(Layout acc_layout) {
			
 
				+    using X = Underscore;
			
 
				+    static_assert(decltype(size<0>(acc_layout))::value == 4);
			
 
				+    static_assert(decltype(rank(acc_layout))::value == 3);
			
 
				+    constexpr int mma_shape_K = get<2>(typename MMA_traits::Shape_MNK{});
			
 
				+    static_assert(mma_shape_K == 8 || mma_shape_K == 16);
			
 
				+    if constexpr (mma_shape_K == 8) {
			
 
				+        return acc_layout;
			
 
				+    } else {
			
 
				+        auto l = logical_divide(acc_layout, Shape<X, X, _2>{});  // (4, MMA_M, (2, MMA_N / 2)))
			
 
				+        return make_layout(make_layout(get<0>(l), get<2, 0>(l)), get<1>(l), get<2, 1>(l));
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+// Convert acc_layout from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
			
 
				+template<typename Layout>
			
 
				+__forceinline__ __device__ auto convert_layout_acc_dropout(Layout acc_layout) {
			
 
				+    using X = Underscore;
			
 
				+    static_assert(decltype(size<0>(acc_layout))::value == 4);
			
 
				+    static_assert(decltype(rank(acc_layout))::value == 3);
			
 
				+    auto l = logical_divide(acc_layout, Shape<X, X, _2>{});  // (4, MMA_M, (2, MMA_N / 2)))
			
 
				+    return make_layout(make_layout(get<0>(l), get<2, 0>(l)), get<1>(l), get<2, 1>(l));
			
 
				+};
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template <typename To_type, typename Engine, typename Layout>
			
 
				+__forceinline__ __device__ auto convert_type(Tensor<Engine, Layout> const &tensor) {
			
 
				+    using From_type = typename Engine::value_type;
			
 
				+    constexpr int numel = decltype(size(tensor))::value;
			
 
				+    cutlass::NumericArrayConverter<To_type, From_type, numel> convert_op;
			
 
				+    // HACK: this requires tensor to be "contiguous"
			
 
				+    auto frag = convert_op(*reinterpret_cast<const cutlass::Array<From_type, numel> *>(tensor.data()));
			
 
				+    return make_tensor(make_rmem_ptr<To_type>(&frag), tensor.layout());
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template <typename Engine, typename Layout>
			
 
				+__forceinline__ __device__ void relu_(Tensor<Engine, Layout> &tensor) {
			
 
				+    constexpr int numel = decltype(size(tensor))::value;
			
 
				+    static_assert(numel % 2 == 0);
			
 
				+    using value_t = typename Engine::value_type;
			
 
				+    // HACK: this requires tensor to be "contiguous"
			
 
				+    Tensor tensor_uint32 = recast<uint32_t>(tensor);
			
 
				+    #pragma unroll
			
 
				+    for (int i = 0; i < size(tensor_uint32); ++i) {
			
 
				+        tensor_uint32(i) = relu2<value_t>(tensor_uint32(i));
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+// On SM80 and above, we can fuse fp32 -> fp16/bf16 conversion and relu into 1 instruction
			
 
				+template <typename To_type, typename Engine, typename Layout>
			
 
				+__forceinline__ __device__ auto convert_type_relu(Tensor<Engine, Layout> const &tensor) {
			
 
				+    using From_type = typename Engine::value_type;
			
 
				+    static_assert(std::is_same_v<To_type, cutlass::half_t> || std::is_same_v<To_type, cutlass::bfloat16_t>);
			
 
				+    static_assert(std::is_same_v<float, From_type>);
			
 
				+    constexpr int numel = decltype(size(tensor))::value;
			
 
				+    static_assert(numel % 2 == 0);
			
 
				+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
			
 
				+    // HACK: this requires tensor to be "contiguous"
			
 
				+    Tensor tensor_float2 = recast<float2>(tensor);
			
 
				+    Tensor out_uint32 = make_tensor<uint32_t>(tensor_float2.layout());
			
 
				+    #pragma unroll
			
 
				+    for (int i = 0; i < size(out_uint32); ++i) {
			
 
				+        out_uint32(i) = convert_relu2<To_type>(tensor_float2(i));
			
 
				+    }
			
 
				+    Tensor out = make_tensor(make_rmem_ptr<To_type>(out_uint32.data()), tensor.layout());
			
 
				+#else
			
 
				+    Tensor out = flash::convert_type<To_type>(tensor);
			
 
				+    flash::relu_(out);
			
 
				+#endif
			
 
				+    return out;
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+// Blocks until all but N previous cp.async.commit_group operations have committed.
			
 
				+// This differs from cute::cp_async_wait in that when N = 0 we don't call cp.async.wait_all
			
 
				+// (which is equivalent to commit_group then wait_group 0).
			
 
				+// Instead we just call cp.async.wait_group 0, which is slightly faster.
			
 
				+// https://github.com/NVIDIA/cutlass/blob/master/include/cute/arch/copy_sm80.hpp#L113
			
 
				+template <int N>
			
 
				+CUTE_HOST_DEVICE
			
 
				+void cp_async_wait() {
			
 
				+#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
			
 
				+    asm volatile("cp.async.wait_group %0;\n" :: "n"(N));
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+// resolves offset of a slice of a paged kv copy from gmem.
			
 
				+// assumes that the tensor has already been positioned at the correct head.
			
 
				+template <typename Kernel_traits>
			
 
				+__forceinline__ __device__
			
 
				+int64_t resolve_thread_kv_page_slice_offset(const int tidx, const int n_block_max, const int page_block_size, 
			
 
				+                            const int* block_table, const int page_stride, const int row_stride) {
			
 
				+    constexpr int kGmemThreadsPerRow = Kernel_traits::kGmemThreadsPerRow;
			
 
				+    constexpr int kGmemRowsPerThread = Kernel_traits::kGmemRowsPerThread;
			
 
				+    constexpr int kGmemElemsPerLoad = Kernel_traits::kGmemElemsPerLoad;
			
 
				+    constexpr int kBlockN = Kernel_traits::kBlockN;
			
 
				+    
			
 
				+    const int64_t col_offset = tidx % kGmemThreadsPerRow * kGmemElemsPerLoad;
			
 
				+    const int64_t block_row_offset = tidx / kGmemThreadsPerRow * kGmemRowsPerThread;
			
 
				+    const int64_t global_row_offset = block_row_offset + (n_block_max - 1) * kBlockN;
			
 
				+    const int64_t page_offset = global_row_offset % page_block_size;
			
 
				+    const int64_t virtual_page_idx = global_row_offset / page_block_size;
			
 
				+
			
 
				+    return ((int64_t) block_table[virtual_page_idx]) * ((int64_t) page_stride)
			
 
				+        + page_offset * ((int64_t) row_stride)
			
 
				+        + col_offset;
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+// Layout reshape function. Given a layout with modes ((v1, v2), m, k), returns (v1, v2, k),         
			
 
				+// where v2 may be a tuple itself, in the case of swizzled smem-backed thread tiles. This ensures
			
 
				+// that paged and non-paged copies result in equivalently shaped, if not necessarily strided, tensors.
			
 
				+template <class Shape, class Stride>
			
 
				+__forceinline__ __device__
			
 
				+auto reshape_thread_tile(Layout<Shape, Stride> l) {
			
 
				+    return make_layout(append(get<0>(l.shape()), get<2>(l.shape())),
			
 
				+                        append(get<0>(l.stride()), get<2>(l.stride())));
			
 
				+}
			
 
				+
			
 
				+// reshapes and flattens the thread tile layout. A separate function is needed for the case where
			
 
				+// one of the modes of l is a layout itself and must be flattened, as opposed to keeping it intact
			
 
				+// for the case of swizzled layouts
			
 
				+template <class Shape, class Stride>
			
 
				+__forceinline__ __device__
			
 
				+auto reshape_flatten_thread_tile(Layout<Shape, Stride> l) {
			
 
				+    auto mode_0 = filter(flatten(get<0>(l)));
			
 
				+    return make_layout(append(mode_0.shape(), get<2>(l.shape())),
			
 
				+                        append(mode_0.stride(), get<2>(l.stride())));
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template <bool Is_even_MN=true, bool Is_even_K=true, bool Clear_OOB_MN=false, bool Clear_OOB_K=true,
			
 
				+          typename TiledCopy, typename Engine0, typename Layout0, typename Engine1, typename Layout1,
			
 
				+          typename Engine2, typename Layout2, typename Engine3, typename Layout3>
			
 
				+__forceinline__ __device__ void copy(TiledCopy tiled_copy, Tensor<Engine0, Layout0> const &S,
			
 
				+                            Tensor<Engine1, Layout1> &D, Tensor<Engine2, Layout2> const &identity_MN,
			
 
				+                            Tensor<Engine3, Layout3> const &predicate_K, const int max_MN=0) {
			
 
				+    CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});
			
 
				+    CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});
			
 
				+    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D));                     // MMA
			
 
				+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D));                     // MMA_M
			
 
				+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D));                     // MMA_K
			
 
				+    // There's no case where !Clear_OOB_K && Clear_OOB_MN
			
 
				+    static_assert(!(Clear_OOB_MN && !Clear_OOB_K));
			
 
				+    #pragma unroll
			
 
				+    for (int m = 0; m < size<1>(S); ++m) {
			
 
				+        if (Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_MN) {
			
 
				+            #pragma unroll
			
 
				+            for (int k = 0; k < size<2>(S); ++k) {
			
 
				+                if (Is_even_K || predicate_K(k)) {
			
 
				+                    cute::copy(tiled_copy, S(_, m, k), D(_, m, k));
			
 
				+                } else if (Clear_OOB_K) {
			
 
				+                    cute::clear(D(_, m, k));
			
 
				+                }
			
 
				+            }
			
 
				+        } else if (Clear_OOB_MN) {
			
 
				+            cute::clear(D(_, m, _));
			
 
				+        }
			
 
				+    }
			
 
				+    // TD [2023-04-13]: Strange that the code below can cause race condition.
			
 
				+    // I think it's because the copies are under an if statement.
			
 
				+    // if (Is_even_K) {
			
 
				+    //     #pragma unroll
			
 
				+    //     for (int m = 0; m < size<1>(S); ++m) {
			
 
				+    //         if (Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_MN) {
			
 
				+    //             copy(tiled_copy, S(_, m, _), D(_, m, _));
			
 
				+    //         } else if (Clear_OOB_MN) {
			
 
				+    //             clear(D(_, m, _));
			
 
				+    //         }
			
 
				+    //     }
			
 
				+    // } else {  // It's slightly faster in this case if iterate over K first
			
 
				+    //     #pragma unroll
			
 
				+    //     for (int k = 0; k < size<2>(S); ++k) {
			
 
				+    //         if (predicate_K(k)) {
			
 
				+    //             #pragma unroll
			
 
				+    //             for (int m = 0; m < size<1>(S); ++m) {
			
 
				+    //                 if (Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_MN) {
			
 
				+    //                     copy(tiled_copy, S(_, m, k), D(_, m, k));
			
 
				+    //                 } else if (Clear_OOB_MN) {
			
 
				+    //                     clear(D(_, m, k));
			
 
				+    //                 }
			
 
				+    //             }
			
 
				+    //         } else if (Clear_OOB_K) {  // There's no case where !Clear_OOB_K && Clear_OOB_MN
			
 
				+    //             if (Clear_OOB_MN || Is_even_MN) {
			
 
				+    //                 clear(D(_, _, k));
			
 
				+    //             } else {
			
 
				+    //                 #pragma unroll
			
 
				+    //                 for (int m = 0; m < size<1>(S); ++m) {
			
 
				+    //                     if (!(Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_MN)) {
			
 
				+    //                         clear(D(_, m, k));
			
 
				+    //                     }
			
 
				+    //                 }
			
 
				+    //             }
			
 
				+    //         }
			
 
				+    //     }
			
 
				+    // }
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+template <bool Is_even_K=true,
			
 
				+          typename Engine0, typename Layout0, typename Engine1, typename Layout1,
			
 
				+          typename Engine2, typename Layout2, typename Engine3, typename Layout3>
			
 
				+__forceinline__ __device__ void copy_w_min_idx(Tensor<Engine0, Layout0> const &S,
			
 
				+                                      Tensor<Engine1, Layout1> &D, Tensor<Engine2, Layout2> const &identity_MN,
			
 
				+                                      Tensor<Engine3, Layout3> const &predicate_K,
			
 
				+                                      const int max_MN=0, const int min_MN=0) {
			
 
				+    CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});
			
 
				+    CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});
			
 
				+    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D));                     // MMA
			
 
				+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D));                     // MMA_M
			
 
				+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D));                     // MMA_K
			
 
				+    // if (threadIdx.x == 0 && blockIdx.z == 0) { printf("blockIdx.y = %d, max_MN = %d, min_MN = %d\n", blockIdx.y, max_MN, min_MN); }
			
 
				+    #pragma unroll
			
 
				+    for (int m = 0; m < size<1>(S); ++m) {
			
 
				+        // if (threadIdx.x == 0 && blockIdx.z == 0) { printf("blockIdx.y = %d, m = %d\n", blockIdx.y, get<0>(identity_MN(0, m, 0))); }
			
 
				+        if (get<0>(identity_MN(0, m, 0)) >= min_MN && get<0>(identity_MN(0, m, 0)) < max_MN) {
			
 
				+            // if (threadIdx.x == 0 && blockIdx.z == 0) { printf("Inner loop, blockIdx.y = %d, m = %d\n", blockIdx.y, get<0>(identity_MN(0, m, 0))); }
			
 
				+            #pragma unroll
			
 
				+            for (int k = 0; k < size<2>(S); ++k) {
			
 
				+                if (Is_even_K || predicate_K(k)) {
			
 
				+                    cute::copy(S(_, m, k), D(_, m, k));
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+}  // namespace flash
			
--- a/kernels/torch_bindings.cpp
+++ b/kernels/torch_bindings.cpp
@@ -3,6 +3,7 @@
 
				 #include "ops.h"
			
 
				 #include "core/registration.h"
			
 
				 #include "quantization/quant_ops.h"
			
 
				+#include "flash_attn/flash_api.h"
			
 
				 
			
 
				 #include <torch/library.h>
			
 
				 
			
@@ -447,6 +448,24 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
				       "Tensor? final_states_out_,"
			
 
				       "bool silu_activation) -> Tensor");
			
 
				   ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
			
 
				+
			
 
				+  ops.def("fwd(Tensor! q, Tensor k, Tensor v, Tensor!? out, Tensor? alibi_slopes, "
			
 
				+          "float p_dropout, float softmax_scale, bool is_causal, int window_size_left, int window_size_right, "
			
 
				+          "float softcap, bool return_softmax, Generator? gen) -> Tensor[]");
			
 
				+  ops.impl("fwd", torch::kCUDA, &mha_fwd);
			
 
				+
			
 
				+  ops.def("varlen_fwd(Tensor! q, Tensor k, Tensor v, Tensor!? out, Tensor cu_seqlens_q, "
			
 
				+          "Tensor cu_seqlens_k, Tensor? seqused_k, Tensor? block_table, Tensor? alibi_slopes, "
			
 
				+          "int max_seqlen_q, int max_seqlen_k, float p_dropout, float softmax_scale, bool zero_tensors, "
			
 
				+          "bool is_causal, int window_size_left, int window_size_right, float softcap, bool return_softmax, "
			
 
				+          "Generator? gen) -> Tensor[]");
			
 
				+  ops.impl("varlen_fwd", torch::kCUDA, &mha_varlen_fwd);
			
 
				+
			
 
				+  ops.def("fwd_kvcache(Tensor! q, Tensor kcache, Tensor vcache, Tensor? k, Tensor? v, Tensor? seqlens_k, "
			
 
				+          "Tensor? rotary_cos, Tensor? rotary_sin, Tensor? cache_batch_idx, Tensor? block_table, Tensor? alibi_slopes, "
			
 
				+          "Tensor!? out, float softmax_scale, bool is_causal, int window_size_left, int window_size_right, "
			
 
				+          "float softcap, bool is_rotary_interleaved, int num_splits) -> Tensor[]");
			
 
				+  ops.impl("fwd_kvcache", torch::kCUDA, &mha_fwd_kvcache);
			
 
				 #endif
			
 
				 }
			
 
				 
			
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -7,7 +7,6 @@ torch == 2.4.0; platform_system == 'Linux'
 
				 torchvision == 0.19; platform_system == 'Linux'  # for phi3v
			
 
				 xformers == 0.0.27.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.4.0
			
 
				 triton >= 2.2.1; platform_system == 'Linux'
			
 
				-aphrodite-flash-attn == 2.6.1.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.4.0
			
 
				 
			
 
				 # Windows dependencies
			
 
				 winloop; platform_system == 'Windows'