8 months ago · dfe1a59e4b
--- a/hopper/benchmark_attn.py
+++ b/hopper/benchmark_attn.py
@@ -0,0 +1,273 @@
 
				+from functools import partial
			
 
				+import math
			
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+import torch.nn.functional as F
			
 
				+
			
 
				+import time
			
 
				+
			
 
				+try:
			
 
				+    import cudnn
			
 
				+except ImportError:
			
 
				+    cudnn = None
			
 
				+
			
 
				+
			
 
				+from einops import rearrange, repeat
			
 
				+
			
 
				+# from flash_attn.utils.benchmark import benchmark_forward, benchmark_backward, benchmark_combined, benchmark_all, benchmark_fwd_bwd, pytorch_profiler
			
 
				+from flash_attn.utils.benchmark import benchmark_forward, benchmark_backward, benchmark_combined, benchmark_all, benchmark_fwd_bwd, pytorch_profiler
			
 
				+from flash_attn.flash_attn_interface import flash_attn_func
			
 
				+from flash_attn_interface import flash_attn_func as flash_attn_func_v3, flash_attn_varlen_func as flash_attn_varlen_func_v3
			
 
				+
			
 
				+# Need to install triton nightly:
			
 
				+# pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
			
 
				+
			
 
				+try:
			
 
				+    from triton_fused_attention import attention as triton_attention
			
 
				+except ImportError:
			
 
				+    triton_attention = None
			
 
				+
			
 
				+def flops(batch, nheads, seqlen_q, seqlen_k, headdim, causal=False, mode='fwd'):
			
 
				+    assert mode in ["fwd", "bwd", "fwd_bwd"]
			
 
				+    f = 4 * batch * seqlen**2 * nheads * headdim // (2 if causal else 1)
			
 
				+    return f if mode == "fwd" else (2.5 * f if mode == "bwd" else 3.5 * f)
			
 
				+
			
 
				+
			
 
				+def convert_to_cudnn_type(torch_type):
			
 
				+    if torch_type == torch.float16:
			
 
				+        return cudnn.data_type.HALF
			
 
				+    elif torch_type == torch.bfloat16:
			
 
				+        return cudnn.data_type.BFLOAT16
			
 
				+    elif torch_type == torch.float32:
			
 
				+        return cudnn.data_type.FLOAT
			
 
				+    elif torch_type == torch.int32:
			
 
				+        return cudnn.data_type.INT32
			
 
				+    elif torch_type == torch.int64:
			
 
				+        return cudnn.data_type.INT64
			
 
				+    else:
			
 
				+        raise ValueError("Unsupported tensor data type.")
			
 
				+
			
 
				+
			
 
				+def cudnn_sdpa_setup(q, k, v, grad, causal=False):
			
 
				+    b, nheads, seqlen_q, headdim = q.shape
			
 
				+    _, _, seqlen_k, _ = k.shape
			
 
				+    assert v.shape == (b, nheads, seqlen_k, headdim)
			
 
				+    assert cudnn is not None, 'CUDNN is not available'
			
 
				+    q_gpu, k_gpu, v_gpu = q, k, v
			
 
				+    o_gpu = torch.empty_like(q_gpu)
			
 
				+    stats_gpu = torch.empty(b, nheads, seqlen_q, 1, dtype=torch.float32, device=q.device)
			
 
				+    graph_forward = cudnn.pygraph(
			
 
				+        io_data_type=convert_to_cudnn_type(q.dtype),
			
 
				+        intermediate_data_type=cudnn.data_type.FLOAT,
			
 
				+        compute_data_type=cudnn.data_type.FLOAT,
			
 
				+    )
			
 
				+    q_forward = graph_forward.tensor_like(q_gpu.detach())
			
 
				+    k_forward = graph_forward.tensor_like(k_gpu.detach())
			
 
				+    v_forward = graph_forward.tensor_like(v_gpu.detach())
			
 
				+
			
 
				+    o_forward, stats_forward = graph_forward.sdpa(
			
 
				+        name="sdpa",
			
 
				+        q=q_forward,
			
 
				+        k=k_forward,
			
 
				+        v=v_forward,
			
 
				+        is_inference=False,
			
 
				+        attn_scale=1.0 / math.sqrt(headdim),
			
 
				+        use_causal_mask=causal,
			
 
				+    )
			
 
				+
			
 
				+    o_forward.set_output(True).set_dim(o_gpu.shape).set_stride(o_gpu.stride())
			
 
				+    stats_forward.set_output(True).set_data_type(cudnn.data_type.FLOAT)
			
 
				+
			
 
				+    graph_forward.validate()
			
 
				+    graph_forward.build_operation_graph()
			
 
				+    graph_forward.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
			
 
				+    graph_forward.check_support()
			
 
				+    graph_forward.build_plans()
			
 
				+
			
 
				+    variant_pack_forward = {
			
 
				+        q_forward: q_gpu,
			
 
				+        k_forward: k_gpu,
			
 
				+        v_forward: v_gpu,
			
 
				+        o_forward: o_gpu,
			
 
				+        stats_forward: stats_gpu,
			
 
				+    }
			
 
				+
			
 
				+    dQ_gpu = torch.empty_like(q_gpu)
			
 
				+    dK_gpu = torch.empty_like(k_gpu)
			
 
				+    dV_gpu = torch.empty_like(v_gpu)
			
 
				+    dO_gpu = grad
			
 
				+
			
 
				+    graph_backward = cudnn.pygraph(
			
 
				+        io_data_type=cudnn.data_type.HALF,
			
 
				+        intermediate_data_type=cudnn.data_type.FLOAT,
			
 
				+        compute_data_type=cudnn.data_type.FLOAT,
			
 
				+    )
			
 
				+    
			
 
				+    q_backward = graph_backward.tensor_like(q_gpu.detach())
			
 
				+    k_backward = graph_backward.tensor_like(k_gpu.detach())
			
 
				+    v_backward = graph_backward.tensor_like(v_gpu.detach())
			
 
				+    o_backward = graph_backward.tensor_like(o_gpu.detach())
			
 
				+    dO_backward = graph_backward.tensor_like(dO_gpu.detach())
			
 
				+    stats_backward = graph_backward.tensor_like(stats_gpu.detach())
			
 
				+    
			
 
				+    dQ_backward, dK_backward, dV_backward = graph_backward.sdpa_backward(
			
 
				+        name="sdpa_backward",
			
 
				+        q=q_backward,
			
 
				+        k=k_backward,
			
 
				+        v=v_backward,
			
 
				+        o=o_backward,
			
 
				+        dO=dO_backward,
			
 
				+        stats=stats_backward,
			
 
				+        attn_scale=1.0 / math.sqrt(headdim),
			
 
				+        use_causal_mask=causal,
			
 
				+    )
			
 
				+    
			
 
				+    dQ_backward.set_output(True).set_dim(dQ_gpu.size()).set_stride(dQ_gpu.stride())
			
 
				+    dK_backward.set_output(True).set_dim(dK_gpu.size()).set_stride(dK_gpu.stride())
			
 
				+    dV_backward.set_output(True).set_dim(dV_gpu.size()).set_stride(dV_gpu.stride())
			
 
				+    
			
 
				+    graph_backward.validate()
			
 
				+    graph_backward.build_operation_graph()
			
 
				+    graph_backward.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
			
 
				+    graph_backward.check_support()
			
 
				+    graph_backward.build_plans()
			
 
				+
			
 
				+    variant_pack_backward = {
			
 
				+        q_backward: q_gpu,
			
 
				+        k_backward: k_gpu,
			
 
				+        v_backward: v_gpu,
			
 
				+        o_backward: o_gpu,
			
 
				+        dO_backward: dO_gpu,
			
 
				+        stats_backward: stats_gpu,
			
 
				+        dQ_backward: dQ_gpu,
			
 
				+        dK_backward: dK_gpu,
			
 
				+        dV_backward: dV_gpu,
			
 
				+    }
			
 
				+
			
 
				+    workspace = torch.empty(
			
 
				+        max(graph_forward.get_workspace_size(), graph_backward.get_workspace_size()), 
			
 
				+        device="cuda", dtype=torch.uint8
			
 
				+    )
			
 
				+
			
 
				+    def run_fwd(*args, **kwargs):
			
 
				+        graph_forward.execute(variant_pack_forward, workspace)
			
 
				+        return o_gpu, stats_gpu
			
 
				+
			
 
				+    def run_bwd(*args, **kwargs):
			
 
				+        graph_backward.execute(variant_pack_backward, workspace)
			
 
				+        return dQ_gpu, dK_gpu, dV_gpu
			
 
				+
			
 
				+    return run_fwd, run_bwd
			
 
				+
			
 
				+
			
 
				+torch.manual_seed(0)
			
 
				+repeats = 100
			
 
				+dropout_p = 0.0
			
 
				+causal = False
			
 
				+dtype = torch.float16
			
 
				+device = 'cuda'
			
 
				+verbose = False
			
 
				+batch_size = 2
			
 
				+# seqlen = 2048
			
 
				+seqlen = 8192
			
 
				+# seqlen = 4096
			
 
				+# seqlen = 2047
			
 
				+dim = 2048
			
 
				+# headdim = 128
			
 
				+# headdim = 64
			
 
				+headdim = 256
			
 
				+
			
 
				+# for mode in ['fwd', 'bwd']:
			
 
				+for mode in ['fwd']:
			
 
				+    for headdim in [64, 128, 256]:
			
 
				+    # for headdim in [128]:
			
 
				+        for seqlen in [1024, 2048, 4096, 8192, 16384, 32768]:
			
 
				+        # for seqlen in [8192]:
			
 
				+            nheads = dim // headdim
			
 
				+            # nheads = 24
			
 
				+            # headdim = 64
			
 
				+            # batch_size = 64
			
 
				+            # seqlen = 512
			
 
				+            # nheads = 8
			
 
				+            # headdim = 128
			
 
				+            nheads_kv = nheads
			
 
				+    
			
 
				+            qkv = torch.randn(batch_size, seqlen, 3, nheads, headdim, device=device, dtype=dtype,
			
 
				+                            requires_grad=True)
			
 
				+            q = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, requires_grad=True)
			
 
				+            k = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, requires_grad=True)
			
 
				+            v = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, requires_grad=True)
			
 
				+            q_t = q.transpose(1, 2).contiguous().detach().requires_grad_()
			
 
				+            k_t = k.transpose(1, 2).contiguous().detach().requires_grad_()
			
 
				+            v_t = k.transpose(1, 2).contiguous().detach().requires_grad_()
			
 
				+            grad = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype)
			
 
				+            grad_t = grad.transpose(1, 2).contiguous()
			
 
				+    
			
 
				+            bench_fn = benchmark_forward if mode == 'fwd' else partial(benchmark_backward, grad=grad)
			
 
				+
			
 
				+            for causal in [False, True]:
			
 
				+            # for causal in [True]:
			
 
				+                print(f"\n### {headdim = }, {seqlen = }, {causal = } ###")
			
 
				+                if headdim <= 128 and cudnn is not None:
			
 
				+                    cudnn_sdpa_fwd, cudnn_sdpa_bwd = cudnn_sdpa_setup(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), grad.transpose(1, 2), causal=causal)
			
 
				+                f = flops(batch_size, nheads, seqlen, seqlen, headdim, causal=causal, mode=mode)
			
 
				+                _, m0 = bench_fn(flash_attn_func, q, k, v, dropout_p, causal=causal, repeats=repeats, verbose=verbose, desc='Fav2')
			
 
				+                if mode == 'bwd':
			
 
				+                    ref_dv, v.grad = v.grad.clone(), None
			
 
				+                    ref_dk, k.grad = k.grad.clone(), None
			
 
				+                    ref_dq, q.grad = q.grad.clone(), None
			
 
				+                # pytorch_profiler(flash_attn_func, q, k, v, dropout_p, causal=causal, backward=False)
			
 
				+                if headdim <= 128:
			
 
				+                    if triton_attention is not None:
			
 
				+                        if mode == 'fwd':
			
 
				+                            time.sleep(1) # Sleep to avoid residual power throttling from the previous benchmark
			
 
				+                            _, m3 = benchmark_forward(triton_attention, q_t, k_t, v_t, causal, 1 / math.sqrt(headdim), repeats=repeats, verbose=verbose, desc='Triton')
			
 
				+                        # TODO: fix Triton numeric errors.
			
 
				+                        # if mode == 'bwd':
			
 
				+                        #     dv, v_t.grad = v_t.grad.clone(), None
			
 
				+                        #     dk, k_t.grad = k_t.grad.clone(), None
			
 
				+                        #     dq, q_t.grad = q_t.grad.clone(), None
			
 
				+                        #     torch.testing.assert_close(ref_dv, dv.transpose(1, 2), atol=0.05, rtol=0.05)
			
 
				+                        #     torch.testing.assert_close(ref_dk, dk.transpose(1, 2), atol=0.05, rtol=0.05)
			
 
				+                        #     torch.testing.assert_close(ref_dq, dq.transpose(1, 2), atol=0.05, rtol=0.05)
			
 
				+                    if cudnn is not None:
			
 
				+                        time.sleep(1) # Sleep to avoid residual power throttling from the previous benchmark
			
 
				+                        if mode == 'fwd':
			
 
				+                            _, m2 = benchmark_forward(cudnn_sdpa_fwd, repeats=repeats, verbose=verbose, desc='CuDNN')
			
 
				+                        else:
			
 
				+                            cudnn_sdpa_fwd()
			
 
				+                            _, m2 = benchmark_forward(cudnn_sdpa_bwd, repeats=repeats, verbose=verbose, desc='CuDNN')
			
 
				+                            dq, dk, dv = cudnn_sdpa_bwd()
			
 
				+                            torch.testing.assert_close(ref_dv, dv.transpose(1, 2), atol=0.05, rtol=0.05)
			
 
				+                            torch.testing.assert_close(ref_dk, dk.transpose(1, 2), atol=0.05, rtol=0.05)
			
 
				+                            torch.testing.assert_close(ref_dq, dq.transpose(1, 2), atol=0.05, rtol=0.05)
			
 
				+                        # pytorch_profiler(cudnn_sdpa, backward=False)
			
 
				+                if headdim == 128 or mode == 'fwd':
			
 
				+                    time.sleep(1)
			
 
				+                    _, m1 = bench_fn(flash_attn_func_v3, q, k, v, causal=causal, repeats=repeats, verbose=verbose, desc='Fav3')
			
 
				+                    q_var = q.reshape(-1, q.shape[-2], q.shape[-1])
			
 
				+                    k_var = k.reshape(-1, k.shape[-2], k.shape[-1])
			
 
				+                    v_var = v.reshape(-1, v.shape[-2], v.shape[-1])
			
 
				+                    lens = torch.full([q.shape[0]], seqlen, dtype=torch.int32)
			
 
				+                    cu_seqlens = torch.cat([torch.tensor([0], dtype=torch.int32), torch.cumsum(lens, dim=0, dtype=torch.int32)]).cuda()
			
 
				+                    time.sleep(1)
			
 
				+                    _, m1_var = bench_fn(flash_attn_varlen_func_v3, q_var, k_var, v_var, cu_seqlens, cu_seqlens, seqlen, seqlen, causal=causal, repeats=repeats, verbose=verbose, desc='Fav3 var len')
			
 
				+                    if mode == 'bwd':
			
 
				+                        dv, v.grad = v.grad.clone(), None
			
 
				+                        dk, k.grad = k.grad.clone(), None
			
 
				+                        dq, q.grad = q.grad.clone(), None
			
 
				+                        torch.testing.assert_close(ref_dv, dv, atol=0.05, rtol=0.05)
			
 
				+                        torch.testing.assert_close(ref_dk, dk, atol=0.05, rtol=0.05)
			
 
				+                        torch.testing.assert_close(ref_dq, dq, atol=0.05, rtol=0.05)
			
 
				+ 
			
 
				+                # pytorch_profiler(flash_attn_func_v3, q, k, v, causal=causal, backward=False)
			
 
				+                print(f'Fav2: {m0.mean * 1e3:.3f}ms, {(f / m0.mean * 1e-12):.1f} TFLOPS')
			
 
				+                if headdim <= 128:
			
 
				+                    if triton_attention is not None:
			
 
				+                        print(f'Triton: {m3.mean * 1e3:.3f}ms, {(f / m3.mean * 1e-12):.1f} TFLOPS')
			
 
				+                    if cudnn is not None:
			
 
				+                        print(f'CuDNN: {m2.mean * 1e3:.3f}ms, {(f / m2.mean * 1e-12):.1f} TFLOPS')
			
 
				+                if headdim == 128 or mode == 'fwd':
			
 
				+                    print(f'Fav3: {m1.mean * 1e3:.3f}ms, {(f / m1.mean * 1e-12):.1f} TFLOPS')
			
 
				+                    print(f'Fav3 varlen: {m1_var.mean * 1e3:.3f}ms, {(f / m1_var.mean * 1e-12):.1f} TFLOPS')
			
 
				+    
			
--- a/hopper/epilogue_fwd_sm90_tma.hpp
+++ b/hopper/epilogue_fwd_sm90_tma.hpp
@@ -17,20 +17,15 @@ namespace flash {
 
				 using namespace cute;
			
 
				 
			
 
				 // template <int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_, typename Element_>
			
 
				-template <typename Ktraits>
			
 
				+template <typename Ktraits, typename Seqlen_traits>
			
 
				 struct CollectiveEpilogueFwd {
			
 
				 
			
 
				     using Element = typename Ktraits::Element;
			
 
				     static constexpr int kBlockM = Ktraits::kBlockM;
			
 
				     static constexpr int kBlockN = Ktraits::kBlockN;
			
 
				     static constexpr int kHeadDim = Ktraits::kHeadDim;
			
 
				-    // using Element = Element_;
			
 
				-    // static constexpr int kBlockM = kBlockM_;
			
 
				-    // static constexpr int kBlockN = kBlockN_;
			
 
				-    // static constexpr int kHeadDim = kHeadDim_;
			
 
				     using TileShape_MNK = Shape<Int<kBlockM>, Int<kBlockN>, Int<kHeadDim>>;
			
 
				 
			
 
				-    // static constexpr int kNWarps = kNWarps_;
			
 
				     static constexpr int kNWarps = Ktraits::kNWarps;
			
 
				     static constexpr int kNThreads = kNWarps * cutlass::NumThreadsPerWarp;
			
 
				     static constexpr bool Is_WS = kNWarps >= 12;
			
@@ -38,20 +33,6 @@ struct CollectiveEpilogueFwd {
 
				     static constexpr int NumCopyThreads = !Is_WS ? 0 : cutlass::NumThreadsPerWarpGroup;
			
 
				     static constexpr int NumMmaThreads = kNThreads - NumCopyThreads;
			
 
				 
			
 
				-    using GmemTiledCopyOTMA = cute::SM90_TMA_STORE;
			
 
				-
			
 
				-    // These are for storing the output tensor without TMA (e.g., for setting output to zero)
			
 
				-    static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element);
			
 
				-    static_assert(kHeadDim % kGmemElemsPerLoad == 0, "kHeadDim must be a multiple of kGmemElemsPerLoad");
			
 
				-    static constexpr int kGmemThreadsPerRow = kHeadDim / kGmemElemsPerLoad;
			
 
				-    static_assert(NumMmaThreads % kGmemThreadsPerRow == 0, "NumMmaThreads must be a multiple of kGmemThreadsPerRow");
			
 
				-    using GmemLayoutAtom = Layout<Shape <Int<NumMmaThreads / kGmemThreadsPerRow>, Int<kGmemThreadsPerRow>>,
			
 
				-                                  Stride<Int<kGmemThreadsPerRow>, _1>>;
			
 
				-    using GmemTiledCopyO = decltype(
			
 
				-        make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
			
 
				-                        GmemLayoutAtom{},
			
 
				-                        Layout<Shape<_1, Int<kGmemElemsPerLoad>>>{}));  // Val layout, 8 or 16 vals per store
			
 
				-
			
 
				     using SmemLayoutAtomO = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
			
 
				         decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
			
 
				     using SmemLayoutO = decltype(tile_to_shape(SmemLayoutAtomO{}, select<0, 2>(TileShape_MNK{})));
			
@@ -59,52 +40,72 @@ struct CollectiveEpilogueFwd {
 
				     using SmemCopyAtomO = Copy_Atom<cute::SM90_U32x4_STSM_N, Element>;
			
 
				     using SharedStorage = cute::array_aligned<Element, cute::cosize_v<SmemLayoutO>>;
			
 
				 
			
 
				-    using ShapeO = cute::Shape<int32_t, int32_t, int32_t, int32_t>;  // (seqlen_q, d, head, batch)
			
 
				-    using StrideO = cute::Stride<int64_t, _1, int64_t, int64_t>;
			
 
				-    using StrideLSE = cute::Stride<_1, int64_t, int64_t>;            // (seqlen_q, head, batch)
			
 
				-
			
 
				+    using GmemTiledCopyOTMA = cute::SM90_TMA_STORE;
			
 
				     using TMA_O = decltype(make_tma_copy(
			
 
				         GmemTiledCopyOTMA{},
			
 
				-        make_tensor(make_gmem_ptr(static_cast<Element*>(nullptr)), repeat_like(StrideO{}, int32_t(0)), StrideO{}),
			
 
				+        make_tensor(
			
 
				+            make_gmem_ptr(static_cast<Element*>(nullptr)), 
			
 
				+            typename Seqlen_traits::ShapeT{}, 
			
 
				+            typename Seqlen_traits::StrideT{}
			
 
				+        ),
			
 
				         SmemLayoutO{},
			
 
				         select<0, 2>(TileShape_MNK{}),
			
 
				         _1{}));  // no mcast for O
			
 
				 
			
 
				+    // These are for storing the output tensor without TMA (e.g., for setting output to zero and var-seq-len)
			
 
				+    static constexpr int kNumVecElem = ceil_div(128, sizeof_bits_v<Element>);
			
 
				+    static_assert(kHeadDim % kNumVecElem == 0);
			
 
				+    static constexpr int kNumThreadsPerRow = kHeadDim / kNumVecElem;
			
 
				+    static_assert(NumMmaThreads % kNumThreadsPerRow == 0);
			
 
				+    static constexpr int kNumRows = NumMmaThreads / kNumThreadsPerRow;
			
 
				+    using TiledCopyOAtom = cute::Copy_Atom<cute::UniversalCopy<cutlass::uint128_t>, Element>;
			
 
				+    using TiledCopyOThrLayout = decltype(cute::make_layout(
			
 
				+        cute::make_shape(Int<kNumRows>{}, Int<kNumThreadsPerRow>{}),
			
 
				+        LayoutRight{}));
			
 
				+    using TiledCopyOValLayout = decltype(cute::make_layout(
			
 
				+        cute::make_shape(_1{}, Int<kNumVecElem>{}),
			
 
				+        LayoutRight{}));
			
 
				+    using TiledCopyO = decltype(make_tiled_copy(
			
 
				+        TiledCopyOAtom{},
			
 
				+        TiledCopyOThrLayout{}, // Thr layout
			
 
				+        TiledCopyOValLayout{} // Val layout
			
 
				+    ));
			
 
				+
			
 
				     // Host side kernel arguments
			
 
				     struct Arguments {
			
 
				         Element* ptr_O;
			
 
				-        ShapeO const shape_O;
			
 
				-        StrideO const stride_O;
			
 
				+        typename Seqlen_traits::LayoutT const layout_O;
			
 
				         float* ptr_LSE;
			
 
				-        StrideLSE const stride_LSE;
			
 
				+        typename Seqlen_traits::LayoutLseT const layout_LSE;
			
 
				     };
			
 
				 
			
 
				     // Device side kernel params
			
 
				     struct Params {
			
 
				         Element* ptr_O;
			
 
				-        ShapeO const shape_O;
			
 
				-        StrideO const stride_O;
			
 
				+        typename Seqlen_traits::LayoutT const layout_O;
			
 
				         float* ptr_LSE;
			
 
				-        StrideLSE const stride_LSE;
			
 
				+        typename Seqlen_traits::LayoutLseT const layout_LSE;
			
 
				         TMA_O tma_store_O;
			
 
				     };
			
 
				 
			
 
				     static Params
			
 
				     to_underlying_arguments(Arguments const& args) {
			
 
				-        Tensor mO = make_tensor(make_gmem_ptr(args.ptr_O), args.shape_O, args.stride_O);
			
 
				+        Tensor mO = make_tensor(make_gmem_ptr(args.ptr_O), args.layout_O);
			
 
				         TMA_O tma_store_O = make_tma_copy(
			
 
				             GmemTiledCopyOTMA{},
			
 
				             mO,
			
 
				             SmemLayoutO{},
			
 
				             select<0, 2>(TileShape_MNK{}),
			
 
				             _1{}); // no mcast for O
			
 
				-        return {args.ptr_O, args.shape_O, args.stride_O, args.ptr_LSE, args.stride_LSE, tma_store_O};
			
 
				+        return {args.ptr_O, args.layout_O, args.ptr_LSE, args.layout_LSE, tma_store_O};
			
 
				     }
			
 
				 
			
 
				     /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
			
 
				     CUTLASS_DEVICE
			
 
				     static void prefetch_tma_descriptors(Params const& epilogue_params) {
			
 
				-        cute::prefetch_tma_descriptor(epilogue_params.tma_store_O.get_tma_descriptor());
			
 
				+        if constexpr (!Seqlen_traits::kUseVarSeqLen) {
			
 
				+            cute::prefetch_tma_descriptor(epilogue_params.tma_store_O.get_tma_descriptor());
			
 
				+        }
			
 
				     }
			
 
				 
			
 
				     template <typename SharedStorage, typename FrgTensorO, typename FrgTensorLSE, typename TiledMma>
			
@@ -115,7 +116,8 @@ struct CollectiveEpilogueFwd {
 
				           SharedStorage& shared_storage,
			
 
				           TiledMma tiled_mma,
			
 
				           int thread_idx,
			
 
				-          cute::tuple<int32_t, int32_t, int32_t> const& block_coord
			
 
				+          cute::tuple<int32_t, int32_t, int32_t> const& block_coord,
			
 
				+          const Seqlen_traits& seqlen_traits_q
			
 
				           ) {
			
 
				 
			
 
				         auto [m_block, bidh, bidb] = block_coord;
			
@@ -134,16 +136,9 @@ struct CollectiveEpilogueFwd {
 
				         cutlass::arch::NamedBarrier::arrive(NumMmaThreads + cutlass::NumThreadsPerWarp,
			
 
				                                             cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
			
 
				 
			
 
				-        Tensor mO = epilogue_params.tma_store_O.get_tma_tensor(epilogue_params.shape_O);
			
 
				-        Tensor gO = local_tile(mO(_, _, bidh, bidb), select<0, 2>(TileShape_MNK{}), make_coord(m_block, _0{}));  // (M, K)
			
 
				-        auto block_tma_O = epilogue_params.tma_store_O.get_slice(_0{});
			
 
				-        Tensor tOgO = block_tma_O.partition_D(gO);  // (TMA, TMA_M, TMA_K)
			
 
				-        Tensor tOsO = block_tma_O.partition_S(sO); // (TMA, TMA_M, TMA_K)
			
 
				-
			
 
				-        auto shape_LSE = select<0, 2, 3>(epilogue_params.shape_O);
			
 
				-        Tensor mLSE = make_tensor(make_gmem_ptr(epilogue_params.ptr_LSE), shape_LSE, epilogue_params.stride_LSE);
			
 
				-        Tensor gLSE = local_tile(mLSE(_, bidh, bidb), Shape<Int<kBlockM>>{}, make_coord(m_block));
			
 
				-
			
 
				+        Tensor mLSE = make_tensor(make_gmem_ptr(epilogue_params.ptr_LSE), epilogue_params.layout_LSE);
			
 
				+        Tensor gLSE = seqlen_traits_q.get_lse_local_tile_tensor(
			
 
				+            mLSE, Shape<Int<kBlockM>>{}, bidh, bidb)(_, m_block);
			
 
				         Tensor caccO = cute::make_identity_tensor(select<0, 2>(TileShape_MNK{}));
			
 
				         auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
			
 
				         Tensor taccOcO = thread_mma.partition_C(caccO);                           // (MMA,MMA_M,MMA_K)
			
@@ -156,19 +151,23 @@ struct CollectiveEpilogueFwd {
 
				             #pragma unroll
			
 
				             for (int mi = 0; mi < size(lse); ++mi) {
			
 
				                 const int row = get<0>(taccOcO_row(mi));
			
 
				-                if (row < get<0>(shape_LSE) - m_block * kBlockM) { gLSE(row) = lse(mi); }
			
 
				+                if (row < seqlen_traits_q.actual_seq_len - m_block * kBlockM) { gLSE(row) = lse(mi); }
			
 
				             }
			
 
				         }
			
 
				 
			
 
				-        if (cutlass::canonical_warp_idx_sync() == kNWarps - 1) {
			
 
				-            cutlass::arch::NamedBarrier::sync(NumMmaThreads + cutlass::NumThreadsPerWarp,
			
 
				-                                              cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
			
 
				-            int const lane_predicate = cute::elect_one_sync();
			
 
				-            if (lane_predicate) {
			
 
				-                cute::copy(epilogue_params.tma_store_O, tOsO, tOgO);
			
 
				-                tma_store_arrive();
			
 
				-            }
			
 
				+        int write_warp_idx = kNWarps - 1;
			
 
				+        if (cutlass::canonical_warp_idx_sync() == write_warp_idx) {
			
 
				+            cutlass::arch::NamedBarrier::sync(
			
 
				+                NumMmaThreads + cutlass::NumThreadsPerWarp, 
			
 
				+                cutlass::arch::ReservedNamedBarriers::EpilogueBarrier
			
 
				+            );
			
 
				         }
			
 
				+        TiledCopyO gmem_tiled_copy_O;
			
 
				+        flash::write_O<!Seqlen_traits::kUseVarSeqLen, NumCopyThreads>(
			
 
				+            epilogue_params.ptr_O, epilogue_params.tma_store_O, gmem_tiled_copy_O, 
			
 
				+            epilogue_params.layout_O, select<0, 2>(TileShape_MNK{}), sO, 
			
 
				+            m_block, bidh, bidb, seqlen_traits_q, write_warp_idx
			
 
				+        );
			
 
				     }
			
 
				 
			
 
				     CUTLASS_DEVICE void
			
@@ -177,20 +176,25 @@ struct CollectiveEpilogueFwd {
 
				     }
			
 
				 
			
 
				     // Write 0 to output and -inf to LSE
			
 
				+    template<typename SharedStorage>
			
 
				     CUTLASS_DEVICE void
			
 
				     store_zero(
			
 
				-         Params const& epilogue_params,
			
 
				-         int thread_idx,
			
 
				-         cute::tuple<int32_t, int32_t, int32_t> const& block_coord
			
 
				-         ) {
			
 
				+          Params const& epilogue_params,
			
 
				+          SharedStorage& shared_storage,
			
 
				+          int thread_idx,
			
 
				+          cute::tuple<int32_t, int32_t, int32_t> const& block_coord,
			
 
				+          const Seqlen_traits& seqlen_traits_q
			
 
				+          ) {
			
 
				         auto [m_block, bidh, bidb] = block_coord;
			
 
				-        Tensor mO = make_tensor(make_gmem_ptr(epilogue_params.ptr_O), epilogue_params.shape_O, epilogue_params.stride_O);
			
 
				-        Tensor gO = local_tile(mO(_, _, bidh, bidb), select<0, 2>(TileShape_MNK{}), make_coord(m_block, _0{}));  // (M, K)
			
 
				-        auto shape_LSE = select<0, 2, 3>(epilogue_params.shape_O);
			
 
				-        Tensor mLSE = make_tensor(make_gmem_ptr(epilogue_params.ptr_LSE), shape_LSE, epilogue_params.stride_LSE);
			
 
				-        Tensor gLSE = local_tile(mLSE(_, bidh, bidb), Shape<Int<kBlockM>>{}, make_coord(m_block));
			
 
				-
			
 
				-        GmemTiledCopyO gmem_tiled_copy_O;
			
 
				+        Tensor mO = make_tensor(make_gmem_ptr(epilogue_params.ptr_O), epilogue_params.layout_O);
			
 
				+        Tensor gO = seqlen_traits_q.get_local_tile_tensor(
			
 
				+            mO, select<0, 2>(TileShape_MNK{}), bidh, bidb
			
 
				+        )(_, _, m_block);  // (M, K)
			
 
				+        Tensor mLSE = make_tensor(make_gmem_ptr(epilogue_params.ptr_LSE), epilogue_params.layout_LSE);
			
 
				+        Tensor gLSE = seqlen_traits_q.get_lse_local_tile_tensor(
			
 
				+            mLSE, Shape<Int<kBlockM>>{}, bidh, bidb)(_, m_block);
			
 
				+
			
 
				+        TiledCopyO gmem_tiled_copy_O;
			
 
				         auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(thread_idx);
			
 
				         Tensor tOgO = gmem_thr_copy_O.partition_D(gO);
			
 
				         Tensor tOrO = make_fragment_like(tOgO);
			
@@ -201,13 +205,13 @@ struct CollectiveEpilogueFwd {
 
				         Tensor tOcO = gmem_thr_copy_O.partition_D(cO);
			
 
				         Tensor tOpO = make_tensor<bool>(make_shape(size<2>(tOgO)));
			
 
				         #pragma unroll
			
 
				-        for (int k = 0; k < size(tOpO); ++k) { tOpO(k) = get<1>(tOcO(_0{}, _0{}, k)) < get<1>(epilogue_params.shape_O); }
			
 
				+        for (int k = 0; k < size(tOpO); ++k) { tOpO(k) = get<1>(tOcO(_0{}, _0{}, k)) < get<1>(epilogue_params.layout_O.shape()); }
			
 
				         // Clear_OOB_K must be false since we don't want to write zeros to gmem
			
 
				         flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
			
 
				-            gmem_tiled_copy_O, tOrO, tOgO, tOcO, tOpO, get<0>(epilogue_params.shape_O) - m_block * kBlockM
			
 
				+            gmem_tiled_copy_O, tOrO, tOgO, tOcO, tOpO, get<0>(epilogue_params.layout_O.shape()) - m_block * kBlockM
			
 
				         );
			
 
				         static_assert(kBlockM <= NumMmaThreads);
			
 
				-        if (thread_idx < get<0>(shape_LSE) - m_block * kBlockM) { gLSE(thread_idx) = INFINITY; }
			
 
				+        if (thread_idx < get<0>(epilogue_params.layout_LSE.shape()) - m_block * kBlockM) { gLSE(thread_idx) = INFINITY; }
			
 
				     }
			
 
				 
			
 
				 };
			
--- a/hopper/flash.h
+++ b/hopper/flash.h
@@ -57,7 +57,7 @@ struct Flash_fwd_params : public Qkv_params {
 
				     void * __restrict__ softmax_lseaccum_ptr;
			
 
				 
			
 
				     // The dimensions.
			
 
				-    int b, seqlen_q, seqlen_k, seqlen_knew, d, seqlen_q_rounded, seqlen_k_rounded, d_rounded, rotary_dim;
			
 
				+    int b, seqlen_q, seqlen_k, seqlen_knew, d, seqlen_q_rounded, seqlen_k_rounded, d_rounded, rotary_dim, total_q, total_k;
			
 
				 
			
 
				     // The scaling factors for the kernel.
			
 
				     float scale_softmax;
			
@@ -128,6 +128,8 @@ struct Flash_fwd_params : public Qkv_params {
 
				     void * __restrict__ alibi_slopes_ptr;
			
 
				     index_t alibi_slopes_batch_stride;
			
 
				 
			
 
				+    bool unpadded_lse; // For varlen paths: LSE is in [nheads, total_seqlen_q] format instead of [b, nheads, seqlen_q].
			
 
				+
			
 
				     int * __restrict__ tile_count_semaphore;
			
 
				 };
			
 
				 
			
--- a/hopper/flash_api.cpp
+++ b/hopper/flash_api.cpp
@@ -43,7 +43,8 @@ void set_params_fprop(Flash_fwd_params &params,
 
				                       float softmax_scale,
			
 
				                       int window_size_left,
			
 
				                       int window_size_right,
			
 
				-                      bool seqlenq_ngroups_swapped=false) {
			
 
				+                      bool seqlenq_ngroups_swapped=false,
			
 
				+                      bool unpadded_lse=false) {
			
 
				 
			
 
				     // Reset the parameters
			
 
				     params = {};
			
@@ -81,6 +82,11 @@ void set_params_fprop(Flash_fwd_params &params,
 
				     params.cu_seqlens_k = static_cast<int *>(cu_seqlens_k_d);
			
 
				     params.seqused_k = static_cast<int *>(seqused_k);
			
 
				 
			
 
				+    TORCH_CHECK(
			
 
				+        bool(params.cu_seqlens_q) == bool(params.cu_seqlens_k),
			
 
				+        "cu_seqlens_q and cu_seqlens_k must be both null or non-null"
			
 
				+    );
			
 
				+
			
 
				     // P = softmax(QK^T)
			
 
				     params.p_ptr = p_d;
			
 
				 
			
@@ -139,6 +145,8 @@ void set_params_fprop(Flash_fwd_params &params,
 
				     #ifdef FLASHATTENTION_DISABLE_UNEVEN_K
			
 
				         TORCH_CHECK(d == d_rounded, "This flash attention build does not support headdim not being a multiple of 32.");
			
 
				     #endif
			
 
				+
			
 
				+    params.unpadded_lse = unpadded_lse;
			
 
				 }
			
 
				 
			
 
				 void set_params_dgrad(Flash_bwd_params &params,
			
@@ -372,6 +380,154 @@ mha_fwd(at::Tensor &q,         // batch_size x seqlen_q x num_heads x head_size
 
				     return {out, q_padded, k_padded, v_padded, out_padded, softmax_lse, p};
			
 
				 }
			
 
				 
			
 
				+std::vector<at::Tensor>
			
 
				+mha_varlen_fwd(at::Tensor &q,  // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
			
 
				+               const at::Tensor &k,  // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
			
 
				+               const at::Tensor &v,  // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
			
 
				+               c10::optional<at::Tensor> &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
			
 
				+               const at::Tensor &cu_seqlens_q,  // b+1
			
 
				+               const at::Tensor &cu_seqlens_k,  // b+1
			
 
				+               c10::optional<at::Tensor> &seqused_k, // b. If given, only this many elements of each batch element's keys are used.
			
 
				+               int max_seqlen_q,
			
 
				+               const int max_seqlen_k,
			
 
				+               const float softmax_scale,
			
 
				+               bool is_causal) {
			
 
				+
			
 
				+    auto dprops = at::cuda::getCurrentDeviceProperties();
			
 
				+    bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
			
 
				+    TORCH_CHECK(is_sm90, "FlashAttention only supports Hopper GPUs or newer.");
			
 
				+
			
 
				+    auto q_dtype = q.dtype();
			
 
				+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
			
 
				+                "FlashAttention only support fp16 and bf16 data type");
			
 
				+    TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
			
 
				+    TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
			
 
				+    TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32, "cu_seqlens_q must have dtype int32");
			
 
				+    TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32, "cu_seqlens_k must have dtype int32");
			
 
				+
			
 
				+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v);
			
 
				+    CHECK_DEVICE(cu_seqlens_q);
			
 
				+    CHECK_DEVICE(cu_seqlens_k);
			
 
				+
			
 
				+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
			
 
				+    TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
			
 
				+    TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
			
 
				+    CHECK_CONTIGUOUS(cu_seqlens_q);
			
 
				+    CHECK_CONTIGUOUS(cu_seqlens_k);
			
 
				+
			
 
				+    const auto sizes = q.sizes();
			
 
				+
			
 
				+    const int batch_size = cu_seqlens_q.numel() - 1;
			
 
				+    int num_heads = sizes[1];
			
 
				+    const int head_size_og = sizes[2];
			
 
				+    const int num_heads_k = k.size(1);
			
 
				+
			
 
				+    int window_size_left = -1;
			
 
				+    int window_size_right = -1;
			
 
				+    if (is_causal) { window_size_right = 0; }
			
 
				+
			
 
				+    void *cu_seqlens_q_d = cu_seqlens_q.data_ptr();
			
 
				+
			
 
				+    const int total_q = q.sizes()[0];
			
 
				+
			
 
				+    TORCH_CHECK(batch_size > 0, "batch size must be positive");
			
 
				+    TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
			
 
				+    TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
			
 
				+
			
 
				+    if (window_size_left >= max_seqlen_k) { window_size_left = -1; }
			
 
				+    if (window_size_right >= max_seqlen_k) { window_size_right = -1; }
			
 
				+
			
 
				+    CHECK_SHAPE(q, total_q, num_heads, head_size_og);
			
 
				+    const int total_k = k.size(0);
			
 
				+    CHECK_SHAPE(k, total_k, num_heads_k, head_size_og);
			
 
				+    CHECK_SHAPE(v, total_k, num_heads_k, head_size_og);
			
 
				+
			
 
				+    CHECK_SHAPE(cu_seqlens_q, batch_size + 1);
			
 
				+    CHECK_SHAPE(cu_seqlens_k, batch_size + 1);
			
 
				+    if (seqused_k.has_value()){
			
 
				+        auto seqused_k_ = seqused_k.value();
			
 
				+        TORCH_CHECK(seqused_k_.dtype() == torch::kInt32, "seqused_k must have dtype int32");
			
 
				+        TORCH_CHECK(seqused_k_.is_cuda(), "seqused_k must be on CUDA device");
			
 
				+        TORCH_CHECK(seqused_k_.is_contiguous(), "seqused_k must be contiguous");
			
 
				+        CHECK_SHAPE(seqused_k_, batch_size);
			
 
				+    }
			
 
				+
			
 
				+    at::Tensor q_padded, k_padded, v_padded;
			
 
				+    if (head_size_og % 8 != 0) {
			
 
				+        q_padded = torch::nn::functional::pad(q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
			
 
				+        k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
			
 
				+        v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
			
 
				+    } else {
			
 
				+        q_padded = q;
			
 
				+        k_padded = k;
			
 
				+        v_padded = v;
			
 
				+    }
			
 
				+
			
 
				+    at::Tensor out;
			
 
				+    if (out_.has_value()) {
			
 
				+        out = out_.value();
			
 
				+        TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs");
			
 
				+        CHECK_DEVICE(out);
			
 
				+        TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension");
			
 
				+        CHECK_SHAPE(out, sizes[0], sizes[1], head_size_og);
			
 
				+        if (head_size_og % 8 != 0) { out = torch::empty_like(q_padded); }
			
 
				+    } else {
			
 
				+        out = torch::empty_like(q_padded);
			
 
				+    }
			
 
				+
			
 
				+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
			
 
				+    const int head_size = round_multiple(head_size_og, 8);
			
 
				+    const int head_size_rounded = round_multiple(head_size, 32);
			
 
				+    const int seqlen_q_rounded = round_multiple(max_seqlen_q, 128);
			
 
				+    const int seqlen_k_rounded = round_multiple(max_seqlen_k, 128);
			
 
				+
			
 
				+    // Otherwise the kernel will be launched from cuda:0 device
			
 
				+    // Cast to char to avoid compiler warning about narrowing
			
 
				+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
			
 
				+
			
 
				+    auto opts = q.options();
			
 
				+    auto softmax_lse = torch::empty({num_heads, total_q}, opts.dtype(at::kFloat));
			
 
				+
			
 
				+    Flash_fwd_params params;
			
 
				+    set_params_fprop(params,
			
 
				+                     batch_size,
			
 
				+                     max_seqlen_q, max_seqlen_k,
			
 
				+                     seqlen_q_rounded, seqlen_k_rounded,
			
 
				+                     num_heads, num_heads_k,
			
 
				+                     head_size, head_size_rounded,
			
 
				+                     q_padded, k_padded, v_padded, out,
			
 
				+                     cu_seqlens_q_d,
			
 
				+                     cu_seqlens_k.data_ptr(),
			
 
				+                     seqused_k.has_value() ? seqused_k.value().data_ptr() : nullptr,
			
 
				+                     /*p_d=*/nullptr,
			
 
				+                     softmax_lse.data_ptr(),
			
 
				+                     /*p_dropout=*/0.f,
			
 
				+                     softmax_scale,
			
 
				+                     window_size_left,
			
 
				+                     window_size_right,
			
 
				+                     /*seqlenq_ngroups_swapped=*/false,
			
 
				+                     /*unpadded_lse=*/true);
			
 
				+    params.total_q = total_q;
			
 
				+    params.total_k = total_k;
			
 
				+
			
 
				+    if (max_seqlen_k > 0) {
			
 
				+        auto stream = at::cuda::getCurrentCUDAStream().stream();
			
 
				+        run_mha_fwd(params, stream);
			
 
				+    } else {
			
 
				+        // If seqlen_k == 0, then we have an empty tensor. We need to set the output to 0.
			
 
				+        out.zero_();
			
 
				+        softmax_lse.fill_(std::numeric_limits<float>::infinity());
			
 
				+    }
			
 
				+
			
 
				+    at::Tensor out_padded = out;
			
 
				+    if (head_size_og % 8 != 0) {
			
 
				+        out = out.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
			
 
				+        if (out_.has_value()) { out_.value().copy_(out); }
			
 
				+    }
			
 
				+
			
 
				+    return {out, q_padded, k_padded, v_padded, out_padded, softmax_lse};
			
 
				+}
			
 
				+
			
 
				 void run_mha_bwd(Flash_bwd_params &params, cudaStream_t stream) {
			
 
				     // FP16_SWITCH(!params.is_bf16, [&] {
			
 
				     //     HEADDIM_SWITCH(params.d, [&] {
			
@@ -577,4 +733,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 
				     m.doc() = "FlashAttention";
			
 
				     m.def("fwd", &mha_fwd, "Forward pass");
			
 
				     m.def("bwd", &mha_bwd, "Backward pass");
			
 
				+    m.def("varlen_fwd", &mha_varlen_fwd, "Forward pass (variable length)");
			
 
				 }
			
--- a/hopper/flash_attn_interface.py
+++ b/hopper/flash_attn_interface.py
@@ -57,6 +57,83 @@ def _flash_attn_backward(
 
				     )
			
 
				     return dq, dk, dv, softmax_d
			
 
				 
			
 
				+def _flash_attn_varlen_forward(
			
 
				+    q,
			
 
				+    k,
			
 
				+    v,
			
 
				+    cu_seqlens_q,
			
 
				+    cu_seqlens_k,
			
 
				+    max_seqlen_q,
			
 
				+    max_seqlen_k,
			
 
				+    softmax_scale,
			
 
				+    causal,
			
 
				+):
			
 
				+    maybe_contiguous = lambda x: x.contiguous() if x.stride(-1) != 1 else x
			
 
				+    q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
			
 
				+    out, q, k, v, out_padded, softmax_lse = flashattn_hopper_cuda.varlen_fwd(
			
 
				+        q,
			
 
				+        k,
			
 
				+        v,
			
 
				+        None,
			
 
				+        cu_seqlens_q,
			
 
				+        cu_seqlens_k,
			
 
				+        None,
			
 
				+        max_seqlen_q,
			
 
				+        max_seqlen_k,
			
 
				+        softmax_scale,
			
 
				+        causal,
			
 
				+    )
			
 
				+    # if out.isnan().any() or softmax_lse.isnan().any():
			
 
				+    #     breakpoint()
			
 
				+    return out, q, k, v, out_padded, softmax_lse
			
 
				+
			
 
				+
			
 
				+def _flash_attn_varlen_backward(
			
 
				+    dout,
			
 
				+    q,
			
 
				+    k,
			
 
				+    v,
			
 
				+    out,
			
 
				+    softmax_lse,
			
 
				+    dq,
			
 
				+    dk,
			
 
				+    dv,
			
 
				+    cu_seqlens_q,
			
 
				+    cu_seqlens_k,
			
 
				+    max_seqlen_q,
			
 
				+    max_seqlen_k,
			
 
				+    softmax_scale,
			
 
				+    causal,
			
 
				+):
			
 
				+    maybe_contiguous = lambda x: x.contiguous() if x.stride(-1) != 1 else x
			
 
				+    # dq, dk, dv are allocated by us so they should already be contiguous
			
 
				+    dout, q, k, v, out = [maybe_contiguous(x) for x in (dout, q, k, v, out)]
			
 
				+    (
			
 
				+        dq,
			
 
				+        dk,
			
 
				+        dv,
			
 
				+        softmax_d,
			
 
				+    ) = _get_fa_module().varlen_bwd(
			
 
				+        dout,
			
 
				+        q,
			
 
				+        k,
			
 
				+        v,
			
 
				+        out,
			
 
				+        softmax_lse,
			
 
				+        dq,
			
 
				+        dk,
			
 
				+        dv,
			
 
				+        cu_seqlens_q,
			
 
				+        cu_seqlens_k,
			
 
				+        max_seqlen_q,
			
 
				+        max_seqlen_k,
			
 
				+        softmax_scale,
			
 
				+        causal,
			
 
				+    )
			
 
				+    # if dk.isnan().any() or dk.isnan().any() or dv.isnan().any() or softmax_d.isnan().any():
			
 
				+    #     breakpoint()
			
 
				+    return dq, dk, dv, softmax_d
			
 
				+
			
 
				 
			
 
				 class FlashAttnFunc(torch.autograd.Function):
			
 
				     @staticmethod
			
@@ -105,6 +182,71 @@ class FlashAttnFunc(torch.autograd.Function):
 
				         return dq, dk, dv, None, None
			
 
				 
			
 
				 
			
 
				+class FlashAttnVarlenFunc(torch.autograd.Function):
			
 
				+    @staticmethod
			
 
				+    def forward(
			
 
				+        ctx,
			
 
				+        q,
			
 
				+        k,
			
 
				+        v,
			
 
				+        cu_seqlens_q,
			
 
				+        cu_seqlens_k,
			
 
				+        max_seqlen_q,
			
 
				+        max_seqlen_k,
			
 
				+        softmax_scale,
			
 
				+        causal,
			
 
				+    ):
			
 
				+        if softmax_scale is None:
			
 
				+            softmax_scale = q.shape[-1] ** (-0.5)
			
 
				+        out, q, k, v, out_padded, softmax_lse = _flash_attn_varlen_forward(
			
 
				+            q,
			
 
				+            k,
			
 
				+            v,
			
 
				+            cu_seqlens_q,
			
 
				+            cu_seqlens_k,
			
 
				+            max_seqlen_q,
			
 
				+            max_seqlen_k,
			
 
				+            softmax_scale,
			
 
				+            causal=causal,
			
 
				+        )
			
 
				+        ctx.save_for_backward(
			
 
				+            q, k, v, out_padded, softmax_lse, cu_seqlens_q, cu_seqlens_k
			
 
				+        )
			
 
				+        ctx.max_seqlen_q = max_seqlen_q
			
 
				+        ctx.max_seqlen_k = max_seqlen_k
			
 
				+        ctx.softmax_scale = softmax_scale
			
 
				+        ctx.causal = causal
			
 
				+        return out, softmax_lse
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def backward(ctx, dout, *args):
			
 
				+        # TODO: Uncomment these when var-seq-len is supported in bwd kernel.
			
 
				+        # q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_k = ctx.saved_tensors
			
 
				+        # dq, dk, dv = torch.empty_like(q), torch.empty_like(k), torch.empty_like(v)
			
 
				+        # _flash_attn_varlen_backward(
			
 
				+        #     dout,
			
 
				+        #     q,
			
 
				+        #     k,
			
 
				+        #     v,
			
 
				+        #     out,
			
 
				+        #     softmax_lse,
			
 
				+        #     dq,
			
 
				+        #     dk,
			
 
				+        #     dv,
			
 
				+        #     cu_seqlens_q,
			
 
				+        #     cu_seqlens_k,
			
 
				+        #     ctx.max_seqlen_q,
			
 
				+        #     ctx.max_seqlen_k,
			
 
				+        #     ctx.softmax_scale,
			
 
				+        #     ctx.causal,
			
 
				+        # )
			
 
				+        # dq = dq[..., : dout.shape[-1]]  # We could have padded the head dimension
			
 
				+        # dk = dk[..., : dout.shape[-1]]
			
 
				+        # dv = dv[..., : dout.shape[-1]]
			
 
				+        # return dq, dk, dv, None, None, None, None, None, None, None, None, None, None, None, None
			
 
				+        return None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
			
 
				+
			
 
				+
			
 
				 def flash_attn_func(
			
 
				     q,
			
 
				     k,
			
@@ -167,3 +309,62 @@ def flash_attn_func(
 
				         softmax_scale,
			
 
				         causal,
			
 
				     )
			
 
				+
			
 
				+
			
 
				+def flash_attn_varlen_func(
			
 
				+    q,
			
 
				+    k,
			
 
				+    v,
			
 
				+    cu_seqlens_q,
			
 
				+    cu_seqlens_k,
			
 
				+    max_seqlen_q,
			
 
				+    max_seqlen_k,
			
 
				+    softmax_scale=None,
			
 
				+    causal=False,
			
 
				+):
			
 
				+    """
			
 
				+    Supports multi-query and grouped-query attention (MQA/GQA) by passing in K, V with fewer heads
			
 
				+    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
			
 
				+    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
			
 
				+    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.
			
 
				+    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
			
 
				+    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
			
 
				+        1 1 1 1 0
			
 
				+        1 1 1 1 1
			
 
				+    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
			
 
				+        0 0
			
 
				+        0 0
			
 
				+        0 0
			
 
				+        1 0
			
 
				+        1 1
			
 
				+    If the row of the mask is all zero, the output will be zero.
			
 
				+    Arguments:
			
 
				+        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
			
 
				+        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
			
 
				+        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
			
 
				+        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
			
 
				+           of the sequences in the batch, used to index into q.
			
 
				+        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
			
 
				+           of the sequences in the batch, used to index into kv.
			
 
				+        max_seqlen_q: int. Maximum query sequence length in the batch.
			
 
				+        max_seqlen_k: int. Maximum key sequence length in the batch.
			
 
				+        softmax_scale: float. The scaling of QK^T before applying softmax.
			
 
				+            Default to 1 / sqrt(headdim).
			
 
				+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
			
 
				+    Return:
			
 
				+        out: (total, nheads, headdim).
			
 
				+        softmax_lse [optional, if return_attn_probs=True]: (nheads, total_q_seqlen). The
			
 
				+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
			
 
				+            normalization factor).
			
 
				+    """
			
 
				+    return FlashAttnVarlenFunc.apply(
			
 
				+        q,
			
 
				+        k,
			
 
				+        v,
			
 
				+        cu_seqlens_q,
			
 
				+        cu_seqlens_k,
			
 
				+        max_seqlen_q,
			
 
				+        max_seqlen_k,
			
 
				+        softmax_scale,
			
 
				+        causal,
			
 
				+    )
			
--- a/hopper/flash_fwd_kernel.h
+++ b/hopper/flash_fwd_kernel.h
@@ -24,11 +24,12 @@ namespace flash {
 
				 
			
 
				 using namespace cute;
			
 
				 
			
 
				-template <typename Ktraits, bool Is_causal, typename TileScheduler>
			
 
				+template <typename Ktraits, bool Is_causal, typename TileScheduler, typename Seqlen_traits>
			
 
				 __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp, 1)
			
 
				-    compute_attn_ws(CUTE_GRID_CONSTANT typename CollectiveMainloopFwd<Ktraits, Is_causal>::Params const mainloop_params,
			
 
				-                    CUTE_GRID_CONSTANT typename CollectiveEpilogueFwd<Ktraits>::Params const epilogue_params,
			
 
				-                    CUTE_GRID_CONSTANT typename TileScheduler::Params const scheduler_params
			
 
				+    compute_attn_ws(CUTE_GRID_CONSTANT typename CollectiveMainloopFwd<Ktraits, Is_causal, Seqlen_traits>::Params const mainloop_params,
			
 
				+                    CUTE_GRID_CONSTANT typename CollectiveEpilogueFwd<Ktraits, Seqlen_traits>::Params const epilogue_params,
			
 
				+                    CUTE_GRID_CONSTANT typename TileScheduler::Params const scheduler_params,
			
 
				+                    Seqlen_traits seqlen_traits_q, Seqlen_traits seqlen_traits_k
			
 
				                     ) {
			
 
				 
			
 
				     using Element = typename Ktraits::Element;
			
@@ -46,8 +47,8 @@ __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp,
 
				     // static constexpr int kBlockN = Ktraits::kBlockN;
			
 
				     // constexpr int kHeadDim = Ktraits::kHeadDim;
			
 
				 
			
 
				-    using CollectiveMainloop = CollectiveMainloopFwd<Ktraits, Is_causal>;
			
 
				-    using CollectiveEpilogue = CollectiveEpilogueFwd<Ktraits>;
			
 
				+    using CollectiveMainloop = CollectiveMainloopFwd<Ktraits, Is_causal, Seqlen_traits>;
			
 
				+    using CollectiveEpilogue = CollectiveEpilogueFwd<Ktraits, Seqlen_traits>;
			
 
				 
			
 
				     using MainloopPipeline = typename Ktraits::MainloopPipeline;
			
 
				     using PipelineParams = typename MainloopPipeline::Params;
			
@@ -115,14 +116,21 @@ __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp,
 
				                 auto block_coord = work_tile_info.get_block_coord(scheduler_params);
			
 
				                 auto [m_block, bidh, bidb] = block_coord;
			
 
				 
			
 
				-                int n_block_max = collective_mainloop.get_n_block_max(mainloop_params, m_block);
			
 
				+                seqlen_traits_q.init(bidb);
			
 
				+                seqlen_traits_k.init(bidb);
			
 
				+                if (m_block * kBlockM >= seqlen_traits_q.actual_seq_len) {
			
 
				+                    continue;
			
 
				+                }
			
 
				+                int n_block_max = collective_mainloop.get_n_block_max(
			
 
				+                    mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k);
			
 
				                 if (Is_causal && n_block_max <= 0) {
			
 
				                     scheduler.prefetch_next_work(scheduler_params, work_tile_info);
			
 
				                     scheduler.broadcast_next_work(work_tile_info);
			
 
				                     continue;
			
 
				                 }
			
 
				                 collective_mainloop.load(mainloop_params, pipeline_k, pipeline_v, smem_pipe_write_k, smem_pipe_write_v,
			
 
				-                                         shared_storage, scheduler, scheduler_params, work_tile_info, block_coord, work_idx);
			
 
				+                                         shared_storage, scheduler, scheduler_params, work_tile_info, block_coord, work_idx,
			
 
				+                                         seqlen_traits_q, seqlen_traits_k);
			
 
				                 ++work_idx;
			
 
				             }
			
 
				             collective_mainloop.load_tail(pipeline_k, pipeline_v, smem_pipe_write_k, smem_pipe_write_v);
			
@@ -154,17 +162,24 @@ __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp,
 
				             auto block_coord = work_tile_info.get_block_coord(scheduler_params);
			
 
				             auto [m_block, bidh, bidb] = block_coord;
			
 
				 
			
 
				-            int n_block_max = collective_mainloop.get_n_block_max(mainloop_params, m_block);
			
 
				+            seqlen_traits_q.init(bidb);
			
 
				+            seqlen_traits_k.init(bidb);
			
 
				+            if (m_block * kBlockM >= seqlen_traits_q.actual_seq_len) {
			
 
				+                continue;
			
 
				+            }
			
 
				+            int n_block_max = collective_mainloop.get_n_block_max(
			
 
				+                mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k);
			
 
				             if (Is_causal && n_block_max <= 0) {  // We exit early and write 0 to gO and -inf to gLSE.
			
 
				-                collective_epilogue.store_zero(epilogue_params, threadIdx.x - NumCopyThreads, block_coord);
			
 
				+                collective_epilogue.store_zero(epilogue_params, shared_storage, threadIdx.x - NumCopyThreads, block_coord, seqlen_traits_q);
			
 
				                 continue;
			
 
				             }
			
 
				 
			
 
				             collective_mainloop.mma(mainloop_params, pipeline_k, pipeline_v, smem_pipe_read_k, smem_pipe_read_v,
			
 
				-                                    tOrO, softmax, n_block_max, threadIdx.x - NumCopyThreads, work_idx, m_block, shared_storage);
			
 
				+                                    tOrO, softmax, n_block_max, threadIdx.x - NumCopyThreads, work_idx, m_block, shared_storage,
			
 
				+                                    seqlen_traits_q, seqlen_traits_k);
			
 
				                                     // tOrO, softmax, n_block_max, threadIdx.x - NumCopyThreads + (work_idx >> 30), work_idx, shared_storage);
			
 
				             collective_epilogue.store(epilogue_params, tOrO, softmax.row_sum, shared_storage, tiled_mma1,
			
 
				-                                      threadIdx.x - NumCopyThreads, block_coord);
			
 
				+                                      threadIdx.x - NumCopyThreads, block_coord, seqlen_traits_q);
			
 
				 
			
 
				             ++work_idx;
			
 
				         }
			
--- a/hopper/flash_fwd_launch_template.h
+++ b/hopper/flash_fwd_launch_template.h
@@ -14,41 +14,61 @@
 
				 #include "tile_scheduler.hpp"
			
 
				 #include "flash_fwd_kernel.h"
			
 
				 #include "kernel_traits.h"
			
 
				+#include "seq_len.h"
			
 
				 #include "utils.h"
			
 
				 
			
 
				 
			
 
				-template<typename Kernel_traits, bool Is_causal>
			
 
				+template<typename Kernel_traits, bool Is_causal, typename Seqlen_traits>
			
 
				 void run_flash_fwd(Flash_fwd_params &params, cudaStream_t stream) {
			
 
				     using Element = typename Kernel_traits::Element;
			
 
				     using TileShape_MNK = typename Kernel_traits::TileShape_MNK;
			
 
				     using ClusterShape = typename Kernel_traits::ClusterShape_MNK;
			
 
				 
			
 
				     // print(typename Kernel_traits::SmemLayoutVt{}); printf("\n"); print(typename Kernel_traits::SmemLayoutVt_tmp{});
			
 
				-    using CollectiveMainloop = flash::CollectiveMainloopFwd<Kernel_traits, Is_causal>;
			
 
				-    using CollectiveEpilogue = flash::CollectiveEpilogueFwd<Kernel_traits>;
			
 
				-    using Scheduler = std::conditional_t<!Is_causal,
			
 
				-        flash::StaticPersistentTileScheduler,
			
 
				-        flash::DynamicPersistentTileScheduler<Kernel_traits::kNThreads - cutlass::NumThreadsPerWarpGroup>>;
			
 
				-        // flash::SingleTileScheduler>;
			
 
				+    using CollectiveMainloop = flash::CollectiveMainloopFwd<Kernel_traits, Is_causal, Seqlen_traits>;
			
 
				+    using CollectiveEpilogue = flash::CollectiveEpilogueFwd<Kernel_traits, Seqlen_traits>;
			
 
				+    using Scheduler = std::conditional_t<
			
 
				+        Seqlen_traits::kUseVarSeqLen, 
			
 
				+        flash::SingleTileScheduler,
			
 
				+        std::conditional_t<!Is_causal,
			
 
				+            flash::StaticPersistentTileScheduler,
			
 
				+            flash::DynamicPersistentTileScheduler<Kernel_traits::kNThreads - cutlass::NumThreadsPerWarpGroup>
			
 
				+    >>;
			
 
				+    // using Scheduler = flash::SingleTileScheduler;
			
 
				+    Seqlen_traits seqlen_traits_q(
			
 
				+        params.total_q, params.seqlen_q, params.cu_seqlens_q);
			
 
				+    Seqlen_traits seqlen_traits_k(
			
 
				+        params.total_k, params.seqlen_k, params.cu_seqlens_k, params.seqused_k);
			
 
				     typename CollectiveMainloop::Params mainloop_params =
			
 
				         CollectiveMainloop::to_underlying_arguments({
			
 
				             static_cast<Element const*>(params.q_ptr),
			
 
				-            {params.seqlen_q, params.d, params.h, params.b},  // shape_Q
			
 
				-            {params.q_row_stride, _1{}, params.q_head_stride, params.q_batch_stride},  // stride_Q
			
 
				+            seqlen_traits_q.get_gmem_layout(
			
 
				+                params.seqlen_q, params.d, params.h, params.b, 
			
 
				+                params.q_row_stride, params.q_head_stride, params.q_batch_stride
			
 
				+            ),  // layout_Q
			
 
				             static_cast<Element const*>(params.k_ptr),
			
 
				-            {params.seqlen_k, params.d, params.h_k, params.b},  // shape_K
			
 
				-            {params.k_row_stride, _1{}, params.k_head_stride, params.k_batch_stride},  // stride_K
			
 
				+            seqlen_traits_k.get_gmem_layout(
			
 
				+                params.seqlen_k, params.d, params.h_k, params.b, 
			
 
				+                params.k_row_stride, params.k_head_stride, params.k_batch_stride
			
 
				+            ),  // layout_K
			
 
				             static_cast<Element const*>(params.v_ptr),
			
 
				-            {params.v_row_stride, _1{}, params.v_head_stride, params.v_batch_stride},  // stride_V
			
 
				+            seqlen_traits_k.get_gmem_layout(
			
 
				+                params.seqlen_k, params.d, params.h_k, params.b, 
			
 
				+                params.v_row_stride, params.v_head_stride, params.v_batch_stride
			
 
				+            ),  // layout_V
			
 
				             params.scale_softmax_log2
			
 
				         });
			
 
				     typename CollectiveEpilogue::Params epilogue_params =
			
 
				         CollectiveEpilogue::to_underlying_arguments({
			
 
				             static_cast<Element*>(params.o_ptr),
			
 
				-            {params.seqlen_q, params.d, params.h, params.b},  // shape_O
			
 
				-            {params.o_row_stride, _1{}, params.o_head_stride, params.o_batch_stride},  // stride_O
			
 
				+            seqlen_traits_q.get_gmem_layout(
			
 
				+                params.seqlen_q, params.d, params.h, params.b,
			
 
				+                params.o_row_stride, params.o_head_stride, params.o_batch_stride
			
 
				+            ),  // layout_O
			
 
				             static_cast<float*>(params.softmax_lse_ptr),
			
 
				-            {_1{}, params.seqlen_q, params.h * params.seqlen_q},  // stride_LSE
			
 
				+            seqlen_traits_q.get_lse_gmem_layout(
			
 
				+                params.seqlen_q, params.h, params.b
			
 
				+            )  // layout_LSE
			
 
				         });
			
 
				 
			
 
				     int num_blocks_m = cutlass::ceil_div(params.seqlen_q, Kernel_traits::kBlockM);
			
@@ -58,7 +78,7 @@ void run_flash_fwd(Flash_fwd_params &params, cudaStream_t stream) {
 
				 
			
 
				     // Get the ptr to kernel function.
			
 
				     void *kernel;
			
 
				-    kernel = (void *)flash::compute_attn_ws<Kernel_traits, Is_causal, Scheduler>;
			
 
				+    kernel = (void *)flash::compute_attn_ws<Kernel_traits, Is_causal, Scheduler, Seqlen_traits>;
			
 
				     int smem_size = sizeof(typename Kernel_traits::SharedStorage);
			
 
				     // int smem_size_q = sizeof(decltype((typename Kernel_traits::SharedStorage{}).smem_q));
			
 
				     // int smem_size_k = sizeof(decltype((typename Kernel_traits::SharedStorage{}).smem_k));
			
@@ -81,7 +101,9 @@ void run_flash_fwd(Flash_fwd_params &params, cudaStream_t stream) {
 
				     dim3 block_dims(ctaSize);
			
 
				     dim3 cluster_dims(size<0>(ClusterShape{}), size<1>(ClusterShape{}), size<2>(ClusterShape{}));
			
 
				     cutlass::ClusterLaunchParams launch_params{grid_dims, block_dims, cluster_dims, smem_size, stream};
			
 
				-    cutlass::launch_kernel_on_cluster(launch_params, kernel, mainloop_params, epilogue_params, scheduler_params);
			
 
				+    cutlass::launch_kernel_on_cluster(
			
 
				+        launch_params, kernel, mainloop_params, epilogue_params, 
			
 
				+        scheduler_params, seqlen_traits_q, seqlen_traits_k);
			
 
				     CHECK_CUDA_KERNEL_LAUNCH();
			
 
				 }
			
 
				 
			
@@ -89,7 +111,12 @@ template<typename T>
 
				 void run_mha_fwd_hdim64(Flash_fwd_params &params, cudaStream_t stream) {
			
 
				     constexpr static int Headdim = 64;
			
 
				     BOOL_SWITCH(params.is_causal, Is_causal, [&] {
			
 
				-        run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 192, 128, 16, 2, false, 1, T>, Is_causal>(params, stream);
			
 
				+        SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] {
			
 
				+            run_flash_fwd<
			
 
				+                Flash_fwd_kernel_traits<Headdim, 192, 128, 16, 2, false, 1, T>, 
			
 
				+                Is_causal, Seqlen_traits
			
 
				+            >(params, stream);
			
 
				+        });
			
 
				     });
			
 
				 }
			
 
				 
			
@@ -97,9 +124,14 @@ template<typename T>
 
				 void run_mha_fwd_hdim128(Flash_fwd_params &params, cudaStream_t stream) {
			
 
				     constexpr static int Headdim = 128;
			
 
				     BOOL_SWITCH(params.is_causal, Is_causal, [&] {
			
 
				-        // Only use Cluster if number of tiles along seqlen_q is even
			
 
				-        BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, 128) % 2 == 0, UseCluster, [&] {
			
 
				-            run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, Is_causal ? 128 : 176, 12, 2, false, !Is_causal && UseCluster ? 2 : 1, T>, Is_causal>(params, stream);
			
 
				+        SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] {
			
 
				+            // Only use Cluster if number of tiles along seqlen_q is even and not Is_causal
			
 
				+            BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, 128) % 2 == 0 && !Is_causal && !Seqlen_traits::kUseVarSeqLen, UseCluster, [&] {
			
 
				+                run_flash_fwd<
			
 
				+                    Flash_fwd_kernel_traits<Headdim, 128, Is_causal ? 128 : 176, 12, 2, false, UseCluster ? 2 : 1, T>, 
			
 
				+                    Is_causal, Seqlen_traits
			
 
				+                >(params, stream);
			
 
				+            });
			
 
				         });
			
 
				     });
			
 
				 }
			
@@ -108,9 +140,14 @@ template<typename T>
 
				 void run_mha_fwd_hdim256(Flash_fwd_params &params, cudaStream_t stream) {
			
 
				     constexpr static int Headdim = 256;
			
 
				     BOOL_SWITCH(params.is_causal, Is_causal, [&] {
			
 
				-        // Only use Cluster if number of tiles along seqlen_q is even
			
 
				-        BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, 128) % 2 == 0, UseCluster, [&] {
			
 
				-            run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 80, 12, 2, false, !Is_causal && UseCluster ? 2 : 1, T>, Is_causal>(params, stream);
			
 
				+        SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] {
			
 
				+            // Only use Cluster if number of tiles along seqlen_q is even
			
 
				+            BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, 128) % 2 == 0 && !Is_causal && !Seqlen_traits::kUseVarSeqLen, UseCluster, [&] {
			
 
				+                run_flash_fwd<
			
 
				+                    Flash_fwd_kernel_traits<Headdim, 128, 80, 12, 2, false, UseCluster ? 2 : 1, T>, 
			
 
				+                    Is_causal, Seqlen_traits
			
 
				+                >(params, stream);
			
 
				+            });
			
 
				         });
			
 
				     });
			
 
				 }
			
--- a/hopper/mainloop_fwd_sm90_tma_gmma_ws.hpp
+++ b/hopper/mainloop_fwd_sm90_tma_gmma_ws.hpp
@@ -21,7 +21,7 @@ namespace flash {
 
				 
			
 
				 using namespace cute;
			
 
				 
			
 
				-template <typename Ktraits, bool Is_causal>
			
 
				+template <typename Ktraits, bool Is_causal, typename Seqlen_traits>
			
 
				 struct CollectiveMainloopFwd {
			
 
				 
			
 
				     using Element = typename Ktraits::Element;
			
@@ -64,19 +64,24 @@ struct CollectiveMainloopFwd {
 
				     //     decltype(tile_to_shape(SmemLayoutAtomVTMA{},
			
 
				     //                            make_shape(shape<1>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int<kStages>{})));
			
 
				 
			
 
				-    using ShapeQKV = cute::Shape<int32_t, int32_t, int32_t, int32_t>;  // (seqlen, d, head, batch)
			
 
				-    using StrideQKV = cute::Stride<int64_t, _1, int64_t, int64_t>;
			
 
				-
			
 
				     using TMA_Q = decltype(make_tma_copy(
			
 
				         GmemTiledCopyQ{},
			
 
				-        make_tensor(make_gmem_ptr(static_cast<Element const*>(nullptr)), repeat_like(StrideQKV{}, int32_t(0)), StrideQKV{}),
			
 
				+        make_tensor(
			
 
				+            make_gmem_ptr(static_cast<Element const*>(nullptr)), 
			
 
				+            repeat_like(typename Seqlen_traits::StrideT{}, int32_t(0)), 
			
 
				+            typename Seqlen_traits::StrideT{}
			
 
				+        ),
			
 
				         SmemLayoutQ{},
			
 
				         select<0, 2>(TileShape_MNK{}),
			
 
				         _1{}));  // no mcast for Q
			
 
				 
			
 
				     using TMA_KV = decltype(make_tma_copy(
			
 
				         GmemTiledCopyKV{},
			
 
				-        make_tensor(make_gmem_ptr(static_cast<Element const*>(nullptr)), repeat_like(StrideQKV{}, int32_t(0)), StrideQKV{}),
			
 
				+        make_tensor(
			
 
				+            make_gmem_ptr(static_cast<Element const*>(nullptr)), 
			
 
				+            repeat_like(typename Seqlen_traits::StrideT{}, int32_t(0)), 
			
 
				+            typename Seqlen_traits::StrideT{}
			
 
				+        ),
			
 
				         take<0, 2>(SmemLayoutK{}),
			
 
				         select<1, 2>(TileShape_MNK{}),
			
 
				         size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
			
@@ -95,20 +100,19 @@ struct CollectiveMainloopFwd {
 
				     // Host side kernel arguments
			
 
				     struct Arguments {
			
 
				         Element const* ptr_Q;
			
 
				-        ShapeQKV const shape_Q;
			
 
				-        StrideQKV const stride_Q;
			
 
				+        typename Seqlen_traits::LayoutT layout_Q;
			
 
				         Element const* ptr_K;
			
 
				-        ShapeQKV const shape_K;
			
 
				-        StrideQKV const stride_K;
			
 
				+        typename Seqlen_traits::LayoutT layout_K;
			
 
				         Element const* ptr_V;
			
 
				-        StrideQKV const stride_V;
			
 
				+        typename Seqlen_traits::LayoutT layout_V;
			
 
				         float const softmax_scale_log2;
			
 
				     };
			
 
				 
			
 
				     // Device side kernel params
			
 
				     struct Params {
			
 
				-        ShapeQKV const shape_Q;
			
 
				-        ShapeQKV const shape_K;
			
 
				+        typename Seqlen_traits::LayoutT layout_Q;
			
 
				+        typename Seqlen_traits::LayoutT layout_K;
			
 
				+        typename Seqlen_traits::LayoutT layout_V;
			
 
				         cutlass::FastDivmod qhead_per_khead_divmod;
			
 
				         TMA_Q tma_load_Q;
			
 
				         TMA_KV tma_load_K, tma_load_V;
			
@@ -118,29 +122,29 @@ struct CollectiveMainloopFwd {
 
				 
			
 
				     static Params
			
 
				     to_underlying_arguments(Arguments const& args) {
			
 
				-        Tensor mQ = make_tensor(make_gmem_ptr(args.ptr_Q), args.shape_Q, args.stride_Q);
			
 
				+        Tensor mQ = make_tensor(make_gmem_ptr(args.ptr_Q), args.layout_Q);
			
 
				         TMA_Q tma_load_Q = make_tma_copy(
			
 
				             GmemTiledCopyQ{},
			
 
				             mQ,
			
 
				             SmemLayoutQ{},
			
 
				             select<0, 2>(TileShape_MNK{}),
			
 
				             _1{}); // no mcast for Q
			
 
				-        Tensor mK = make_tensor(make_gmem_ptr(args.ptr_K), args.shape_K, args.stride_K);
			
 
				+        Tensor mK = make_tensor(make_gmem_ptr(args.ptr_K), args.layout_K);
			
 
				         TMA_KV tma_load_K = make_tma_copy(
			
 
				             GmemTiledCopyKV{},
			
 
				             mK,
			
 
				             SmemLayoutK{}(_, _, _0{}),
			
 
				             select<1, 2>(TileShape_MNK{}),
			
 
				             size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
			
 
				-        Tensor mV = make_tensor(make_gmem_ptr(args.ptr_V), args.shape_K, args.stride_V);
			
 
				+        Tensor mV = make_tensor(make_gmem_ptr(args.ptr_V), args.layout_V);
			
 
				         TMA_KV tma_load_V = make_tma_copy(
			
 
				             GmemTiledCopyKV{},
			
 
				             mV,
			
 
				             SmemLayoutV{}(_, _, _0{}),
			
 
				             select<1, 2>(TileShape_MNK{}),
			
 
				             size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
			
 
				-        return {args.shape_Q, args.shape_K,
			
 
				-                cutlass::FastDivmod(cute::ceil_div(get<2>(args.shape_Q), get<2>(args.shape_K))),
			
 
				+        return {args.layout_Q, args.layout_K, args.layout_V,
			
 
				+                cutlass::FastDivmod(cute::ceil_div(get<2>(args.layout_Q.shape()), get<2>(args.layout_K.shape()))),
			
 
				                 tma_load_Q, tma_load_K, tma_load_V,
			
 
				                 args.softmax_scale_log2};
			
 
				     }
			
@@ -154,11 +158,15 @@ struct CollectiveMainloopFwd {
 
				     }
			
 
				 
			
 
				     CUTLASS_DEVICE
			
 
				-    int get_n_block_max(Params const& mainloop_params, int m_block) {
			
 
				+    int get_n_block_max(
			
 
				+          Params const& mainloop_params, int m_block, 
			
 
				+          const Seqlen_traits& seqlen_traits_q,
			
 
				+          const Seqlen_traits& seqlen_traits_k
			
 
				+        ) {
			
 
				         static constexpr int kBlockM = get<0>(TileShape_MNK{});
			
 
				         static constexpr int kBlockN = get<1>(TileShape_MNK{});
			
 
				-        int const seqlen_q = get<0>(mainloop_params.shape_Q);
			
 
				-        int const seqlen_k = get<0>(mainloop_params.shape_K);
			
 
				+        int const seqlen_q = seqlen_traits_q.actual_seq_len;
			
 
				+        int const seqlen_k = seqlen_traits_k.actual_seq_len;
			
 
				         int n_block_max = cute::ceil_div(seqlen_k, kBlockN);
			
 
				         if constexpr (Is_causal) {
			
 
				             n_block_max = std::min(n_block_max,
			
@@ -179,16 +187,18 @@ struct CollectiveMainloopFwd {
 
				          typename Scheduler::Params const& scheduler_params,
			
 
				          typename Scheduler::WorkTileInfo& work_tile_info,
			
 
				          cute::tuple<int32_t, int32_t, int32_t> block_coord,
			
 
				-         int work_idx
			
 
				+         int work_idx,
			
 
				+         const Seqlen_traits& seqlen_traits_q,
			
 
				+         const Seqlen_traits& seqlen_traits_k
			
 
				          ) {
			
 
				 
			
 
				         Tensor sQ = make_tensor(make_smem_ptr(shared_storage.smem_q.data()), SmemLayoutQ{});
			
 
				         Tensor sK = make_tensor(make_smem_ptr(shared_storage.smem_k.data()), SmemLayoutK{});
			
 
				         Tensor sV = make_tensor(make_smem_ptr(shared_storage.smem_v.data()), SmemLayoutV{});
			
 
				 
			
 
				-        Tensor mQ = mainloop_params.tma_load_Q.get_tma_tensor(mainloop_params.shape_Q);
			
 
				-        Tensor mK = mainloop_params.tma_load_K.get_tma_tensor(mainloop_params.shape_K);
			
 
				-        Tensor mV = mainloop_params.tma_load_V.get_tma_tensor(mainloop_params.shape_K);
			
 
				+        Tensor mQ = mainloop_params.tma_load_Q.get_tma_tensor(mainloop_params.layout_Q.shape());
			
 
				+        Tensor mK = mainloop_params.tma_load_K.get_tma_tensor(mainloop_params.layout_K.shape());
			
 
				+        Tensor mV = mainloop_params.tma_load_V.get_tma_tensor(mainloop_params.layout_V.shape());
			
 
				 
			
 
				         auto [m_block, bidh, bidb] = block_coord;
			
 
				         int bidh_kv = mainloop_params.qhead_per_khead_divmod.divide(bidh);
			
@@ -197,9 +207,12 @@ struct CollectiveMainloopFwd {
 
				         uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
			
 
				         constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
			
 
				         uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
			
 
				-        Tensor gQ = local_tile(mQ(_, _, bidh, bidb), select<0, 2>(TileShape_MNK{}), make_coord(m_block, _0{}));  // (M, K)
			
 
				-        Tensor gK = local_tile(mK(_, _, bidh_kv, bidb), select<1, 2>(TileShape_MNK{}), make_coord(_, _0{}));  // (N, K, _)
			
 
				-        Tensor gV = local_tile(mV(_, _, bidh_kv, bidb), select<1, 2>(TileShape_MNK{}), make_coord(_, _0{}));  // (N, K, _)
			
 
				+        Tensor gQ = seqlen_traits_q.get_local_tile_tensor(
			
 
				+            mQ, select<0, 2>(TileShape_MNK{}), bidh, bidb)(_, _, m_block);  // (M, K)
			
 
				+        Tensor gK = seqlen_traits_k.get_local_tile_tensor(
			
 
				+            mK, select<1, 2>(TileShape_MNK{}), bidh_kv, bidb);  // (N, K, _)
			
 
				+        Tensor gV = seqlen_traits_k.get_local_tile_tensor(
			
 
				+            mV, select<1, 2>(TileShape_MNK{}), bidh_kv, bidb);  // (N, K, _)
			
 
				 
			
 
				         Tensor sQ_x = make_tensor(sQ.data(), make_layout(sQ.layout(), Layout<_1>{}));
			
 
				         Tensor gQ_x = make_tensor(gQ.data(), make_layout(gQ.layout(), Layout<_1>{}));
			
@@ -218,7 +231,7 @@ struct CollectiveMainloopFwd {
 
				             }
			
 
				         }
			
 
				 
			
 
				-        int n_block_max = get_n_block_max(mainloop_params, m_block);
			
 
				+        int n_block_max = get_n_block_max(mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k);
			
 
				         int n_block = n_block_max - 1;
			
 
				 
			
 
				         int lane_predicate = cute::elect_one_sync();
			
@@ -331,7 +344,9 @@ struct CollectiveMainloopFwd {
 
				         int thread_idx,
			
 
				         int work_idx,
			
 
				         int m_block,
			
 
				-        SharedStorage& shared_storage
			
 
				+        SharedStorage& shared_storage,
			
 
				+        const Seqlen_traits& seqlen_traits_q,
			
 
				+        const Seqlen_traits& seqlen_traits_k
			
 
				         ) {
			
 
				         static_assert(is_rmem<FrgTensorO>::value, "O tensor must be rmem resident.");
			
 
				 
			
@@ -360,8 +375,8 @@ struct CollectiveMainloopFwd {
 
				         };
			
 
				 
			
 
				         tiled_mma1.accumulate_ = GMMA::ScaleOut::Zero;
			
 
				-        int const seqlen_q = get<0>(mainloop_params.shape_Q);
			
 
				-        int const seqlen_k = get<0>(mainloop_params.shape_K);
			
 
				+        int const seqlen_q = seqlen_traits_q.actual_seq_len;
			
 
				+        int const seqlen_k = seqlen_traits_k.actual_seq_len;
			
 
				         int n_block = n_block_count - 1;
			
 
				 
			
 
				         cutlass::ConsumerToken barrier_token = static_cast<cutlass::BarrierStatus>(shared_storage.barrier_Q.try_wait(work_idx % 2));
			
@@ -483,4 +498,3 @@ struct CollectiveMainloopFwd {
 
				 };
			
 
				 
			
 
				 } // namespace flash
			
 
				-
			
--- a/hopper/seq_len.h
+++ b/hopper/seq_len.h
@@ -0,0 +1,168 @@
 
				+/******************************************************************************
			
 
				+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
			
 
				+ ******************************************************************************/
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include <cutlass/cutlass.h>
			
 
				+#include <cute/layout.hpp>
			
 
				+
			
 
				+namespace flash {
			
 
				+
			
 
				+static constexpr int kMaxTileSize = 128;
			
 
				+
			
 
				+template <bool UseVarSeqLen> class SeqLenTraits {
			
 
				+public:
			
 
				+  // Total number of queries / keys. Unpadded.
			
 
				+  int sum_s = 0;
			
 
				+  // seq len offsets.
			
 
				+  int *cu_seq_len = nullptr;
			
 
				+  // actual seq len array.
			
 
				+  int *seq_used = nullptr;
			
 
				+  // seq len of the current batch.
			
 
				+  int actual_seq_len = -1;
			
 
				+
			
 
				+  // Whether this is for fixed-seq-len or var-seq-len.
			
 
				+  static constexpr bool kUseVarSeqLen = UseVarSeqLen;
			
 
				+
			
 
				+  using ShapeT = std::conditional_t<
			
 
				+      UseVarSeqLen, 
			
 
				+      cute::Shape<int32_t, int32_t, int32_t>, 
			
 
				+      cute::Shape<int32_t, int32_t, int32_t, int32_t>
			
 
				+  >;
			
 
				+  using StrideT = std::conditional_t<
			
 
				+      UseVarSeqLen, 
			
 
				+      cute::Shape<int64_t, _1, int64_t>, 
			
 
				+      cute::Shape<int64_t, _1, int64_t, int64_t>
			
 
				+  >;
			
 
				+  using LayoutT = cute::Layout<ShapeT, StrideT>;
			
 
				+
			
 
				+  using ShapeLseT = std::conditional_t<
			
 
				+      UseVarSeqLen, 
			
 
				+      cute::Shape<int32_t, int32_t>, 
			
 
				+      cute::Shape<int32_t, int32_t, int32_t>
			
 
				+  >;
			
 
				+  using StrideLseT = std::conditional_t<
			
 
				+      UseVarSeqLen, 
			
 
				+      cute::Shape<int64_t, _1>, 
			
 
				+      cute::Shape<int64_t, int64_t, _1>
			
 
				+  >;
			
 
				+  using LayoutLseT = cute::Layout<ShapeLseT, StrideLseT>;
			
 
				+
			
 
				+  CUTLASS_HOST SeqLenTraits() {}
			
 
				+
			
 
				+  CUTLASS_HOST SeqLenTraits(
			
 
				+      int sum_s, int max_seq_len, int *cu_seq_len = nullptr, int *seq_used = nullptr): 
			
 
				+      sum_s(sum_s), cu_seq_len(cu_seq_len), seq_used(seq_used), actual_seq_len(max_seq_len) {}
			
 
				+
			
 
				+  // Returns the layout of a tensor in MKHB format in global memory.
			
 
				+  // padded: only useful for var-seq-len for dq_accum and softmax_d.
			
 
				+  CUTLASS_HOST_DEVICE auto get_gmem_layout(
			
 
				+      int m, int k, int h, int b, 
			
 
				+      int64_t m_stride, int64_t h_stride, int64_t b_stride,
			
 
				+      bool padded = false) const {
			
 
				+    static_assert(!UseVarSeqLen, "Default implementation is for FixedSeqLen.");
			
 
				+    return make_layout(make_shape(m, k, h, b),
			
 
				+                       make_stride(m_stride, cute::_1{}, h_stride, b_stride));
			
 
				+  }
			
 
				+
			
 
				+  // Returns the layout of a tensor in MKHB format in global memory.
			
 
				+  // padded: only useful for var-seq-len for dq_accum and softmax_d.
			
 
				+  CUTLASS_HOST_DEVICE auto get_lse_gmem_layout(
			
 
				+      int m, int h, int b, bool padded = false) const {
			
 
				+    static_assert(!UseVarSeqLen, "Default implementation is for FixedSeqLen.");
			
 
				+    return make_layout(make_shape(b, h, m),
			
 
				+                       make_stride(int64_t(h * m), int64_t(m), cute::_1()));
			
 
				+  }
			
 
				+
			
 
				+  CUTLASS_DEVICE void init(int bidb) {}
			
 
				+
			
 
				+  template <typename MTensor, typename Shape>
			
 
				+  CUTLASS_DEVICE auto get_local_tile_tensor(
			
 
				+      const MTensor &m_tensor, const Shape &tile_shape, 
			
 
				+      int bidh, int bidb, bool padded = false) const {
			
 
				+    auto g_tensor = local_tile(
			
 
				+      m_tensor(_, _, bidh, bidb), tile_shape, make_coord(_, _0{}));
			
 
				+    return g_tensor;
			
 
				+  }
			
 
				+
			
 
				+  template <typename MTensor, typename Shape>
			
 
				+  CUTLASS_DEVICE auto get_lse_local_tile_tensor(
			
 
				+      const MTensor &m_tensor, const Shape &tile_shape, 
			
 
				+      int bidh, int bidb, bool padded = false) const {
			
 
				+    auto g_tensor = local_tile(m_tensor(bidb, bidh, _), tile_shape, make_coord(_));
			
 
				+    return g_tensor;
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+using FixedSeqLenTraits = SeqLenTraits<false>;
			
 
				+
			
 
				+using VarSeqLenTraits = SeqLenTraits<true>;
			
 
				+
			
 
				+// Returns the static layout of a var-seq-len tensor in global memory based on
			
 
				+// max_seq_len and max_batch_size.
			
 
				+// padded: only useful for var-seq-len for dq_accum and softmax_d.
			
 
				+// When padded is True, use B_M + kMaxTileSize * B as the total B_M.
			
 
				+template <>
			
 
				+CUTLASS_HOST_DEVICE auto VarSeqLenTraits::get_gmem_layout(
			
 
				+    int m, int k, int h, int b, 
			
 
				+    int64_t m_stride, int64_t h_stride, int64_t b_stride,
			
 
				+    bool padded) const {
			
 
				+  return make_layout(
			
 
				+    make_shape(sum_s + (padded ? kMaxTileSize * b : 0), k, h), 
			
 
				+    make_stride(m_stride, cute::_1{}, h_stride));
			
 
				+}
			
 
				+
			
 
				+// padded: only useful for var-seq-len for dq_accum and softmax_d.
			
 
				+// When padded is True, use B_M + kMaxTileSize * B as the total B_M.
			
 
				+template <>
			
 
				+CUTLASS_HOST_DEVICE auto VarSeqLenTraits::get_lse_gmem_layout(
			
 
				+    int m, int h, int b, bool padded) const {
			
 
				+  return make_layout(
			
 
				+    make_shape(h, sum_s + (padded ? kMaxTileSize * b : 0)), 
			
 
				+    make_stride(int64_t(sum_s + (padded ? kMaxTileSize * b : 0)), cute::_1()));
			
 
				+}
			
 
				+
			
 
				+template <>
			
 
				+CUTLASS_DEVICE void VarSeqLenTraits::init(int bidb) {
			
 
				+  actual_seq_len = 
			
 
				+      seq_used ? seq_used[bidb] : (cu_seq_len[bidb + 1] - cu_seq_len[bidb]);
			
 
				+}
			
 
				+
			
 
				+template <>
			
 
				+template <typename MTensor, typename Shape>
			
 
				+CUTLASS_DEVICE auto VarSeqLenTraits::get_local_tile_tensor(
			
 
				+    const MTensor &m_tensor, const Shape &tile_shape,
			
 
				+    int bidh, int bidb, bool padded) const {
			
 
				+  auto g_offset = local_tile(
			
 
				+      m_tensor(_, _, bidh), 
			
 
				+      cute::make_shape(1, get<1>(tile_shape)), 
			
 
				+      make_coord(cu_seq_len[bidb] + (padded ? kMaxTileSize * bidb : 0), _0{}));
			
 
				+  auto g_sequence = make_tensor(
			
 
				+      g_offset.data(), 
			
 
				+      make_layout(
			
 
				+        cute::make_shape(actual_seq_len, get<1>(tile_shape)), 
			
 
				+        g_offset.stride()
			
 
				+      ));
			
 
				+  auto g_tensor = local_tile(g_sequence, tile_shape, make_coord(_, _0{}));
			
 
				+  return g_tensor;
			
 
				+}
			
 
				+
			
 
				+template <>
			
 
				+template <typename MTensor, typename Shape>
			
 
				+CUTLASS_DEVICE auto VarSeqLenTraits::get_lse_local_tile_tensor(
			
 
				+    const MTensor &m_tensor, const Shape &tile_shape,
			
 
				+    int bidh, int bidb, bool padded) const {
			
 
				+  auto g_offset = local_tile(
			
 
				+      m_tensor(bidh, _), cute::make_shape(_1{}), 
			
 
				+      make_coord(cu_seq_len[bidb] + (padded ? kMaxTileSize * bidb : 0)));
			
 
				+  auto g_sequence = make_tensor(
			
 
				+      g_offset.data(), 
			
 
				+      make_layout(cute::make_shape(actual_seq_len), cute::make_shape(_1{})));
			
 
				+  auto g_tensor = local_tile(g_sequence, tile_shape, make_coord(_));
			
 
				+  return g_tensor;
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+}  // namespace flash
			
--- a/hopper/static_switch.h
+++ b/hopper/static_switch.h
@@ -66,18 +66,14 @@
 
				     }                                                                          \
			
 
				   }()
			
 
				 
			
 
				-#define SEQLEN_SWITCH(USE_VAR_SEQ_LEN, SEQ_LEN_OUT_OF_BOUND_CHECK, ...)        \
			
 
				+#define SEQLEN_SWITCH(USE_VAR_SEQ_LEN, NAME, ...)                              \
			
 
				   [&] {                                                                        \
			
 
				-    if (!USE_VAR_SEQ_LEN) {                                                    \
			
 
				-      if (SEQ_LEN_OUT_OF_BOUND_CHECK) {                                        \
			
 
				-        using kSeqLenTraitsType = FixedSeqLenTraits<true>;                     \
			
 
				-        return __VA_ARGS__();                                                  \
			
 
				-      } else {                                                                 \
			
 
				-        using kSeqLenTraitsType = FixedSeqLenTraits<false>;                    \
			
 
				-        return __VA_ARGS__();                                                  \
			
 
				-      }                                                                        \
			
 
				+    bool useSeqLen = USE_VAR_SEQ_LEN;                                          \
			
 
				+    if (useSeqLen) {                                                           \
			
 
				+      using NAME = flash::VarSeqLenTraits;                                     \
			
 
				+      return __VA_ARGS__();                                                    \
			
 
				     } else {                                                                   \
			
 
				-      using kSeqLenTraitsType = VarSeqLenTraits;                               \
			
 
				+      using NAME = flash::FixedSeqLenTraits;                                   \
			
 
				       return __VA_ARGS__();                                                    \
			
 
				     }                                                                          \
			
 
				   }() 
			
--- a/hopper/test_flash_attn.py
+++ b/hopper/test_flash_attn.py
@@ -5,40 +5,12 @@ import torch
 
				 import torch.nn.functional as F
			
 
				 
			
 
				 from einops import rearrange, repeat
			
 
				-from flash_attn_interface import flash_attn_func
			
 
				+from flash_attn_interface import flash_attn_func, flash_attn_varlen_func
			
 
				+from tests.test_util import generate_random_padding_mask, generate_qkv, construct_local_mask, attention_ref
			
 
				 
			
 
				 ABS_TOL = 5e-3
			
 
				 REL_TOL = 1e-1
			
 
				 
			
 
				-def construct_local_mask(
			
 
				-    seqlen_q,
			
 
				-    seqlen_k,
			
 
				-    window_size=(-1, -1),  # -1 means infinite window size
			
 
				-    query_padding_mask=None,
			
 
				-    key_padding_mask=None,
			
 
				-    device=None,
			
 
				-):
			
 
				-    row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1")
			
 
				-    col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long)
			
 
				-    sk = (
			
 
				-        seqlen_k
			
 
				-        if key_padding_mask is None
			
 
				-        else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1")
			
 
				-    )
			
 
				-    sq = (
			
 
				-        seqlen_q
			
 
				-        if query_padding_mask is None
			
 
				-        else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1")
			
 
				-    )
			
 
				-    if window_size[0] < 0:
			
 
				-        return col_idx > row_idx + sk - sq + window_size[1]
			
 
				-    else:
			
 
				-        sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk
			
 
				-        return torch.logical_or(
			
 
				-            col_idx > torch.minimum(row_idx + sk - sq + window_size[1], sk),
			
 
				-            col_idx < row_idx + sk - sq - window_size[0],
			
 
				-        )
			
 
				-
			
 
				 def print_diffs(out, out_ref):
			
 
				     out_1d = out.flatten()
			
 
				     out_ref_1d = out_ref.flatten()
			
@@ -51,86 +23,6 @@ def print_diffs(out, out_ref):
 
				             print(f"==== diff ==== {idx}, test: {e_o}, ref: {e_o_ref}")
			
 
				 
			
 
				 
			
 
				-def attention_ref(
			
 
				-    q,
			
 
				-    k,
			
 
				-    v,
			
 
				-    query_padding_mask=None,
			
 
				-    key_padding_mask=None,
			
 
				-    attn_bias=None,
			
 
				-    dropout_p=0.0,
			
 
				-    dropout_mask=None,
			
 
				-    causal=False,
			
 
				-    upcast=True,
			
 
				-    reorder_ops=False,
			
 
				-):
			
 
				-    """
			
 
				-    Arguments:
			
 
				-        q: (batch_size, seqlen_q, nheads, head_dim)
			
 
				-        k: (batch_size, seqlen_k, nheads, head_dim)
			
 
				-        v: (batch_size, seqlen_k, nheads, head_dim)
			
 
				-        query_padding_mask: (batch_size, seqlen_q)
			
 
				-        key_padding_mask: (batch_size, seqlen_k)
			
 
				-        attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k)
			
 
				-        dropout_p: float
			
 
				-        dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k)
			
 
				-        causal: whether to apply causal masking
			
 
				-        upcast: whether to cast all inputs to fp32, do all computation in fp32, then cast
			
 
				-            output back to fp16/bf16.
			
 
				-        reorder_ops: whether to change the order of operations (scaling k instead of scaling k, etc.)
			
 
				-            without changing the math. This is to estimate the numerical error from operation
			
 
				-            reordering.
			
 
				-    Output:
			
 
				-        output: (batch_size, seqlen_q, nheads, head_dim)
			
 
				-        attention: (batch_size, nheads, seqlen_q, seqlen_k), softmax after dropout
			
 
				-    """
			
 
				-    dtype_og = q.dtype
			
 
				-    if upcast:
			
 
				-        q, k, v = q.float(), k.float(), v.float()
			
 
				-    seqlen_q, seqlen_k = q.shape[1], k.shape[1]
			
 
				-    k = repeat(k, "b s h d -> b s (h g) d", g=q.shape[2] // k.shape[2])
			
 
				-    v = repeat(v, "b s h d -> b s (h g) d", g=q.shape[2] // v.shape[2])
			
 
				-    d = q.shape[-1]
			
 
				-    if not reorder_ops:
			
 
				-        scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(d), k)
			
 
				-    else:
			
 
				-        scores = torch.einsum("bthd,bshd->bhts", q, k / math.sqrt(d))
			
 
				-    if key_padding_mask is not None:
			
 
				-        scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf"))
			
 
				-    if causal:
			
 
				-        local_mask = construct_local_mask(
			
 
				-            seqlen_q,
			
 
				-            seqlen_k,
			
 
				-            (-1, 0),
			
 
				-            None,
			
 
				-            None,
			
 
				-            q.device,
			
 
				-        )
			
 
				-        scores.masked_fill_(local_mask, float("-inf"))
			
 
				-    if attn_bias is not None:
			
 
				-        scores = scores + attn_bias
			
 
				-    attention = torch.softmax(scores, dim=-1).to(v.dtype)
			
 
				-    # We want to mask here so that the attention matrix doesn't have any NaNs
			
 
				-    # Otherwise we'll get NaN in dV
			
 
				-    if query_padding_mask is not None:
			
 
				-        attention = attention.masked_fill(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0)
			
 
				-    # Some rows might be completely masked out so we fill them with zero instead of NaN
			
 
				-    if causal:
			
 
				-        attention = attention.masked_fill(torch.all(local_mask, dim=-1, keepdim=True), 0.0)
			
 
				-    dropout_scaling = 1.0 / (1 - dropout_p)
			
 
				-    # attention_drop = attention.masked_fill(~dropout_mask, 0.0) * dropout_scaling
			
 
				-    # output = torch.einsum('bhts,bshd->bthd', attention_drop , v)
			
 
				-    if dropout_mask is not None:
			
 
				-        attention_drop = attention.masked_fill(~dropout_mask, 0.0)
			
 
				-    else:
			
 
				-        attention_drop = attention
			
 
				-    output = torch.einsum("bhts,bshd->bthd", attention_drop, v * dropout_scaling)
			
 
				-    if query_padding_mask is not None:
			
 
				-        output.masked_fill_(rearrange(~query_padding_mask, "b s -> b s 1 1"), 0.0)
			
 
				-    return output.to(dtype=dtype_og), attention.to(dtype=dtype_og)
			
 
				-
			
 
				-
			
 
				-
			
 
				 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
			
 
				 # @pytest.mark.parametrize("dtype", [torch.bfloat16])
			
 
				 @pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
			
@@ -142,10 +34,11 @@ def attention_ref(
 
				 # @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192])
			
 
				 # @pytest.mark.parametrize('d', [56, 80])
			
 
				 @pytest.mark.parametrize("d", [64, 128, 256])
			
 
				-# @pytest.mark.parametrize("d", [256])
			
 
				+# @pytest.mark.parametrize("d", [128])
			
 
				 @pytest.mark.parametrize(
			
 
				     "seqlen_q,seqlen_k",
			
 
				     [
			
 
				+        (257, 1),
			
 
				         (64, 128),
			
 
				         (128, 128),
			
 
				         (256, 256),
			
@@ -175,8 +68,9 @@ def test_flash_attn_output(
 
				     batch_size = 9
			
 
				     nheads = 6
			
 
				     nheads_kv = 6 if mha_type == "mha" else (2 if mha_type == "gqa" else 1)
			
 
				-    # batch_size = 1
			
 
				-    # nheads = 1
			
 
				+    # nheads_kv = 2
			
 
				+    # batch_size = 9
			
 
				+    # nheads = 6
			
 
				     q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True)
			
 
				     k = torch.randn(batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype, requires_grad=True)
			
 
				     v = torch.randn(batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype, requires_grad=True)
			
@@ -244,9 +138,172 @@ def test_flash_attn_output(
 
				 
			
 
				     # Check that FlashAttention's numerical error is at most twice the numerical error
			
 
				     # of a Pytorch implementation.
			
 
				+    # breakpoint()
			
 
				     assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item()
			
 
				 
			
 
				     # if d <= 128:
			
 
				     #     assert (dq - dq_ref).abs().max().item() <= 2 * (dq_pt - dq_ref).abs().max().item()
			
 
				     #     assert (dk - dk_ref).abs().max().item() <= 2 * (dk_pt - dk_ref).abs().max().item()
			
 
				     #     assert (dv - dv_ref).abs().max().item() <= 2 * (dv_pt - dv_ref).abs().max().item()
			
 
				+
			
 
				+
			
 
				+@pytest.mark.parametrize("dtype", [torch.float16])
			
 
				+@pytest.mark.parametrize("causal", [False, True])
			
 
				+@pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
			
 
				+# @pytest.mark.parametrize('causal', [True])
			
 
				+# @pytest.mark.parametrize("d", [32, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256])
			
 
				+# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
			
 
				+# @pytest.mark.parametrize('d', [128])
			
 
				+@pytest.mark.parametrize("d", [64, 128, 256])
			
 
				+@pytest.mark.parametrize(
			
 
				+    "seqlen_q,seqlen_k",
			
 
				+    [
			
 
				+        (1, 1),
			
 
				+        (1, 3),
			
 
				+        (2, 1),
			
 
				+        (511, 1),
			
 
				+        (3, 513),
			
 
				+        (64, 128),
			
 
				+        (113, 203),
			
 
				+        (128, 128),
			
 
				+        (128, 217),
			
 
				+        (113, 211),
			
 
				+        (108, 256),
			
 
				+        (256, 512),
			
 
				+        (384, 256),
			
 
				+        (512, 256),
			
 
				+        (640, 128),
			
 
				+        (1024, 1024),
			
 
				+        (1023, 1024),
			
 
				+        (1024, 1023),
			
 
				+        (2048, 2048),
			
 
				+    ],
			
 
				+)
			
 
				+# @pytest.mark.parametrize('seqlen_q,seqlen_k', [(128, 128)])
			
 
				+def test_flash_attn_varlen_output(
			
 
				+    seqlen_q, seqlen_k, d, causal, mha_type, dtype
			
 
				+):
			
 
				+    if (
			
 
				+        max(seqlen_q, seqlen_k) >= 2048
			
 
				+        and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30
			
 
				+    ):
			
 
				+        pytest.skip()  # Reference implementation OOM
			
 
				+    device = "cuda"
			
 
				+    # set seed
			
 
				+    torch.random.manual_seed(0)
			
 
				+    # batch_size = 1
			
 
				+    # nheads = 1
			
 
				+    batch_size = 9
			
 
				+    nheads = 6
			
 
				+    nheads_kv = 6 if mha_type == "mha" else (2 if mha_type == "gqa" else 1)
			
 
				+ 
			
 
				+    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True)
			
 
				+    k = torch.randn(
			
 
				+        batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype, requires_grad=True
			
 
				+    )
			
 
				+    v = torch.randn(
			
 
				+        batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype, requires_grad=True
			
 
				+    )
			
 
				+
			
 
				+    query_padding_mask = generate_random_padding_mask(seqlen_q, batch_size, device, mode="random")
			
 
				+    key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode="random")
			
 
				+    # key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode='full')
			
 
				+
			
 
				+    (
			
 
				+        q_unpad,
			
 
				+        k_unpad,
			
 
				+        v_unpad,
			
 
				+        cu_seqlens_q,
			
 
				+        cu_seqlens_k,
			
 
				+        max_seqlen_q,
			
 
				+        max_seqlen_k,
			
 
				+        q,
			
 
				+        k,
			
 
				+        v,
			
 
				+        output_pad_fn,
			
 
				+        dq_pad_fn,
			
 
				+        dk_pad_fn,
			
 
				+    ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
			
 
				+    # print("cu_seqlens_q: ", cu_seqlens_q)
			
 
				+    # print("cu_seqlens_k: ", cu_seqlens_k)
			
 
				+    # print("q_unpad, shape: ", q_unpad.shape)
			
 
				+    # print("k_unpad, shape: ", k_unpad.shape)
			
 
				+    # print("v_unpad, shape: ", v_unpad.shape)
			
 
				+    out_unpad, sm_lse = flash_attn_varlen_func(
			
 
				+        q_unpad,
			
 
				+        k_unpad,
			
 
				+        v_unpad,
			
 
				+        cu_seqlens_q,
			
 
				+        cu_seqlens_k,
			
 
				+        max_seqlen_q,
			
 
				+        max_seqlen_k,
			
 
				+        causal=causal,
			
 
				+    )
			
 
				+    out = output_pad_fn(out_unpad)
			
 
				+    dropout_mask = None
			
 
				+
			
 
				+    out_ref, attn_ref = attention_ref(
			
 
				+        q,
			
 
				+        k,
			
 
				+        v,
			
 
				+        query_padding_mask,
			
 
				+        key_padding_mask,
			
 
				+        causal=causal,
			
 
				+    )
			
 
				+    out_pt, attn_pt = attention_ref(
			
 
				+        q,
			
 
				+        k,
			
 
				+        v,
			
 
				+        query_padding_mask,
			
 
				+        key_padding_mask,
			
 
				+        causal=causal,
			
 
				+        upcast=False,
			
 
				+        reorder_ops=True,
			
 
				+    )
			
 
				+
			
 
				+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
			
 
				+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
			
 
				+    print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
			
 
				+    print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
			
 
				+
			
 
				+    # g = torch.randn_like(out)
			
 
				+    # if d <= 128:
			
 
				+    #     (
			
 
				+    #         dq_unpad,
			
 
				+    #         dk_unpad,
			
 
				+    #         dv_unpad,
			
 
				+    #     ) = torch.autograd.grad(out, (q_unpad, k_unpad, v_unpad), g)
			
 
				+    #     dk = dk_pad_fn(dk_unpad)
			
 
				+    #     dv = dk_pad_fn(dv_unpad)
			
 
				+    #     (
			
 
				+    #         dq_ref,
			
 
				+    #         dk_ref,
			
 
				+    #         dv_ref,
			
 
				+    #     ) = torch.autograd.grad(out_ref, (q, k, v), g)
			
 
				+    #     (
			
 
				+    #         dq_pt,
			
 
				+    #         dk_pt,
			
 
				+    #         dv_pt,
			
 
				+    #     ) = torch.autograd.grad(out_pt, (q, k, v), g)
			
 
				+    #     dq = dq_pad_fn(dq_unpad)
			
 
				+    #     print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}")
			
 
				+    #     print(f"dK max diff: {(dk - dk_ref).abs().max().item()}")
			
 
				+    #     print(f"dV max diff: {(dv - dv_ref).abs().max().item()}")
			
 
				+    #     print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}")
			
 
				+    #     print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}")
			
 
				+    #     print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}")
			
 
				+    #     print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}")
			
 
				+    #     print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}")
			
 
				+    #     print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}")
			
 
				+    #     print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}")
			
 
				+    #     print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}")
			
 
				+    #     print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}")
			
 
				+
			
 
				+    # Check that FlashAttention's numerical error is at most twice the numerical error
			
 
				+    # of a Pytorch implementation.
			
 
				+    assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item()
			
 
				+
			
 
				+    # if d <= 128:
			
 
				+    #     assert (dq - dq_ref).abs().max().item() < 1e-4 or (dq - dq_ref).abs().max().item() <= 3 * (dq_pt - dq_ref).abs().max().item()
			
 
				+    #     assert (dk - dk_ref).abs().max().item() < 1e-4 or (dk - dk_ref).abs().max().item() <= 3 * (dk_pt - dk_ref).abs().max().item()
			
 
				+    #     assert (dk - dk_ref).abs().max().item() < 1e-4 or (dv - dv_ref).abs().max().item() <= 3 * (dv_pt - dv_ref).abs().max().item()
			
--- a/hopper/utils.h
+++ b/hopper/utils.h
@@ -15,6 +15,7 @@
 
				 #endif
			
 
				 
			
 
				 #include <cute/tensor.hpp>
			
 
				+#include <cute/atom/copy_atom.hpp>
			
 
				 
			
 
				 #include <cutlass/array.h>
			
 
				 #include <cutlass/cutlass.h>
			
@@ -228,4 +229,93 @@ __forceinline__ __device__ void copy(TiledCopy tiled_copy, Tensor<Engine0, Layou
 
				 
			
 
				 ////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				 
			
 
				+template <int NumCopyThreads, typename ElemO, typename TMACopyO, typename LayoutO, 
			
 
				+          typename TileShapeO, typename SMemO, typename SeqLenTraits>
			
 
				+__forceinline__ __device__ void write_tma(
			
 
				+        ElemO* O, const TMACopyO& tma_store_O,
			
 
				+        const LayoutO& layout_O, const TileShapeO& tile_shape_O,
			
 
				+        const SMemO& sO, int m_block, int bidh, int bidb,
			
 
				+        const SeqLenTraits& seqlen_traits_o, int write_warp_idx) {
			
 
				+    Tensor mO = tma_store_O.get_tma_tensor(layout_O.shape());
			
 
				+    Tensor gO = seqlen_traits_o.get_local_tile_tensor(
			
 
				+        mO, tile_shape_O, bidh, bidb
			
 
				+    )(_, _, m_block);  // (M, K)
			
 
				+    auto block_tma_O = tma_store_O.get_slice(_0{});
			
 
				+    Tensor tOgO = block_tma_O.partition_D(gO);  // (TMA, TMA_M, TMA_K)
			
 
				+    Tensor tOsO = block_tma_O.partition_S(sO);  // (TMA, TMA_M, TMA_K)
			
 
				+
			
 
				+    int const lane_predicate = cute::elect_one_sync();
			
 
				+    int const warp_idx = cutlass::canonical_warp_idx_sync();
			
 
				+    if (warp_idx == write_warp_idx && lane_predicate) {
			
 
				+        cute::copy(tma_store_O, tOsO, tOgO);
			
 
				+        tma_store_arrive();
			
 
				+    }
			
 
				+    // Note: no wait here.
			
 
				+    // tma_store_wait<0>();
			
 
				+}
			
 
				+
			
 
				+template <int NumCopyThreads, typename ElemO, typename TiledCopyO, typename LayoutO, 
			
 
				+          typename TileShapeO, typename SMemO, typename SeqLenTraits>
			
 
				+__forceinline__ __device__ void write_tiled(
			
 
				+        ElemO* O, const TiledCopyO& tiled_copy_O,
			
 
				+        const LayoutO& layout_O, const TileShapeO& tile_shape_O,
			
 
				+        const SMemO& sO, int m_block, int bidh, int bidb,
			
 
				+        const SeqLenTraits& seqlen_traits_o) {
			
 
				+    Tensor mO = make_tensor(make_gmem_ptr(O), layout_O);
			
 
				+    Tensor gO = seqlen_traits_o.get_local_tile_tensor(
			
 
				+        mO, tile_shape_O, bidh, bidb
			
 
				+    )(_, _, m_block);  // (M, K)
			
 
				+
			
 
				+    ThrCopy thr_copy_O = tiled_copy_O.get_slice(threadIdx.x - NumCopyThreads);
			
 
				+    Tensor tOgO = thr_copy_O.partition_D(gO); // (CPY,CPY_M,CPY_K,k)
			
 
				+    Tensor tOsO = thr_copy_O.partition_S(sO); // (CPY,CPY_M,CPY_K)
			
 
				+
			
 
				+    // Prepare for TiledCopy.
			
 
				+    // Grouping is needed because cute::copy_if() does group_modes<1, R> for src and dst.
			
 
				+    // After grouping, the first dim is number of elements to read together.
			
 
				+    Tensor tOsOFlatten = cute::flatten(tOsO);
			
 
				+    Tensor tOsOGroup = cute::group_modes<1, rank(tOsOFlatten)>(tOsOFlatten);
			
 
				+    Tensor tOgOFlatten = cute::flatten(tOgO);
			
 
				+    Tensor tOgOGroup = cute::group_modes<1, rank(tOgOFlatten)>(tOgOFlatten);
			
 
				+
			
 
				+    // Get thread coords to global index mapping.
			
 
				+    Tensor gOCounting = cute::make_identity_tensor(gO.shape());
			
 
				+    Tensor tSgOCounting = thr_copy_O.partition_D(gOCounting);
			
 
				+    Tensor tSgOCountingFlatten = cute::flatten(tSgOCounting);
			
 
				+    Tensor tSgOCountingGrouped =
			
 
				+        cute::group_modes<1, rank(tSgOCountingFlatten)>(tSgOCountingFlatten);
			
 
				+
			
 
				+    // Write out to GMEM.
			
 
				+    const int kNumMsPerTile = get<0>(tile_shape_O);
			
 
				+    int cta_m = std::min(
			
 
				+        seqlen_traits_o.actual_seq_len - m_block * kNumMsPerTile, kNumMsPerTile
			
 
				+    );
			
 
				+    if (cta_m == kNumMsPerTile) {
			
 
				+        copy(tiled_copy_O, tOsOGroup, tOgOGroup);
			
 
				+    } else {
			
 
				+        auto predicate_fn = [&](auto coords) {
			
 
				+            auto s_coords = tSgOCountingGrouped(_0{}, coords);
			
 
				+            return elem_less(get<0>(s_coords), cta_m);
			
 
				+        };
			
 
				+        copy_if(tiled_copy_O, predicate_fn, tOsOGroup, tOgOGroup);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <bool IsTMACopy, int NumCopyThreads, typename ElemO, 
			
 
				+          typename TMACopyO, typename TiledCopyO, typename LayoutO, 
			
 
				+          typename TileShapeO, typename SMemO, typename SeqLenTraits>
			
 
				+__forceinline__ __device__ void write_O(
			
 
				+        ElemO* O, const TMACopyO& tma_copy_O, const TiledCopyO& tiled_copy_O,
			
 
				+        const LayoutO& layout_O, const TileShapeO& tile_shape_O,
			
 
				+        const SMemO& sO, int m_block, int bidh, int bidb,
			
 
				+        const SeqLenTraits& seqlen_traits_o, int write_warp_idx) {
			
 
				+    if constexpr (IsTMACopy) {
			
 
				+        write_tma<NumCopyThreads>(O, tma_copy_O, layout_O, tile_shape_O, sO, m_block, bidh, bidb, seqlen_traits_o, write_warp_idx);
			
 
				+    } else {
			
 
				+        write_tiled<NumCopyThreads>(O, tiled_copy_O, layout_O, tile_shape_O, sO, m_block, bidh, bidb, seqlen_traits_o);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				 }  // namespace flash
			
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -0,0 +1,254 @@
 
				+import math
			
 
				+
			
 
				+import torch
			
 
				+from einops import rearrange, repeat
			
 
				+from flash_attn.bert_padding import pad_input, unpad_input
			
 
				+
			
 
				+
			
 
				+def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"):
			
 
				+    assert mode in ["full", "random", "third"]
			
 
				+    if mode == "full":
			
 
				+        lengths = torch.full((batch_size, 1), max_seqlen, device=device, dtype=torch.int32)
			
 
				+    elif mode == "random":
			
 
				+        lengths = torch.randint(
			
 
				+            max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device
			
 
				+        )
			
 
				+    elif mode == "third":
			
 
				+        lengths = torch.randint(max_seqlen // 3, max_seqlen + 1, (batch_size, 1), device=device)
			
 
				+    padding_mask = (
			
 
				+        repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths
			
 
				+    )
			
 
				+    return padding_mask
			
 
				+
			
 
				+
			
 
				+def generate_qkv(
			
 
				+    q, k, v, query_padding_mask=None, key_padding_mask=None, kvpacked=False, qkvpacked=False
			
 
				+):
			
 
				+    """
			
 
				+    Arguments:
			
 
				+        q: (batch_size, seqlen_q, nheads, d)
			
 
				+        k: (batch_size, seqlen_k, nheads_k, d)
			
 
				+        v: (batch_size, seqlen_k, nheads_k, d)
			
 
				+        query_padding_mask: (batch_size, seqlen), bool
			
 
				+        key_padding_mask: (batch_size, seqlen), bool
			
 
				+    """
			
 
				+    assert not (kvpacked and qkvpacked)
			
 
				+    batch_size, seqlen_q, nheads, d = q.shape
			
 
				+    _, seqlen_k, nheads_k, _ = k.shape
			
 
				+    assert k.shape == (batch_size, seqlen_k, nheads_k, d)
			
 
				+    assert v.shape == (batch_size, seqlen_k, nheads_k, d)
			
 
				+
			
 
				+    if query_padding_mask is not None:
			
 
				+        q_unpad, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, query_padding_mask)
			
 
				+        output_pad_fn = lambda output_unpad: pad_input(
			
 
				+            output_unpad, indices_q, batch_size, seqlen_q
			
 
				+        )
			
 
				+    else:
			
 
				+        q_unpad = rearrange(q, "b s h d -> (b s) h d")
			
 
				+        cu_seqlens_q = torch.arange(
			
 
				+            0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q_unpad.device
			
 
				+        )
			
 
				+        max_seqlen_q = seqlen_q
			
 
				+        output_pad_fn = lambda output_unpad: rearrange(
			
 
				+            output_unpad, "(b s) h d -> b s h d", b=batch_size
			
 
				+        )
			
 
				+
			
 
				+    if key_padding_mask is not None:
			
 
				+        k_unpad, indices_k, cu_seqlens_k, max_seqlen_k = unpad_input(k, key_padding_mask)
			
 
				+        v_unpad, _, _, _ = unpad_input(v, key_padding_mask)
			
 
				+    else:
			
 
				+        k_unpad = rearrange(k, "b s h d -> (b s) h d")
			
 
				+        v_unpad = rearrange(v, "b s h d -> (b s) h d")
			
 
				+        cu_seqlens_k = torch.arange(
			
 
				+            0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=k_unpad.device
			
 
				+        )
			
 
				+        max_seqlen_k = seqlen_k
			
 
				+
			
 
				+    if qkvpacked:
			
 
				+        assert (query_padding_mask == key_padding_mask).all()
			
 
				+        assert nheads == nheads_k
			
 
				+        qkv_unpad = torch.stack([q_unpad, k_unpad, v_unpad], dim=1)
			
 
				+        qkv = torch.stack([q, k, v], dim=2)
			
 
				+        if query_padding_mask is not None:
			
 
				+            dqkv_pad_fn = lambda dqkv_unpad: pad_input(dqkv_unpad, indices_q, batch_size, seqlen_q)
			
 
				+        else:
			
 
				+            dqkv_pad_fn = lambda dqkv_unpad: rearrange(
			
 
				+                dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size
			
 
				+            )
			
 
				+        return (
			
 
				+            qkv_unpad.detach().requires_grad_(),
			
 
				+            cu_seqlens_q,
			
 
				+            max_seqlen_q,
			
 
				+            qkv.detach().requires_grad_(),
			
 
				+            output_pad_fn,
			
 
				+            dqkv_pad_fn,
			
 
				+        )
			
 
				+    elif kvpacked:
			
 
				+        kv_unpad = torch.stack([k_unpad, v_unpad], dim=1)
			
 
				+        kv = torch.stack([k, v], dim=2)
			
 
				+        dq_pad_fn = output_pad_fn
			
 
				+        if key_padding_mask is not None:
			
 
				+            dkv_pad_fn = lambda dkv_unpad: pad_input(dkv_unpad, indices_k, batch_size, seqlen_k)
			
 
				+        else:
			
 
				+            dkv_pad_fn = lambda dkv_unpad: rearrange(
			
 
				+                dkv_unpad, "(b s) t h d -> b s t h d", b=batch_size
			
 
				+            )
			
 
				+        return (
			
 
				+            q_unpad.detach().requires_grad_(),
			
 
				+            kv_unpad.detach().requires_grad_(),
			
 
				+            cu_seqlens_q,
			
 
				+            cu_seqlens_k,
			
 
				+            max_seqlen_q,
			
 
				+            max_seqlen_k,
			
 
				+            q.detach().requires_grad_(),
			
 
				+            kv.detach().requires_grad_(),
			
 
				+            output_pad_fn,
			
 
				+            dq_pad_fn,
			
 
				+            dkv_pad_fn,
			
 
				+        )
			
 
				+    else:
			
 
				+        dq_pad_fn = output_pad_fn
			
 
				+        if key_padding_mask is not None:
			
 
				+            dk_pad_fn = lambda dk_unpad: pad_input(dk_unpad, indices_k, batch_size, seqlen_k)
			
 
				+        else:
			
 
				+            dk_pad_fn = lambda dk_unpad: rearrange(dk_unpad, "(b s) h d -> b s h d", b=batch_size)
			
 
				+        return (
			
 
				+            q_unpad.detach().requires_grad_(),
			
 
				+            k_unpad.detach().requires_grad_(),
			
 
				+            v_unpad.detach().requires_grad_(),
			
 
				+            cu_seqlens_q,
			
 
				+            cu_seqlens_k,
			
 
				+            max_seqlen_q,
			
 
				+            max_seqlen_k,
			
 
				+            q.detach().requires_grad_(),
			
 
				+            k.detach().requires_grad_(),
			
 
				+            v.detach().requires_grad_(),
			
 
				+            output_pad_fn,
			
 
				+            dq_pad_fn,
			
 
				+            dk_pad_fn,
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+def construct_local_mask(
			
 
				+    seqlen_q,
			
 
				+    seqlen_k,
			
 
				+    window_size=(-1, -1),  # -1 means infinite window size
			
 
				+    query_padding_mask=None,
			
 
				+    key_padding_mask=None,
			
 
				+    device=None,
			
 
				+    key_leftpad=None,
			
 
				+):
			
 
				+    row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1")
			
 
				+    col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long)
			
 
				+    if key_leftpad is not None:
			
 
				+        key_leftpad = rearrange(key_leftpad, "b -> b 1 1 1")
			
 
				+        col_idx = repeat(col_idx, "s -> b 1 1 s", b=key_leftpad.shape[0])
			
 
				+        col_idx = torch.where(col_idx >= key_leftpad, col_idx - key_leftpad, 2**32)
			
 
				+    sk = (
			
 
				+        seqlen_k
			
 
				+        if key_padding_mask is None
			
 
				+        else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1")
			
 
				+    )
			
 
				+    sq = (
			
 
				+        seqlen_q
			
 
				+        if query_padding_mask is None
			
 
				+        else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1")
			
 
				+    )
			
 
				+    if window_size[0] < 0:
			
 
				+        return col_idx > row_idx + sk - sq + window_size[1]
			
 
				+    else:
			
 
				+        sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk
			
 
				+        return torch.logical_or(
			
 
				+            col_idx > torch.minimum(row_idx + sk - sq + window_size[1], sk),
			
 
				+            col_idx < row_idx + sk - sq - window_size[0],
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+def attention_ref(
			
 
				+    q,
			
 
				+    k,
			
 
				+    v,
			
 
				+    query_padding_mask=None,
			
 
				+    key_padding_mask=None,
			
 
				+    attn_bias=None,
			
 
				+    dropout_p=0.0,
			
 
				+    dropout_mask=None,
			
 
				+    causal=False,
			
 
				+    window_size=(-1, -1),  # -1 means infinite window size
			
 
				+    softcap=0.0,
			
 
				+    upcast=True,
			
 
				+    reorder_ops=False,
			
 
				+    key_leftpad=None,
			
 
				+):
			
 
				+    """
			
 
				+    Arguments:
			
 
				+        q: (batch_size, seqlen_q, nheads, head_dim)
			
 
				+        k: (batch_size, seqlen_k, nheads_k, head_dim)
			
 
				+        v: (batch_size, seqlen_k, nheads_k, head_dim)
			
 
				+        query_padding_mask: (batch_size, seqlen_q)
			
 
				+        key_padding_mask: (batch_size, seqlen_k)
			
 
				+        attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k)
			
 
				+        dropout_p: float
			
 
				+        dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k)
			
 
				+        causal: whether to apply causal masking
			
 
				+        window_size: (int, int), left and right window size
			
 
				+        upcast: whether to cast all inputs to fp32, do all computation in fp32, then cast
			
 
				+            output back to fp16/bf16.
			
 
				+        reorder_ops: whether to change the order of operations (scaling k instead of scaling q, etc.)
			
 
				+            without changing the math. This is to estimate the numerical error from operation
			
 
				+            reordering.
			
 
				+    Output:
			
 
				+        output: (batch_size, seqlen_q, nheads, head_dim)
			
 
				+        attention: (batch_size, nheads, seqlen_q, seqlen_k), softmax after dropout
			
 
				+    """
			
 
				+    if causal:
			
 
				+        window_size = (window_size[0], 0)
			
 
				+    dtype_og = q.dtype
			
 
				+    if upcast:
			
 
				+        q, k, v = q.float(), k.float(), v.float()
			
 
				+    seqlen_q, seqlen_k = q.shape[1], k.shape[1]
			
 
				+    k = repeat(k, "b s h d -> b s (h g) d", g=q.shape[2] // k.shape[2])
			
 
				+    v = repeat(v, "b s h d -> b s (h g) d", g=q.shape[2] // v.shape[2])
			
 
				+    d = q.shape[-1]
			
 
				+    if not reorder_ops:
			
 
				+        scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(d), k)
			
 
				+    else:
			
 
				+        scores = torch.einsum("bthd,bshd->bhts", q, k / math.sqrt(d))
			
 
				+    if softcap > 0:
			
 
				+        scores /= softcap
			
 
				+        scores = scores.tanh()
			
 
				+        scores *= softcap
			
 
				+    if key_padding_mask is not None:
			
 
				+        scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf"))
			
 
				+    if window_size[0] >= 0 or window_size[1] >= 0:
			
 
				+        local_mask = construct_local_mask(
			
 
				+            seqlen_q,
			
 
				+            seqlen_k,
			
 
				+            window_size,
			
 
				+            query_padding_mask,
			
 
				+            key_padding_mask,
			
 
				+            q.device,
			
 
				+            key_leftpad=key_leftpad,
			
 
				+        )
			
 
				+        scores.masked_fill_(local_mask, float("-inf"))
			
 
				+    if attn_bias is not None:
			
 
				+        scores = scores + attn_bias
			
 
				+    attention = torch.softmax(scores, dim=-1).to(v.dtype)
			
 
				+    # Some rows might be completely masked out so we fill them with zero instead of NaN
			
 
				+    if window_size[0] >= 0 or window_size[1] >= 0:
			
 
				+        attention = attention.masked_fill(torch.all(local_mask, dim=-1, keepdim=True), 0.0)
			
 
				+    # We want to mask here so that the attention matrix doesn't have any NaNs
			
 
				+    # Otherwise we'll get NaN in dV
			
 
				+    if query_padding_mask is not None:
			
 
				+        attention = attention.masked_fill(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0)
			
 
				+    dropout_scaling = 1.0 / (1 - dropout_p)
			
 
				+    # attention_drop = attention.masked_fill(~dropout_mask, 0.0) * dropout_scaling
			
 
				+    # output = torch.einsum('bhts,bshd->bthd', attention_drop , v)
			
 
				+    if dropout_mask is not None:
			
 
				+        attention_drop = attention.masked_fill(~dropout_mask, 0.0)
			
 
				+    else:
			
 
				+        attention_drop = attention
			
 
				+    output = torch.einsum("bhts,bshd->bthd", attention_drop, v * dropout_scaling)
			
 
				+    if query_padding_mask is not None:
			
 
				+        output.masked_fill_(rearrange(~query_padding_mask, "b s -> b s 1 1"), 0.0)
			
 
				+    return output.to(dtype=dtype_og), attention.to(dtype=dtype_og)