Ying Zhang 496fdc4f6c Add seqused_q in fwd / bwd and seqused_k in bwd. vor 4 Monaten
..
__init__.py 7f67966cc7 FA3 initial code release vor 6 Monaten
benchmark_attn.py 3669b25206 bwd benchmark + small fixes (#1129) vor 5 Monaten
benchmark_flash_attention_fp8.py c92ca63268 FA3 FP8 qkv descales + restore max offset for h128 causal + added sync for producer WG (#1173) vor 4 Monaten
block_info.h 7f67966cc7 FA3 initial code release vor 6 Monaten
epilogue_bwd_sm90_tma.hpp 496fdc4f6c Add seqused_q in fwd / bwd and seqused_k in bwd. vor 4 Monaten
epilogue_fwd_sm90_tma.hpp a3a257c71d Fix out-of-bound writes for var-seq-len zero-length KVs vor 5 Monaten
flash.h 496fdc4f6c Add seqused_q in fwd / bwd and seqused_k in bwd. vor 4 Monaten
flash_api.cpp 496fdc4f6c Add seqused_q in fwd / bwd and seqused_k in bwd. vor 4 Monaten
flash_attn_interface.py 496fdc4f6c Add seqused_q in fwd / bwd and seqused_k in bwd. vor 4 Monaten
flash_bwd_hdim128_bf16_sm90.cu bafe253042 [FA3] Bwd vor 5 Monaten
flash_bwd_hdim128_fp16_sm90.cu 7f67966cc7 FA3 initial code release vor 6 Monaten
flash_bwd_hdim256_fp16_sm90.cu 7f67966cc7 FA3 initial code release vor 6 Monaten
flash_bwd_hdim64_bf16_sm90.cu bafe253042 [FA3] Bwd vor 5 Monaten
flash_bwd_hdim64_fp16_sm90.cu 7f67966cc7 FA3 initial code release vor 6 Monaten
flash_bwd_hdim96_bf16_sm90.cu bafe253042 [FA3] Bwd vor 5 Monaten
flash_bwd_hdim96_fp16_sm90.cu bafe253042 [FA3] Bwd vor 5 Monaten
flash_bwd_kernel.h 3f6ff1c1c5 Remove struct : cute::aligned_struct to avoid error with gcc 12 vor 5 Monaten
flash_bwd_launch_template.h 496fdc4f6c Add seqused_q in fwd / bwd and seqused_k in bwd. vor 4 Monaten
flash_bwd_postprocess_kernel.h 496fdc4f6c Add seqused_q in fwd / bwd and seqused_k in bwd. vor 4 Monaten
flash_bwd_preprocess_kernel.h 496fdc4f6c Add seqused_q in fwd / bwd and seqused_k in bwd. vor 4 Monaten
flash_fwd_hdim128_bf16_sm90.cu 74b0761ff7 [FA3] BF16 forward vor 6 Monaten
flash_fwd_hdim128_e4m3_sm90.cu 5018ac6ac5 Fp8 kernel with "in-kernel" transpose of V in producer (#1100) vor 5 Monaten
flash_fwd_hdim128_fp16_sm90.cu 7f67966cc7 FA3 initial code release vor 6 Monaten
flash_fwd_hdim256_bf16_sm90.cu 74b0761ff7 [FA3] BF16 forward vor 6 Monaten
flash_fwd_hdim256_e4m3_sm90.cu 5018ac6ac5 Fp8 kernel with "in-kernel" transpose of V in producer (#1100) vor 5 Monaten
flash_fwd_hdim256_fp16_sm90.cu 7f67966cc7 FA3 initial code release vor 6 Monaten
flash_fwd_hdim64_bf16_sm90.cu 74b0761ff7 [FA3] BF16 forward vor 6 Monaten
flash_fwd_hdim64_e4m3_sm90.cu 5018ac6ac5 Fp8 kernel with "in-kernel" transpose of V in producer (#1100) vor 5 Monaten
flash_fwd_hdim64_fp16_sm90.cu 7f67966cc7 FA3 initial code release vor 6 Monaten
flash_fwd_kernel.h c92ca63268 FA3 FP8 qkv descales + restore max offset for h128 causal + added sync for producer WG (#1173) vor 4 Monaten
flash_fwd_launch_template.h 496fdc4f6c Add seqused_q in fwd / bwd and seqused_k in bwd. vor 4 Monaten
kernel_traits.h c92ca63268 FA3 FP8 qkv descales + restore max offset for h128 causal + added sync for producer WG (#1173) vor 4 Monaten
mainloop_bwd_sm90_tma_gmma_ws.hpp 496fdc4f6c Add seqused_q in fwd / bwd and seqused_k in bwd. vor 4 Monaten
mainloop_fwd_sm90_tma_gmma_ws.hpp c92ca63268 FA3 FP8 qkv descales + restore max offset for h128 causal + added sync for producer WG (#1173) vor 4 Monaten
named_barrier.hpp bafe253042 [FA3] Bwd vor 5 Monaten
seq_len.h dfe1a59e4b Add var-seq-len to FA3 fp16 / bf16 fwd (#1072) vor 6 Monaten
setup.py bafe253042 [FA3] Bwd vor 5 Monaten
softmax.h c92ca63268 FA3 FP8 qkv descales + restore max offset for h128 causal + added sync for producer WG (#1173) vor 4 Monaten
static_switch.h dfe1a59e4b Add var-seq-len to FA3 fp16 / bf16 fwd (#1072) vor 6 Monaten
test_flash_attn.py 496fdc4f6c Add seqused_q in fwd / bwd and seqused_k in bwd. vor 4 Monaten
tile_scheduler.hpp 5018ac6ac5 Fp8 kernel with "in-kernel" transpose of V in producer (#1100) vor 5 Monaten
tile_scheduler_bwd.hpp bafe253042 [FA3] Bwd vor 5 Monaten
utils.h bafe253042 [FA3] Bwd vor 5 Monaten