Ying Zhang 496fdc4f6c Add seqused_q in fwd / bwd and seqused_k in bwd. 4 ماه پیش
..
__init__.py 7f67966cc7 FA3 initial code release 6 ماه پیش
benchmark_attn.py 3669b25206 bwd benchmark + small fixes (#1129) 5 ماه پیش
benchmark_flash_attention_fp8.py c92ca63268 FA3 FP8 qkv descales + restore max offset for h128 causal + added sync for producer WG (#1173) 4 ماه پیش
block_info.h 7f67966cc7 FA3 initial code release 6 ماه پیش
epilogue_bwd_sm90_tma.hpp 496fdc4f6c Add seqused_q in fwd / bwd and seqused_k in bwd. 4 ماه پیش
epilogue_fwd_sm90_tma.hpp a3a257c71d Fix out-of-bound writes for var-seq-len zero-length KVs 5 ماه پیش
flash.h 496fdc4f6c Add seqused_q in fwd / bwd and seqused_k in bwd. 4 ماه پیش
flash_api.cpp 496fdc4f6c Add seqused_q in fwd / bwd and seqused_k in bwd. 4 ماه پیش
flash_attn_interface.py 496fdc4f6c Add seqused_q in fwd / bwd and seqused_k in bwd. 4 ماه پیش
flash_bwd_hdim128_bf16_sm90.cu bafe253042 [FA3] Bwd 5 ماه پیش
flash_bwd_hdim128_fp16_sm90.cu 7f67966cc7 FA3 initial code release 6 ماه پیش
flash_bwd_hdim256_fp16_sm90.cu 7f67966cc7 FA3 initial code release 6 ماه پیش
flash_bwd_hdim64_bf16_sm90.cu bafe253042 [FA3] Bwd 5 ماه پیش
flash_bwd_hdim64_fp16_sm90.cu 7f67966cc7 FA3 initial code release 6 ماه پیش
flash_bwd_hdim96_bf16_sm90.cu bafe253042 [FA3] Bwd 5 ماه پیش
flash_bwd_hdim96_fp16_sm90.cu bafe253042 [FA3] Bwd 5 ماه پیش
flash_bwd_kernel.h 3f6ff1c1c5 Remove struct : cute::aligned_struct to avoid error with gcc 12 5 ماه پیش
flash_bwd_launch_template.h 496fdc4f6c Add seqused_q in fwd / bwd and seqused_k in bwd. 4 ماه پیش
flash_bwd_postprocess_kernel.h 496fdc4f6c Add seqused_q in fwd / bwd and seqused_k in bwd. 4 ماه پیش
flash_bwd_preprocess_kernel.h 496fdc4f6c Add seqused_q in fwd / bwd and seqused_k in bwd. 4 ماه پیش
flash_fwd_hdim128_bf16_sm90.cu 74b0761ff7 [FA3] BF16 forward 6 ماه پیش
flash_fwd_hdim128_e4m3_sm90.cu 5018ac6ac5 Fp8 kernel with "in-kernel" transpose of V in producer (#1100) 5 ماه پیش
flash_fwd_hdim128_fp16_sm90.cu 7f67966cc7 FA3 initial code release 6 ماه پیش
flash_fwd_hdim256_bf16_sm90.cu 74b0761ff7 [FA3] BF16 forward 6 ماه پیش
flash_fwd_hdim256_e4m3_sm90.cu 5018ac6ac5 Fp8 kernel with "in-kernel" transpose of V in producer (#1100) 5 ماه پیش
flash_fwd_hdim256_fp16_sm90.cu 7f67966cc7 FA3 initial code release 6 ماه پیش
flash_fwd_hdim64_bf16_sm90.cu 74b0761ff7 [FA3] BF16 forward 6 ماه پیش
flash_fwd_hdim64_e4m3_sm90.cu 5018ac6ac5 Fp8 kernel with "in-kernel" transpose of V in producer (#1100) 5 ماه پیش
flash_fwd_hdim64_fp16_sm90.cu 7f67966cc7 FA3 initial code release 6 ماه پیش
flash_fwd_kernel.h c92ca63268 FA3 FP8 qkv descales + restore max offset for h128 causal + added sync for producer WG (#1173) 4 ماه پیش
flash_fwd_launch_template.h 496fdc4f6c Add seqused_q in fwd / bwd and seqused_k in bwd. 4 ماه پیش
kernel_traits.h c92ca63268 FA3 FP8 qkv descales + restore max offset for h128 causal + added sync for producer WG (#1173) 4 ماه پیش
mainloop_bwd_sm90_tma_gmma_ws.hpp 496fdc4f6c Add seqused_q in fwd / bwd and seqused_k in bwd. 4 ماه پیش
mainloop_fwd_sm90_tma_gmma_ws.hpp c92ca63268 FA3 FP8 qkv descales + restore max offset for h128 causal + added sync for producer WG (#1173) 4 ماه پیش
named_barrier.hpp bafe253042 [FA3] Bwd 5 ماه پیش
seq_len.h dfe1a59e4b Add var-seq-len to FA3 fp16 / bf16 fwd (#1072) 6 ماه پیش
setup.py bafe253042 [FA3] Bwd 5 ماه پیش
softmax.h c92ca63268 FA3 FP8 qkv descales + restore max offset for h128 causal + added sync for producer WG (#1173) 4 ماه پیش
static_switch.h dfe1a59e4b Add var-seq-len to FA3 fp16 / bf16 fwd (#1072) 6 ماه پیش
test_flash_attn.py 496fdc4f6c Add seqused_q in fwd / bwd and seqused_k in bwd. 4 ماه پیش
tile_scheduler.hpp 5018ac6ac5 Fp8 kernel with "in-kernel" transpose of V in producer (#1100) 5 ماه پیش
tile_scheduler_bwd.hpp bafe253042 [FA3] Bwd 5 ماه پیش
utils.h bafe253042 [FA3] Bwd 5 ماه پیش