Jay Shah 9b6cba16c1 remove some debug code il y a 2 mois
..
__init__.py 7f67966cc7 FA3 initial code release il y a 5 mois
benchmark_attn.py 3669b25206 bwd benchmark + small fixes (#1129) il y a 4 mois
benchmark_flash_attention_fp8.py 5e3864f2ee change default output type of fp8 kernel to bf16 il y a 2 mois
benchmark_split_kv.py fff4b5c09b add split kv benchmark script il y a 2 mois
combine.h aa45d75f64 dont write out zero for split kernel, only lse=-inf il y a 2 mois
epilogue_bwd_sm90_tma.hpp db80387343 Add seqused_q in fwd / bwd and seqused_k in bwd. il y a 3 mois
epilogue_fwd_sm90_tma.hpp b3d60fa3a5 prune more dead code il y a 2 mois
flash.h c06cc0ba9f change cu_seqlens_k to seqused_k for kv cache api il y a 2 mois
flash_api.cpp 9b6cba16c1 remove some debug code il y a 2 mois
flash_attn_interface.py 50cb90aea6 comment out unimplemented kwargs from flash_attn_with_kvcache il y a 2 mois
flash_bwd_hdim128_bf16_sm90.cu bafe253042 [FA3] Bwd il y a 4 mois
flash_bwd_hdim128_fp16_sm90.cu 7f67966cc7 FA3 initial code release il y a 5 mois
flash_bwd_hdim256_fp16_sm90.cu 7f67966cc7 FA3 initial code release il y a 5 mois
flash_bwd_hdim64_bf16_sm90.cu bafe253042 [FA3] Bwd il y a 4 mois
flash_bwd_hdim64_fp16_sm90.cu 7f67966cc7 FA3 initial code release il y a 5 mois
flash_bwd_hdim96_bf16_sm90.cu bafe253042 [FA3] Bwd il y a 4 mois
flash_bwd_hdim96_fp16_sm90.cu bafe253042 [FA3] Bwd il y a 4 mois
flash_bwd_kernel.h 1c9717d699 address comments il y a 2 mois
flash_bwd_launch_template.h 1c9717d699 address comments il y a 2 mois
flash_bwd_postprocess_kernel.h db80387343 Add seqused_q in fwd / bwd and seqused_k in bwd. il y a 3 mois
flash_bwd_preprocess_kernel.h db80387343 Add seqused_q in fwd / bwd and seqused_k in bwd. il y a 3 mois
flash_fwd_hdim128_bf16_gqa16_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim128_bf16_gqa2_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim128_bf16_gqa32_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim128_bf16_gqa4_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim128_bf16_gqa8_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim128_bf16_sm90.cu 74b0761ff7 [FA3] BF16 forward il y a 5 mois
flash_fwd_hdim128_e4m3_gqa16_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim128_e4m3_gqa2_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim128_e4m3_gqa32_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim128_e4m3_gqa4_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim128_e4m3_gqa8_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim128_e4m3_sm90.cu 5018ac6ac5 Fp8 kernel with "in-kernel" transpose of V in producer (#1100) il y a 4 mois
flash_fwd_hdim128_fp16_gqa16_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim128_fp16_gqa2_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim128_fp16_gqa32_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim128_fp16_gqa4_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim128_fp16_gqa8_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim128_fp16_sm90.cu 7f67966cc7 FA3 initial code release il y a 5 mois
flash_fwd_hdim256_bf16_gqa16_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim256_bf16_gqa2_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim256_bf16_gqa32_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim256_bf16_gqa4_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim256_bf16_gqa8_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim256_bf16_sm90.cu 74b0761ff7 [FA3] BF16 forward il y a 5 mois
flash_fwd_hdim256_e4m3_gqa16_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim256_e4m3_gqa2_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim256_e4m3_gqa32_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim256_e4m3_gqa4_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim256_e4m3_gqa8_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim256_e4m3_sm90.cu 5018ac6ac5 Fp8 kernel with "in-kernel" transpose of V in producer (#1100) il y a 4 mois
flash_fwd_hdim256_fp16_gqa16_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim256_fp16_gqa2_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim256_fp16_gqa32_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim256_fp16_gqa4_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim256_fp16_gqa8_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim256_fp16_sm90.cu 7f67966cc7 FA3 initial code release il y a 5 mois
flash_fwd_hdim64_bf16_gqa16_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim64_bf16_gqa2_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim64_bf16_gqa32_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim64_bf16_gqa4_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim64_bf16_gqa8_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim64_bf16_sm90.cu 74b0761ff7 [FA3] BF16 forward il y a 5 mois
flash_fwd_hdim64_e4m3_gqa16_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim64_e4m3_gqa2_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim64_e4m3_gqa32_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim64_e4m3_gqa4_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim64_e4m3_gqa8_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim64_e4m3_sm90.cu 5018ac6ac5 Fp8 kernel with "in-kernel" transpose of V in producer (#1100) il y a 4 mois
flash_fwd_hdim64_fp16_gqa16_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim64_fp16_gqa2_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim64_fp16_gqa32_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim64_fp16_gqa4_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim64_fp16_gqa8_sm90.cu bb230b8c54 separate gqa compilation il y a 2 mois
flash_fwd_hdim64_fp16_sm90.cu 7f67966cc7 FA3 initial code release il y a 5 mois
flash_fwd_kernel.h b3d60fa3a5 prune more dead code il y a 2 mois
flash_fwd_launch_template.h b3d60fa3a5 prune more dead code il y a 2 mois
kernel_traits.h 7c1473e0e5 remove Is_batch_dynamic from seqlen traits and handle fp8 perf regression using smem boolean il y a 2 mois
mainloop_bwd_sm90_tma_gmma_ws.hpp 1c9717d699 address comments il y a 2 mois
mainloop_fwd_sm90_tma_gmma_ws.hpp b3d60fa3a5 prune more dead code il y a 2 mois
named_barrier.hpp bafe253042 [FA3] Bwd il y a 4 mois
seq_len.h c06cc0ba9f change cu_seqlens_k to seqused_k for kv cache api il y a 2 mois
setup.py 03200a753f removed old gqa cu files and unified methods il y a 2 mois
softmax.h c92ca63268 FA3 FP8 qkv descales + restore max offset for h128 causal + added sync for producer WG (#1173) il y a 3 mois
static_switch.h bb230b8c54 separate gqa compilation il y a 2 mois
test_attn_kvcache.py a7cce59d25 adjust tolerances in test script for kv cache il y a 2 mois
test_flash_attn.py bb230b8c54 separate gqa compilation il y a 2 mois
test_kvcache.py 9c97808349 changes for correct lse write out for splits=1 and splits > 1 case. il y a 2 mois
tile_scheduler.hpp 0a1a0c22b6 refactor for split kv il y a 2 mois
tile_scheduler_bwd.hpp bafe253042 [FA3] Bwd il y a 4 mois
utils.h eb9c0ee22a add rmem -> gmem for fp8 il y a 2 mois