1
0
sclarkson 1feb711f46 Fix compilation with clang on ARM64 (#1285) 1 долоо хоног өмнө
..
__init__.py 7f67966cc7 FA3 initial code release 5 сар өмнө
benchmark_attn.py 3669b25206 bwd benchmark + small fixes (#1129) 4 сар өмнө
benchmark_flash_attention_fp8.py a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
benchmark_split_kv.py a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
combine.h 478ee666cc Make namespace comment consistent (#1305) 1 сар өмнө
copy_paged_sm90_tma.hpp 284e2c6e5b Make FA3 paged attention ready for upgrade to Cutlass 3.6 (#1331) 1 сар өмнө
copy_paged_sm90_tma_cutlass35.hpp 0823cf7b5d Fix FA3 Varlen Performance regression (#1361) 1 долоо хоног өмнө
copy_paged_sm90_tma_cutlass36.hpp 0823cf7b5d Fix FA3 Varlen Performance regression (#1361) 1 долоо хоног өмнө
epilogue_bwd_sm90_tma.hpp db80387343 Add seqused_q in fwd / bwd and seqused_k in bwd. 2 сар өмнө
epilogue_fwd_sm90_tma.hpp a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash.h b443207c1f Paged Attention support for FA3 (#1268) 1 сар өмнө
flash_api.cpp 1feb711f46 Fix compilation with clang on ARM64 (#1285) 1 долоо хоног өмнө
flash_attn_interface.py 284e2c6e5b Make FA3 paged attention ready for upgrade to Cutlass 3.6 (#1331) 1 сар өмнө
flash_bwd_hdim128_bf16_sm90.cu bafe253042 [FA3] Bwd 4 сар өмнө
flash_bwd_hdim128_fp16_sm90.cu 7f67966cc7 FA3 initial code release 5 сар өмнө
flash_bwd_hdim256_fp16_sm90.cu 7f67966cc7 FA3 initial code release 5 сар өмнө
flash_bwd_hdim64_bf16_sm90.cu bafe253042 [FA3] Bwd 4 сар өмнө
flash_bwd_hdim64_fp16_sm90.cu 7f67966cc7 FA3 initial code release 5 сар өмнө
flash_bwd_hdim96_bf16_sm90.cu bafe253042 [FA3] Bwd 4 сар өмнө
flash_bwd_hdim96_fp16_sm90.cu bafe253042 [FA3] Bwd 4 сар өмнө
flash_bwd_kernel.h 1c9717d699 address comments 2 сар өмнө
flash_bwd_launch_template.h 1c9717d699 address comments 2 сар өмнө
flash_bwd_postprocess_kernel.h db80387343 Add seqused_q in fwd / bwd and seqused_k in bwd. 2 сар өмнө
flash_bwd_preprocess_kernel.h db80387343 Add seqused_q in fwd / bwd and seqused_k in bwd. 2 сар өмнө
flash_fwd_hdim128_bf16_gqa16_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim128_bf16_gqa2_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim128_bf16_gqa32_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim128_bf16_gqa4_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim128_bf16_gqa8_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim128_bf16_sm90.cu 74b0761ff7 [FA3] BF16 forward 5 сар өмнө
flash_fwd_hdim128_e4m3_gqa16_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim128_e4m3_gqa2_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim128_e4m3_gqa32_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim128_e4m3_gqa4_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim128_e4m3_gqa8_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim128_e4m3_sm90.cu 5018ac6ac5 Fp8 kernel with "in-kernel" transpose of V in producer (#1100) 4 сар өмнө
flash_fwd_hdim128_fp16_gqa16_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim128_fp16_gqa2_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim128_fp16_gqa32_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim128_fp16_gqa4_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim128_fp16_gqa8_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim128_fp16_sm90.cu 7f67966cc7 FA3 initial code release 5 сар өмнө
flash_fwd_hdim256_bf16_gqa16_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim256_bf16_gqa2_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim256_bf16_gqa32_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim256_bf16_gqa4_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim256_bf16_gqa8_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim256_bf16_sm90.cu 74b0761ff7 [FA3] BF16 forward 5 сар өмнө
flash_fwd_hdim256_e4m3_gqa16_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim256_e4m3_gqa2_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim256_e4m3_gqa32_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim256_e4m3_gqa4_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim256_e4m3_gqa8_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim256_e4m3_sm90.cu 5018ac6ac5 Fp8 kernel with "in-kernel" transpose of V in producer (#1100) 4 сар өмнө
flash_fwd_hdim256_fp16_gqa16_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim256_fp16_gqa2_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim256_fp16_gqa32_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim256_fp16_gqa4_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim256_fp16_gqa8_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim256_fp16_sm90.cu 7f67966cc7 FA3 initial code release 5 сар өмнө
flash_fwd_hdim64_bf16_gqa16_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim64_bf16_gqa2_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim64_bf16_gqa32_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim64_bf16_gqa4_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim64_bf16_gqa8_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim64_bf16_sm90.cu 74b0761ff7 [FA3] BF16 forward 5 сар өмнө
flash_fwd_hdim64_e4m3_gqa16_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim64_e4m3_gqa2_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim64_e4m3_gqa32_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim64_e4m3_gqa4_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim64_e4m3_gqa8_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim64_e4m3_sm90.cu 5018ac6ac5 Fp8 kernel with "in-kernel" transpose of V in producer (#1100) 4 сар өмнө
flash_fwd_hdim64_fp16_gqa16_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim64_fp16_gqa2_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim64_fp16_gqa32_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim64_fp16_gqa4_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim64_fp16_gqa8_sm90.cu a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_hdim64_fp16_sm90.cu 7f67966cc7 FA3 initial code release 5 сар өмнө
flash_fwd_kernel.h a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
flash_fwd_launch_template.h b443207c1f Paged Attention support for FA3 (#1268) 1 сар өмнө
kernel_traits.h a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
mainloop_bwd_sm90_tma_gmma_ws.hpp 1c9717d699 address comments 2 сар өмнө
mainloop_fwd_sm90_tma_gmma_ws.hpp 0823cf7b5d Fix FA3 Varlen Performance regression (#1361) 1 долоо хоног өмнө
named_barrier.hpp 478ee666cc Make namespace comment consistent (#1305) 1 сар өмнө
seq_len.h b443207c1f Paged Attention support for FA3 (#1268) 1 сар өмнө
setup.py a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
softmax.h 478ee666cc Make namespace comment consistent (#1305) 1 сар өмнө
static_switch.h b443207c1f Paged Attention support for FA3 (#1268) 1 сар өмнө
test_attn_kvcache.py a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
test_flash_attn.py b443207c1f Paged Attention support for FA3 (#1268) 1 сар өмнө
test_kvcache.py a5a75274bc FA3 kvcache + split kv + gqa parallelization (#1236) 2 сар өмнө
tile_scheduler.hpp 478ee666cc Make namespace comment consistent (#1305) 1 сар өмнө
tile_scheduler_bwd.hpp 478ee666cc Make namespace comment consistent (#1305) 1 сар өмнө
utils.h 478ee666cc Make namespace comment consistent (#1305) 1 сар өмнө