.. |
__init__.py
|
7f67966cc7
FA3 initial code release
|
5 mēneši atpakaļ |
benchmark_attn.py
|
3669b25206
bwd benchmark + small fixes (#1129)
|
4 mēneši atpakaļ |
benchmark_flash_attention_fp8.py
|
5e3864f2ee
change default output type of fp8 kernel to bf16
|
2 mēneši atpakaļ |
benchmark_split_kv.py
|
fff4b5c09b
add split kv benchmark script
|
2 mēneši atpakaļ |
combine.h
|
aa45d75f64
dont write out zero for split kernel, only lse=-inf
|
2 mēneši atpakaļ |
epilogue_bwd_sm90_tma.hpp
|
db80387343
Add seqused_q in fwd / bwd and seqused_k in bwd.
|
3 mēneši atpakaļ |
epilogue_fwd_sm90_tma.hpp
|
b3d60fa3a5
prune more dead code
|
2 mēneši atpakaļ |
flash.h
|
c06cc0ba9f
change cu_seqlens_k to seqused_k for kv cache api
|
2 mēneši atpakaļ |
flash_api.cpp
|
9b6cba16c1
remove some debug code
|
2 mēneši atpakaļ |
flash_attn_interface.py
|
50cb90aea6
comment out unimplemented kwargs from flash_attn_with_kvcache
|
2 mēneši atpakaļ |
flash_bwd_hdim128_bf16_sm90.cu
|
bafe253042
[FA3] Bwd
|
4 mēneši atpakaļ |
flash_bwd_hdim128_fp16_sm90.cu
|
7f67966cc7
FA3 initial code release
|
5 mēneši atpakaļ |
flash_bwd_hdim256_fp16_sm90.cu
|
7f67966cc7
FA3 initial code release
|
5 mēneši atpakaļ |
flash_bwd_hdim64_bf16_sm90.cu
|
bafe253042
[FA3] Bwd
|
4 mēneši atpakaļ |
flash_bwd_hdim64_fp16_sm90.cu
|
7f67966cc7
FA3 initial code release
|
5 mēneši atpakaļ |
flash_bwd_hdim96_bf16_sm90.cu
|
bafe253042
[FA3] Bwd
|
4 mēneši atpakaļ |
flash_bwd_hdim96_fp16_sm90.cu
|
bafe253042
[FA3] Bwd
|
4 mēneši atpakaļ |
flash_bwd_kernel.h
|
1c9717d699
address comments
|
2 mēneši atpakaļ |
flash_bwd_launch_template.h
|
1c9717d699
address comments
|
2 mēneši atpakaļ |
flash_bwd_postprocess_kernel.h
|
db80387343
Add seqused_q in fwd / bwd and seqused_k in bwd.
|
3 mēneši atpakaļ |
flash_bwd_preprocess_kernel.h
|
db80387343
Add seqused_q in fwd / bwd and seqused_k in bwd.
|
3 mēneši atpakaļ |
flash_fwd_hdim128_bf16_gqa16_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim128_bf16_gqa2_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim128_bf16_gqa32_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim128_bf16_gqa4_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim128_bf16_gqa8_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim128_bf16_sm90.cu
|
74b0761ff7
[FA3] BF16 forward
|
5 mēneši atpakaļ |
flash_fwd_hdim128_e4m3_gqa16_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim128_e4m3_gqa2_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim128_e4m3_gqa32_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim128_e4m3_gqa4_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim128_e4m3_gqa8_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim128_e4m3_sm90.cu
|
5018ac6ac5
Fp8 kernel with "in-kernel" transpose of V in producer (#1100)
|
4 mēneši atpakaļ |
flash_fwd_hdim128_fp16_gqa16_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim128_fp16_gqa2_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim128_fp16_gqa32_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim128_fp16_gqa4_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim128_fp16_gqa8_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim128_fp16_sm90.cu
|
7f67966cc7
FA3 initial code release
|
5 mēneši atpakaļ |
flash_fwd_hdim256_bf16_gqa16_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim256_bf16_gqa2_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim256_bf16_gqa32_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim256_bf16_gqa4_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim256_bf16_gqa8_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim256_bf16_sm90.cu
|
74b0761ff7
[FA3] BF16 forward
|
5 mēneši atpakaļ |
flash_fwd_hdim256_e4m3_gqa16_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim256_e4m3_gqa2_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim256_e4m3_gqa32_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim256_e4m3_gqa4_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim256_e4m3_gqa8_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim256_e4m3_sm90.cu
|
5018ac6ac5
Fp8 kernel with "in-kernel" transpose of V in producer (#1100)
|
4 mēneši atpakaļ |
flash_fwd_hdim256_fp16_gqa16_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim256_fp16_gqa2_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim256_fp16_gqa32_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim256_fp16_gqa4_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim256_fp16_gqa8_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim256_fp16_sm90.cu
|
7f67966cc7
FA3 initial code release
|
5 mēneši atpakaļ |
flash_fwd_hdim64_bf16_gqa16_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim64_bf16_gqa2_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim64_bf16_gqa32_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim64_bf16_gqa4_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim64_bf16_gqa8_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim64_bf16_sm90.cu
|
74b0761ff7
[FA3] BF16 forward
|
5 mēneši atpakaļ |
flash_fwd_hdim64_e4m3_gqa16_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim64_e4m3_gqa2_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim64_e4m3_gqa32_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim64_e4m3_gqa4_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim64_e4m3_gqa8_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim64_e4m3_sm90.cu
|
5018ac6ac5
Fp8 kernel with "in-kernel" transpose of V in producer (#1100)
|
4 mēneši atpakaļ |
flash_fwd_hdim64_fp16_gqa16_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim64_fp16_gqa2_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim64_fp16_gqa32_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim64_fp16_gqa4_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim64_fp16_gqa8_sm90.cu
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
flash_fwd_hdim64_fp16_sm90.cu
|
7f67966cc7
FA3 initial code release
|
5 mēneši atpakaļ |
flash_fwd_kernel.h
|
b3d60fa3a5
prune more dead code
|
2 mēneši atpakaļ |
flash_fwd_launch_template.h
|
b3d60fa3a5
prune more dead code
|
2 mēneši atpakaļ |
kernel_traits.h
|
7c1473e0e5
remove Is_batch_dynamic from seqlen traits and handle fp8 perf regression using smem boolean
|
2 mēneši atpakaļ |
mainloop_bwd_sm90_tma_gmma_ws.hpp
|
1c9717d699
address comments
|
2 mēneši atpakaļ |
mainloop_fwd_sm90_tma_gmma_ws.hpp
|
b3d60fa3a5
prune more dead code
|
2 mēneši atpakaļ |
named_barrier.hpp
|
bafe253042
[FA3] Bwd
|
4 mēneši atpakaļ |
seq_len.h
|
c06cc0ba9f
change cu_seqlens_k to seqused_k for kv cache api
|
2 mēneši atpakaļ |
setup.py
|
03200a753f
removed old gqa cu files and unified methods
|
2 mēneši atpakaļ |
softmax.h
|
c92ca63268
FA3 FP8 qkv descales + restore max offset for h128 causal + added sync for producer WG (#1173)
|
3 mēneši atpakaļ |
static_switch.h
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
test_attn_kvcache.py
|
a7cce59d25
adjust tolerances in test script for kv cache
|
2 mēneši atpakaļ |
test_flash_attn.py
|
bb230b8c54
separate gqa compilation
|
2 mēneši atpakaļ |
test_kvcache.py
|
9c97808349
changes for correct lse write out for splits=1 and splits > 1 case.
|
2 mēneši atpakaļ |
tile_scheduler.hpp
|
0a1a0c22b6
refactor for split kv
|
2 mēneši atpakaļ |
tile_scheduler_bwd.hpp
|
bafe253042
[FA3] Bwd
|
4 mēneši atpakaļ |
utils.h
|
eb9c0ee22a
add rmem -> gmem for fp8
|
2 mēneši atpakaļ |