.. |
instantiations
|
8668823635
Split hdimdiff into a separate translation unit
|
3 settimane fa |
__init__.py
|
7f67966cc7
FA3 initial code release
|
7 mesi fa |
benchmark_attn.py
|
5458c78e6d
Remove sink token
|
4 giorni fa |
benchmark_flash_attention_fp8.py
|
efbf19cd15
Fix incorrect torch dtype (#1399)
|
1 mese fa |
benchmark_mla_decode.py
|
08f4c802c4
Add FLOPS to MLA decode benchmark
|
5 giorni fa |
benchmark_split_kv.py
|
ea3ecea97a
Add tp_degree to benchmark_split_kv
|
2 settimane fa |
block.h
|
6752d62aa4
Add dynamic splits
|
1 settimana fa |
copy_sm90_bulk_reduce.hpp
|
7a802796e1
Big refactor and update
|
1 mese fa |
epilogue_bwd.hpp
|
6752d62aa4
Add dynamic splits
|
1 settimana fa |
epilogue_fwd.hpp
|
6752d62aa4
Add dynamic splits
|
1 settimana fa |
flash.h
|
5458c78e6d
Remove sink token
|
4 giorni fa |
flash_api.cpp
|
45c48afb2b
Add option for WG1 to use RS MMA but WG2 using SS MMA
|
2 giorni fa |
flash_attn_interface.py
|
5458c78e6d
Remove sink token
|
4 giorni fa |
flash_bwd_kernel_sm80.h
|
15cf7ee435
Rename collective_mainloop -> mainloop, move tile_scheduler variable
|
3 settimane fa |
flash_bwd_kernel_sm90.h
|
15cf7ee435
Rename collective_mainloop -> mainloop, move tile_scheduler variable
|
3 settimane fa |
flash_bwd_launch_template.h
|
5458c78e6d
Remove sink token
|
4 giorni fa |
flash_bwd_postprocess_kernel.h
|
7a802796e1
Big refactor and update
|
1 mese fa |
flash_bwd_preprocess_kernel.h
|
7a802796e1
Big refactor and update
|
1 mese fa |
flash_fwd_combine.cu
|
5378bc3204
Tile fwd_combine kernel along headdim, don't need kBlockM > 128
|
3 settimane fa |
flash_fwd_combine_kernel.h
|
6752d62aa4
Add dynamic splits
|
1 settimana fa |
flash_fwd_combine_launch_template.h
|
6752d62aa4
Add dynamic splits
|
1 settimana fa |
flash_fwd_kernel_sm80.h
|
15cf7ee435
Rename collective_mainloop -> mainloop, move tile_scheduler variable
|
3 settimane fa |
flash_fwd_kernel_sm90.h
|
15cf7ee435
Rename collective_mainloop -> mainloop, move tile_scheduler variable
|
3 settimane fa |
flash_fwd_launch_template.h
|
5458c78e6d
Remove sink token
|
4 giorni fa |
flash_prepare_scheduler.cu
|
20b84d6363
Don't use IntraWGOverlap for hdim 64,512
|
4 giorni fa |
generate_kernels.py
|
8668823635
Split hdimdiff into a separate translation unit
|
3 settimane fa |
heuristics.h
|
6752d62aa4
Add dynamic splits
|
1 settimana fa |
mainloop_bwd_sm80.hpp
|
5458c78e6d
Remove sink token
|
4 giorni fa |
mainloop_bwd_sm90_tma_gmma_ws.hpp
|
5458c78e6d
Remove sink token
|
4 giorni fa |
mainloop_fwd_sm80.hpp
|
5458c78e6d
Remove sink token
|
4 giorni fa |
mainloop_fwd_sm90_tma_gmma_ws.hpp
|
4f0640d534
Move writing P to smem as separate function
|
1 giorno fa |
mask.h
|
7a802796e1
Big refactor and update
|
1 mese fa |
named_barrier.hpp
|
f0f25239bd
Implement the case of LargeHeadDimV
|
1 mese fa |
pack_gqa.h
|
7a802796e1
Big refactor and update
|
1 mese fa |
padding.py
|
7a802796e1
Big refactor and update
|
1 mese fa |
paged_kv.h
|
e3b2400a31
Fix loading paged V when kHeadDimV != kHeadDim
|
1 mese fa |
rotary.h
|
7a802796e1
Big refactor and update
|
1 mese fa |
seqlen.h
|
7a802796e1
Big refactor and update
|
1 mese fa |
setup.py
|
3edf7e0daa
Add kwargs to _write_ninja_file for compatibility with new torch
|
1 giorno fa |
sm90_pipeline_no_cluster.hpp
|
68bf390920
Update Cutlass to fix mem fence
|
1 mese fa |
softmax.h
|
7a802796e1
Big refactor and update
|
1 mese fa |
static_switch.h
|
180ff782dd
Template for Sm86
|
1 mese fa |
test_attn_kvcache.py
|
a5a75274bc
FA3 kvcache + split kv + gqa parallelization (#1236)
|
4 mesi fa |
test_flash_attn.py
|
5458c78e6d
Remove sink token
|
4 giorni fa |
test_kvcache.py
|
a5a75274bc
FA3 kvcache + split kv + gqa parallelization (#1236)
|
4 mesi fa |
test_util.py
|
fa445ff6c2
Fix FP8 test
|
3 settimane fa |
tile_scheduler.hpp
|
20b84d6363
Don't use IntraWGOverlap for hdim 64,512
|
4 giorni fa |
tile_size.h
|
20b84d6363
Don't use IntraWGOverlap for hdim 64,512
|
4 giorni fa |
utils.h
|
45c48afb2b
Add option for WG1 to use RS MMA but WG2 using SS MMA
|
2 giorni fa |