.. |
instantiations
|
8668823635
Split hdimdiff into a separate translation unit
|
vor 3 Wochen |
__init__.py
|
7f67966cc7
FA3 initial code release
|
vor 7 Monaten |
benchmark_attn.py
|
5458c78e6d
Remove sink token
|
vor 4 Tagen |
benchmark_flash_attention_fp8.py
|
efbf19cd15
Fix incorrect torch dtype (#1399)
|
vor 1 Monat |
benchmark_mla_decode.py
|
08f4c802c4
Add FLOPS to MLA decode benchmark
|
vor 5 Tagen |
benchmark_split_kv.py
|
ea3ecea97a
Add tp_degree to benchmark_split_kv
|
vor 2 Wochen |
block.h
|
6752d62aa4
Add dynamic splits
|
vor 1 Woche |
copy_sm90_bulk_reduce.hpp
|
7a802796e1
Big refactor and update
|
vor 1 Monat |
epilogue_bwd.hpp
|
6752d62aa4
Add dynamic splits
|
vor 1 Woche |
epilogue_fwd.hpp
|
6752d62aa4
Add dynamic splits
|
vor 1 Woche |
flash.h
|
5458c78e6d
Remove sink token
|
vor 4 Tagen |
flash_api.cpp
|
45c48afb2b
Add option for WG1 to use RS MMA but WG2 using SS MMA
|
vor 2 Tagen |
flash_attn_interface.py
|
5458c78e6d
Remove sink token
|
vor 4 Tagen |
flash_bwd_kernel_sm80.h
|
15cf7ee435
Rename collective_mainloop -> mainloop, move tile_scheduler variable
|
vor 3 Wochen |
flash_bwd_kernel_sm90.h
|
15cf7ee435
Rename collective_mainloop -> mainloop, move tile_scheduler variable
|
vor 3 Wochen |
flash_bwd_launch_template.h
|
5458c78e6d
Remove sink token
|
vor 4 Tagen |
flash_bwd_postprocess_kernel.h
|
7a802796e1
Big refactor and update
|
vor 1 Monat |
flash_bwd_preprocess_kernel.h
|
7a802796e1
Big refactor and update
|
vor 1 Monat |
flash_fwd_combine.cu
|
5378bc3204
Tile fwd_combine kernel along headdim, don't need kBlockM > 128
|
vor 3 Wochen |
flash_fwd_combine_kernel.h
|
6752d62aa4
Add dynamic splits
|
vor 1 Woche |
flash_fwd_combine_launch_template.h
|
6752d62aa4
Add dynamic splits
|
vor 1 Woche |
flash_fwd_kernel_sm80.h
|
15cf7ee435
Rename collective_mainloop -> mainloop, move tile_scheduler variable
|
vor 3 Wochen |
flash_fwd_kernel_sm90.h
|
15cf7ee435
Rename collective_mainloop -> mainloop, move tile_scheduler variable
|
vor 3 Wochen |
flash_fwd_launch_template.h
|
5458c78e6d
Remove sink token
|
vor 4 Tagen |
flash_prepare_scheduler.cu
|
20b84d6363
Don't use IntraWGOverlap for hdim 64,512
|
vor 4 Tagen |
generate_kernels.py
|
8668823635
Split hdimdiff into a separate translation unit
|
vor 3 Wochen |
heuristics.h
|
6752d62aa4
Add dynamic splits
|
vor 1 Woche |
mainloop_bwd_sm80.hpp
|
5458c78e6d
Remove sink token
|
vor 4 Tagen |
mainloop_bwd_sm90_tma_gmma_ws.hpp
|
5458c78e6d
Remove sink token
|
vor 4 Tagen |
mainloop_fwd_sm80.hpp
|
5458c78e6d
Remove sink token
|
vor 4 Tagen |
mainloop_fwd_sm90_tma_gmma_ws.hpp
|
4f0640d534
Move writing P to smem as separate function
|
vor 1 Tag |
mask.h
|
7a802796e1
Big refactor and update
|
vor 1 Monat |
named_barrier.hpp
|
f0f25239bd
Implement the case of LargeHeadDimV
|
vor 1 Monat |
pack_gqa.h
|
7a802796e1
Big refactor and update
|
vor 1 Monat |
padding.py
|
7a802796e1
Big refactor and update
|
vor 1 Monat |
paged_kv.h
|
e3b2400a31
Fix loading paged V when kHeadDimV != kHeadDim
|
vor 1 Monat |
rotary.h
|
7a802796e1
Big refactor and update
|
vor 1 Monat |
seqlen.h
|
7a802796e1
Big refactor and update
|
vor 1 Monat |
setup.py
|
3edf7e0daa
Add kwargs to _write_ninja_file for compatibility with new torch
|
vor 1 Tag |
sm90_pipeline_no_cluster.hpp
|
68bf390920
Update Cutlass to fix mem fence
|
vor 1 Monat |
softmax.h
|
7a802796e1
Big refactor and update
|
vor 1 Monat |
static_switch.h
|
180ff782dd
Template for Sm86
|
vor 1 Monat |
test_attn_kvcache.py
|
a5a75274bc
FA3 kvcache + split kv + gqa parallelization (#1236)
|
vor 4 Monaten |
test_flash_attn.py
|
5458c78e6d
Remove sink token
|
vor 4 Tagen |
test_kvcache.py
|
a5a75274bc
FA3 kvcache + split kv + gqa parallelization (#1236)
|
vor 4 Monaten |
test_util.py
|
fa445ff6c2
Fix FP8 test
|
vor 3 Wochen |
tile_scheduler.hpp
|
20b84d6363
Don't use IntraWGOverlap for hdim 64,512
|
vor 4 Tagen |
tile_size.h
|
20b84d6363
Don't use IntraWGOverlap for hdim 64,512
|
vor 4 Tagen |
utils.h
|
45c48afb2b
Add option for WG1 to use RS MMA but WG2 using SS MMA
|
vor 2 Tagen |