.. |
alibi.h
|
9486635c92
Fix typos of comments about shape. (#837)
|
6 months ago |
block_info.h
|
40e534a7f6
Implement cache_leftpad
|
6 months ago |
dropout.h
|
66a127aef8
Refactor masking in fwd pass into 1 object
|
1 year ago |
flash.h
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim128_bf16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim128_bf16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim128_fp16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim128_fp16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim160_bf16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim160_bf16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim160_fp16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim160_fp16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim192_bf16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim192_bf16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim192_fp16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim192_fp16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim224_bf16_sm80.cu
|
ea8a25ca38
Remove configure in bwd kernel launch
|
1 year ago |
flash_bwd_hdim224_fp16_sm80.cu
|
ea8a25ca38
Remove configure in bwd kernel launch
|
1 year ago |
flash_bwd_hdim256_bf16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim256_bf16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim256_fp16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim256_fp16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim32_bf16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim32_bf16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim32_fp16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim32_fp16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim64_bf16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim64_bf16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim64_fp16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim64_fp16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim96_bf16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim96_bf16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim96_fp16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_hdim96_fp16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_kernel.h
|
5ca83a9c71
Clean up softcapping bwd a bit
|
6 months ago |
flash_bwd_launch_template.h
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_bwd_preprocess_kernel.h
|
f816dee63c
Support unpadded LSE layout (#970)
|
6 months ago |
flash_fwd_hdim128_bf16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim128_bf16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim128_fp16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim128_fp16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim160_bf16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim160_bf16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim160_fp16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim160_fp16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim192_bf16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim192_bf16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim192_fp16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim192_fp16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim224_bf16_causal_sm80.cu
|
908511b2b6
Split into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim224_bf16_sm80.cu
|
908511b2b6
Split into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim224_fp16_causal_sm80.cu
|
908511b2b6
Split into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim224_fp16_sm80.cu
|
908511b2b6
Split into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim256_bf16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim256_bf16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim256_fp16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim256_fp16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim32_bf16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim32_bf16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim32_fp16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim32_fp16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim64_bf16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim64_bf16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim64_fp16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim64_fp16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim96_bf16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim96_bf16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim96_fp16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_hdim96_fp16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_kernel.h
|
5f1ae4a34b
backwards for softcapping (#1033)
|
6 months ago |
flash_fwd_launch_template.h
|
751c762c9c
Don't specialize for hdim 224 to speed up compilation
|
6 months ago |
flash_fwd_split_hdim128_bf16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim128_bf16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim128_fp16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim128_fp16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim160_bf16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim160_bf16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim160_fp16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim160_fp16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim192_bf16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim192_bf16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim192_fp16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim192_fp16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim224_bf16_causal_sm80.cu
|
908511b2b6
Split into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim224_bf16_sm80.cu
|
908511b2b6
Split into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim224_fp16_causal_sm80.cu
|
908511b2b6
Split into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim224_fp16_sm80.cu
|
908511b2b6
Split into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim256_bf16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim256_bf16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim256_fp16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim256_fp16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim32_bf16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim32_bf16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim32_fp16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim32_fp16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim64_bf16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim64_bf16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim64_fp16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim64_fp16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim96_bf16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim96_bf16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim96_fp16_causal_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
flash_fwd_split_hdim96_fp16_sm80.cu
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
generate_kernels.py
|
65f723bb9a
Split bwd into more .cu files to speed up compilation
|
6 months ago |
kernel_traits.h
|
d732be1e67
Update to Cutlass 3.5
|
7 months ago |
mask.h
|
9486635c92
Fix typos of comments about shape. (#837)
|
6 months ago |
philox.cuh
|
ed4959b2eb
Change inline to __forceinline__, use __grid_constant__ param
|
1 year ago |
rotary.h
|
d732be1e67
Update to Cutlass 3.5
|
7 months ago |
softmax.h
|
23e8fa5a26
Add the option for the macro and note (#893)
|
9 months ago |
static_switch.h
|
751c762c9c
Don't specialize for hdim 224 to speed up compilation
|
6 months ago |
utils.h
|
5f1ae4a34b
backwards for softcapping (#1033)
|
6 months ago |