generate_kernels.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. # Copied from Driss Guessous's PR in PyTorch: https://github.com/pytorch/pytorch/pull/105602
  2. # This file is run to generate the kernel instantiations for the flash_attn kernels
  3. # They are written to several files in order to speed up compilation
  4. import argparse
  5. import itertools
  6. from collections import namedtuple
  7. from dataclasses import dataclass
  8. from pathlib import Path
  9. from typing import List, Optional
  10. KERNEL_BATCH = namedtuple("Kernel", ["template", "filename"])
  11. DTYPE_MAP = {
  12. "fp16": "cutlass::half_t",
  13. "bf16": "cutlass::bfloat16_t",
  14. "e4m3": "cutlass::float_e4m3_t",
  15. }
  16. DTYPE_MAP_BWD = {
  17. "fp16": "cutlass::half_t",
  18. "bf16": "cutlass::bfloat16_t",
  19. }
  20. SM = [90] # Sm kernels support up to
  21. HEAD_DIMENSIONS = [64, 96, 128, 192, 256]
  22. PAGEDKV = [False, True]
  23. SPLIT = [False, True]
  24. SOFTCAP = [False, True]
  25. PACKGQA = [False, True]
  26. KERNEL_IMPL_TEMPLATE_FWD = """#include "flash_fwd_launch_template.h"
  27. #ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
  28. template void run_mha_fwd_<{DTYPE}, {HEAD_DIM}, {SPLIT}, {PAGEDKV}, {SOFTCAP}, {PACKGQA}>(Flash_fwd_params &params, cudaStream_t stream);
  29. #endif
  30. """
  31. KERNEL_IMPL_TEMPLATE_BWD = """#include "flash_bwd_launch_template.h"
  32. #ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
  33. template<>
  34. void run_mha_bwd_<{DTYPE}, {HEAD_DIM}>(Flash_bwd_params &params, cudaStream_t stream) {{
  35. run_mha_bwd_hdim{HEAD_DIM}<{DTYPE}>(params, stream);
  36. }}
  37. #endif
  38. """
  39. @dataclass
  40. class Kernel:
  41. sm: int
  42. dtype: str
  43. head_dim: int
  44. split: bool
  45. paged_kv: bool
  46. softcap: bool
  47. packgqa: bool
  48. direction: str
  49. @property
  50. def template(self) -> str:
  51. if self.direction == "fwd":
  52. return KERNEL_IMPL_TEMPLATE_FWD.format(
  53. DTYPE=DTYPE_MAP[self.dtype], HEAD_DIM=self.head_dim,
  54. SPLIT=str(self.split).lower(), PAGEDKV=str(self.paged_kv).lower(),
  55. SOFTCAP=str(self.softcap).lower(), PACKGQA=str(self.packgqa).lower()
  56. )
  57. elif self.direction == "bwd":
  58. return KERNEL_IMPL_TEMPLATE_BWD.format(
  59. DTYPE=DTYPE_MAP[self.dtype], HEAD_DIM=self.head_dim
  60. )
  61. @property
  62. def filename(self) -> str:
  63. return f"flash_{self.direction}_hdim{self.head_dim}_{self.dtype}{'_paged' if self.paged_kv else ''}{'_split' if self.split else ''}{'_softcap' if self.softcap else ''}{'_packgqa' if self.packgqa else ''}_sm{self.sm}.cu"
  64. def get_all_kernels() -> List[Kernel]:
  65. for dtype, head_dim, split, paged_kv, softcap, packgqa, sm in itertools.product(DTYPE_MAP.keys(), HEAD_DIMENSIONS, SPLIT, PAGEDKV, SOFTCAP, PACKGQA, SM):
  66. yield Kernel(sm=sm, dtype=dtype, head_dim=head_dim, split=split, paged_kv=paged_kv, softcap=softcap, packgqa=packgqa, direction="fwd")
  67. for dtype, head_dim, sm in itertools.product(DTYPE_MAP_BWD.keys(), HEAD_DIMENSIONS, SM):
  68. yield Kernel(sm=sm, dtype=dtype, head_dim=head_dim, split=False, paged_kv=False, softcap=False, packgqa=False, direction="bwd")
  69. def batch_hdim(kernels_all) -> List[KERNEL_BATCH]:
  70. for dtype, split, paged_kv, softcap, packgqa, sm in itertools.product(DTYPE_MAP.keys(), SPLIT, PAGEDKV, SOFTCAP, PACKGQA, SM):
  71. kernels = [k for k in kernels_all if k.direction == "fwd" and k.dtype == dtype and k.split == split and k.paged_kv == paged_kv and k.softcap == softcap and k.packgqa == packgqa]
  72. assert len(kernels) > 0
  73. filename = f"flash_fwd_hdimall_{dtype}{'_paged' if paged_kv else ''}{'_split' if split else ''}{'_softcap' if softcap else ''}{'_packgqa' if packgqa else ''}_sm{sm}.cu"
  74. template = "\n".join([f"#include \"{k.filename}\"" for k in kernels])
  75. yield KERNEL_BATCH(template, filename)
  76. def write_kernel(kernel: Kernel, autogen_dir: Path) -> None:
  77. prelude = """// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
  78. // Splitting the different template instantiations to different files to speed up compilation.
  79. // This file is auto-generated. See "generate_kernels.py"\n
  80. """
  81. (autogen_dir / kernel.filename).write_text(prelude + kernel.template)
  82. def main(output_dir: Optional[str]) -> None:
  83. output_dir = Path(output_dir) if output_dir is not None else Path(__file__).parent
  84. output_dir.mkdir(parents=True, exist_ok=True)
  85. kernels_all = list(get_all_kernels())
  86. for kernel in kernels_all:
  87. write_kernel(kernel, output_dir)
  88. for kernel in batch_hdim(kernels_all):
  89. write_kernel(kernel, output_dir)
  90. if __name__ == "__main__":
  91. parser = argparse.ArgumentParser(
  92. prog="generate_kernels",
  93. description="Generate the flash_attention kernels template instantiations",
  94. )
  95. # Set an optional output directory
  96. parser.add_argument(
  97. "-o",
  98. "--output_dir",
  99. default="instantiations",
  100. required=False,
  101. help="Where to generate the kernels "
  102. " will default to the current directory ",
  103. )
  104. args = parser.parse_args()
  105. main(args.output_dir)