1
0

mha_fwd.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. /******************************************************************************
  2. * Copyright (c) 2024, Tri Dao.
  3. ******************************************************************************/
  4. #include "flash_common.hpp"
  5. #include "fmha_fwd.hpp"
  6. #include "mask.hpp"
  7. fmha_fwd_traits get_ck_fmha_fwd_traits(const mask_info &mask,
  8. std::string dtype,
  9. int head_size,
  10. bool has_dropout,
  11. bool has_lse,
  12. bool enable_alibi)
  13. {
  14. return fmha_fwd_traits{head_size,
  15. head_size,
  16. dtype,
  17. false, // is_group_mode
  18. true, // is_v_rowmajor
  19. mask.type,
  20. enable_alibi ? bias_enum::alibi : bias_enum::no_bias,
  21. has_lse,
  22. has_dropout,
  23. false}; // do_fp8_static_quant
  24. }
  25. fmha_fwd_args get_ck_fmha_fwd_args(bool has_lse,
  26. bool has_dropout_randval,
  27. const mask_info &mask,
  28. // sizes
  29. const int b,
  30. const int seqlen_q,
  31. const int seqlen_k,
  32. const int h,
  33. const int h_k,
  34. const int d,
  35. // device pointers
  36. const at::Tensor q,
  37. const at::Tensor k,
  38. const at::Tensor v,
  39. c10::optional<at::Tensor> &alibi_slopes_,
  40. at::Tensor out,
  41. at::Tensor softmax_lse,
  42. at::Tensor dropout_randval,
  43. float softmax_scale,
  44. float p_dropout,
  45. uint64_t drop_seed,
  46. uint64_t drop_offset)
  47. {
  48. // q: (batch_size, seqlen_q, nheads, d)
  49. // k: (batch_size, seqlen_k, nheads_k, d)
  50. // v: (batch_size, seqlen_k, nheads_k, d)
  51. // o: (batch_size, seqlen_q, nheads, d)
  52. // alibi_slopes:(batch_size, nheads) or (nhead)
  53. // lse: (batch_size, nheads, seqlen_q)
  54. // randval: (batch_size, nheads, seqlen_q, seqlen_k)
  55. ck_tile::index_t stride_q = q.stride(1);
  56. ck_tile::index_t stride_k = k.stride(1);
  57. ck_tile::index_t stride_v = v.stride(1);
  58. ck_tile::index_t stride_o = out.stride(1);
  59. ck_tile::index_t stride_randval = has_dropout_randval ? dropout_randval.stride(2) : 0;
  60. ck_tile::index_t nhead_stride_q = q.stride(2);
  61. ck_tile::index_t nhead_stride_k = k.stride(2);
  62. ck_tile::index_t nhead_stride_v = v.stride(2);
  63. ck_tile::index_t nhead_stride_o = out.stride(2);
  64. ck_tile::index_t nhead_stride_lse = has_lse ? softmax_lse.stride(1) : 0;
  65. ck_tile::index_t nhead_stride_randval = has_dropout_randval ? dropout_randval.stride(1) : 0;
  66. ck_tile::index_t batch_stride_q = q.stride(0);
  67. ck_tile::index_t batch_stride_k = k.stride(0);
  68. ck_tile::index_t batch_stride_v = v.stride(0);
  69. ck_tile::index_t batch_stride_o = out.stride(0);
  70. ck_tile::index_t batch_stride_lse = has_lse ? softmax_lse.stride(0) : 0;
  71. ck_tile::index_t batch_stride_randval = has_dropout_randval ? dropout_randval.stride(0) : 0;
  72. void *alibi_slopes_ptr = nullptr;
  73. ck_tile::index_t stride_alibi_slopes = 0;
  74. if (alibi_slopes_.has_value()) {
  75. auto alibi_slopes = alibi_slopes_.value();
  76. CHECK_DEVICE(alibi_slopes);
  77. TORCH_CHECK(alibi_slopes.stride(-1) == 1, "ALiBi slopes tensor must have contiguous last dimension");
  78. TORCH_CHECK(alibi_slopes.sizes() == torch::IntArrayRef({h}) || alibi_slopes.sizes() == torch::IntArrayRef({b, h}));
  79. alibi_slopes_ptr = alibi_slopes.data_ptr();
  80. stride_alibi_slopes = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0;
  81. }
  82. return fmha_fwd_args{q.data_ptr(),
  83. k.data_ptr(),
  84. v.data_ptr(),
  85. alibi_slopes_ptr, // bias
  86. has_dropout_randval ? dropout_randval.data_ptr() : nullptr,
  87. has_lse ? softmax_lse.data_ptr() : nullptr,
  88. out.data_ptr(),
  89. nullptr, // seqstart_q
  90. nullptr, // seqstart_k
  91. nullptr,
  92. seqlen_q,
  93. seqlen_k,
  94. b,
  95. seqlen_q, // max_seqlen_q
  96. d, // hdim_q
  97. d, // hdim_v
  98. h, // nhead
  99. h_k, // nhead_k
  100. softmax_scale, // scale_s
  101. 1, // scale_p
  102. 1, // scale_o
  103. stride_q,
  104. stride_k,
  105. stride_v,
  106. stride_alibi_slopes,
  107. stride_randval,
  108. stride_o,
  109. nhead_stride_q,
  110. nhead_stride_k,
  111. nhead_stride_v,
  112. 0, // nhead_stride_bias, FA without bias
  113. nhead_stride_randval,
  114. nhead_stride_lse,
  115. nhead_stride_o,
  116. batch_stride_q,
  117. batch_stride_k,
  118. batch_stride_v,
  119. 0, // batch_stride_bias, FA without bias
  120. batch_stride_randval,
  121. batch_stride_lse,
  122. batch_stride_o,
  123. mask.left,
  124. mask.right,
  125. static_cast<ck_tile::index_t>(mask.type),
  126. p_dropout,
  127. has_dropout_randval,
  128. {drop_seed, drop_offset}};
  129. }
  130. std::vector<at::Tensor>
  131. mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x round_multiple(head_size, 8)
  132. const at::Tensor &k, // batch_size x seqlen_k x num_heads_k x round_multiple(head_size, 8)
  133. const at::Tensor &v, // batch_size x seqlen_k x num_heads_k x round_multiple(head_size, 8)
  134. c10::optional<at::Tensor> &out_, // batch_size x seqlen_q x num_heads x round_multiple(head_size, 8)
  135. c10::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
  136. const float p_dropout,
  137. const float softmax_scale,
  138. bool is_causal,
  139. int window_size_left,
  140. int window_size_right,
  141. const float /*softcap*/,
  142. const bool return_dropout_randval,
  143. c10::optional<at::Generator> gen_)
  144. {
  145. auto q_dtype = q.dtype();
  146. TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
  147. "FlashAttention only support fp16 and bf16 data type");
  148. TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
  149. TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
  150. std::string q_dtype_str = q_dtype == torch::kFloat16 ? "fp16" : "bf16";
  151. CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v);
  152. TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
  153. TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
  154. TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
  155. const auto sizes = q.sizes();
  156. const int batch_size = sizes[0];
  157. int seqlen_q = sizes[1];
  158. int num_heads = sizes[2];
  159. const int head_size = sizes[3];
  160. const int seqlen_k = k.size(1);
  161. const int num_heads_k = k.size(2);
  162. TORCH_CHECK(batch_size > 0, "batch size must be positive");
  163. TORCH_CHECK(head_size <= 256, "CK only supports head dimension at most 256");
  164. TORCH_CHECK(head_size % 8 == 0, "query, key, value, and out_ must have a head_size that is a multiple of 8");
  165. TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
  166. if (window_size_left >= seqlen_k) { window_size_left = -1; }
  167. if (window_size_right >= seqlen_k) { window_size_right = -1; }
  168. // causal=true is the same as causal=false in this case
  169. if (seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; }
  170. mask_info mask;
  171. if (is_causal) {
  172. // Causal is the special case where window_size_right == 0 and window_size_left < 0.
  173. window_size_right = 0;
  174. std::string mask_identify = "b:" + std::to_string(window_size_left) + "," + "0";
  175. mask = mask_info::decode(mask_identify, seqlen_q, seqlen_k); // casual
  176. }
  177. else if (window_size_left == -1 && window_size_right == -1) {
  178. mask = mask_info::decode("0", seqlen_q, seqlen_k); // no mask
  179. }
  180. else {
  181. // Local is the more general case where window_size_right >= 0 or window_size_left >= 0.
  182. std::string mask_identify = "b:" + std::to_string(window_size_left) + "," + std::to_string(window_size_right);
  183. mask = mask_info::decode(mask_identify, seqlen_q, seqlen_k); // local
  184. }
  185. // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case
  186. // H/t Daniel Haziza
  187. const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size % 8 == 0 && !alibi_slopes_.has_value();
  188. const int ngroups = num_heads / num_heads_k;
  189. if (seqlenq_ngroups_swapped) {
  190. q = q.reshape({batch_size, num_heads_k, ngroups, head_size}).transpose(1, 2);
  191. seqlen_q = ngroups;
  192. num_heads = num_heads_k;
  193. }
  194. CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size);
  195. CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size);
  196. CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size);
  197. at::Tensor out;
  198. if (out_.has_value()) {
  199. out = out_.value();
  200. TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs");
  201. CHECK_DEVICE(out);
  202. TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension");
  203. CHECK_SHAPE(out, batch_size, sizes[1], sizes[2], head_size);
  204. if (seqlenq_ngroups_swapped) {
  205. out = out.reshape({batch_size, num_heads_k, ngroups, head_size}).transpose(1, 2);
  206. }
  207. }
  208. else {
  209. out = torch::empty_like(q);
  210. }
  211. // Otherwise the kernel will be launched from cuda:0 device
  212. // Cast to char to avoid compiler warning about narrowing
  213. at::cuda::CUDAGuard device_guard{(char)q.get_device()};
  214. auto opts = q.options();
  215. bool has_lse = true;
  216. bool has_dropout = p_dropout > 0.0f;
  217. at::Tensor softmax_lse;
  218. // TODO - check gradient, only training require lse
  219. softmax_lse = torch::empty({batch_size, num_heads, seqlen_q}, opts.dtype(torch::kFloat32));
  220. at::Tensor p;
  221. if (return_dropout_randval) {
  222. TORCH_CHECK(has_dropout, "return_dropout_randval require p_dropout > 0");
  223. p = torch::empty({batch_size, num_heads, seqlen_q, seqlen_k}, opts.dtype(torch::kUInt8));
  224. }
  225. else {
  226. p = torch::empty({ 0 }, opts);
  227. }
  228. uint64_t drop_seed = 1, drop_offset = 0;
  229. int64_t counter_offset = batch_size * num_heads * ck_tile::get_warp_size();
  230. auto options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
  231. auto rng_state = torch::empty({2}, options.dtype(torch::kInt64));
  232. if (p_dropout > 0.0) {
  233. auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
  234. gen_, at::cuda::detail::getDefaultCUDAGenerator());
  235. // See Note [Acquire lock when using random generators]
  236. std::lock_guard<std::mutex> lock(gen->mutex_);
  237. auto philox_args = gen->philox_cuda_state(counter_offset);
  238. std::tie(drop_seed, drop_offset) = flash::unpack(philox_args);
  239. }
  240. rng_state[0] = *(reinterpret_cast<int64_t*>(&drop_seed));
  241. rng_state[1] = *(reinterpret_cast<int64_t*>(&drop_offset));
  242. if (seqlen_k > 0) {
  243. auto stream = at::cuda::getCurrentHIPStream().stream();
  244. ck_tile::stream_config stream_config{stream};
  245. auto traits =
  246. get_ck_fmha_fwd_traits(
  247. mask,
  248. q_dtype_str,
  249. head_size,
  250. has_dropout,
  251. has_lse,
  252. alibi_slopes_.has_value());
  253. auto args =
  254. get_ck_fmha_fwd_args(
  255. has_lse,
  256. return_dropout_randval,
  257. mask,
  258. batch_size,
  259. seqlen_q,
  260. seqlen_k,
  261. num_heads,
  262. num_heads_k,
  263. head_size,
  264. q,
  265. k,
  266. v,
  267. alibi_slopes_,
  268. out,
  269. softmax_lse,
  270. p,
  271. softmax_scale,
  272. p_dropout,
  273. drop_seed,
  274. drop_offset);
  275. float t = fmha_fwd(traits, args, stream_config);
  276. TORCH_CHECK(t >= 0, "invalid argument for fmha_fwd");
  277. }
  278. else {
  279. // If seqlen_k == 0, then we have an empty tensor. We need to set the output to 0.
  280. out.zero_();
  281. softmax_lse.fill_(std::numeric_limits<float>::infinity());
  282. }
  283. if (seqlenq_ngroups_swapped) {
  284. out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size});
  285. q = q.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size});
  286. softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1});
  287. }
  288. return {out, softmax_lse, p, rng_state};
  289. }