tile_scheduler.hpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464
  1. /******************************************************************************
  2. * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
  3. ******************************************************************************/
  4. #pragma once
  5. #include "cutlass/fast_math.h"
  6. #include "cutlass/arch/barrier.h"
  7. #include "named_barrier.hpp"
  8. namespace flash {
  9. ///////////////////////////////////////////////////////////////////////////////
  10. template<bool Varlen=false, int kBlock=128>
  11. class SingleTileScheduler {
  12. public:
  13. using SharedStorage = int;
  14. // Host side kernel arguments
  15. struct Arguments {
  16. int const num_blocks, num_head, num_batch;
  17. int* const tile_count_semaphore = nullptr;
  18. int* const cu_seqlens = nullptr;
  19. int* const seqused = nullptr;
  20. };
  21. // Device side kernel params
  22. struct Params {
  23. int const num_blocks, num_head, num_batch;
  24. int* const cu_seqlens;
  25. int* const seqused;
  26. };
  27. static Params
  28. to_underlying_arguments(Arguments const& args) {
  29. return {args.num_blocks, args.num_head, args.num_batch,
  30. !Varlen ? nullptr : args.cu_seqlens, !Varlen ? nullptr : args.seqused};
  31. }
  32. static dim3
  33. get_grid_shape(Params const& params, int num_sm) {
  34. return {uint32_t(params.num_blocks), uint32_t(params.num_head), uint32_t(params.num_batch)};
  35. }
  36. struct WorkTileInfo {
  37. int block_idx = 0;
  38. int bidh = 0;
  39. int bidb = 0;
  40. bool is_valid_tile = false;
  41. CUTLASS_DEVICE
  42. bool
  43. is_valid(Params const& params) const {
  44. return is_valid_tile;
  45. }
  46. CUTLASS_DEVICE
  47. cute::tuple<int32_t, int32_t, int32_t>
  48. get_block_coord(Params const& params) const {
  49. return {block_idx, bidh, bidb};
  50. }
  51. };
  52. CUTLASS_DEVICE
  53. SingleTileScheduler(SharedStorage* const smem_scheduler) { }
  54. template<bool IsProducerWarp=false>
  55. CUTLASS_DEVICE
  56. WorkTileInfo
  57. get_initial_work(Params const& params) const {
  58. WorkTileInfo work_info {int(blockIdx.x), int(blockIdx.y), int(blockIdx.z), true};
  59. if constexpr (Varlen) {
  60. work_info.is_valid_tile = work_info.block_idx * kBlock < (params.seqused ? params.seqused[work_info.bidb] : params.cu_seqlens[work_info.bidb + 1] - params.cu_seqlens[work_info.bidb]);
  61. }
  62. return work_info;
  63. }
  64. CUTLASS_DEVICE
  65. void
  66. init_consumer() const {}
  67. CUTLASS_DEVICE
  68. void
  69. prefetch_next_work(Params const& params, WorkTileInfo& current_work) const {}
  70. template<bool IsProducerWarp=false>
  71. CUTLASS_DEVICE
  72. WorkTileInfo
  73. get_next_work(Params const& params, WorkTileInfo const& current_work) const {
  74. return {-1, -1, -1, false};
  75. }
  76. };
  77. ///////////////////////////////////////////////////////////////////////////////
  78. class StaticPersistentTileScheduler {
  79. public:
  80. using SharedStorage = int;
  81. // Host side kernel arguments
  82. struct Arguments {
  83. int const num_blocks, num_head, num_batch;
  84. int* const tile_count_semaphore = nullptr;
  85. int* const cu_seqlens = nullptr;
  86. int* const seqused = nullptr;
  87. };
  88. // Device side kernel params
  89. struct Params {
  90. int total_blocks;
  91. cutlass::FastDivmod m_block_divmod, head_divmod;
  92. };
  93. static Params
  94. to_underlying_arguments(Arguments const& args) {
  95. return {args.num_blocks * args.num_head * args.num_batch,
  96. cutlass::FastDivmod(args.num_blocks), cutlass::FastDivmod(args.num_head)};
  97. }
  98. static dim3
  99. get_grid_shape(Params const& params, int num_sm) {
  100. return {uint32_t(num_sm)};
  101. }
  102. struct WorkTileInfo {
  103. int tile_idx;
  104. CUTLASS_DEVICE
  105. bool
  106. is_valid(Params const& params) const {
  107. return tile_idx < params.total_blocks;
  108. }
  109. CUTLASS_DEVICE
  110. cute::tuple<int32_t, int32_t, int32_t>
  111. get_block_coord(Params const& params) const {
  112. int block, bidh, bidb;
  113. bidb = params.head_divmod.divmod(bidh, params.m_block_divmod.divmod(block, tile_idx));
  114. return {block, bidh, bidb};
  115. }
  116. };
  117. CUTLASS_DEVICE
  118. StaticPersistentTileScheduler(SharedStorage* const smem_scheduler) {};
  119. template<bool IsProducerWarp=false>
  120. CUTLASS_DEVICE
  121. WorkTileInfo
  122. get_initial_work(Params const& params) const {
  123. return {int(blockIdx.x)};
  124. }
  125. CUTLASS_DEVICE
  126. void
  127. init_consumer() const {}
  128. CUTLASS_DEVICE
  129. void
  130. prefetch_next_work(Params const& params, WorkTileInfo& current_work) const {}
  131. template<bool IsProducerWarp=false>
  132. CUTLASS_DEVICE
  133. WorkTileInfo
  134. get_next_work(Params const& params, WorkTileInfo const& current_work) const {
  135. return {current_work.tile_idx + int(gridDim.x)};
  136. }
  137. };
  138. template<int NumMmaThreads=2 * cutlass::NumThreadsPerWarpGroup, int NumProducerThreads=cutlass::NumThreadsPerWarp>
  139. class DynamicPersistentTileScheduler {
  140. public:
  141. using SharedStorage = int;
  142. protected:
  143. SharedStorage* const tile_count_smem;
  144. public:
  145. // Host side kernel arguments
  146. struct Arguments {
  147. int const num_blocks, num_head, num_batch;
  148. int* const tile_count_semaphore;
  149. int* const cu_seqlens = nullptr;
  150. int* const seqused = nullptr;
  151. };
  152. // Device side kernel params
  153. struct Params {
  154. int const total_blocks;
  155. cutlass::FastDivmod const m_block_divmod, head_divmod;
  156. int* const tile_count_semaphore;
  157. };
  158. static Params
  159. to_underlying_arguments(Arguments const& args) {
  160. return {args.num_blocks * args.num_head * args.num_batch,
  161. cutlass::FastDivmod(args.num_blocks), cutlass::FastDivmod(args.num_head),
  162. args.tile_count_semaphore};
  163. }
  164. static dim3
  165. get_grid_shape(Params const& params, int num_sm) {
  166. return {uint32_t(num_sm)};
  167. }
  168. struct WorkTileInfo {
  169. int tile_idx;
  170. CUTLASS_DEVICE
  171. bool
  172. is_valid(Params const& params) const {
  173. return tile_idx < params.total_blocks;
  174. }
  175. CUTLASS_DEVICE
  176. cute::tuple<int32_t, int32_t, int32_t>
  177. get_block_coord(Params const& params) const {
  178. int block, bidh, bidb;
  179. bidb = params.head_divmod.divmod(bidh, params.m_block_divmod.divmod(block, tile_idx));
  180. return {block, bidh, bidb};
  181. }
  182. };
  183. CUTLASS_DEVICE
  184. DynamicPersistentTileScheduler(SharedStorage* const smem_scheduler) : tile_count_smem(smem_scheduler) {};
  185. template<bool IsProducerWarp=false>
  186. CUTLASS_DEVICE
  187. WorkTileInfo
  188. get_initial_work(Params const& params) const {
  189. return {int(blockIdx.x)};
  190. }
  191. CUTLASS_DEVICE
  192. void
  193. init_consumer() const {
  194. cutlass::arch::NamedBarrier::arrive(NumMmaThreads + NumProducerThreads, static_cast<int>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
  195. }
  196. CUTLASS_DEVICE
  197. void
  198. prefetch_next_work(Params const& params, WorkTileInfo& current_work) const {
  199. if (threadIdx.x % NumProducerThreads == 0) {
  200. current_work.tile_idx = atomicAdd(params.tile_count_semaphore, 1) + int(gridDim.x);
  201. }
  202. }
  203. template<bool IsProducerWarp=false>
  204. CUTLASS_DEVICE
  205. WorkTileInfo
  206. get_next_work(Params const& params, WorkTileInfo const& current_work) const {
  207. if constexpr (IsProducerWarp) {
  208. // thread 0 already has the right tile_idx, just need to broadcast to the rest of warp 0
  209. int new_tile_idx = __shfl_sync(0xffffffff, current_work.tile_idx, 0 /*lane*/);
  210. cutlass::arch::NamedBarrier::sync(NumMmaThreads + NumProducerThreads, static_cast<int>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
  211. if (threadIdx.x % NumProducerThreads == 0) {
  212. *tile_count_smem = current_work.tile_idx;
  213. }
  214. cutlass::arch::NamedBarrier::arrive(NumMmaThreads + NumProducerThreads, static_cast<int>(FwdNamedBarriers::TileCountSmemFull) /*id*/);
  215. return {new_tile_idx};
  216. } else {
  217. cutlass::arch::NamedBarrier::sync(NumMmaThreads + NumProducerThreads, static_cast<int>(FwdNamedBarriers::TileCountSmemFull) /*id*/);
  218. int tile_idx = *tile_count_smem;
  219. cutlass::arch::NamedBarrier::arrive(NumMmaThreads + NumProducerThreads, static_cast<int>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
  220. return {tile_idx};
  221. }
  222. }
  223. };
  224. template<int kBlock, int NumMmaThreads=2 * cutlass::NumThreadsPerWarpGroup, int NumProducerThreads=cutlass::NumThreadsPerWarp>
  225. class VarlenDynamicPersistentTileScheduler {
  226. public:
  227. using SharedStorage = int4;
  228. protected:
  229. SharedStorage* const work_info_smem;
  230. public:
  231. // Host side kernel arguments
  232. struct Arguments {
  233. int const num_blocks, num_head, num_batch;
  234. int* const tile_count_semaphore;
  235. int* const cu_seqlens;
  236. int* const seqused;
  237. };
  238. // Device side kernel params
  239. struct Params {
  240. int num_head, num_batch;
  241. int* const tile_count_semaphore;
  242. int* const cu_seqlens;
  243. int* const seqused;
  244. };
  245. static Params
  246. to_underlying_arguments(Arguments const& args) {
  247. return {args.num_head, args.num_batch,
  248. args.tile_count_semaphore, args.cu_seqlens, args.seqused};
  249. }
  250. static dim3
  251. get_grid_shape(Params const& params, int num_sm) {
  252. return {uint32_t(num_sm)};
  253. }
  254. struct WorkTileInfo {
  255. int tile_idx, block, bidh, bidb;
  256. CUTLASS_DEVICE
  257. bool
  258. is_valid(Params const& params) const {
  259. // if (blockIdx.x >= 0 && (threadIdx.x == 128 || threadIdx.x == 0)) { printf("blockIdx.x = %d, threadIdx.x = %d, checking valid, bidb = %d, params.num_batch = %d\n", blockIdx.x, threadIdx.x, bidb, params.num_batch); }
  260. return bidb < params.num_batch;
  261. }
  262. CUTLASS_DEVICE
  263. cute::tuple<int32_t, int32_t, int32_t>
  264. get_block_coord(Params const& params) const {
  265. return {block, bidh, bidb};
  266. }
  267. };
  268. CUTLASS_DEVICE
  269. VarlenDynamicPersistentTileScheduler(SharedStorage* const smem_scheduler) : work_info_smem(smem_scheduler) {};
  270. CUTLASS_DEVICE
  271. WorkTileInfo
  272. tile_idx_to_work_tile(Params const& params, int next_tile_idx, WorkTileInfo const& current_work) const {
  273. auto prefix_sum = [](int val) {
  274. int lane = threadIdx.x % cutlass::NumThreadsPerWarp;
  275. CUTLASS_PRAGMA_UNROLL
  276. for (int i = 1; i < cutlass::NumThreadsPerWarp; i <<= 1) {
  277. int32_t partial_sum = __shfl_up_sync(0xffffffff, val, i);
  278. if (lane >= i) { val += partial_sum; }
  279. }
  280. return val;
  281. };
  282. auto get_num_m_blocks = [&](int bidb) {
  283. int lane = threadIdx.x % cutlass::NumThreadsPerWarp;
  284. int seqlen;
  285. if (params.seqused) {
  286. seqlen = lane + bidb < params.num_batch ? params.seqused[lane + bidb] : 0;
  287. } else {
  288. int cur_cu_seqlen = lane + bidb <= params.num_batch ? params.cu_seqlens[lane + bidb] : 0;
  289. int next_cu_seqlen = __shfl_down_sync(0xffffffff, cur_cu_seqlen, 1);
  290. seqlen = next_cu_seqlen - cur_cu_seqlen;
  291. }
  292. return lane + bidb < params.num_batch && lane < cutlass::NumThreadsPerWarp - 1
  293. ? cute::ceil_div(seqlen, kBlock) : 0;
  294. };
  295. int num_m_blocks = get_num_m_blocks(current_work.bidb); // Different for each lane
  296. // Cumulative number of blocks for the next 31 batches
  297. int num_m_blocks_cumulative = prefix_sum(num_m_blocks);
  298. // Total number of blocks for the next 31 batches
  299. int m_blocks_in_group = __shfl_sync(0xffffffff, num_m_blocks_cumulative, cutlass::NumThreadsPerWarp - 1);
  300. int group_end_tile = current_work.tile_idx - current_work.block - current_work.bidh * __shfl_sync(0xffffffff, num_m_blocks, 0 /*lane*/) + m_blocks_in_group * params.num_head; // Same for all lanes
  301. int bidb = current_work.bidb;
  302. // if (blockIdx.x <= 9 && threadIdx.x == 128) {
  303. // printf("Before while, blockIdx.x = %d, threadIdx.x = %d, bidb = %d, num_m_blocks = %d, next_tile_idx = %d, group_end_tile = %d, mh_blocks_in_group = %d\n", blockIdx.x, threadIdx.x, bidb, num_m_blocks, next_tile_idx, group_end_tile, mh_blocks_in_group);
  304. // }
  305. while (group_end_tile <= next_tile_idx) {
  306. bidb += cutlass::NumThreadsPerWarp - 1;
  307. if (bidb >= params.num_batch) {
  308. // if (blockIdx.x <= 9 && threadIdx.x == 128) {
  309. // printf("Returning early, blockIdx.x = %d, threadIdx.x = %d, bidb = %d, num_m_blocks = %d, next_tile_idx = %d, group_end_tile = %d, mh_blocks_in_group = %d\n", blockIdx.x, threadIdx.x, bidb, num_m_blocks, next_tile_idx, group_end_tile, mh_blocks_in_group);
  310. // }
  311. return {next_tile_idx, 0, 0, params.num_batch};
  312. }
  313. num_m_blocks = get_num_m_blocks(bidb);
  314. num_m_blocks_cumulative = prefix_sum(num_m_blocks);
  315. m_blocks_in_group = __shfl_sync(0xffffffff, num_m_blocks_cumulative, cutlass::NumThreadsPerWarp - 1);
  316. group_end_tile += m_blocks_in_group * params.num_head;
  317. // if (blockIdx.x <= 9 && threadIdx.x == 128) {
  318. // printf("Bottom of while, blockIdx.x = %d, threadIdx.x = %d, bidb = %d, num_m_blocks = %d, next_tile_idx = %d, group_end_tile = %d, mh_blocks_in_group = %d\n", blockIdx.x, threadIdx.x, bidb, num_m_blocks, next_tile_idx, group_end_tile, mh_blocks_in_group);
  319. // }
  320. }
  321. int group_start_tile = group_end_tile - m_blocks_in_group * params.num_head;
  322. // The next problem to process is the first one that does not have ending tile position
  323. // that is greater than or equal to tile index.
  324. int batch_idx_in_group = __popc(__ballot_sync(0xffffffff, group_start_tile + num_m_blocks_cumulative * params.num_head <= next_tile_idx));
  325. bidb += batch_idx_in_group;
  326. num_m_blocks = __shfl_sync(0xffffffff, num_m_blocks, batch_idx_in_group);
  327. int mh_block = next_tile_idx - group_start_tile - (batch_idx_in_group == 0 ? 0 : __shfl_sync(0xffffffff, num_m_blocks_cumulative, batch_idx_in_group - 1)) * params.num_head;
  328. int bidh = mh_block / num_m_blocks;
  329. int block = mh_block - bidh * num_m_blocks;
  330. // if (blockIdx.x <= 9 && threadIdx.x == 128) {
  331. // printf("blockIdx.x = %d, threadIdx.x = %d, num_mh_blocks = %d, batch_idx_in_group = %d, bidb = %d, num_m_blocks = %d, next_tile_idx = %d, group_end_tile = %d, mh_blocks_in_group = %d, mh_block = %d, bidh = %d, block = %d\n", blockIdx.x, threadIdx.x, num_mh_blocks, batch_idx_in_group, bidb, num_m_blocks, next_tile_idx, group_end_tile, mh_blocks_in_group, mh_block, bidh, block);
  332. // }
  333. return {next_tile_idx, block, bidh, bidb};
  334. }
  335. template<bool IsProducerWarp=false>
  336. CUTLASS_DEVICE
  337. WorkTileInfo
  338. get_initial_work(Params const& params) const {
  339. if constexpr (IsProducerWarp) {
  340. WorkTileInfo work_info = tile_idx_to_work_tile(params, int(blockIdx.x), {0, 0, 0, 0});
  341. if (threadIdx.x % cutlass::NumThreadsPerWarp == 0) {
  342. *work_info_smem = make_int4(work_info.tile_idx, work_info.block, work_info.bidh, work_info.bidb);
  343. }
  344. cutlass::arch::NamedBarrier::arrive(NumMmaThreads + NumProducerThreads, static_cast<int>(FwdNamedBarriers::TileCountSmemFull) /*id*/);
  345. return work_info;
  346. } else {
  347. return get_next_work<false>(params, {0, 0, 0, 0});
  348. }
  349. }
  350. CUTLASS_DEVICE
  351. void
  352. init_consumer() const {
  353. // Don't arrive at the TileCountSmemEmpty barrier here, because get_initial_work will do that
  354. }
  355. CUTLASS_DEVICE
  356. void
  357. prefetch_next_work(Params const& params, WorkTileInfo& current_work) const {
  358. if (threadIdx.x % NumProducerThreads == 0) {
  359. current_work.tile_idx = atomicAdd(params.tile_count_semaphore, 1) + int(gridDim.x);
  360. }
  361. }
  362. template<bool IsProducerWarp=false>
  363. CUTLASS_DEVICE
  364. WorkTileInfo
  365. get_next_work(Params const& params, WorkTileInfo const& current_work) const {
  366. if constexpr (IsProducerWarp) {
  367. // thread 0 has the next tile_idx, just need to broadcast to the rest of warp 0
  368. int new_tile_idx = __shfl_sync(0xffffffff, current_work.tile_idx, 0 /*lane*/);
  369. WorkTileInfo work_info = {__shfl_sync(0xffffffff, current_work.tile_idx, 1 /*lane*/), current_work.block, current_work.bidh, current_work.bidb};
  370. work_info = tile_idx_to_work_tile(params, new_tile_idx, work_info);
  371. cutlass::arch::NamedBarrier::sync(NumMmaThreads + NumProducerThreads, static_cast<int>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
  372. if (threadIdx.x % cutlass::NumThreadsPerWarp == 0) {
  373. *work_info_smem = make_int4(work_info.tile_idx, work_info.block, work_info.bidh, work_info.bidb);
  374. }
  375. cutlass::arch::NamedBarrier::arrive(NumMmaThreads + NumProducerThreads, static_cast<int>(FwdNamedBarriers::TileCountSmemFull) /*id*/);
  376. return work_info;
  377. } else {
  378. cutlass::arch::NamedBarrier::sync(NumMmaThreads + NumProducerThreads, static_cast<int>(FwdNamedBarriers::TileCountSmemFull) /*id*/);
  379. int4 work_info = *work_info_smem;
  380. cutlass::arch::NamedBarrier::arrive(NumMmaThreads + NumProducerThreads, static_cast<int>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
  381. return WorkTileInfo{work_info.x, work_info.y, work_info.z, work_info.w};
  382. }
  383. }
  384. };
  385. } // flash