q_gemm_kernel.cuh 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579
  1. /*
  2. * Adapted from https://github.com/turboderp/exllamav2
  3. * Copyright (c) 2024 turboderp
  4. *
  5. * Permission is hereby granted, free of charge, to any person obtaining a copy
  6. * of this software and associated documentation files (the "Software"), to deal
  7. * in the Software without restriction, including without limitation the rights
  8. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. * copies of the Software, and to permit persons to whom the Software is
  10. * furnished to do so, subject to the following conditions:
  11. *
  12. * The above copyright notice and this permission notice shall be included in all
  13. * copies or substantial portions of the Software.
  14. *
  15. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. * SOFTWARE.
  22. */
  23. #include "compat.cuh"
  24. namespace aphrodite {
  25. namespace exl2 {
  26. #define MAX_Q_GEMM_WEIGHTS 4
  27. #define EXL2_BLOCK_KN_SIZE 64
  28. #define EXL2_BLOCK_M_SIZE_MAX 8
  29. #define EXL2_MAX_GROUPS_IN_BLOCK (EXL2_BLOCK_KN_SIZE / 32)
  30. __forceinline__ __device__ half2 dot22_8(half2(&dq)[4], const half* a_ptr, const half2 g_result, const half qs_h)
  31. {
  32. half2 result = {};
  33. const half2* a2_ptr = (const half2*)a_ptr;
  34. #pragma unroll
  35. for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
  36. return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
  37. }
  38. __forceinline__ __device__ half2 dot22_16(half2(&dq)[8], const half* a_ptr, const half2 g_result, const half qs_h)
  39. {
  40. half2 result = {};
  41. const half2* a2_ptr = (const half2*)a_ptr;
  42. #pragma unroll
  43. for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
  44. return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
  45. }
  46. __forceinline__ __device__ half2 dot22_32(half2(&dq)[16], const half* a_ptr, const half2 g_result, const half qs_h)
  47. {
  48. half2 result = {};
  49. const half2* a2_ptr = (const half2*)a_ptr;
  50. #pragma unroll
  51. for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
  52. return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
  53. }
  54. __forceinline__ __device__ float dot22_8_f(half2(&dq)[4], const half* a_ptr, const float g_result, const float qs_f)
  55. {
  56. half2 result = {};
  57. const half2* a2_ptr = (const half2*)a_ptr;
  58. #pragma unroll
  59. for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
  60. float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result));
  61. return fma(result_f, qs_f, g_result);
  62. }
  63. __forceinline__ __device__ float dot22_16_f(half2(&dq)[8], const half* a_ptr, const float g_result, const float qs_f)
  64. {
  65. half2 result = {};
  66. const half2* a2_ptr = (const half2*)a_ptr;
  67. #pragma unroll
  68. for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
  69. float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result));
  70. return fma(result_f, qs_f, g_result);
  71. }
  72. __forceinline__ __device__ float dot22_32_f(half2(&dq)[16], const half* a_ptr, const float g_result, const float qs_f)
  73. {
  74. half2 result = {};
  75. const half2* a2_ptr = (const half2*)a_ptr;
  76. #pragma unroll
  77. for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
  78. float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result));
  79. return fma(result_f, qs_f, g_result);
  80. }
  81. __forceinline__ __device__ half dot22_8_h(half2(&dq)[4], const half* a_ptr, const half g_result, const half qs_h)
  82. {
  83. // Use FP32 accumulator to avoid potential overflow since unscaled weights are in the range -128..127
  84. float result = {};
  85. #pragma unroll
  86. for (int i = 0; i < 4; i++)
  87. {
  88. half2 w01 = dq[i];
  89. float w0 = __low2float(w01);
  90. float w1 = __high2float(w01);
  91. float x0 = __half2float(*a_ptr++);
  92. float x1 = __half2float(*a_ptr++);
  93. result = fma(w0, x0, result);
  94. result = fma(w1, x1, result);
  95. }
  96. float qs = __half2float(qs_h);
  97. result *= qs;
  98. half result_h = __float2half_rn(result);
  99. return __hadd(result_h, g_result);
  100. }
  101. __forceinline__ __device__ half dot22_16_h(half2(&dq)[8], const half* a_ptr, const half g_result, const half qs_h)
  102. {
  103. half2 result = {};
  104. const half2* a2_ptr = (const half2*)a_ptr;
  105. #pragma unroll
  106. for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
  107. half result_h = __hadd(__low2half(result), __high2half(result));
  108. return __hfma(result_h, qs_h, g_result);
  109. }
  110. __forceinline__ __device__ half dot22_32_h(half2(&dq)[16], const half* a_ptr, const half g_result, const half qs_h)
  111. {
  112. half2 result = {};
  113. const half2* a2_ptr = (const half2*)a_ptr;
  114. #pragma unroll
  115. for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
  116. half result_h = __hadd(__low2half(result), __high2half(result));
  117. return __hfma(result_h, qs_h, g_result);
  118. }
  119. typedef void (*fp_gemm_half_q_half_kernel)
  120. (
  121. const half*,
  122. const uint32_t*,
  123. const uint32_t*,
  124. const half*,
  125. half*,
  126. const int,
  127. const int,
  128. const int,
  129. const int,
  130. const uint16_t*,
  131. const uint16_t*,
  132. const int,
  133. const int,
  134. const int,
  135. const int,
  136. const int,
  137. const int,
  138. const bool
  139. );
  140. template <int m_count>
  141. __global__ void gemm_half_q_half_kernel
  142. (
  143. const half* __restrict__ a,
  144. const uint32_t* __restrict__ b_q_weight,
  145. const uint32_t* __restrict__ b_q_scale,
  146. const half* __restrict__ b_q_scale_max,
  147. half* __restrict__ c,
  148. const int size_m,
  149. const int size_n,
  150. const int size_k,
  151. const int groups,
  152. const uint16_t* __restrict__ b_q_group_map,
  153. const uint16_t* __restrict__ b_q_perm,
  154. const int rows_8,
  155. const int rows_6,
  156. const int rows_5,
  157. const int rows_4,
  158. const int rows_3,
  159. const int rows_2,
  160. const bool clear
  161. )
  162. {
  163. MatrixView_half a_(a, size_m, size_k);
  164. MatrixView_half_rw c_(c, size_m, size_n);
  165. MatrixView_q4_row b_q_scale_(b_q_scale, groups, size_n);
  166. int t = threadIdx.x;
  167. // Block
  168. int offset_n = blockIdx.x * EXL2_BLOCK_KN_SIZE * 4;
  169. int offset_m = blockIdx.y * m_count;
  170. int offset_k = blockIdx.z * EXL2_BLOCK_KN_SIZE;
  171. int end_n = min(offset_n + EXL2_BLOCK_KN_SIZE * 4, size_n);
  172. int end_m = min(offset_m + m_count, size_m);
  173. int end_k = min(offset_k + EXL2_BLOCK_KN_SIZE, size_k);
  174. int n = offset_n + t * 4;
  175. // Read weights
  176. half_uint16 weights[MAX_Q_GEMM_WEIGHTS];
  177. // Preload block_a
  178. __shared__ half block_a[m_count][EXL2_BLOCK_KN_SIZE];
  179. if (offset_k + t < end_k)
  180. {
  181. for (int m = 0; m < m_count; ++m)
  182. {
  183. const half* a_ptr = a_.item_ptr(offset_m + m, 0);
  184. half* block_a_ptr = block_a[m];
  185. half a0 = a_ptr[b_q_perm[offset_k + t]];
  186. // half a0 = a_ptr[offset_k + t];
  187. block_a_ptr[t] = a0;
  188. }
  189. }
  190. // Clear
  191. if (n >= size_n) return;
  192. if (clear && blockIdx.z == 0) // && (threadIdx.x & 1) == 0)
  193. {
  194. for (int m = 0; m < m_count; m++)
  195. *((uint64_t*) c_.item_ptr(offset_m + m, n)) = 0;
  196. }
  197. __syncthreads();
  198. // Find initial group
  199. //int group = offset_k / groupsize;
  200. int group = b_q_group_map[offset_k * 2];
  201. // if (offset_m == 0 && t == 0)
  202. // DBGI2(offset_k, group);
  203. // Preload scales
  204. half scales[EXL2_MAX_GROUPS_IN_BLOCK][4];
  205. //int groups_in_block = DIVIDE((end_k - offset_k), groupsize);
  206. int temp_k = offset_k;
  207. for (int g = 0; temp_k < end_k; g++)
  208. {
  209. int qscales[4];
  210. b_q_scale_.item4(qscales, group + g, n);
  211. qscales[0]++;
  212. qscales[1]++;
  213. qscales[2]++;
  214. qscales[3]++;
  215. half maxscale = b_q_scale_max[group + g];
  216. scales[g][0] = __hmul(__int2half_rn(qscales[0] * qscales[0]), maxscale);
  217. scales[g][1] = __hmul(__int2half_rn(qscales[1] * qscales[1]), maxscale);
  218. scales[g][2] = __hmul(__int2half_rn(qscales[2] * qscales[2]), maxscale);
  219. scales[g][3] = __hmul(__int2half_rn(qscales[3] * qscales[3]), maxscale);
  220. temp_k += b_q_group_map[temp_k * 2 + 1];
  221. }
  222. // a, b offset
  223. int pre_rows_8 = min(rows_8, offset_k);
  224. int pre_rows_6 = offset_k > rows_8 ? min(rows_6, offset_k) - rows_8 : 0;
  225. int pre_rows_5 = offset_k > rows_6 ? min(rows_5, offset_k) - rows_6 : 0;
  226. int pre_rows_4 = offset_k > rows_5 ? min(rows_4, offset_k) - rows_5 : 0;
  227. int pre_rows_3 = offset_k > rows_4 ? min(rows_3, offset_k) - rows_4 : 0;
  228. int pre_rows_2 = offset_k > rows_3 ? min(rows_2, offset_k) - rows_3 : 0;
  229. int qk = 0;
  230. qk += pre_rows_8 / 32 * 8;
  231. qk += pre_rows_6 / 32 * 6;
  232. qk += pre_rows_5 / 32 * 5;
  233. qk += pre_rows_4 / 32 * 4;
  234. qk += pre_rows_3 / 32 * 3;
  235. qk += pre_rows_2 / 32 * 2;
  236. const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
  237. const half* a_ptr = &block_a[0][0];
  238. int a_stride = EXL2_BLOCK_KN_SIZE;
  239. // Initial group
  240. int scales_idx = 0;
  241. half qs_h0 = scales[scales_idx][0];
  242. half qs_h1 = scales[scales_idx][1];
  243. half qs_h2 = scales[scales_idx][2];
  244. half qs_h3 = scales[scales_idx][3];
  245. int nextgroup = offset_k + b_q_group_map[offset_k * 2 + 1];
  246. // Column result
  247. half block_c[m_count][4] = {};
  248. // Dequantize groups
  249. int k = offset_k;
  250. while (k < rows_8 && k < end_k)
  251. {
  252. if (k == nextgroup)
  253. {
  254. group++;
  255. scales_idx++;
  256. qs_h0 = scales[scales_idx][0];
  257. qs_h1 = scales[scales_idx][1];
  258. qs_h2 = scales[scales_idx][2];
  259. qs_h3 = scales[scales_idx][3];
  260. nextgroup += b_q_group_map[k * 2 + 1];
  261. }
  262. #pragma unroll
  263. for (int j = 0; j < 4; j++)
  264. {
  265. int4 load_int4[2];
  266. load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
  267. load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
  268. half2 dq[4][4];
  269. dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n);
  270. dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n);
  271. dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n);
  272. dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n);
  273. for (int m = 0; m < m_count; m++)
  274. {
  275. block_c[m][0] = dot22_8_h(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_h0);
  276. block_c[m][1] = dot22_8_h(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_h1);
  277. block_c[m][2] = dot22_8_h(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_h2);
  278. block_c[m][3] = dot22_8_h(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_h3);
  279. }
  280. a_ptr += 8;
  281. }
  282. k += 32;
  283. }
  284. while (k < rows_6 && k < end_k)
  285. {
  286. if (k == nextgroup)
  287. {
  288. group++;
  289. scales_idx++;
  290. qs_h0 = scales[scales_idx][0];
  291. qs_h1 = scales[scales_idx][1];
  292. qs_h2 = scales[scales_idx][2];
  293. qs_h3 = scales[scales_idx][3];
  294. nextgroup += b_q_group_map[k * 2 + 1];
  295. }
  296. #pragma unroll
  297. for (int j = 0; j < 2; j++)
  298. {
  299. int4 load_int4[3];
  300. load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
  301. load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
  302. load_int4[2] = *((int4*) b_ptr); b_ptr += size_n;
  303. half2 dq[4][8];
  304. dequant_6bit_16(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0], size_n);
  305. dequant_6bit_16(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1], size_n);
  306. dequant_6bit_16(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2], size_n);
  307. dequant_6bit_16(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3], size_n);
  308. for (int m = 0; m < m_count; m++)
  309. {
  310. block_c[m][0] = dot22_16_h(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_h0);
  311. block_c[m][1] = dot22_16_h(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_h1);
  312. block_c[m][2] = dot22_16_h(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_h2);
  313. block_c[m][3] = dot22_16_h(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_h3);
  314. }
  315. a_ptr += 16;
  316. }
  317. k += 32;
  318. }
  319. while (k < rows_5 && k < end_k)
  320. {
  321. if (k == nextgroup)
  322. {
  323. group++;
  324. scales_idx++;
  325. qs_h0 = scales[scales_idx][0];
  326. qs_h1 = scales[scales_idx][1];
  327. qs_h2 = scales[scales_idx][2];
  328. qs_h3 = scales[scales_idx][3];
  329. nextgroup += b_q_group_map[k * 2 + 1];
  330. }
  331. #pragma unroll
  332. for (int j = 0; j < 1; j++)
  333. {
  334. int4 load_int4[5];
  335. load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
  336. load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
  337. load_int4[2] = *((int4*) b_ptr); b_ptr += size_n;
  338. load_int4[3] = *((int4*) b_ptr); b_ptr += size_n;
  339. load_int4[4] = *((int4*) b_ptr); b_ptr += size_n;
  340. half2 dq[4][16];
  341. dequant_5bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, load_int4[3].x, load_int4[4].x, dq[0], size_n);
  342. dequant_5bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, load_int4[3].y, load_int4[4].y, dq[1], size_n);
  343. dequant_5bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, load_int4[3].z, load_int4[4].z, dq[2], size_n);
  344. dequant_5bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, load_int4[3].w, load_int4[4].w, dq[3], size_n);
  345. for (int m = 0; m < m_count; m++)
  346. {
  347. block_c[m][0] = dot22_32_h(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_h0);
  348. block_c[m][1] = dot22_32_h(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_h1);
  349. block_c[m][2] = dot22_32_h(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_h2);
  350. block_c[m][3] = dot22_32_h(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_h3);
  351. }
  352. a_ptr += 32;
  353. }
  354. k += 32;
  355. }
  356. while (k < rows_4 && k < end_k)
  357. {
  358. if (k == nextgroup)
  359. {
  360. group++;
  361. scales_idx++;
  362. qs_h0 = scales[scales_idx][0];
  363. qs_h1 = scales[scales_idx][1];
  364. qs_h2 = scales[scales_idx][2];
  365. qs_h3 = scales[scales_idx][3];
  366. nextgroup += b_q_group_map[k * 2 + 1];
  367. }
  368. #pragma unroll
  369. for (int j = 0; j < 4; j++)
  370. {
  371. int4 load_int4[1];
  372. load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
  373. half2 dq[4][4];
  374. dequant_4bit_8(load_int4[0].x, dq[0], size_n);
  375. dequant_4bit_8(load_int4[0].y, dq[1], size_n);
  376. dequant_4bit_8(load_int4[0].z, dq[2], size_n);
  377. dequant_4bit_8(load_int4[0].w, dq[3], size_n);
  378. for (int m = 0; m < m_count; m++)
  379. {
  380. block_c[m][0] = dot22_8_h(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_h0);
  381. block_c[m][1] = dot22_8_h(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_h1);
  382. block_c[m][2] = dot22_8_h(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_h2);
  383. block_c[m][3] = dot22_8_h(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_h3);
  384. }
  385. a_ptr += 8;
  386. }
  387. k += 32;
  388. }
  389. while (k < rows_3 && k < end_k)
  390. {
  391. if (k == nextgroup)
  392. {
  393. group++;
  394. scales_idx++;
  395. qs_h0 = scales[scales_idx][0];
  396. qs_h1 = scales[scales_idx][1];
  397. qs_h2 = scales[scales_idx][2];
  398. qs_h3 = scales[scales_idx][3];
  399. nextgroup += b_q_group_map[k * 2 + 1];
  400. }
  401. #pragma unroll
  402. for (int j = 0; j < 1; j++)
  403. {
  404. int4 load_int4[3];
  405. load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
  406. load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
  407. load_int4[2] = *((int4*) b_ptr); b_ptr += size_n;
  408. half2 dq[4][16];
  409. dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0], size_n);
  410. dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1], size_n);
  411. dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2], size_n);
  412. dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3], size_n);
  413. for (int m = 0; m < m_count; m++)
  414. {
  415. block_c[m][0] = dot22_32_h(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_h0);
  416. block_c[m][1] = dot22_32_h(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_h1);
  417. block_c[m][2] = dot22_32_h(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_h2);
  418. block_c[m][3] = dot22_32_h(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_h3);
  419. }
  420. a_ptr += 32;
  421. }
  422. k += 32;
  423. }
  424. while (k < rows_2 && k < end_k)
  425. {
  426. if (k == nextgroup)
  427. {
  428. group++;
  429. scales_idx++;
  430. qs_h0 = scales[scales_idx][0];
  431. qs_h1 = scales[scales_idx][1];
  432. qs_h2 = scales[scales_idx][2];
  433. qs_h3 = scales[scales_idx][3];
  434. nextgroup += b_q_group_map[k * 2 + 1];
  435. }
  436. #pragma unroll
  437. for (int j = 0; j < 1; j++)
  438. {
  439. int4 load_int4[1];
  440. load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
  441. half2 dq[4][8];
  442. dequant_2bit_16(load_int4[0].x, dq[0], size_n);
  443. dequant_2bit_16(load_int4[0].y, dq[1], size_n);
  444. dequant_2bit_16(load_int4[0].z, dq[2], size_n);
  445. dequant_2bit_16(load_int4[0].w, dq[3], size_n);
  446. for (int m = 0; m < m_count; m++)
  447. {
  448. block_c[m][0] = dot22_16_h(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_h0);
  449. block_c[m][1] = dot22_16_h(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_h1);
  450. block_c[m][2] = dot22_16_h(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_h2);
  451. block_c[m][3] = dot22_16_h(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_h3);
  452. }
  453. a_ptr += 16;
  454. }
  455. k += 16;
  456. }
  457. // Accumulate column sums in c
  458. for (int m = 0; m < m_count; m++)
  459. {
  460. half2* out = (half2*)c_.item_ptr(offset_m + m, n);
  461. half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]);
  462. half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]);
  463. atomicAdd(out , result01);
  464. atomicAdd(out + 1, result23);
  465. // *out = result01;
  466. // *(out + 1) = result23;
  467. }
  468. }
  469. struct map_m_count_exl2 {
  470. static constexpr fp_gemm_half_q_half_kernel pick_gemm_half_q_half_kernel(const int m_count)
  471. {
  472. #if EXL2_BLOCK_M_SIZE_MAX >= 1
  473. if (m_count == 1) return gemm_half_q_half_kernel<1>;
  474. #endif
  475. #if EXL2_BLOCK_M_SIZE_MAX >= 2
  476. if (m_count == 2) return gemm_half_q_half_kernel<2>;
  477. #endif
  478. #if EXL2_BLOCK_M_SIZE_MAX >= 3
  479. if (m_count == 3) return gemm_half_q_half_kernel<3>;
  480. #endif
  481. #if EXL2_BLOCK_M_SIZE_MAX >= 4
  482. if (m_count == 4) return gemm_half_q_half_kernel<4>;
  483. #endif
  484. #if EXL2_BLOCK_M_SIZE_MAX >= 5
  485. if (m_count == 5) return gemm_half_q_half_kernel<5>;
  486. #endif
  487. #if EXL2_BLOCK_M_SIZE_MAX >= 6
  488. if (m_count == 6) return gemm_half_q_half_kernel<6>;
  489. #endif
  490. #if EXL2_BLOCK_M_SIZE_MAX >= 7
  491. if (m_count == 7) return gemm_half_q_half_kernel<7>;
  492. #endif
  493. #if EXL2_BLOCK_M_SIZE_MAX >= 8
  494. if (m_count == 8) return gemm_half_q_half_kernel<8>;
  495. #endif
  496. return NULL;
  497. }
  498. };
  499. fp_gemm_half_q_half_kernel pick_gemm_half_q_half_kernel(const int m_count)
  500. {
  501. return map_m_count_exl2::pick_gemm_half_q_half_kernel(m_count);
  502. }
  503. } // namespace exl2
  504. } // namespace aphrodite