kernelLauncher.cu 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
  1. /*
  2. * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "common.h"
  17. #include "utility.h"
  18. namespace tensorrt_llm
  19. {
  20. namespace kernels
  21. {
  22. template <WeightOnlyQuantType QType, typename WeightOnlyFlag, template <typename T> class ActOp, bool Zero, bool Bias,
  23. int N_PER_BLOCK, int BATCH, int BLOCK_SIZE>
  24. struct WeightOnlyBatchedGemvKernelLauncher
  25. {
  26. static void run(const WeightOnlyParams& params, cudaStream_t stream);
  27. };
  28. template <WeightOnlyQuantType QType, typename WeightOnlyFlag, template <typename T> class ActOp, int N_PER_BLOCK,
  29. int BATCH, int BLOCK_SIZE>
  30. void select_zero_bias(const WeightOnlyParams& params, cudaStream_t stream)
  31. {
  32. if (params.zeros && params.bias)
  33. {
  34. WeightOnlyBatchedGemvKernelLauncher<QType, WeightOnlyFlag, ActOp, true, true, N_PER_BLOCK, BATCH,
  35. BLOCK_SIZE>::run(params, stream);
  36. }
  37. else if (params.zeros && !params.bias)
  38. {
  39. WeightOnlyBatchedGemvKernelLauncher<QType, WeightOnlyFlag, ActOp, true, false, N_PER_BLOCK, BATCH,
  40. BLOCK_SIZE>::run(params, stream);
  41. }
  42. else if (!params.zeros && params.bias)
  43. {
  44. WeightOnlyBatchedGemvKernelLauncher<QType, WeightOnlyFlag, ActOp, false, true, N_PER_BLOCK, BATCH,
  45. BLOCK_SIZE>::run(params, stream);
  46. }
  47. else
  48. {
  49. WeightOnlyBatchedGemvKernelLauncher<QType, WeightOnlyFlag, ActOp, false, false, N_PER_BLOCK, BATCH,
  50. BLOCK_SIZE>::run(params, stream);
  51. }
  52. }
  53. template <WeightOnlyQuantType QType, typename WeightOnlyFlag, int N_PER_BLOCK, int BATCH, int BLOCK_SIZE>
  54. void select_activation(const WeightOnlyParams& params, cudaStream_t stream)
  55. {
  56. switch (params.act_func_type)
  57. {
  58. // Currently, activation function is not called in the plugin
  59. #if 0
  60. case WeightOnlyActivationFunctionType::Gelu:
  61. {
  62. select_zero_bias<QType, WeightOnlyFlag, GeluActivation, N_PER_BLOCK, BATCH, BLOCK_SIZE>(params, stream);
  63. break;
  64. }
  65. case WeightOnlyActivationFunctionType::Relu:
  66. {
  67. select_zero_bias<QType, WeightOnlyFlag, ReluActivation, N_PER_BLOCK, BATCH, BLOCK_SIZE>(params, stream);
  68. break;
  69. }
  70. #endif
  71. case WeightOnlyActivationFunctionType::Identity:
  72. {
  73. select_zero_bias<QType, WeightOnlyFlag, IdentityActivation, N_PER_BLOCK, BATCH, BLOCK_SIZE>(params, stream);
  74. break;
  75. }
  76. default:
  77. {
  78. throw std::runtime_error("Use unsupported activation");
  79. break;
  80. }
  81. }
  82. }
  83. template <typename WeightOnlyFlag, int N_PER_BLOCK, int BATCH, int BLOCK_SIZE>
  84. void select_quant_type(const WeightOnlyParams& params, cudaStream_t stream)
  85. {
  86. if (params.quant_type == WeightOnlyQuantType::Int4b)
  87. {
  88. select_activation<WeightOnlyQuantType::Int4b, WeightOnlyFlag, N_PER_BLOCK, BATCH, BLOCK_SIZE>(params, stream);
  89. }
  90. else if (params.quant_type == WeightOnlyQuantType::Int8b)
  91. {
  92. select_activation<WeightOnlyQuantType::Int8b, WeightOnlyFlag, N_PER_BLOCK, BATCH, BLOCK_SIZE>(params, stream);
  93. }
  94. else
  95. {
  96. throw std::runtime_error("Unknown QuantType");
  97. }
  98. }
  99. template <int N_PER_BLOCK, int BATCH, int BLOCK_SIZE>
  100. void select_groupwise_weight_only(const WeightOnlyParams& params, cudaStream_t stream)
  101. {
  102. if (params.weight_only_type == WeightOnlyType::GroupWise && params.group_size == 64)
  103. {
  104. select_quant_type<WeightOnlyGroupWise<64>, N_PER_BLOCK, BATCH, BLOCK_SIZE>(params, stream);
  105. }
  106. else if (params.weight_only_type == WeightOnlyType::GroupWise && params.group_size == 128)
  107. {
  108. select_quant_type<WeightOnlyGroupWise<128>, N_PER_BLOCK, BATCH, BLOCK_SIZE>(params, stream);
  109. }
  110. else
  111. {
  112. throw std::runtime_error("Only support groupwise weight only for gs=64/128");
  113. }
  114. }
  115. void weight_only_batched_gemv_launcher(const WeightOnlyParams& params, cudaStream_t stream)
  116. {
  117. assert(params.act_func_type == WeightOnlyActivationFunctionType::Identity);
  118. assert(params.weight_only_type == WeightOnlyType::GroupWise
  119. || (params.weight_only_type == WeightOnlyType::PerChannel && params.bias == nullptr
  120. && params.zeros == nullptr));
  121. if (params.weight_only_type == WeightOnlyType::PerChannel)
  122. {
  123. if (params.quant_type == WeightOnlyQuantType::Int4b)
  124. {
  125. switch (params.m)
  126. {
  127. case 1:
  128. {
  129. WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel,
  130. IdentityActivation, false, false, 1, 1, 192>::run(params, stream);
  131. break;
  132. }
  133. case 2:
  134. {
  135. WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel,
  136. IdentityActivation, false, false, 2, 2, 128>::run(params, stream);
  137. break;
  138. }
  139. case 3:
  140. {
  141. WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel,
  142. IdentityActivation, false, false, 2, 3, 256>::run(params, stream);
  143. break;
  144. }
  145. case 4:
  146. {
  147. WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int4b, WeightOnlyPerChannel,
  148. IdentityActivation, false, false, 4, 4, 256>::run(params, stream);
  149. break;
  150. }
  151. default:
  152. {
  153. throw std::runtime_error("Weight only cuda kernel only supported bs <= 4");
  154. break;
  155. }
  156. }
  157. }
  158. else if (params.quant_type == WeightOnlyQuantType::Int8b)
  159. {
  160. switch (params.m)
  161. {
  162. case 1:
  163. {
  164. WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel,
  165. IdentityActivation, false, false, 2, 1, 256>::run(params, stream);
  166. break;
  167. }
  168. case 2:
  169. {
  170. WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel,
  171. IdentityActivation, false, false, 2, 2, 256>::run(params, stream);
  172. break;
  173. }
  174. case 3:
  175. {
  176. WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel,
  177. IdentityActivation, false, false, 2, 3, 256>::run(params, stream);
  178. break;
  179. }
  180. case 4:
  181. {
  182. WeightOnlyBatchedGemvKernelLauncher<WeightOnlyQuantType::Int8b, WeightOnlyPerChannel,
  183. IdentityActivation, false, false, 2, 4, 256>::run(params, stream);
  184. break;
  185. }
  186. default:
  187. {
  188. throw std::runtime_error("Weight only cuda kernel only supported bs <= 4");
  189. break;
  190. }
  191. }
  192. }
  193. }
  194. else if (params.weight_only_type == WeightOnlyType::GroupWise)
  195. {
  196. switch (params.m)
  197. {
  198. case 1:
  199. {
  200. select_groupwise_weight_only<2, 1, 256>(params, stream);
  201. break;
  202. }
  203. case 2:
  204. {
  205. select_groupwise_weight_only<2, 2, 256>(params, stream);
  206. break;
  207. }
  208. case 3:
  209. {
  210. select_groupwise_weight_only<2, 3, 128>(params, stream);
  211. break;
  212. }
  213. case 4:
  214. {
  215. select_groupwise_weight_only<2, 4, 128>(params, stream);
  216. break;
  217. }
  218. default:
  219. {
  220. throw std::runtime_error("Weight only cuda kernel only supported bs <= 4");
  221. break;
  222. }
  223. }
  224. }
  225. }
  226. } // namespace kernels
  227. } // namespace tensorrt_llm