1
0

ops.h 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. #pragma once
  2. #include <torch/extension.h>
  3. void paged_attention_v1(
  4. torch::Tensor& out,
  5. torch::Tensor& query,
  6. torch::Tensor& key_cache,
  7. torch::Tensor& value_cache,
  8. int num_kv_heads,
  9. float scale,
  10. torch::Tensor& block_tables,
  11. torch::Tensor& context_lens,
  12. int block_size,
  13. int max_context_len,
  14. const c10::optional<torch::Tensor>& alibi_slopes,
  15. const c10::optional<torch::Tensor>& custom_bias,
  16. const std::string& kv_cache_dtype,
  17. float k_scale = 1.0f,
  18. float k_zp = 0.0f,
  19. float v_scale = 1.0f,
  20. float v_zp = 0.0f);
  21. void paged_attention_v2(
  22. torch::Tensor& out,
  23. torch::Tensor& exp_sums,
  24. torch::Tensor& max_logits,
  25. torch::Tensor& tmp_out,
  26. torch::Tensor& query,
  27. torch::Tensor& key_cache,
  28. torch::Tensor& value_cache,
  29. int num_kv_heads,
  30. float scale,
  31. torch::Tensor& block_tables,
  32. torch::Tensor& context_lens,
  33. int block_size,
  34. int max_context_len,
  35. const c10::optional<torch::Tensor>& alibi_slopes,
  36. const c10::optional<torch::Tensor>& custom_bias,
  37. const std::string& kv_cache_dtype,
  38. float k_scale = 1.0f,
  39. float k_zp = 0.0f,
  40. float v_scale = 1.0f,
  41. float v_zp = 0.0f);
  42. void rms_norm(
  43. torch::Tensor& out,
  44. torch::Tensor& input,
  45. torch::Tensor& weight,
  46. float epsilon);
  47. void fused_add_rms_norm(
  48. torch::Tensor& input,
  49. torch::Tensor& residual,
  50. torch::Tensor& weight,
  51. float epsilon);
  52. void rotary_embedding(
  53. torch::Tensor& positions,
  54. torch::Tensor& query,
  55. torch::Tensor& key,
  56. int head_size,
  57. torch::Tensor& cos_sin_cache,
  58. bool is_neox);
  59. void silu_and_mul(
  60. torch::Tensor& out,
  61. torch::Tensor& input);
  62. void gelu_and_mul(
  63. torch::Tensor& out,
  64. torch::Tensor& input);
  65. void gelu_new(
  66. torch::Tensor& out,
  67. torch::Tensor& input);
  68. void gelu_fast(
  69. torch::Tensor& out,
  70. torch::Tensor& input);
  71. #ifndef USE_ROCM
  72. torch::Tensor awq_gemm(
  73. torch::Tensor _in_feats,
  74. torch::Tensor _kernel,
  75. torch::Tensor _scaling_factors,
  76. torch::Tensor _zeros,
  77. int split_k_iters);
  78. torch::Tensor awq_dequantize(
  79. torch::Tensor _kernel,
  80. torch::Tensor _scaling_factors,
  81. torch::Tensor _zeros,
  82. int split_k_iters,
  83. int thx,
  84. int thy);
  85. torch::Tensor autoquant_s4_f16_gemm(
  86. torch::Tensor _in_feats,
  87. torch::Tensor _kernel,
  88. torch::Tensor _scales_zeros);
  89. void autoquant_convert_s4_k_m8(
  90. torch::Tensor _weight_dest,
  91. torch::Tensor _quant_scales_zeros_dest,
  92. torch::Tensor _workspace,
  93. torch::Tensor _quant_weight_src,
  94. torch::Tensor _quant_scales,
  95. torch::Tensor _quant_zeros,
  96. int m,
  97. int k,
  98. int group_size);
  99. torch::Tensor marlin_gemm(
  100. torch::Tensor& a,
  101. torch::Tensor& b_q_weight,
  102. torch::Tensor& b_scales,
  103. torch::Tensor& workspace,
  104. int64_t size_m,
  105. int64_t size_n,
  106. int64_t size_k);
  107. at::Tensor e8p_mm_origorder(
  108. const at::Tensor& A,
  109. const at::Tensor& B,
  110. const at::Tensor& CB);
  111. void decompress_e8p_origorder(
  112. torch::Tensor YIs,
  113. torch::Tensor CB,
  114. torch::Tensor &Y
  115. );
  116. #endif
  117. void squeezellm_gemm(
  118. torch::Tensor vec,
  119. torch::Tensor mat,
  120. torch::Tensor mul,
  121. torch::Tensor lookup_table);
  122. torch::Tensor gptq_gemm(
  123. torch::Tensor a,
  124. torch::Tensor b_q_weight,
  125. torch::Tensor b_gptq_qzeros,
  126. torch::Tensor b_gptq_scales,
  127. torch::Tensor b_g_idx,
  128. bool use_exllama,
  129. int bit);
  130. void gptq_shuffle(
  131. torch::Tensor q_weight,
  132. torch::Tensor q_perm,
  133. int bit);
  134. torch::Tensor aqlm_gemm(
  135. const torch::Tensor& input,
  136. const torch::Tensor& codes,
  137. const torch::Tensor& codebooks,
  138. const torch::Tensor& scales,
  139. const torch::Tensor& codebook_partition_sizes,
  140. const std::optional<torch::Tensor>& bias
  141. );
  142. torch::Tensor ggml_dequantize(
  143. torch::Tensor X,
  144. int8_t type,
  145. int64_t m,
  146. int64_t n
  147. );
  148. torch::Tensor ggml_mul_mat_vec(
  149. torch::Tensor W, // quant weight
  150. torch::Tensor X, // input
  151. int8_t type,
  152. int64_t m
  153. );
  154. torch::Tensor ggml_mul_mat_vec_a8(
  155. torch::Tensor W, // quant weight
  156. torch::Tensor X, // input
  157. int8_t type,
  158. int64_t row
  159. );
  160. torch::Tensor ggml_mul_mat_a8(
  161. torch::Tensor W, // quant weight
  162. torch::Tensor X, // input
  163. int8_t type,
  164. int64_t row
  165. );
  166. uintptr_t make_q_matrix(
  167. torch::Tensor q_weight,
  168. torch::Tensor q_perm,
  169. torch::Tensor q_invperm,
  170. torch::Tensor q_scale,
  171. torch::Tensor q_scale_max,
  172. torch::Tensor q_groups,
  173. torch::Tensor q_group_map
  174. );
  175. torch::Tensor exl2_gemm(
  176. torch::Tensor a,
  177. uintptr_t b
  178. );
  179. void moe_align_block_size(
  180. torch::Tensor topk_ids,
  181. int num_experts,
  182. int block_size,
  183. torch::Tensor sorted_token_ids,
  184. torch::Tensor expert_ids,
  185. torch::Tensor num_tokens_post_pad
  186. );
  187. #ifndef USE_ROCM
  188. using fptr_t = uint64_t;
  189. fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data,
  190. const std::vector<std::string> &handles,
  191. const std::vector<int64_t> &offsets, int rank,
  192. bool full_nvlink);
  193. bool should_custom_ar(torch::Tensor &inp, int max_size, int world_size,
  194. bool full_nvlink);
  195. void all_reduce_reg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out);
  196. void all_reduce_unreg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &reg_buffer,
  197. torch::Tensor &out);
  198. void dispose(fptr_t _fa);
  199. int meta_size();
  200. void register_buffer(fptr_t _fa, torch::Tensor &t,
  201. const std::vector<std::string> &handles,
  202. const std::vector<int64_t> &offsets);
  203. std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(fptr_t _fa);
  204. void register_graph_buffers(fptr_t _fa, const std::vector<std::string> &handles,
  205. const std::vector<std::vector<int64_t>> &offsets);
  206. #endif