ops.h 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
  1. #pragma once
  2. #include <torch/extension.h>
  3. void paged_attention_v1(
  4. torch::Tensor& out,
  5. torch::Tensor& query,
  6. torch::Tensor& key_cache,
  7. torch::Tensor& value_cache,
  8. int num_kv_heads,
  9. float scale,
  10. torch::Tensor& block_tables,
  11. torch::Tensor& context_lens,
  12. int block_size,
  13. int max_context_len,
  14. const c10::optional<torch::Tensor>& alibi_slopes,
  15. const std::string& kv_cache_dtype,
  16. float k_scale = 1.0f,
  17. float k_zp = 0.0f,
  18. float v_scale = 1.0f,
  19. float v_zp = 0.0f);
  20. void paged_attention_v2(
  21. torch::Tensor& out,
  22. torch::Tensor& exp_sums,
  23. torch::Tensor& max_logits,
  24. torch::Tensor& tmp_out,
  25. torch::Tensor& query,
  26. torch::Tensor& key_cache,
  27. torch::Tensor& value_cache,
  28. int num_kv_heads,
  29. float scale,
  30. torch::Tensor& block_tables,
  31. torch::Tensor& context_lens,
  32. int block_size,
  33. int max_context_len,
  34. const c10::optional<torch::Tensor>& alibi_slopes,
  35. const std::string& kv_cache_dtype,
  36. float k_scale = 1.0f,
  37. float k_zp = 0.0f,
  38. float v_scale = 1.0f,
  39. float v_zp = 0.0f);
  40. void rms_norm(
  41. torch::Tensor& out,
  42. torch::Tensor& input,
  43. torch::Tensor& weight,
  44. float epsilon);
  45. void fused_add_rms_norm(
  46. torch::Tensor& input,
  47. torch::Tensor& residual,
  48. torch::Tensor& weight,
  49. float epsilon);
  50. void rotary_embedding(
  51. torch::Tensor& positions,
  52. torch::Tensor& query,
  53. torch::Tensor& key,
  54. int head_size,
  55. torch::Tensor& cos_sin_cache,
  56. bool is_neox);
  57. void silu_and_mul(
  58. torch::Tensor& out,
  59. torch::Tensor& input);
  60. void gelu_and_mul(
  61. torch::Tensor& out,
  62. torch::Tensor& input);
  63. void gelu_new(
  64. torch::Tensor& out,
  65. torch::Tensor& input);
  66. void gelu_fast(
  67. torch::Tensor& out,
  68. torch::Tensor& input);
  69. #ifndef USE_ROCM
  70. torch::Tensor awq_gemm(
  71. torch::Tensor _in_feats,
  72. torch::Tensor _kernel,
  73. torch::Tensor _scaling_factors,
  74. torch::Tensor _zeros,
  75. int split_k_iters);
  76. torch::Tensor awq_dequantize(
  77. torch::Tensor _kernel,
  78. torch::Tensor _scaling_factors,
  79. torch::Tensor _zeros,
  80. int split_k_iters,
  81. int thx,
  82. int thy);
  83. torch::Tensor autoquant_s4_f16_gemm(
  84. torch::Tensor _in_feats,
  85. torch::Tensor _kernel,
  86. torch::Tensor _scales_zeros);
  87. void autoquant_convert_s4_k_m8(
  88. torch::Tensor _weight_dest,
  89. torch::Tensor _quant_scales_zeros_dest,
  90. torch::Tensor _workspace,
  91. torch::Tensor _quant_weight_src,
  92. torch::Tensor _quant_scales,
  93. torch::Tensor _quant_zeros,
  94. int m,
  95. int k,
  96. int group_size);
  97. torch::Tensor marlin_gemm(
  98. torch::Tensor& a,
  99. torch::Tensor& b_q_weight,
  100. torch::Tensor& b_scales,
  101. torch::Tensor& workspace,
  102. int64_t size_m,
  103. int64_t size_n,
  104. int64_t size_k);
  105. at::Tensor e8p_mm_origorder(
  106. const at::Tensor& A,
  107. const at::Tensor& B,
  108. const at::Tensor& CB);
  109. void decompress_e8p_origorder(
  110. torch::Tensor YIs,
  111. torch::Tensor CB,
  112. torch::Tensor &Y
  113. );
  114. #endif
  115. void squeezellm_gemm(
  116. torch::Tensor vec,
  117. torch::Tensor mat,
  118. torch::Tensor mul,
  119. torch::Tensor lookup_table);
  120. torch::Tensor gptq_gemm(
  121. torch::Tensor a,
  122. torch::Tensor b_q_weight,
  123. torch::Tensor b_gptq_qzeros,
  124. torch::Tensor b_gptq_scales,
  125. torch::Tensor b_g_idx,
  126. bool use_exllama,
  127. int bit);
  128. void gptq_shuffle(
  129. torch::Tensor q_weight,
  130. torch::Tensor q_perm,
  131. int bit);
  132. torch::Tensor aqlm_gemm(
  133. const torch::Tensor& input,
  134. const torch::Tensor& codes,
  135. const torch::Tensor& codebooks,
  136. const torch::Tensor& scales,
  137. const torch::Tensor& codebook_partition_sizes,
  138. const std::optional<torch::Tensor>& bias
  139. );
  140. torch::Tensor ggml_dequantize(
  141. torch::Tensor X,
  142. int8_t type,
  143. int64_t m,
  144. int64_t n
  145. );
  146. torch::Tensor ggml_mul_mat_vec(
  147. torch::Tensor W, // quant weight
  148. torch::Tensor X, // input
  149. int8_t type,
  150. int64_t m
  151. );
  152. torch::Tensor ggml_mul_mat_vec_a8(
  153. torch::Tensor W, // quant weight
  154. torch::Tensor X, // input
  155. int8_t type,
  156. int64_t row
  157. );
  158. torch::Tensor ggml_mul_mat_a8(
  159. torch::Tensor W, // quant weight
  160. torch::Tensor X, // input
  161. int8_t type,
  162. int64_t row
  163. );
  164. uintptr_t make_q_matrix(
  165. torch::Tensor q_weight,
  166. torch::Tensor q_perm,
  167. torch::Tensor q_invperm,
  168. torch::Tensor q_scale,
  169. torch::Tensor q_scale_max,
  170. torch::Tensor q_groups,
  171. torch::Tensor q_group_map
  172. );
  173. torch::Tensor exl2_gemm(
  174. torch::Tensor a,
  175. uintptr_t b
  176. );
  177. void moe_align_block_size(
  178. torch::Tensor topk_ids,
  179. int num_experts,
  180. int block_size,
  181. torch::Tensor sorted_token_ids,
  182. torch::Tensor expert_ids,
  183. torch::Tensor num_tokens_post_pad
  184. );
  185. #ifndef USE_ROCM
  186. using fptr_t = uint64_t;
  187. fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data,
  188. const std::vector<std::string> &handles,
  189. const std::vector<int64_t> &offsets, int rank,
  190. bool full_nvlink);
  191. bool should_custom_ar(torch::Tensor &inp, int max_size, int world_size,
  192. bool full_nvlink);
  193. void all_reduce_reg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out);
  194. void all_reduce_unreg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &reg_buffer,
  195. torch::Tensor &out);
  196. void dispose(fptr_t _fa);
  197. int meta_size();
  198. void register_buffer(fptr_t _fa, torch::Tensor &t,
  199. const std::vector<std::string> &handles,
  200. const std::vector<int64_t> &offsets);
  201. std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(fptr_t _fa);
  202. void register_graph_buffers(fptr_t _fa, const std::vector<std::string> &handles,
  203. const std::vector<std::vector<int64_t>> &offsets);
  204. #endif