ops.h 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. #pragma once
  2. #include <torch/extension.h>
  3. void paged_attention_v1(
  4. torch::Tensor& out,
  5. torch::Tensor& query,
  6. torch::Tensor& key_cache,
  7. torch::Tensor& value_cache,
  8. int num_kv_heads,
  9. float scale,
  10. torch::Tensor& block_tables,
  11. torch::Tensor& context_lens,
  12. int block_size,
  13. int max_context_len,
  14. const c10::optional<torch::Tensor>& alibi_slopes,
  15. const std::string& kv_cache_dtype);
  16. void paged_attention_v2(
  17. torch::Tensor& out,
  18. torch::Tensor& exp_sums,
  19. torch::Tensor& max_logits,
  20. torch::Tensor& tmp_out,
  21. torch::Tensor& query,
  22. torch::Tensor& key_cache,
  23. torch::Tensor& value_cache,
  24. int num_kv_heads,
  25. float scale,
  26. torch::Tensor& block_tables,
  27. torch::Tensor& context_lens,
  28. int block_size,
  29. int max_context_len,
  30. const c10::optional<torch::Tensor>& alibi_slopes,
  31. const std::string& kv_cache_dtype);
  32. void rms_norm(
  33. torch::Tensor& out,
  34. torch::Tensor& input,
  35. torch::Tensor& weight,
  36. float epsilon);
  37. void fused_add_rms_norm(
  38. torch::Tensor& input,
  39. torch::Tensor& residual,
  40. torch::Tensor& weight,
  41. float epsilon);
  42. void rotary_embedding(
  43. torch::Tensor& positions,
  44. torch::Tensor& query,
  45. torch::Tensor& key,
  46. int head_size,
  47. torch::Tensor& cos_sin_cache,
  48. bool is_neox);
  49. void batched_rotary_embedding(
  50. torch::Tensor& positions,
  51. torch::Tensor& query,
  52. torch::Tensor& key,
  53. int head_size,
  54. torch::Tensor& cos_sin_cache,
  55. bool is_neox,
  56. int rot_dim,
  57. torch::Tensor& cos_sin_cache_offsets);
  58. void silu_and_mul(
  59. torch::Tensor& out,
  60. torch::Tensor& input);
  61. void gelu_and_mul(
  62. torch::Tensor& out,
  63. torch::Tensor& input);
  64. void gelu_tanh_and_mul(
  65. torch::Tensor& out,
  66. torch::Tensor& input);
  67. void gelu_new(
  68. torch::Tensor& out,
  69. torch::Tensor& input);
  70. void gelu_fast(
  71. torch::Tensor& out,
  72. torch::Tensor& input);
  73. #ifndef USE_ROCM
  74. torch::Tensor awq_gemm(
  75. torch::Tensor _in_feats,
  76. torch::Tensor _kernel,
  77. torch::Tensor _scaling_factors,
  78. torch::Tensor _zeros,
  79. int split_k_iters);
  80. torch::Tensor awq_dequantize(
  81. torch::Tensor _kernel,
  82. torch::Tensor _scaling_factors,
  83. torch::Tensor _zeros,
  84. int split_k_iters,
  85. int thx,
  86. int thy);
  87. torch::Tensor autoquant_s4_f16_gemm(
  88. torch::Tensor _in_feats,
  89. torch::Tensor _kernel,
  90. torch::Tensor _scales_zeros);
  91. void autoquant_convert_s4_k_m8(
  92. torch::Tensor _weight_dest,
  93. torch::Tensor _quant_scales_zeros_dest,
  94. torch::Tensor _workspace,
  95. torch::Tensor _quant_weight_src,
  96. torch::Tensor _quant_scales,
  97. torch::Tensor _quant_zeros,
  98. int m,
  99. int k,
  100. int group_size);
  101. torch::Tensor marlin_gemm(
  102. torch::Tensor& a,
  103. torch::Tensor& b_q_weight,
  104. torch::Tensor& b_scales,
  105. torch::Tensor& workspace,
  106. int64_t size_m,
  107. int64_t size_n,
  108. int64_t size_k);
  109. at::Tensor e8p_mm_origorder(
  110. const at::Tensor& A,
  111. const at::Tensor& B,
  112. const at::Tensor& CB);
  113. void decompress_e8p_origorder(
  114. torch::Tensor YIs,
  115. torch::Tensor CB,
  116. torch::Tensor &Y
  117. );
  118. #endif
  119. void squeezellm_gemm(
  120. torch::Tensor vec,
  121. torch::Tensor mat,
  122. torch::Tensor mul,
  123. torch::Tensor lookup_table);
  124. torch::Tensor gptq_gemm(
  125. torch::Tensor a,
  126. torch::Tensor b_q_weight,
  127. torch::Tensor b_gptq_qzeros,
  128. torch::Tensor b_gptq_scales,
  129. torch::Tensor b_g_idx,
  130. bool use_exllama,
  131. int bit);
  132. void gptq_shuffle(
  133. torch::Tensor q_weight,
  134. torch::Tensor q_perm,
  135. int bit);
  136. torch::Tensor aqlm_gemm(
  137. const torch::Tensor& input,
  138. const torch::Tensor& codes,
  139. const torch::Tensor& codebooks,
  140. const torch::Tensor& scales,
  141. const torch::Tensor& codebook_partition_sizes,
  142. const std::optional<torch::Tensor>& bias
  143. );
  144. torch::Tensor ggml_dequantize(
  145. torch::Tensor X,
  146. int8_t type,
  147. int64_t m,
  148. int64_t n
  149. );
  150. torch::Tensor ggml_mul_mat_vec(
  151. torch::Tensor W, // quant weight
  152. torch::Tensor X, // input
  153. int8_t type,
  154. int64_t m
  155. );
  156. torch::Tensor ggml_mul_mat_vec_a8(
  157. torch::Tensor W, // quant weight
  158. torch::Tensor X, // input
  159. int8_t type,
  160. int64_t row
  161. );
  162. torch::Tensor ggml_mul_mat_a8(
  163. torch::Tensor W, // quant weight
  164. torch::Tensor X, // input
  165. int8_t type,
  166. int64_t row
  167. );
  168. uintptr_t make_q_matrix(
  169. torch::Tensor q_weight,
  170. torch::Tensor q_perm,
  171. torch::Tensor q_invperm,
  172. torch::Tensor q_scale,
  173. torch::Tensor q_scale_max,
  174. torch::Tensor q_groups,
  175. torch::Tensor q_group_map
  176. );
  177. torch::Tensor exl2_gemm(
  178. torch::Tensor a,
  179. uintptr_t b
  180. );
  181. void moe_align_block_size(
  182. torch::Tensor topk_ids,
  183. int num_experts,
  184. int block_size,
  185. torch::Tensor sorted_token_ids,
  186. torch::Tensor expert_ids,
  187. torch::Tensor num_tokens_post_pad
  188. );
  189. #ifndef USE_ROCM
  190. using fptr_t = uint64_t;
  191. fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data,
  192. const std::vector<std::string> &handles,
  193. const std::vector<int64_t> &offsets, int rank,
  194. bool full_nvlink);
  195. bool should_custom_ar(torch::Tensor &inp, int max_size, int world_size,
  196. bool full_nvlink);
  197. void all_reduce_reg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out);
  198. void all_reduce_unreg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &reg_buffer,
  199. torch::Tensor &out);
  200. void dispose(fptr_t _fa);
  201. int meta_size();
  202. void register_buffer(fptr_t _fa, torch::Tensor &t,
  203. const std::vector<std::string> &handles,
  204. const std::vector<int64_t> &offsets);
  205. std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(fptr_t _fa);
  206. void register_graph_buffers(fptr_t _fa, const std::vector<std::string> &handles,
  207. const std::vector<std::vector<int64_t>> &offsets);
  208. #endif