ops.h 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. #pragma once
  2. #include <torch/extension.h>
  3. void paged_attention_v1(
  4. torch::Tensor& out,
  5. torch::Tensor& query,
  6. torch::Tensor& key_cache,
  7. torch::Tensor& value_cache,
  8. int num_kv_heads,
  9. float scale,
  10. torch::Tensor& block_tables,
  11. torch::Tensor& context_lens,
  12. int block_size,
  13. int max_context_len,
  14. const c10::optional<torch::Tensor>& alibi_slopes,
  15. const std::string& kv_cache_dtype);
  16. void paged_attention_v2(
  17. torch::Tensor& out,
  18. torch::Tensor& exp_sums,
  19. torch::Tensor& max_logits,
  20. torch::Tensor& tmp_out,
  21. torch::Tensor& query,
  22. torch::Tensor& key_cache,
  23. torch::Tensor& value_cache,
  24. int num_kv_heads,
  25. float scale,
  26. torch::Tensor& block_tables,
  27. torch::Tensor& context_lens,
  28. int block_size,
  29. int max_context_len,
  30. const c10::optional<torch::Tensor>& alibi_slopes,
  31. const std::string& kv_cache_dtype);
  32. void rms_norm(
  33. torch::Tensor& out,
  34. torch::Tensor& input,
  35. torch::Tensor& weight,
  36. float epsilon);
  37. void fused_add_rms_norm(
  38. torch::Tensor& input,
  39. torch::Tensor& residual,
  40. torch::Tensor& weight,
  41. float epsilon);
  42. void rotary_embedding(
  43. torch::Tensor& positions,
  44. torch::Tensor& query,
  45. torch::Tensor& key,
  46. int head_size,
  47. torch::Tensor& cos_sin_cache,
  48. bool is_neox);
  49. void silu_and_mul(
  50. torch::Tensor& out,
  51. torch::Tensor& input);
  52. void gelu_and_mul(
  53. torch::Tensor& out,
  54. torch::Tensor& input);
  55. void gelu_new(
  56. torch::Tensor& out,
  57. torch::Tensor& input);
  58. void gelu_fast(
  59. torch::Tensor& out,
  60. torch::Tensor& input);
  61. #ifndef USE_ROCM
  62. torch::Tensor awq_gemm(
  63. torch::Tensor _in_feats,
  64. torch::Tensor _kernel,
  65. torch::Tensor _scaling_factors,
  66. torch::Tensor _zeros,
  67. int split_k_iters);
  68. torch::Tensor awq_dequantize(
  69. torch::Tensor _kernel,
  70. torch::Tensor _scaling_factors,
  71. torch::Tensor _zeros,
  72. int split_k_iters,
  73. int thx,
  74. int thy);
  75. void marlin_gemm(
  76. const torch::Tensor& input,
  77. const torch::Tensor& weights,
  78. torch::Tensor& output,
  79. const torch::Tensor& scales,
  80. torch::Tensor& workspace);
  81. at::Tensor e8p_mm_origorder(
  82. const at::Tensor& A,
  83. const at::Tensor& B,
  84. const at::Tensor& CB);
  85. void decompress_e8p_origorder(
  86. torch::Tensor YIs,
  87. torch::Tensor CB,
  88. torch::Tensor &Y
  89. );
  90. #endif
  91. void squeezellm_gemm(
  92. torch::Tensor vec,
  93. torch::Tensor mat,
  94. torch::Tensor mul,
  95. torch::Tensor lookup_table);
  96. torch::Tensor gptq_gemm(
  97. torch::Tensor a,
  98. torch::Tensor b_q_weight,
  99. torch::Tensor b_gptq_qzeros,
  100. torch::Tensor b_gptq_scales,
  101. torch::Tensor b_g_idx,
  102. bool use_exllama,
  103. int bit);
  104. void gptq_shuffle(
  105. torch::Tensor q_weight,
  106. torch::Tensor q_perm,
  107. int bit);
  108. torch::Tensor ggml_dequantize(
  109. torch::Tensor X,
  110. int8_t type,
  111. int64_t m,
  112. int64_t n
  113. );
  114. torch::Tensor ggml_mul_mat_vec(
  115. torch::Tensor W, // quant weight
  116. torch::Tensor X, // input
  117. int8_t type,
  118. int64_t m
  119. );
  120. torch::Tensor ggml_mul_mat_vec_a8(
  121. torch::Tensor W, // quant weight
  122. torch::Tensor X, // input
  123. int8_t type,
  124. int64_t row
  125. );
  126. torch::Tensor ggml_mul_mat_a8(
  127. torch::Tensor W, // quant weight
  128. torch::Tensor X, // input
  129. int8_t type,
  130. int64_t row
  131. );
  132. void moe_align_block_size(
  133. torch::Tensor topk_ids,
  134. int num_experts,
  135. int block_size,
  136. torch::Tensor sorted_token_ids,
  137. torch::Tensor expert_ids,
  138. torch::Tensor num_tokens_post_pad
  139. );
  140. #ifndef USE_ROCM
  141. using fptr_t = uint64_t;
  142. fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data,
  143. const std::vector<std::string> &handles,
  144. const std::vector<int64_t> &offsets, int rank,
  145. bool full_nvlink);
  146. bool should_custom_ar(torch::Tensor &inp, int max_size, int world_size,
  147. bool full_nvlink);
  148. void all_reduce_reg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out);
  149. void all_reduce_unreg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &reg_buffer,
  150. torch::Tensor &out);
  151. void dispose(fptr_t _fa);
  152. int meta_size();
  153. void register_buffer(fptr_t _fa, torch::Tensor &t,
  154. const std::vector<std::string> &handles,
  155. const std::vector<int64_t> &offsets);
  156. std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(fptr_t _fa);
  157. void register_graph_buffers(fptr_t _fa, const std::vector<std::string> &handles,
  158. const std::vector<std::vector<int64_t>> &offsets);
  159. #endif