ops.h 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. #pragma once
  2. #include <torch/extension.h>
  3. void paged_attention_v1(
  4. torch::Tensor& out,
  5. torch::Tensor& query,
  6. torch::Tensor& key_cache,
  7. torch::Tensor& value_cache,
  8. int num_kv_heads,
  9. float scale,
  10. torch::Tensor& block_tables,
  11. torch::Tensor& context_lens,
  12. int block_size,
  13. int max_context_len,
  14. const c10::optional<torch::Tensor>& alibi_slopes,
  15. const std::string& kv_cache_dtype);
  16. void paged_attention_v2(
  17. torch::Tensor& out,
  18. torch::Tensor& exp_sums,
  19. torch::Tensor& max_logits,
  20. torch::Tensor& tmp_out,
  21. torch::Tensor& query,
  22. torch::Tensor& key_cache,
  23. torch::Tensor& value_cache,
  24. int num_kv_heads,
  25. float scale,
  26. torch::Tensor& block_tables,
  27. torch::Tensor& context_lens,
  28. int block_size,
  29. int max_context_len,
  30. const c10::optional<torch::Tensor>& alibi_slopes,
  31. const std::string& kv_cache_dtype);
  32. void rms_norm(
  33. torch::Tensor& out,
  34. torch::Tensor& input,
  35. torch::Tensor& weight,
  36. float epsilon);
  37. void fused_add_rms_norm(
  38. torch::Tensor& input,
  39. torch::Tensor& residual,
  40. torch::Tensor& weight,
  41. float epsilon);
  42. void rotary_embedding(
  43. torch::Tensor& positions,
  44. torch::Tensor& query,
  45. torch::Tensor& key,
  46. int head_size,
  47. torch::Tensor& cos_sin_cache,
  48. bool is_neox);
  49. void silu_and_mul(
  50. torch::Tensor& out,
  51. torch::Tensor& input);
  52. void gelu_new(
  53. torch::Tensor& out,
  54. torch::Tensor& input);
  55. void gelu_fast(
  56. torch::Tensor& out,
  57. torch::Tensor& input);
  58. #ifndef USE_ROCM
  59. torch::Tensor awq_gemm(
  60. torch::Tensor _in_feats,
  61. torch::Tensor _kernel,
  62. torch::Tensor _scaling_factors,
  63. torch::Tensor _zeros,
  64. int split_k_iters);
  65. torch::Tensor awq_dequantize(
  66. torch::Tensor _kernel,
  67. torch::Tensor _scaling_factors,
  68. torch::Tensor _zeros,
  69. int split_k_iters,
  70. int thx,
  71. int thy);
  72. void marlin_gemm(
  73. const torch::Tensor& input,
  74. const torch::Tensor& weights,
  75. torch::Tensor& output,
  76. const torch::Tensor& scales,
  77. torch::Tensor& workspace);
  78. at::Tensor e8p_mm_origorder(
  79. const at::Tensor& A,
  80. const at::Tensor& B,
  81. const at::Tensor& CB);
  82. void decompress_e8p_origorder(
  83. torch::Tensor YIs,
  84. torch::Tensor CB,
  85. torch::Tensor &Y
  86. );
  87. #endif
  88. void squeezellm_gemm(
  89. torch::Tensor vec,
  90. torch::Tensor mat,
  91. torch::Tensor mul,
  92. torch::Tensor lookup_table);
  93. torch::Tensor gptq_gemm(
  94. torch::Tensor a,
  95. torch::Tensor b_q_weight,
  96. torch::Tensor b_gptq_qzeros,
  97. torch::Tensor b_gptq_scales,
  98. torch::Tensor b_g_idx,
  99. bool use_exllama,
  100. int bit);
  101. void gptq_shuffle(
  102. torch::Tensor q_weight,
  103. torch::Tensor q_perm,
  104. int bit);
  105. torch::Tensor ggml_dequantize(
  106. torch::Tensor X,
  107. int8_t type,
  108. int64_t m,
  109. int64_t n
  110. );
  111. torch::Tensor ggml_mul_mat_vec(
  112. torch::Tensor W, // quant weight
  113. torch::Tensor X, // input
  114. int8_t type,
  115. int64_t m
  116. );
  117. torch::Tensor ggml_mul_mat_vec_a8(
  118. torch::Tensor W, // quant weight
  119. torch::Tensor X, // input
  120. int8_t type,
  121. int64_t row
  122. );
  123. torch::Tensor ggml_mul_mat_a8(
  124. torch::Tensor W, // quant weight
  125. torch::Tensor X, // input
  126. int8_t type,
  127. int64_t row
  128. );
  129. #ifndef USE_ROCM
  130. using fptr_t = uint64_t;
  131. fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data,
  132. const std::vector<std::string> &handles,
  133. const std::vector<int64_t> &offsets, int rank,
  134. bool full_nvlink);
  135. bool should_custom_ar(torch::Tensor &inp, int max_size, int world_size,
  136. bool full_nvlink);
  137. void all_reduce_reg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out);
  138. void all_reduce_unreg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &reg_buffer,
  139. torch::Tensor &out);
  140. void dispose(fptr_t _fa);
  141. int meta_size();
  142. void register_buffer(fptr_t _fa, torch::Tensor &t,
  143. const std::vector<std::string> &handles,
  144. const std::vector<int64_t> &offsets);
  145. std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(fptr_t _fa);
  146. void register_graph_buffers(fptr_t _fa, const std::vector<std::string> &handles,
  147. const std::vector<std::vector<int64_t>> &offsets);
  148. #endif