pybind.cpp 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. #include "cache.h"
  2. #include "cuda_utils.h"
  3. #include "ops.h"
  4. #include <torch/extension.h>
  5. PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  6. // Aphrodite custom ops
  7. pybind11::module ops = m.def_submodule("ops", "Aphrodite custom operators");
  8. // Attention ops
  9. ops.def(
  10. "paged_attention_v1",
  11. &paged_attention_v1,
  12. "Compute the attention between an input query and the cached keys/values using PagedAttention.");
  13. ops.def(
  14. "paged_attention_v2",
  15. &paged_attention_v2,
  16. "PagedAttention V2.");
  17. // Activation ops
  18. ops.def(
  19. "silu_and_mul",
  20. &silu_and_mul,
  21. "Activation function used in SwiGLU.");
  22. ops.def(
  23. "gelu_and_mul",
  24. &gelu_and_mul,
  25. "Activation function used in GeGLU with `none` approximation.");
  26. ops.def(
  27. "gelu_tanh_and_mul",
  28. &gelu_tanh_and_mul,
  29. "Activation function used in GeGLU with `tanh` approximation.");
  30. ops.def(
  31. "gelu_new",
  32. &gelu_new,
  33. "GELU implementation used in GPT-2.");
  34. ops.def(
  35. "gelu_fast",
  36. &gelu_fast,
  37. "Approximate GELU implementation.");
  38. // Layernorm
  39. ops.def(
  40. "rms_norm",
  41. &rms_norm,
  42. "Apply Root Mean Square (RMS) Normalization to the input tensor.");
  43. ops.def(
  44. "fused_add_rms_norm",
  45. &fused_add_rms_norm,
  46. "In-place fused Add and RMS Normalization");
  47. // Rotary embedding
  48. ops.def(
  49. "rotary_embedding",
  50. &rotary_embedding,
  51. "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
  52. ops.def(
  53. "batched_rotary_embedding",
  54. &batched_rotary_embedding,
  55. "Apply batched GPT-NeoX or GPT-J style rotary embedding to query and key");
  56. #ifndef USE_ROCM
  57. // Quantization ops
  58. ops.def("aqlm_gemm", &aqlm_gemm, "Quantized GEMM for AQLM");
  59. ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
  60. ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");
  61. ops.def("autoquant_convert_s4_k_m8", &autoquant_convert_s4_k_m8, "convert kernel.");
  62. ops.def("autoquant_s4_f16_gemm", &autoquant_s4_f16_gemm, "weight int4 activation float16 gemm kernel.");
  63. ops.def("quip_decompress", &decompress_e8p_origorder, "decompress_packed_e8p");
  64. ops.def("quip_gemv", &e8p_mm_origorder, "e8p_mm_origorder");
  65. ops.def("marlin_gemm", &marlin_gemm, "Marlin Optimized Quantized GEMM for GPTQ");
  66. #endif
  67. ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
  68. ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
  69. ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
  70. ops.def("ggml_dequantize", &ggml_dequantize, "ggml_dequantize");
  71. ops.def("ggml_mul_mat_vec", &ggml_mul_mat_vec, "ggml_mul_mat_vec");
  72. ops.def("ggml_mul_mat_vec_a8", &ggml_mul_mat_vec_a8, "ggml_mul_mat_vec_a8");
  73. ops.def("ggml_mul_mat_a8", &ggml_mul_mat_a8, "ggml_mul_mat_a8");
  74. ops.def("exl2_make_q_matrix",&make_q_matrix, "preprocess for exl2");
  75. ops.def("exl2_gemm", &exl2_gemm, "exl2 gemm");
  76. ops.def("moe_align_block_size",
  77. &moe_align_block_size,
  78. "Aligning the number of tokens to be processed by each expert such that it is divisible by the block size.");
  79. // Cache ops
  80. pybind11::module cache_ops = m.def_submodule("cache_ops", "Aphrodite cache ops");
  81. cache_ops.def(
  82. "swap_blocks",
  83. &swap_blocks,
  84. "Swap in (out) the cache blocks from src to dst");
  85. cache_ops.def(
  86. "copy_blocks",
  87. &copy_blocks,
  88. "Copy the cache blocks from src to dst");
  89. cache_ops.def(
  90. "reshape_and_cache",
  91. &reshape_and_cache,
  92. "Reshape the key and value tensors and cache them");
  93. cache_ops.def(
  94. "gather_cached_kv",
  95. &gather_cached_kv,
  96. "Gather key and value from the cache into contiguous QKV tensors");
  97. cache_ops.def(
  98. "convert_fp8_e5m2",
  99. &convert_fp8_e5m2,
  100. "Convert the key and value cache to fp8_e5m2 data type");
  101. // Cuda utils
  102. pybind11::module cuda_utils = m.def_submodule("cuda_utils", "Aphrodite cuda utils");
  103. cuda_utils.def(
  104. "get_device_attribute",
  105. &get_device_attribute,
  106. "Gets the specified device attribute.");
  107. cuda_utils.def(
  108. "get_max_shared_memory_per_block_device_attribute",
  109. &get_max_shared_memory_per_block_device_attribute,
  110. "Gets the maximum shared memory per block device attribute.");
  111. #ifndef USE_ROCM
  112. // Custom all-reduce kernels
  113. pybind11::module custom_ar = m.def_submodule("custom_ar", "custom allreduce");
  114. custom_ar.def("init_custom_ar", &init_custom_ar, "init_custom_ar");
  115. custom_ar.def("should_custom_ar", &should_custom_ar, "should_custom_ar");
  116. custom_ar.def("all_reduce_reg", &all_reduce_reg, "all_reduce_reg");
  117. custom_ar.def("all_reduce_unreg", &all_reduce_unreg, "all_reduce_unreg");
  118. custom_ar.def("dispose", &dispose, "dispose");
  119. custom_ar.def("meta_size", &meta_size, "meta_size");
  120. custom_ar.def("register_buffer", &register_buffer, "register_buffer");
  121. custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta,
  122. "get_graph_buffer_ipc_meta");
  123. custom_ar.def("register_graph_buffers", &register_graph_buffers,
  124. "register_graph_buffers");
  125. #endif
  126. }