pybind.cpp 3.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. #include "cache.h"
  2. #include "cuda_utils.h"
  3. #include "ops.h"
  4. #include <torch/extension.h>
  5. PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  6. // Aphrodite custom ops
  7. pybind11::module ops = m.def_submodule("ops", "Aphrodite custom operators");
  8. // Attention ops
  9. ops.def("paged_attention_v1", &paged_attention_v1,
  10. "Compute the attention between an input query and the cached "
  11. "keys/values using PagedAttention.");
  12. ops.def("paged_attention_v2", &paged_attention_v2, "PagedAttention V2.");
  13. // Activation ops
  14. ops.def("silu_and_mul", &silu_and_mul, "Activation function used in SwiGLU.");
  15. ops.def("gelu_and_mul", &gelu_and_mul,
  16. "Activation function used in GeGLU with `none` approximation.");
  17. ops.def("gelu_tanh_and_mul", &gelu_tanh_and_mul,
  18. "Activation function used in GeGLU with `tanh` approximation.");
  19. ops.def("gelu_new", &gelu_new, "GELU implementation used in GPT-2.");
  20. ops.def("gelu_fast", &gelu_fast, "Approximate GELU implementation.");
  21. // Layernorm
  22. ops.def("rms_norm", &rms_norm,
  23. "Apply Root Mean Square (RMS) Normalization to the input tensor.");
  24. ops.def("fused_add_rms_norm", &fused_add_rms_norm,
  25. "In-place fused Add and RMS Normalization");
  26. // Rotary embedding
  27. ops.def("rotary_embedding", &rotary_embedding,
  28. "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
  29. ops.def("batched_rotary_embedding", &batched_rotary_embedding,
  30. "Apply batched GPT-NeoX or GPT-J style rotary embedding to query and "
  31. "key");
  32. ops.def("moe_align_block_size", &moe_align_block_size,
  33. "Aligning the number of tokens to be processed by each expert such "
  34. "that it is divisible by the block size.");
  35. // Cache ops
  36. pybind11::module cache_ops =
  37. m.def_submodule("cache_ops", "Aphrodite cache ops");
  38. cache_ops.def("swap_blocks", &swap_blocks,
  39. "Swap in (out) the cache blocks from src to dst");
  40. cache_ops.def("copy_blocks", &copy_blocks,
  41. "Copy the cache blocks from src to dst");
  42. cache_ops.def("reshape_and_cache", &reshape_and_cache,
  43. "Reshape the key and value tensors and cache them");
  44. cache_ops.def("reshape_and_cache_flash", &reshape_and_cache_flash,
  45. "Reshape the key and value tensors and cache them");
  46. cache_ops.def("convert_fp8", &convert_fp8,
  47. "Convert the key and value cache to fp8 data type");
  48. // Cuda utils
  49. pybind11::module cuda_utils =
  50. m.def_submodule("cuda_utils", "Aphrodite cuda utils");
  51. cuda_utils.def("get_device_attribute", &get_device_attribute,
  52. "Gets the specified device attribute.");
  53. cuda_utils.def("get_max_shared_memory_per_block_device_attribute",
  54. &get_max_shared_memory_per_block_device_attribute,
  55. "Gets the maximum shared memory per block device attribute.");
  56. #ifndef USE_ROCM
  57. // Custom all-reduce kernels
  58. pybind11::module custom_ar = m.def_submodule("custom_ar", "custom allreduce");
  59. custom_ar.def("init_custom_ar", &init_custom_ar, "init_custom_ar");
  60. custom_ar.def("should_custom_ar", &should_custom_ar, "should_custom_ar");
  61. custom_ar.def("all_reduce_reg", &all_reduce_reg, "all_reduce_reg");
  62. custom_ar.def("all_reduce_unreg", &all_reduce_unreg, "all_reduce_unreg");
  63. custom_ar.def("dispose", &dispose, "dispose");
  64. custom_ar.def("meta_size", &meta_size, "meta_size");
  65. custom_ar.def("register_buffer", &register_buffer, "register_buffer");
  66. custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta,
  67. "get_graph_buffer_ipc_meta");
  68. custom_ar.def("register_graph_buffers", &register_graph_buffers,
  69. "register_graph_buffers");
  70. #endif
  71. }