pybind.cpp 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. #include "cache.h"
  2. #include "cuda_utils.h"
  3. #include "ops.h"
  4. #include <torch/extension.h>
  5. PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  6. // Aphrodite custom ops
  7. pybind11::module ops = m.def_submodule("ops", "Aphrodite Engine custom operators");
  8. // Attention ops
  9. ops.def(
  10. "paged_attention_v1",
  11. &paged_attention_v1,
  12. "Compute the attention between an input query and the cached keys/values using PagedAttention.");
  13. ops.def(
  14. "paged_attention_v2",
  15. &paged_attention_v2,
  16. "PagedAttention V2.");
  17. // Activation ops
  18. ops.def(
  19. "silu_and_mul",
  20. &silu_and_mul,
  21. "Activation function used in SwiGLU.");
  22. ops.def(
  23. "gelu_new",
  24. &gelu_new,
  25. "GELU implementation used in GPT-2.");
  26. ops.def(
  27. "gelu_fast",
  28. &gelu_fast,
  29. "Approximate GELU implementation.");
  30. // Layernorm
  31. ops.def(
  32. "rms_norm",
  33. &rms_norm,
  34. "Apply Root Mean Square (RMS) Normalization to the input tensor.");
  35. ops.def(
  36. "fused_add_rms_norm",
  37. &fused_add_rms_norm,
  38. "In-place fused Add and RMS Normalization");
  39. // Rotary embedding
  40. ops.def(
  41. "rotary_embedding",
  42. &rotary_embedding,
  43. "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
  44. // Quantization ops
  45. #ifndef USE_ROCM
  46. ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
  47. ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
  48. ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
  49. #endif
  50. ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
  51. // misc
  52. ops.def(
  53. "bincount",
  54. &aphrodite_bincount,
  55. "CUDA Graph compatible bincount implementation.");
  56. // Cache ops
  57. pybind11::module cache_ops = m.def_submodule("cache_ops", "Aphrodite Engine cache ops");
  58. cache_ops.def(
  59. "swap_blocks",
  60. &swap_blocks,
  61. "Swap in (out) the cache blocks from src to dst");
  62. cache_ops.def(
  63. "copy_blocks",
  64. &copy_blocks,
  65. "Copy the cache blocks from src to dst");
  66. cache_ops.def(
  67. "reshape_and_cache",
  68. &reshape_and_cache,
  69. "Reshape the key and value tensors and cache them");
  70. cache_ops.def(
  71. "gather_cached_kv",
  72. &gather_cached_kv,
  73. "Gather key and value from the cache into contiguous QKV tensors");
  74. cache_ops.def(
  75. "convert_fp8",
  76. &convert_fp8,
  77. "Convert the KV cache to FP8 datatype");
  78. // Cuda utils
  79. pybind11::module cuda_utils = m.def_submodule("cuda_utils", "Aphrodite Engine cuda utils");
  80. cuda_utils.def(
  81. "get_device_attribute",
  82. &get_device_attribute,
  83. "Gets the specified device attribute.");
  84. }