|
@@ -7,66 +7,38 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
|
|
pybind11::module ops = m.def_submodule("ops", "Aphrodite custom operators");
|
|
|
|
|
|
// Attention ops
|
|
|
- ops.def(
|
|
|
- "paged_attention_v1",
|
|
|
- &paged_attention_v1,
|
|
|
- "Compute the attention between an input query and the cached keys/values using PagedAttention.");
|
|
|
- ops.def(
|
|
|
- "paged_attention_v2",
|
|
|
- &paged_attention_v2,
|
|
|
- "PagedAttention V2.");
|
|
|
+ ops.def("paged_attention_v1", &paged_attention_v1,
|
|
|
+ "Compute the attention between an input query and the cached "
|
|
|
+ "keys/values using PagedAttention.");
|
|
|
+ ops.def("paged_attention_v2", &paged_attention_v2, "PagedAttention V2.");
|
|
|
|
|
|
// Activation ops
|
|
|
- ops.def(
|
|
|
- "silu_and_mul",
|
|
|
- &silu_and_mul,
|
|
|
- "Activation function used in SwiGLU.");
|
|
|
- ops.def(
|
|
|
- "gelu_and_mul",
|
|
|
- &gelu_and_mul,
|
|
|
- "Activation function used in GeGLU with `none` approximation.");
|
|
|
- ops.def(
|
|
|
- "gelu_tanh_and_mul",
|
|
|
- &gelu_tanh_and_mul,
|
|
|
- "Activation function used in GeGLU with `tanh` approximation.");
|
|
|
- ops.def(
|
|
|
- "gelu_new",
|
|
|
- &gelu_new,
|
|
|
- "GELU implementation used in GPT-2.");
|
|
|
- ops.def(
|
|
|
- "gelu_fast",
|
|
|
- &gelu_fast,
|
|
|
- "Approximate GELU implementation.");
|
|
|
+ ops.def("silu_and_mul", &silu_and_mul, "Activation function used in SwiGLU.");
|
|
|
+ ops.def("gelu_and_mul", &gelu_and_mul,
|
|
|
+ "Activation function used in GeGLU with `none` approximation.");
|
|
|
+ ops.def("gelu_tanh_and_mul", &gelu_tanh_and_mul,
|
|
|
+ "Activation function used in GeGLU with `tanh` approximation.");
|
|
|
+ ops.def("gelu_new", &gelu_new, "GELU implementation used in GPT-2.");
|
|
|
+ ops.def("gelu_fast", &gelu_fast, "Approximate GELU implementation.");
|
|
|
|
|
|
// Layernorm
|
|
|
- ops.def(
|
|
|
- "rms_norm",
|
|
|
- &rms_norm,
|
|
|
- "Apply Root Mean Square (RMS) Normalization to the input tensor.");
|
|
|
+ ops.def("rms_norm", &rms_norm,
|
|
|
+ "Apply Root Mean Square (RMS) Normalization to the input tensor.");
|
|
|
|
|
|
- ops.def(
|
|
|
- "fused_add_rms_norm",
|
|
|
- &fused_add_rms_norm,
|
|
|
- "In-place fused Add and RMS Normalization");
|
|
|
+ ops.def("fused_add_rms_norm", &fused_add_rms_norm,
|
|
|
+ "In-place fused Add and RMS Normalization");
|
|
|
|
|
|
// Rotary embedding
|
|
|
- ops.def(
|
|
|
- "rotary_embedding",
|
|
|
- &rotary_embedding,
|
|
|
- "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
|
|
|
+ ops.def("rotary_embedding", &rotary_embedding,
|
|
|
+ "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
|
|
|
|
|
|
// Cache ops
|
|
|
- pybind11::module cache_ops = m.def_submodule("cache_ops", "Aphrodite cache ops");
|
|
|
- cache_ops.def(
|
|
|
- "swap_blocks",
|
|
|
- &swap_blocks,
|
|
|
- "Swap in (out) the cache blocks from src to dst");
|
|
|
- cache_ops.def(
|
|
|
- "copy_blocks",
|
|
|
- ©_blocks,
|
|
|
- "Copy the cache blocks from src to dst");
|
|
|
- cache_ops.def(
|
|
|
- "reshape_and_cache",
|
|
|
- &reshape_and_cache,
|
|
|
- "Reshape the key and value tensors and cache them");
|
|
|
+ pybind11::module cache_ops =
|
|
|
+ m.def_submodule("cache_ops", "Aphrodite cache ops");
|
|
|
+ cache_ops.def("swap_blocks", &swap_blocks,
|
|
|
+ "Swap in (out) the cache blocks from src to dst");
|
|
|
+ cache_ops.def("copy_blocks", ©_blocks,
|
|
|
+ "Copy the cache blocks from src to dst");
|
|
|
+ cache_ops.def("reshape_and_cache", &reshape_and_cache,
|
|
|
+ "Reshape the key and value tensors and cache them");
|
|
|
}
|