10 月之前 · 483c95a2f8
--- a/aphrodite/quantization/awq.py
+++ b/aphrodite/quantization/awq.py
@@ -9,6 +9,7 @@ from aphrodite.modeling.layers.fused_moe import (moe_align_block_size,
 
				 from aphrodite.modeling.layers.linear import (LinearMethodBase,
			
 
				                                               set_weight_attrs)
			
 
				 from aphrodite.quantization.base_config import (QuantizationConfig)
			
 
				+from aphrodite._C import ops as _C_ops
			
 
				 
			
 
				 HAS_QUANTS = False
			
 
				 with suppress(ImportError):
			
@@ -222,7 +223,7 @@ class AWQLinearMethod(LinearMethodBase):
 
				         out = torch.empty((gate_up.shape[:-1] + (gate_up.shape[-1] // 2, )),
			
 
				                           dtype=x.dtype,
			
 
				                           device=x.device)
			
 
				-        ops.silu_and_mul(out, gate_up)
			
 
				+        _C_ops.silu_and_mul(out, gate_up)
			
 
				 
			
 
				         out = ops.awq_group_gemm(out, w2["qweight"], w2["scales"],
			
 
				                                  w2["qzeros"], topk_weights, sorted_token_ids,
			
--- a/aphrodite/quantization/gptq.py
+++ b/aphrodite/quantization/gptq.py
@@ -12,6 +12,7 @@ from aphrodite.modeling.layers.fused_moe import (fused_moe, fused_topk,
 
				 from aphrodite.modeling.layers.linear import LinearMethodBase, set_weight_attrs
			
 
				 from aphrodite.quantization.base_config import (
			
 
				     QuantizationConfig, )
			
 
				+from aphrodite._C import ops as _C_ops
			
 
				 
			
 
				 HAS_QUANTS = False
			
 
				 with suppress(ImportError):
			
@@ -321,7 +322,7 @@ class GPTQLinearMethod(LinearMethodBase):
 
				             dtype=x.dtype,
			
 
				             device=x.device,
			
 
				         )
			
 
				-        ops.silu_and_mul(out, gate_up)
			
 
				+        _C_ops.silu_and_mul(out, gate_up)
			
 
				 
			
 
				         out = ops.group_gptq_gemm(
			
 
				             out,