|
@@ -1,84 +1,12 @@
|
|
|
import re
|
|
|
-from enum import Enum
|
|
|
-from typing import Any, Dict, Iterable, Optional
|
|
|
+from typing import Iterable, Optional
|
|
|
|
|
|
-from pydantic import BaseModel, Field
|
|
|
+from compressed_tensors import CompressionFormat
|
|
|
from torch.nn import Module
|
|
|
|
|
|
from aphrodite.quantization.utils.quant_utils import FUSED_LAYER_NAME_MAPPING
|
|
|
|
|
|
|
|
|
-class CompressionFormat(Enum):
|
|
|
- dense = "dense"
|
|
|
- sparse_bitmask = "sparse-bitmask"
|
|
|
- naive_quantized = "naive-quantized"
|
|
|
- float_quantized = "float-quantized"
|
|
|
- int_quantized = "int-quantized"
|
|
|
- pack_quantized = "pack-quantized"
|
|
|
- marlin_24 = "marlin-24"
|
|
|
-
|
|
|
-
|
|
|
-class QuantizationType(str, Enum):
|
|
|
- """
|
|
|
- Enum storing quantization type options
|
|
|
- """
|
|
|
-
|
|
|
- INT = "int"
|
|
|
- FLOAT = "float"
|
|
|
-
|
|
|
-
|
|
|
-class QuantizationStrategy(str, Enum):
|
|
|
- """
|
|
|
- Enum storing quantization strategy options
|
|
|
- """
|
|
|
-
|
|
|
- TENSOR = "tensor"
|
|
|
- CHANNEL = "channel"
|
|
|
- GROUP = "group"
|
|
|
- BLOCK = "block"
|
|
|
- TOKEN = "token"
|
|
|
-
|
|
|
-
|
|
|
-class QuantizationArgs(BaseModel):
|
|
|
- """
|
|
|
- User facing arguments used to define a quantization config
|
|
|
- for weights or activations
|
|
|
-
|
|
|
- :param num_bits: quantization bit depth
|
|
|
- :param type: dtype to quantized to, either int or float
|
|
|
- :param symmetric: whether or not quantization scale is symmetric
|
|
|
- :param strategy: string determining the scope of scale/zero-point to apply
|
|
|
- :param group_size: group length to use for the group strategy
|
|
|
- :param block_structure: 2d block structure to use for the block
|
|
|
- strategy, must be of the format "2x4", "8x16", etc.
|
|
|
- :param dynamic: set True to perform dynamic quantization -
|
|
|
- values will not be calibrated during calibration phase,
|
|
|
- instead during inference new quantization ranges will be
|
|
|
- observed with every sample. Defaults to False for static
|
|
|
- quantization. Note that enabling dynamic quantization
|
|
|
- will change the default observer to a memoryless one
|
|
|
- """
|
|
|
-
|
|
|
- num_bits: int = 8
|
|
|
- type: QuantizationType = QuantizationType.INT
|
|
|
- symmetric: bool = True
|
|
|
- group_size: Optional[int] = None
|
|
|
- strategy: Optional[QuantizationStrategy] = None
|
|
|
- block_structure: Optional[str] = None
|
|
|
- dynamic: bool = False
|
|
|
- observer: str = Field(
|
|
|
- default="minmax",
|
|
|
- description=("The class to use to compute the quantization param - "
|
|
|
- "scale and zero-point'"),
|
|
|
- )
|
|
|
- observer_kwargs: Dict[str, Any] = Field(
|
|
|
- default_factory=dict,
|
|
|
- description=
|
|
|
- ("optional dict of kwargs to be passed directly to torch quantization "
|
|
|
- "Observers constructor excluding quantization range or symmetry"),
|
|
|
- )
|
|
|
-
|
|
|
-
|
|
|
def is_activation_quantization_format(format: str) -> bool:
|
|
|
_ACTIVATION_QUANTIZATION_FORMATS = [
|
|
|
CompressionFormat.naive_quantized.value,
|