|
- # Supports AQLM compression, see https://github.com/Vahe1994/AQLM
- # and https://arxiv.org/pdf/2401.06118.pdf
- import math
- from contextlib import suppress
- from typing import Any, Dict, List, Optional
- import torch
- import torch.nn.functional as F
- from torch.nn.parameter import Parameter
- from aphrodite.modeling.layers.linear import LinearBase, LinearMethodBase
- from aphrodite.modeling.utils import set_weight_attrs
- from aphrodite.quantization.base_config import QuantizationConfig
- HAS_QUANTS = False
- with suppress(ImportError):
- from aphrodite._quant_C import quant_ops as ops
- HAS_QUANTS = True
- def get_int_dtype(nbits: int) -> torch.dtype:
- if nbits <= 8:
- return torch.int8
- if nbits <= 16:
- return torch.int16
- if nbits <= 32:
- return torch.int32
- if nbits <= 64:
- return torch.int64
- raise ValueError(f"No dtype available for {nbits}-bit codebooks")
- @torch.inference_mode()
- def unpack_int_data(data: torch.IntTensor, nbits: int) -> torch.IntTensor:
- return data.to(torch.int64) % (2**nbits)
- def dequantize_weight(codes: torch.Tensor,
- codebooks: torch.Tensor,
- scales: Optional[torch.Tensor] = None) -> torch.Tensor:
- """
- Decode float weights from quantization codes. Differentiable.
- :param codes: tensor of integer quantization codes, shape
- [*dims, num_out_groups, num_in_groups, num_codebooks]
- :param codebooks: tensor of vectors for each quantization code,
- [num_codebooks, codebook_size, out_group_size, in_group_size]
- :param scales: weight will be multiplied by this factor, must be
- broadcastble with
- [*dims, out_groups, num_in_groups, out_group_size, in_group_size]
- :return: reconstructed weight tensor of shape
- [*dims, num_in_groups*group_size]
- """
- num_out_groups, num_in_groups, num_codebooks = codes.shape[-3:]
- num_codebooks, codebook_size, out_group_size, in_group_size = \
- codebooks.shape
- out_features = num_out_groups * out_group_size
- in_features = num_in_groups * in_group_size
- codebook_offsets = torch.arange(
- 0, num_codebooks * codebook_size, codebook_size,
- device=codes.device) # shape: [num_codebooks]
- reconstructed_weight_flat = F.embedding_bag(
- codes.flatten(0, -2) + codebook_offsets,
- codebooks.flatten(0, 1).flatten(-2, -1),
- mode="sum"
- ) # [prod(dims) * num_out_groups * num_in_groups, out_group_size
- # * in_group_size]
- reconstructed_weight_groupwise = reconstructed_weight_flat.view(
- list(codes.shape[:-3]) +
- [num_out_groups, num_in_groups, out_group_size, in_group_size])
- if scales is not None:
- reconstructed_weight_groupwise = reconstructed_weight_groupwise.mul(
- scales)
- return reconstructed_weight_groupwise.swapaxes(
- -3, -2).reshape(list(codes.shape[:-3]) + [out_features, in_features])
- def dequantize_gemm(
- input: torch.Tensor, # [..., in_features]
- codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
- codebooks: torch.
- Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
- scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
- bias: Optional[torch.Tensor],
- ) -> torch.Tensor:
- dequantized_weight = dequantize_weight(
- unpack_int_data(codes, codebooks.shape[1].bit_length() - 1),
- codebooks,
- scales,
- )
- return F.linear(input, dequantized_weight, bias)
- # Generic dequantization, slow but flexible.
- def generic_dequantize_gemm(
- input: torch.Tensor, # [..., in_features]
- codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
- codebooks: torch.
- Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
- scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
- output_partition_sizes: torch.IntTensor,
- bias: Optional[torch.Tensor],
- ) -> torch.Tensor:
- output_shape = input.shape[:-1] + (scales.shape[0], )
- output = torch.empty(output_shape, dtype=input.dtype, device=input.device)
- num_outputs = len(output_partition_sizes)
- # break the inputs and codebooks apart then combine the outputs.
- # Surprisingly (to me) this is faster than doing 3 de-quants and 1 big
- # multiply at the end.
- num_codebooks = codebooks.shape[0] // num_outputs
- assert (scales.shape[0] == codes.shape[0])
- assert (sum(output_partition_sizes) == scales.shape[0])
- output_offset = 0
- codebooks_offset = 0
- for output_size in output_partition_sizes:
- shard_output = dequantize_gemm(
- input, codes.narrow(0, output_offset, output_size),
- codebooks.narrow(0, codebooks_offset, num_codebooks),
- scales.narrow(0, output_offset, output_size), None
- if bias is None else bias.narrow(0, output_offset, output_size))
- output_slice = output.narrow(-1, output_offset, output_size)
- assert (output_slice.shape == shard_output.shape)
- output_slice.copy_(shard_output)
- output_offset += output_size
- codebooks_offset += num_codebooks
- return output
- # Optimized dequnantize/decompression kernels, supports 1x16 and 2x8
- # at 6 and 9 times faster than the generic version above, respectively.
- def optimized_dequantize_gemm(
- input: torch.Tensor, # [..., in_features]
- codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
- codebooks: torch.
- Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
- scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
- output_partition_sizes: torch.IntTensor,
- bias: Optional[torch.Tensor],
- ) -> torch.Tensor:
- weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
- if bias is None:
- # scaling the output is fastest, so we do that when possible.
- output = F.linear(input, weights, bias)
- orig_shape = output.shape
- flattened_output = output.view(-1, output.size(-1))
- f_scales = scales.view(-1, scales.shape[0])
- b_scales = f_scales.expand(flattened_output.shape[0], -1)
- flattened_output *= b_scales
- return output.view(orig_shape)
- else:
- b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
- -1, weights.shape[1])
- weights *= b_scales
- return F.linear(input, weights, bias)
- class AQLMConfig(QuantizationConfig):
- """Config class for AQLM.
- Reference: https://github.com/Vahe1994/AQLM
- """
- def __init__(
- self,
- in_group_size: int,
- nbits_per_codebook: int,
- num_codebooks: int,
- out_group_size: int,
- ) -> None:
- self.in_group_size = in_group_size
- self.nbits_per_codebook = nbits_per_codebook
- self.num_codebooks = num_codebooks
- self.out_group_size = out_group_size
- # out_group_size > 1 is untested, and probably won't work as-is.
- assert (self.out_group_size == 1)
- self.pack_factor = (self.in_group_size * self.out_group_size)
- def __repr__(self) -> str:
- return (f"AQLMConfig(in_group_size={self.in_group_size}, "
- f"nbits_per_codebook={self.nbits_per_codebook}, "
- f"num_codebooks={self.num_codebooks}, "
- f"out_group_size={self.out_group_size})")
- @classmethod
- def get_name(cls) -> str:
- return "aqlm"
- @classmethod
- def get_supported_act_dtypes(cls) -> List[torch.dtype]:
- return [torch.half]
- @classmethod
- def get_min_capability(cls) -> int:
- return 70
- @classmethod
- def get_config_filenames(cls) -> List[str]:
- return [] # no extra configs.
- @classmethod
- def from_config(cls, config: Dict[str, Any]) -> "AQLMConfig":
- in_group_size = cls.get_from_keys(config, ["in_group_size"])
- nbits_per_codebook = cls.get_from_keys(config, ["nbits_per_codebook"])
- num_code_books = cls.get_from_keys(config, ["num_codebooks"])
- out_group_size = cls.get_from_keys(config, ["out_group_size"])
- return cls(in_group_size, nbits_per_codebook, num_code_books,
- out_group_size)
- def get_quant_method(
- self, layer: torch.nn.Module) -> Optional["AQLMLinearMethod"]:
- if isinstance(layer, LinearBase):
- return AQLMLinearMethod(self)
- return None
- def get_scaled_act_names(self) -> List[str]:
- return []
- class AQLMLinearMethod(LinearMethodBase):
- """Linear method for AQLM.
- Args:
- quant_config: The AQLM quantization config.
- """
- def __init__(self, quant_config: AQLMConfig):
- self.quant_config = quant_config
- def create_weights(self, layer: torch.nn.Module,
- input_size_per_partition: int,
- output_partition_sizes: List[int], input_size: int,
- output_size: int, params_dtype: torch.dtype,
- **extra_weight_attrs):
- del output_size # Unused.
- del input_size # Unused.
- if params_dtype != torch.half:
- raise ValueError("Only half is currently supported by aqlm")
- if input_size_per_partition % self.quant_config.in_group_size != 0:
- raise ValueError(
- "The input size is not aligned with the quantized "
- "weight shape. This can be caused by too large "
- "tensor parallel size.")
- output_size_per_partition = sum(output_partition_sizes)
- if output_size_per_partition % self.quant_config.out_group_size != 0:
- raise ValueError(
- "The output size is not aligned with the quantized "
- "weight shape. This can be caused by too large "
- "tensor parallel size.")
- codes = Parameter(
- torch.empty(
- # There could actually be two pack factors, one along input and
- # one along output, but we don't currently support
- # out_group_size, and only the one along output needs to be
- # marked with "packed_dim" in order for QKVLinear to work.
- output_size_per_partition,
- input_size_per_partition // self.quant_config.pack_factor,
- self.quant_config.num_codebooks,
- dtype=get_int_dtype(self.quant_config.nbits_per_codebook),
- ),
- requires_grad=False,
- )
- set_weight_attrs(
- codes,
- {
- "input_dim": 1,
- "output_dim": 0,
- "packed_dim": 1,
- "pack_factor": self.quant_config.pack_factor,
- },
- )
- codebooks = Parameter(
- torch.empty(
- self.quant_config.num_codebooks * len(output_partition_sizes),
- 2**self.quant_config.nbits_per_codebook,
- self.quant_config.out_group_size,
- self.quant_config.in_group_size,
- dtype=params_dtype,
- ),
- requires_grad=False,
- )
- set_weight_attrs(
- codebooks,
- {
- # metadata indicates fixed size concatenated along dim 0
- "is_metadata":
- True,
- "output_partition_sizes":
- torch.tensor(output_partition_sizes, device='cpu'),
- },
- )
- scales = Parameter(
- torch.empty(
- (
- output_size_per_partition //
- self.quant_config.out_group_size,
- 1,
- 1,
- 1,
- ),
- dtype=params_dtype,
- ),
- requires_grad=False,
- )
- set_weight_attrs(
- scales,
- {
- "output_dim": 0,
- "packed_dim": 0,
- "pack_factor": self.quant_config.out_group_size
- },
- )
- layer.register_parameter("codes", codes)
- set_weight_attrs(codes, extra_weight_attrs)
- layer.register_parameter("codebooks", codebooks)
- set_weight_attrs(codebooks, extra_weight_attrs)
- layer.register_parameter("scales", scales)
- set_weight_attrs(scales, extra_weight_attrs)
- def apply(
- self,
- layer: torch.nn.Module,
- x: torch.Tensor,
- bias: Optional[torch.Tensor] = None,
- ) -> torch.Tensor:
- codebooks = layer.codebooks
- codes = layer.codes
- scales = layer.scales
- output_partition_sizes = getattr(codebooks, "output_partition_sizes",
- None)
- nbooks = codes.shape[2]
- ingroups = codebooks.shape[3]
- outgroups = codebooks.shape[2]
- bits = codebooks.shape[1]
- # We support these formats with dedicated gemm and decompression
- # kernels.
- if ingroups == 8 and outgroups == 1 and (
- (bits == 256 and nbooks == 2) or (bits == 65536 and nbooks == 1)):
- # thresholds determined by timings on an A6000, one GPU
- use_gemv = math.prod(x.shape[:-1]) <= 6
- return ops.aqlm_gemm(
- x,
- codes,
- codebooks,
- scales,
- output_partition_sizes,
- bias,
- ) if use_gemv else optimized_dequantize_gemm(
- x,
- codes,
- codebooks,
- scales,
- output_partition_sizes,
- bias,
- )
- # fall back all unoptimized formats
- return generic_dequantize_gemm(
- x,
- codes,
- codebooks,
- scales,
- output_partition_sizes,
- bias,
- )
|