compressed_tensors_unquantized.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
  1. from typing import Callable, List, Optional
  2. import torch
  3. import torch.nn.functional as F
  4. from aphrodite.modeling.parameter import ModelWeightParameter
  5. from aphrodite.quantization.compressed_tensors.schemes import (
  6. CompressedTensorsScheme)
  7. __all__ = ["CompressedTensorsUnquantized"]
  8. class CompressedTensorsUnquantized(CompressedTensorsScheme):
  9. """
  10. Implements the scheme for all layers which are ignored
  11. in the CompressedTensors config. The input and loaded weight are used
  12. in a linear transformation.
  13. """
  14. @classmethod
  15. def get_min_capability(cls) -> int:
  16. # volta and up
  17. return 70
  18. def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
  19. # required by torch.compile to be torch.nn.Parameter
  20. layer.weight = torch.nn.Parameter(layer.weight.data,
  21. requires_grad=False)
  22. def create_weights(self, layer: torch.nn.Module,
  23. output_partition_sizes: List[int],
  24. input_size_per_partition: int,
  25. params_dtype: torch.dtype, weight_loader: Callable,
  26. **kwargs):
  27. weight = ModelWeightParameter(data=torch.empty(
  28. sum(output_partition_sizes),
  29. input_size_per_partition,
  30. dtype=params_dtype),
  31. input_dim=1,
  32. output_dim=0,
  33. weight_loader=weight_loader)
  34. layer.register_parameter("weight", weight)
  35. def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
  36. bias: Optional[torch.Tensor]) -> torch.Tensor:
  37. return F.linear(x, layer.weight, bias)