compressed_tensors_unquantized.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. from typing import Callable, List, Optional
  2. import torch
  3. import torch.nn.functional as F
  4. from torch.nn import Parameter
  5. from aphrodite.quantization.compressed_tensors.schemes import (
  6. CompressedTensorsScheme)
  7. from aphrodite.modeling.utils import set_weight_attrs
  8. __all__ = ["CompressedTensorsUnquantized"]
  9. class CompressedTensorsUnquantized(CompressedTensorsScheme):
  10. """
  11. Implements the scheme for all layers which are ignored
  12. in the CompressedTensors config. The input and loaded weight are used
  13. in a linear transformation.
  14. """
  15. def get_min_capability(self) -> int:
  16. # volta and up
  17. return 70
  18. def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
  19. pass
  20. def create_weights(self, layer: torch.nn.Module,
  21. output_partition_sizes: List[int],
  22. input_size_per_partition: int,
  23. params_dtype: torch.dtype, weight_loader: Callable,
  24. **kwargs):
  25. weight = Parameter(torch.empty(sum(output_partition_sizes),
  26. input_size_per_partition,
  27. dtype=params_dtype),
  28. requires_grad=False)
  29. set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
  30. layer.register_parameter("weight", weight)
  31. set_weight_attrs(weight, {"weight_loader": weight_loader})
  32. def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
  33. bias: Optional[torch.Tensor]) -> torch.Tensor:
  34. return F.linear(x, layer.weight, bias)