compressed_tensors_unquantized.py 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. from typing import Callable, List, Optional
  2. import torch
  3. import torch.nn.functional as F
  4. from torch.nn import Parameter
  5. from aphrodite.modeling.utils import set_weight_attrs
  6. from aphrodite.quantization.compressed_tensors.schemes import (
  7. CompressedTensorsScheme)
  8. __all__ = ["CompressedTensorsUnquantized"]
  9. class CompressedTensorsUnquantized(CompressedTensorsScheme):
  10. """
  11. Implements the scheme for all layers which are ignored
  12. in the CompressedTensors config. The input and loaded weight are used
  13. in a linear transformation.
  14. """
  15. @classmethod
  16. def get_min_capability(cls) -> int:
  17. # volta and up
  18. return 70
  19. def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
  20. pass
  21. def create_weights(self, layer: torch.nn.Module,
  22. output_partition_sizes: List[int],
  23. input_size_per_partition: int,
  24. params_dtype: torch.dtype, weight_loader: Callable,
  25. **kwargs):
  26. weight = Parameter(torch.empty(sum(output_partition_sizes),
  27. input_size_per_partition,
  28. dtype=params_dtype),
  29. requires_grad=False)
  30. set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
  31. layer.register_parameter("weight", weight)
  32. set_weight_attrs(weight, {"weight_loader": weight_loader})
  33. def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
  34. bias: Optional[torch.Tensor]) -> torch.Tensor:
  35. return F.linear(x, layer.weight, bias)