compressed_tensors_unquantized.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. from typing import Callable, List, Optional
  2. import torch
  3. import torch.nn.functional as F
  4. from torch.nn import Parameter
  5. from aphrodite.quantization.compressed_tensors.schemes import (
  6. CompressedTensorsScheme)
  7. from aphrodite.modeling.utils import set_weight_attrs
  8. __all__ = ["CompressedTensorsUnquantized"]
  9. class CompressedTensorsUnquantized(CompressedTensorsScheme):
  10. """
  11. Implements the scheme for all layers which are ignored
  12. in the CompressedTensors config. The input and loaded weight are used
  13. in a linear transformation.
  14. """
  15. def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
  16. pass
  17. def create_weights(self, layer: torch.nn.Module,
  18. output_partition_sizes: List[int],
  19. input_size_per_partition: int,
  20. params_dtype: torch.dtype, weight_loader: Callable,
  21. **kwargs):
  22. weight = Parameter(torch.empty(sum(output_partition_sizes),
  23. input_size_per_partition,
  24. device="cuda",
  25. dtype=params_dtype),
  26. requires_grad=False)
  27. set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
  28. layer.register_parameter("weight", weight)
  29. set_weight_attrs(weight, {"weight_loader": weight_loader})
  30. def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
  31. bias: Optional[torch.Tensor]) -> torch.Tensor:
  32. return F.linear(x, layer.weight, bias)