1
0

gptq_marlin_24.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. from typing import Any, Dict, List, Optional
  2. import torch
  3. from loguru import logger
  4. from torch.nn.parameter import Parameter
  5. from aphrodite import _custom_ops as ops
  6. from aphrodite.modeling.layers.linear import LinearBase, LinearMethodBase
  7. from aphrodite.modeling.parameter import (BaseAphroditeParameter,
  8. ChannelQuantScaleParameter,
  9. GroupQuantScaleParameter,
  10. PackedAphroditeParameter)
  11. from aphrodite.quantization.base_config import QuantizationConfig
  12. from aphrodite.scalar_type import scalar_types
  13. GPTQ_MARLIN_24_TILE = 16
  14. GPTQ_MARLIN_24_MIN_THREAD_N = 128
  15. GPTQ_MARLIN_24_MIN_THREAD_K = 128
  16. GPTQ_MARLIN_24_MAX_PARALLEL = 64
  17. GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES = [
  18. scalar_types.uint4b8, scalar_types.uint8b128
  19. ]
  20. GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES = [-1, 128]
  21. class GPTQMarlin24Config(QuantizationConfig):
  22. """Config class for Marlin24.
  23. """
  24. def __init__(
  25. self,
  26. weight_bits: int,
  27. group_size: int,
  28. ) -> None:
  29. quant_type = {
  30. 4: scalar_types.uint4b8,
  31. 8: scalar_types.uint8b128,
  32. }.get(weight_bits)
  33. self.group_size = group_size
  34. # Verify
  35. if quant_type is None or \
  36. quant_type not in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES:
  37. raise ValueError(
  38. f"Marlin_24 does not support quant_type = {quant_type}. "
  39. f"Only weight_bits = {GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES} "
  40. "are supported.")
  41. if self.group_size not in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES:
  42. raise ValueError(
  43. f"Marlin_24 does not support group_size = {self.group_size}. "
  44. f"Only group_sizes = {GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES} "
  45. "are supported.")
  46. self.quant_type = quant_type
  47. # 4 Bits packed into 32 bit datatype.
  48. self.pack_factor = 32 // self.quant_type.size_bits
  49. # Tile size used by marlin kernels.
  50. self.tile_size = 16
  51. # Min out_features dim
  52. self.min_n_threads = GPTQ_MARLIN_24_MIN_THREAD_N
  53. # Min in_features dim
  54. self.min_k_threads = GPTQ_MARLIN_24_MIN_THREAD_K
  55. # Max parallel problems to solve at once (improves large
  56. # batch performance)
  57. self.max_parallel = GPTQ_MARLIN_24_MAX_PARALLEL
  58. # Permutation length used by the marlin kernels.
  59. self.perm_len = 1024
  60. def __repr__(self) -> str:
  61. return "Marlin24Config(quant_type={}, group_size={})".format(
  62. self.quant_type, self.group_size)
  63. @classmethod
  64. def get_name(cls) -> str:
  65. return "gptq_marlin_24"
  66. @classmethod
  67. def get_supported_act_dtypes(cls) -> List[torch.dtype]:
  68. return [torch.half]
  69. @classmethod
  70. # Need to figure it out
  71. def get_min_capability(cls) -> int:
  72. return 80
  73. @classmethod
  74. def get_config_filenames(cls) -> List[str]:
  75. return ["quantize_config.json"]
  76. @classmethod
  77. def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlin24Config":
  78. weight_bits = cls.get_from_keys(config, ["bits"])
  79. group_size = cls.get_from_keys(config, ["group_size"])
  80. return cls(weight_bits, group_size)
  81. @classmethod
  82. def override_quantization_method(cls, hf_quant_cfg,
  83. user_quant) -> Optional[str]:
  84. is_marlin_24_format = (
  85. hf_quant_cfg.get("checkpoint_format") == "marlin_24")
  86. is_valid_user_quant = (user_quant is None or user_quant == "gptq"
  87. or user_quant == "gptq_marlin_24")
  88. if is_marlin_24_format and is_valid_user_quant:
  89. msg = ("The model is serialized in {} format. "
  90. "Using {} kernel.".format(cls.get_name(), cls.get_name()))
  91. logger.info(msg)
  92. return cls.get_name()
  93. return None
  94. def get_quant_method(self, layer: torch.nn.Module,
  95. prefix: str) -> Optional["GPTQMarlin24LinearMethod"]:
  96. if isinstance(layer, LinearBase):
  97. return GPTQMarlin24LinearMethod(self)
  98. return None
  99. def get_scaled_act_names(self) -> List[str]:
  100. return []
  101. class GPTQMarlin24LinearMethod(LinearMethodBase):
  102. """Linear method for Marlin24.
  103. Args:
  104. quant_config: The Marlin24 quantization config.
  105. """
  106. def __init__(self, quant_config: GPTQMarlin24Config):
  107. self.quant_config = quant_config
  108. def create_weights(
  109. self,
  110. layer: torch.nn.Module,
  111. input_size_per_partition: int,
  112. output_partition_sizes: List[int],
  113. input_size: int,
  114. output_size: int,
  115. params_dtype: torch.dtype,
  116. **extra_weight_attrs,
  117. ):
  118. del output_size # Unused.
  119. weight_loader = extra_weight_attrs["weight_loader"]
  120. if params_dtype != torch.float16:
  121. raise ValueError(
  122. f"The params dtype must be float16, but got {params_dtype}")
  123. # Validate output_size_per_partition
  124. output_size_per_partition = sum(output_partition_sizes)
  125. if output_size_per_partition % self.quant_config.min_n_threads != 0:
  126. raise ValueError(
  127. f"Weight output_size_per_partition = "
  128. f"{output_size_per_partition} is not divisible by "
  129. f"min_n_threads = {self.quant_config.min_n_threads}.")
  130. if output_size_per_partition % self.quant_config.pack_factor != 0:
  131. raise ValueError(
  132. f"Weight output_size_per_partition = "
  133. f"{output_size_per_partition} is not divisible by "
  134. f"pack_factor = {self.quant_config.pack_factor}.")
  135. # Validate input_size_per_partition
  136. if input_size_per_partition % self.quant_config.min_k_threads != 0:
  137. raise ValueError(
  138. f"Weight input_size_per_partition = "
  139. f"{input_size_per_partition} is not divisible by "
  140. f"min_k_threads = {self.quant_config.min_k_threads}.")
  141. if (self.quant_config.group_size != -1 and
  142. input_size_per_partition % self.quant_config.group_size != 0):
  143. raise ValueError(f"Weight input_size_per_partition = "
  144. f"{input_size_per_partition} is not divisible by "
  145. f"group_size = {self.quant_config.group_size}.")
  146. # Check that we have at least 4 tiles horizontally in the shard
  147. num_tiles_per_perm = self.quant_config.perm_len // (
  148. self.quant_config.tile_size**2)
  149. if output_size_per_partition % num_tiles_per_perm != 0:
  150. raise ValueError(
  151. "Each permutation group must reside on the same gpu")
  152. # Quantized 4Bit weights packed into Int32.
  153. qweight = PackedAphroditeParameter(
  154. data=torch.empty(
  155. input_size_per_partition // self.quant_config.tile_size // 2,
  156. output_size_per_partition * self.quant_config.tile_size //
  157. self.quant_config.pack_factor,
  158. device="cuda",
  159. dtype=torch.int32,
  160. ),
  161. input_dim=0,
  162. output_dim=1,
  163. packed_dim=1,
  164. packed_factor=self.quant_config.pack_factor,
  165. marlin_tile_size=self.quant_config.tile_size,
  166. weight_loader=weight_loader)
  167. # Meta
  168. meta = PackedAphroditeParameter(data=torch.empty(
  169. input_size_per_partition // 8 // 2 // 2,
  170. output_size_per_partition * 2,
  171. device="cuda",
  172. dtype=torch.int16,
  173. ),
  174. input_dim=0,
  175. output_dim=1,
  176. packed_dim=1,
  177. packed_factor=1,
  178. marlin_tile_size=2,
  179. weight_loader=weight_loader)
  180. # Determine if channelwise or not
  181. input_groups = (1 if self.quant_config.group_size == -1 else
  182. input_size_per_partition //
  183. self.quant_config.group_size)
  184. weight_scale_args = {
  185. "data":
  186. torch.empty(
  187. input_groups,
  188. output_size_per_partition,
  189. device="cuda",
  190. dtype=params_dtype,
  191. ),
  192. "weight_loader":
  193. weight_loader
  194. }
  195. if input_groups == 1:
  196. scales = ChannelQuantScaleParameter(output_dim=1,
  197. **weight_scale_args)
  198. else:
  199. scales = GroupQuantScaleParameter(output_dim=1,
  200. input_dim=0,
  201. **weight_scale_args)
  202. # Allocate workspace (Used for internal locking mechanism)
  203. max_workspace_size = (
  204. output_size_per_partition //
  205. self.quant_config.min_n_threads) * self.quant_config.max_parallel
  206. workspace = BaseAphroditeParameter(data=torch.zeros(max_workspace_size,
  207. device="cuda",
  208. dtype=torch.int),
  209. weight_loader=weight_loader)
  210. layer.register_parameter("B_24", qweight)
  211. layer.register_parameter("B_meta", meta)
  212. layer.register_parameter("s", scales)
  213. layer.register_parameter("workspace", workspace)
  214. def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
  215. # required by torch.compile
  216. layer.B_24 = Parameter(layer.B_24.data, requires_grad=False)
  217. layer.s = Parameter(layer.s.data, requires_grad=False)
  218. layer.B_meta = Parameter(layer.B_meta.data, requires_grad=False)
  219. layer.workspace = Parameter(layer.workspace.data, requires_grad=False)
  220. def apply(
  221. self,
  222. layer: torch.nn.Module,
  223. x: torch.Tensor,
  224. bias: Optional[torch.Tensor] = None,
  225. ) -> torch.Tensor:
  226. qweight = layer.B_24
  227. meta = layer.B_meta
  228. scales = layer.s
  229. workspace = layer.workspace
  230. x_2d = x.view(-1, x.shape[-1])
  231. size_m = x_2d.shape[0]
  232. size_k = x_2d.shape[1]
  233. size_n = scales.shape[1]
  234. output_2d = ops.gptq_marlin_24_gemm(x_2d, qweight, meta, scales,
  235. workspace,
  236. self.quant_config.quant_type,
  237. size_m, size_n, size_k)
  238. output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
  239. if bias is not None:
  240. output.add_(bias) # In-place add
  241. return output