1
0

utils.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. import re
  2. from enum import Enum
  3. from typing import Any, Dict, Iterable, Optional
  4. from pydantic import BaseModel, Field
  5. from torch.nn import Module
  6. from aphrodite.quantization.utils.quant_utils import FUSED_LAYER_NAME_MAPPING
  7. class CompressionFormat(Enum):
  8. dense = "dense"
  9. sparse_bitmask = "sparse-bitmask"
  10. naive_quantized = "naive-quantized"
  11. float_quantized = "float-quantized"
  12. int_quantized = "int-quantized"
  13. pack_quantized = "pack-quantized"
  14. marlin_24 = "marlin-24"
  15. class QuantizationType(str, Enum):
  16. """
  17. Enum storing quantization type options
  18. """
  19. INT = "int"
  20. FLOAT = "float"
  21. class QuantizationStrategy(str, Enum):
  22. """
  23. Enum storing quantization strategy options
  24. """
  25. TENSOR = "tensor"
  26. CHANNEL = "channel"
  27. GROUP = "group"
  28. BLOCK = "block"
  29. TOKEN = "token"
  30. class QuantizationArgs(BaseModel):
  31. """
  32. User facing arguments used to define a quantization config
  33. for weights or activations
  34. :param num_bits: quantization bit depth
  35. :param type: dtype to quantized to, either int or float
  36. :param symmetric: whether or not quantization scale is symmetric
  37. :param strategy: string determining the scope of scale/zero-point to apply
  38. :param group_size: group length to use for the group strategy
  39. :param block_structure: 2d block structure to use for the block
  40. strategy, must be of the format "2x4", "8x16", etc.
  41. :param dynamic: set True to perform dynamic quantization -
  42. values will not be calibrated during calibration phase,
  43. instead during inference new quantization ranges will be
  44. observed with every sample. Defaults to False for static
  45. quantization. Note that enabling dynamic quantization
  46. will change the default observer to a memoryless one
  47. """
  48. num_bits: int = 8
  49. type: QuantizationType = QuantizationType.INT
  50. symmetric: bool = True
  51. group_size: Optional[int] = None
  52. strategy: Optional[QuantizationStrategy] = None
  53. block_structure: Optional[str] = None
  54. dynamic: bool = False
  55. observer: str = Field(
  56. default="minmax",
  57. description=("The class to use to compute the quantization param - "
  58. "scale and zero-point'"),
  59. )
  60. observer_kwargs: Dict[str, Any] = Field(
  61. default_factory=dict,
  62. description=
  63. ("optional dict of kwargs to be passed directly to torch quantization "
  64. "Observers constructor excluding quantization range or symmetry"),
  65. )
  66. def is_activation_quantization_format(format: str) -> bool:
  67. _ACTIVATION_QUANTIZATION_FORMATS = [
  68. CompressionFormat.naive_quantized.value,
  69. CompressionFormat.int_quantized.value,
  70. CompressionFormat.float_quantized.value
  71. ]
  72. return format in _ACTIVATION_QUANTIZATION_FORMATS
  73. def should_ignore_layer(layer_name: Optional[str],
  74. ignore: Iterable[str]) -> bool:
  75. if layer_name is None:
  76. return False
  77. # layer_name = model.layers.0.self_attn.qkv_proj
  78. # proj_name = qkv_proj
  79. proj_name = layer_name.split(".")[-1]
  80. # Fused layers like gate_up_proj or qkv_proj will not be fused
  81. # in the safetensors checkpoint. So, we convert the name
  82. # from the fused version to unfused + check to make sure that
  83. # each shard of the fused layer has the same scheme.
  84. if proj_name in FUSED_LAYER_NAME_MAPPING:
  85. shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name]
  86. # Convert fused_name --> [shard_names]
  87. shard_names = [
  88. layer_name.replace(proj_name, shard_proj_name)
  89. for shard_proj_name in shard_proj_names
  90. ]
  91. # Layer should be ignored if shards are ignored.
  92. should_ignore_layer = None
  93. for shard_name in shard_names:
  94. should_ignore_shard = check_equal_or_regex_match(
  95. layer_name=shard_name, targets=ignore)
  96. # If shard_idx=0, set layer ignore to match shard.
  97. if should_ignore_layer is None:
  98. should_ignore_layer = should_ignore_shard
  99. # If shard_idx=1+ confirm scheme matches prior shards.
  100. elif should_ignore_shard != should_ignore_layer:
  101. raise ValueError(f"Found a different quantization schemes for "
  102. f"{shard_proj_names} in {layer_name}. "
  103. "Aphrodite requires all to use the same "
  104. "scheme.")
  105. # Unfused layers like down_proj and o_proj will match
  106. # the safetensors checkpoint already.
  107. else:
  108. should_ignore_layer = check_equal_or_regex_match(layer_name=layer_name,
  109. targets=ignore)
  110. assert should_ignore_layer is not None
  111. return should_ignore_layer
  112. def check_equal_or_regex_match(layer_name: str,
  113. targets: Iterable[str]) -> bool:
  114. """
  115. Checks whether a layer_name is exactly equal or a regex match for
  116. if target starts with 're:' to any target in list.
  117. """
  118. for target in targets:
  119. if _is_equal_or_regex_match(layer_name, target):
  120. return True
  121. return False
  122. def find_matched_target(layer_name: Optional[str], module: Module,
  123. targets: Iterable[str]) -> str:
  124. """
  125. Helper function to look up which "target" in the compressed-tensors
  126. config that a layer corresponds to.
  127. Recall that a compressed-tensors configs has a concept of
  128. config_groups, where each layer can be quantized with with a different
  129. scheme.
  130. targets in each config_group will be a list of either layer names
  131. (or regexes corresponding to layer names) or names of torch Modules.
  132. First, we try to match the layer_name with a target
  133. Second, we try to match the module's name with a target
  134. :param layer_name: layer name
  135. :param module: torch.nn.Module
  136. :param targets: list of targets to match the layer against
  137. """
  138. if layer_name is None:
  139. layer_name = ""
  140. matched_target = (_find_first_match(layer_name, targets)
  141. or _find_first_match(module.__class__.__name__, targets,
  142. True))
  143. if matched_target is None:
  144. raise ValueError(f"Unable to find matching target for {module} in the "
  145. "compressed-tensors config.")
  146. return matched_target
  147. def _find_first_match(value: str,
  148. targets: Iterable[str],
  149. check_contains: bool = False) -> Optional[str]:
  150. """
  151. Returns first element of target that matches value either
  152. exactly or as a regex after 're:'. If check_contains is set to True,
  153. additionally checks if the target string is contained within the value.
  154. :param value: string to compare the list of targets against
  155. :param targets: list of targets to match the layer against
  156. :param check_contains: whether or not to do a substring match
  157. """
  158. for target in targets:
  159. if _is_equal_or_regex_match(value,
  160. target,
  161. check_contains=check_contains):
  162. return target
  163. return None
  164. def get_compressed_tensors_cache_scale(name: str) -> Optional[str]:
  165. """
  166. Check whether the param name matches the format for k/v cache scales
  167. in compressed-tensors. If this is the case, return its equivalent
  168. param name expected by Aphrodite.
  169. :param name: param name
  170. :return: matching param name for KV cache scale in Aphrodite
  171. """
  172. if name.endswith(".output_scale") and ".k_proj" in name:
  173. return name.replace(".k_proj.output_scale", ".attn.k_scale")
  174. if name.endswith(".output_scale") and ".v_proj" in name:
  175. return name.replace(".v_proj.output_scale", ".attn.v_scale")
  176. # If no matches, return None
  177. return None
  178. def _is_equal_or_regex_match(value: str,
  179. target: str,
  180. check_contains: bool = False) -> bool:
  181. """
  182. Checks whether a value is exactly equal or a regex match for target
  183. if target starts with 're:'. If check_contains is set to True,
  184. additionally checks if the target string is contained within the value.
  185. """
  186. if target.startswith("re:"):
  187. pattern = target[3:]
  188. if re.match(pattern, value):
  189. return True
  190. elif check_contains:
  191. if target.lower() in value.lower():
  192. return True
  193. elif target == value:
  194. return True
  195. return False