test_machete_gemm.py 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. """Tests for the machete kernel.
  2. Run `pytest tests/kernels/test_machete_gemm.py`.
  3. """
  4. import math
  5. from typing import Optional, Tuple
  6. import pytest
  7. import torch
  8. from aphrodite import _custom_ops as ops
  9. from aphrodite.platforms import current_platform
  10. from aphrodite.quantization.utils.quant_utils import (pack_rows,
  11. quantize_weights)
  12. from aphrodite.scalar_type import ScalarType, scalar_types
  13. CUDA_DEVICES = [
  14. f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
  15. ]
  16. MNK_SHAPES = [
  17. (1, 128, 128),
  18. (1, 512, 1024),
  19. (1, 4096, 4096),
  20. (13, 8192, 4096),
  21. (26, 4096, 8192),
  22. (1, 4096, 4096),
  23. (257, 128, 4096),
  24. (257, 4224, 4160),
  25. (257, 4096, 4096),
  26. (64, 4096, 4096),
  27. (1024, 4096, 8192),
  28. (1024, 8192, 4096),
  29. ]
  30. ACT_TYPES = [torch.float16, torch.bfloat16]
  31. WTYPE_ZEROPOINTS = [
  32. # GPTQ style
  33. (scalar_types.uint4b8, False),
  34. (scalar_types.uint8b128, False),
  35. # AWQ style
  36. (scalar_types.uint4, True),
  37. (scalar_types.uint8, True),
  38. ]
  39. # TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
  40. # unit tests to a common utility function. Currently the use of
  41. # `is_quant_method_supported` conflates kernels with quantization methods
  42. # an assumption which is breaking down as quantizations methods can have
  43. # have kernels and some kernels support multiple quantization methods.
  44. IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9
  45. def rand_data(shape, dtype=torch.float16):
  46. return 10 * (torch.rand(shape, dtype=dtype, device="cuda") - 0.3)
  47. def maybe_convert_zeropoints(zps: Optional[torch.Tensor], s: torch.Tensor):
  48. return zps if zps is None else -1 * s * (zps.to(s.dtype))
  49. def machete_quantize_and_pack(w: torch.Tensor,
  50. wtype: ScalarType,
  51. group_size: int,
  52. zero_points: bool = False):
  53. assert wtype.is_integer(), "TODO: support floating point weights"
  54. w_ref, w_q, w_s, w_zp = quantize_weights(
  55. w,
  56. wtype,
  57. group_size,
  58. zero_points=zero_points,
  59. # to match how the kernel applies zps
  60. ref_zero_points_after_scales=True)
  61. w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
  62. w_q = w_q.t().contiguous().t() # convert to col major
  63. w_q_machete = ops.machete_prepack_B(w_q, wtype)
  64. return w_ref, w_q_machete, w_s, w_zp
  65. def machete_gemm_test_helper(a: torch.Tensor, b: torch.Tensor,
  66. wtype: ScalarType, group_size: int,
  67. zero_points: bool):
  68. w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack(
  69. b, wtype, group_size, zero_points)
  70. output_ref = torch.matmul(a, w_ref)
  71. output = ops.machete_gemm(
  72. a=a,
  73. b_q=w_q_packed,
  74. b_type=wtype,
  75. b_scales=w_s,
  76. b_zeros=maybe_convert_zeropoints(w_zp, w_s),
  77. b_group_size=group_size,
  78. )
  79. # Relax atol as our reduction dim becomes larger (more rounding error)
  80. # Relax atol when we have zeropoints since the way machete applies
  81. # zeropoints (after scales) causes noise around 0
  82. atol = 1 if zero_points else min(5e-2 * math.sqrt(a.shape[1]), 1)
  83. torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol)
  84. @pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
  85. reason="Machete is not supported on this GPU type.")
  86. @pytest.mark.parametrize("shape",
  87. MNK_SHAPES,
  88. ids=lambda x: "x".join(str(v) for v in x))
  89. @pytest.mark.parametrize("atype", ACT_TYPES, ids=lambda x: str(x))
  90. @pytest.mark.parametrize("wtype_zeropoints", WTYPE_ZEROPOINTS)
  91. @pytest.mark.parametrize("group_size", [128, None])
  92. def test_machete_all_schedules(shape, atype: torch.dtype,
  93. wtype_zeropoints: Tuple[ScalarType, bool],
  94. group_size: Optional[int]):
  95. m, n, k = shape
  96. wtype, zero_points = wtype_zeropoints
  97. if group_size is not None and k % group_size != 0:
  98. return
  99. print(f"MNK = {m} {n} {k}")
  100. # Normalize group_size
  101. if group_size is None:
  102. group_size = k
  103. assert group_size <= k
  104. a = rand_data((m, k), atype)
  105. w = rand_data((k, n), atype)
  106. w_ref, w_q_machete, w_s, w_zp = machete_quantize_and_pack(
  107. w, wtype, group_size, zero_points)
  108. output_ref = torch.matmul(a, w_ref)
  109. for schedule in ops.machete_supported_schedules(wtype):
  110. print(f"Testing schedule {schedule}")
  111. output = ops.machete_gemm(
  112. a,
  113. b_q=w_q_machete,
  114. b_type=wtype,
  115. b_scales=w_s,
  116. b_zeros=maybe_convert_zeropoints(w_zp, w_s),
  117. b_group_size=group_size,
  118. schedule=schedule,
  119. )
  120. # Relax atol as our reduction dim becomes larger (more rounding error)
  121. # Relax atol when we have zeropoints since the way machete applies
  122. # zeropoints (after scales) causes noise around 0
  123. atol = 1 if zero_points else min(5e-2 * math.sqrt(k), 1)
  124. torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol),\
  125. f"Schedule failed {schedule}"
  126. @pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
  127. reason="Machete is not supported on this GPU type.")
  128. @pytest.mark.parametrize("shape",
  129. MNK_SHAPES,
  130. ids=lambda x: "x".join(str(v) for v in x))
  131. @pytest.mark.parametrize("atype", ACT_TYPES, ids=lambda x: str(x))
  132. @pytest.mark.parametrize("wtype_zeropoints", WTYPE_ZEROPOINTS)
  133. @pytest.mark.parametrize("group_size", [128, None])
  134. def test_machete_heuristic(shape, atype: torch.dtype,
  135. wtype_zeropoints: Tuple[ScalarType, bool],
  136. group_size: Optional[int]):
  137. m, n, k = shape
  138. wtype, zero_points = wtype_zeropoints
  139. if group_size is not None and k % group_size != 0:
  140. return
  141. # Normalize group_size
  142. if group_size is None:
  143. group_size = k
  144. assert group_size <= k
  145. a = rand_data((m, k), atype)
  146. b = rand_data((k, n), atype)
  147. machete_gemm_test_helper(a, b, wtype, group_size, zero_points)
  148. # Test working on other devices
  149. @pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
  150. reason="Machete is not supported on this GPU type.")
  151. @pytest.mark.parametrize("device", CUDA_DEVICES)
  152. def test_machete_devices(device: str):
  153. m, n, k = 512, 4096, 4096
  154. wtype = scalar_types.uint4b8
  155. group_size = 128
  156. zero_points = False
  157. print(f"MNK = {m} {n} {k}, device = {device}")
  158. a = rand_data((m, k), torch.float16).to(device)
  159. b = rand_data((k, n), torch.float16).to(device)
  160. machete_gemm_test_helper(a, b, wtype, group_size, zero_points)
  161. # Test working with a subset of A and B
  162. @pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
  163. reason="Machete is not supported on this GPU type.")
  164. def test_machete_subset():
  165. big_m, big_n, big_k = 1024, 1024, 1024
  166. m, n, k = 512, 512, 512
  167. wtype = scalar_types.uint4b8
  168. group_size = 128
  169. zero_points = False
  170. whole_a = rand_data((big_m, big_k), torch.float16)
  171. whole_b = rand_data((big_k, big_n), torch.float16)
  172. a = whole_a[0:m, 0:k]
  173. b = whole_b[0:k, 0:n]
  174. machete_gemm_test_helper(a, b, wtype, group_size, zero_points)
  175. # Test to make sure cuda graphs work
  176. class MacheteLayer(torch.nn.Module):
  177. def __init__(self, **kwargs):
  178. super().__init__()
  179. self.kwargs = kwargs
  180. def forward(self, a):
  181. return ops.machete_gemm(**self.kwargs)
  182. @pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
  183. reason="Machete is not supported on this GPU type.")
  184. def test_machete_cuda_graph():
  185. m, n, k = 512, 4096, 4096
  186. a = rand_data((m, k), torch.float16)
  187. b = rand_data((k, n), torch.float16)
  188. wtype = scalar_types.uint4b8
  189. group_size = 128
  190. zero_points = False
  191. w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack(
  192. b, wtype, group_size, zero_points)
  193. # Construct a trivial model with a single layer that calls a machete kernel
  194. model = MacheteLayer(
  195. a=a,
  196. b_q=w_q_packed,
  197. b_type=wtype,
  198. b_scales=w_s,
  199. b_zeros=maybe_convert_zeropoints(w_zp, w_s),
  200. b_group_size=group_size,
  201. )
  202. output_ref = torch.matmul(a, w_ref)
  203. # Run the model with a cuda graph
  204. stream = torch.cuda.Stream()
  205. with torch.cuda.stream(stream):
  206. g = torch.cuda.CUDAGraph()
  207. with torch.cuda.graph(g):
  208. output = model(a)
  209. output.zero_()
  210. g.replay()
  211. # Relax atol as our reduction dim becomes larger (more rounding error)
  212. # Relax atol when we have zeropoints since the way machete applies
  213. # zeropoints (after scales) causes noise around 0
  214. atol = 1 if zero_points else min(5e-2 * math.sqrt(k), 1)
  215. torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol)