test_int8_quant.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. import pytest
  2. import torch
  3. from aphrodite._custom_ops import scaled_int8_quant
  4. from tests.kernels.quant_utils import ref_dynamic_per_token_quant
  5. DTYPES = [torch.half, torch.bfloat16, torch.float]
  6. HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192,
  7. 8193] # Arbitrary values for testing
  8. NUM_TOKENS = [1, 7, 83, 4096] # Arbitrary values for testing
  9. SEEDS = [0]
  10. SCALE = [0.1, 0.5, 0.8, 1.2, 2.1]
  11. @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
  12. @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
  13. @pytest.mark.parametrize("dtype", DTYPES)
  14. @pytest.mark.parametrize("seed", SEEDS)
  15. @torch.inference_mode()
  16. def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
  17. dtype: torch.dtype, seed: int) -> None:
  18. torch.random.manual_seed(seed)
  19. torch.cuda.manual_seed(seed)
  20. x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
  21. # reference
  22. ref_out, ref_scales = ref_dynamic_per_token_quant(x, torch.int8)
  23. # kernel
  24. ops_out, ops_scales = scaled_int8_quant(x)
  25. torch.testing.assert_close(ops_scales, ref_scales)
  26. torch.testing.assert_close(
  27. ops_out, ref_out, atol=1,
  28. rtol=0.0) # big atol to account for rounding errors
  29. @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
  30. @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
  31. @pytest.mark.parametrize("dtype", DTYPES)
  32. @pytest.mark.parametrize("seed", SEEDS)
  33. @pytest.mark.parametrize("scale", SCALE)
  34. @torch.inference_mode()
  35. def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
  36. dtype: torch.dtype, seed: int,
  37. scale: float) -> None:
  38. torch.random.manual_seed(seed)
  39. torch.cuda.manual_seed(seed)
  40. int8_traits = torch.iinfo(torch.int8)
  41. x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
  42. scale = torch.tensor([scale], dtype=torch.float32, device="cuda")
  43. out1 = (x / scale).round().clamp(int8_traits.min,
  44. int8_traits.max).to(torch.int8)
  45. out2, _ = scaled_int8_quant(x, scale)
  46. torch.testing.assert_close(
  47. out1, out2, atol=1,
  48. rtol=0.0) # big atol to account for rounding errors