test_layernorm.py 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. import pytest
  2. import torch
  3. from aphrodite.modeling.layers.layernorm import RMSNorm
  4. from tests.kernels.utils import opcheck
  5. DTYPES = [torch.half, torch.bfloat16, torch.float]
  6. NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing
  7. HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192,
  8. 8199] # Arbitrary values for testing
  9. ADD_RESIDUAL = [False, True]
  10. SEEDS = [0]
  11. CUDA_DEVICES = [
  12. f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
  13. ]
  14. @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
  15. @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
  16. @pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
  17. @pytest.mark.parametrize("dtype", DTYPES)
  18. @pytest.mark.parametrize("seed", SEEDS)
  19. @pytest.mark.parametrize("device", CUDA_DEVICES)
  20. @torch.inference_mode()
  21. def test_rms_norm(
  22. num_tokens: int,
  23. hidden_size: int,
  24. add_residual: bool,
  25. dtype: torch.dtype,
  26. seed: int,
  27. device: str,
  28. ) -> None:
  29. torch.random.manual_seed(seed)
  30. if torch.cuda.is_available():
  31. torch.cuda.manual_seed(seed)
  32. torch.set_default_device(device)
  33. layer = RMSNorm(hidden_size).to(dtype=dtype)
  34. layer.weight.data.normal_(mean=1.0, std=0.1)
  35. scale = 1 / (2 * hidden_size)
  36. x = torch.randn(num_tokens, hidden_size, dtype=dtype)
  37. x *= scale
  38. residual = torch.randn_like(x) * scale if add_residual else None
  39. # NOTE: The reference implementation should be executed first
  40. # because the custom kernel is in-place.
  41. ref_out = layer.forward_native(x, residual)
  42. out = layer(x, residual)
  43. # NOTE: LayerNorm operators (including RMS) typically have larger
  44. # numerical errors than other operators because they involve reductions.
  45. # Therefore, we use a larger tolerance.
  46. if add_residual:
  47. torch.testing.assert_close(out[0], ref_out[0], atol=1e-2, rtol=1e-2)
  48. torch.testing.assert_close(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
  49. else:
  50. torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
  51. if residual is not None:
  52. opcheck(torch.ops._C.fused_add_rms_norm,
  53. (x, residual, layer.weight.data, layer.variance_epsilon))
  54. else:
  55. opcheck(torch.ops._C.rms_norm,
  56. (out, x, layer.weight.data, layer.variance_epsilon))