test_phimoe.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. """Compare the outputs of HF and Aphrodite for moe models using greedy sampling.
  2. Run `pytest tests/models/test_phimoe.py`.
  3. """
  4. import pytest
  5. import torch
  6. from aphrodite.common.utils import is_cpu
  7. from ...utils import check_logprobs_close
  8. MODELS = [
  9. "microsoft/Phi-3.5-MoE-instruct",
  10. ]
  11. def test_phimoe_routing_function():
  12. from aphrodite.modeling.models.phimoe import phimoe_routing_function
  13. test_case = {
  14. 0: {
  15. "hidden_states":
  16. torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
  17. dtype=torch.float32,
  18. requires_grad=False).view(4, 2),
  19. "gating_output":
  20. torch.tensor([0.1, 0.2, 0.3, 0.4],
  21. dtype=torch.float32,
  22. requires_grad=False),
  23. "topk":
  24. 2,
  25. "renormalize":
  26. False,
  27. },
  28. 1: {
  29. "hidden_states":
  30. torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
  31. dtype=torch.float32,
  32. requires_grad=False).view(4, 2),
  33. "gating_output":
  34. torch.tensor([0.4, 0.2, 0.3, 0.4],
  35. dtype=torch.float32,
  36. requires_grad=False),
  37. "topk":
  38. 2,
  39. "renormalize":
  40. False,
  41. }
  42. }
  43. ground_truth = {
  44. 0: {
  45. "topk_weights":
  46. torch.tensor([1., 1.], dtype=torch.float32, requires_grad=False),
  47. "topk_ids":
  48. torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
  49. },
  50. 1: {
  51. "topk_weights":
  52. torch.tensor([0.5, 1.], dtype=torch.float32, requires_grad=False),
  53. "topk_ids":
  54. torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
  55. }
  56. }
  57. for test_id in test_case:
  58. topk_weights, topk_ids = phimoe_routing_function(**test_case[test_id])
  59. assert torch.allclose(topk_weights,
  60. ground_truth[test_id]["topk_weights"])
  61. assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
  62. def get_gpu_memory():
  63. try:
  64. props = torch.cuda.get_device_properties(torch.cuda.current_device())
  65. gpu_memory = props.total_memory / (1024**3)
  66. return gpu_memory
  67. except Exception:
  68. return 0
  69. @pytest.mark.skipif(condition=is_cpu(),
  70. reason="This test takes a lot time to run on CPU, "
  71. "and aphrodite CI's disk space is not enough for this "
  72. "model.")
  73. @pytest.mark.skipif(condition=get_gpu_memory() < 100,
  74. reason="Skip this test if GPU memory is insufficient.")
  75. @pytest.mark.parametrize("model", MODELS)
  76. @pytest.mark.parametrize("dtype", ["bfloat16"])
  77. @pytest.mark.parametrize("max_tokens", [64])
  78. @pytest.mark.parametrize("num_logprobs", [5])
  79. def test_models(
  80. hf_runner,
  81. aphrodite_runner,
  82. example_prompts,
  83. model: str,
  84. dtype: str,
  85. max_tokens: int,
  86. num_logprobs: int,
  87. ) -> None:
  88. with hf_runner(model, dtype=dtype) as hf_model:
  89. hf_outputs = hf_model.generate_greedy_logprobs_limit(
  90. example_prompts, max_tokens, num_logprobs)
  91. with aphrodite_runner(model, dtype=dtype) as aphrodite_model:
  92. aphrodite_outputs = aphrodite_model.generate_greedy_logprobs(
  93. example_prompts, max_tokens, num_logprobs)
  94. check_logprobs_close(
  95. outputs_0_lst=hf_outputs,
  96. outputs_1_lst=aphrodite_outputs,
  97. name_0="hf",
  98. name_1="aphrodite",
  99. )