test_basic_distributed_correctness.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. """Compare the outputs of HF and distributed Aphrodite when using greedy sampling.
  2. Aphrodite will allocate all the available memory, so we need to run the tests one
  3. by one. The solution is to pass arguments (model name) by environment
  4. variables.
  5. Run:
  6. ```sh
  7. TEST_DIST_MODEL=alpindale/gemma-2b pytest \
  8. test_basic_distributed_correctness.py
  9. TEST_DIST_MODEL=mistralai/Mistral-7B-Instruct-v0.2 \
  10. test_basic_distributed_correctness.py
  11. ```
  12. """
  13. import os
  14. import pytest
  15. import torch
  16. MODELS = [
  17. os.environ["TEST_DIST_MODEL"],
  18. ]
  19. @pytest.mark.skipif(torch.cuda.device_count() < 2,
  20. reason="Need at least 2 GPUs to run the test.")
  21. @pytest.mark.parametrize("model", MODELS)
  22. @pytest.mark.parametrize("dtype", ["half"])
  23. @pytest.mark.parametrize("max_tokens", [5])
  24. def test_models(
  25. hf_runner,
  26. aphrodite_runner,
  27. example_prompts,
  28. model: str,
  29. dtype: str,
  30. max_tokens: int,
  31. ) -> None:
  32. hf_model = hf_runner(model, dtype=dtype)
  33. hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
  34. del hf_model
  35. aphrodite_model = aphrodite_runner(
  36. model,
  37. dtype=dtype,
  38. tensor_parallel_size=2,
  39. )
  40. aphrodite_outputs = aphrodite_model.generate_greedy(example_prompts,
  41. max_tokens)
  42. del aphrodite_model
  43. for i in range(len(example_prompts)):
  44. hf_output_ids, hf_output_str = hf_outputs[i]
  45. aphrodite_output_ids, aphrodite_output_str = aphrodite_outputs[i]
  46. assert hf_output_str == aphrodite_output_str, (
  47. f"Test{i}:\nHF: {hf_output_str!r}\nAphrodite: "
  48. f"{aphrodite_output_str!r}")
  49. assert hf_output_ids == aphrodite_output_ids, (
  50. f"Test{i}:\nHF: {hf_output_ids}\nAphrodite: {aphrodite_output_ids}")