test_basic_distributed_correctness.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. """Compare the outputs of HF and distributed Aphrodite when using
  2. greedy sampling.
  3. Run:
  4. ```sh
  5. cd $APHRODITE_PATH/tests
  6. pytest distributed/test_basic_distributed_correctness.py
  7. ```
  8. """
  9. import os
  10. import pytest
  11. from aphrodite.common.utils import cuda_device_count_stateless
  12. from ..models.utils import check_outputs_equal
  13. from ..utils import fork_new_process_for_each_test
  14. TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
  15. @pytest.mark.skipif(cuda_device_count_stateless() < 2,
  16. reason="Need at least 2 GPUs to run the test.")
  17. @pytest.mark.parametrize(
  18. "model, distributed_executor_backend, attention_backend, test_suite", [
  19. ("facebook/opt-125m", "ray", "", "L4"),
  20. ("facebook/opt-125m", "mp", "", "L4"),
  21. ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
  22. ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
  23. ("facebook/opt-125m", "ray", "", "A100"),
  24. ("facebook/opt-125m", "mp", "", "A100"),
  25. ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
  26. ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
  27. ])
  28. @fork_new_process_for_each_test
  29. def test_models(
  30. hf_runner,
  31. aphrodite_runner,
  32. example_prompts,
  33. model: str,
  34. distributed_executor_backend: str,
  35. attention_backend: str,
  36. test_suite: str,
  37. ) -> None:
  38. if test_suite != TARGET_TEST_SUITE:
  39. pytest.skip(f"Skip test for {test_suite}")
  40. if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
  41. # test ray adag
  42. os.environ['APHRODITE_USE_RAY_SPMD_WORKER'] = "1"
  43. os.environ['APHRODITE_USE_RAY_COMPILED_DAG'] = "1"
  44. if attention_backend:
  45. os.environ["APHRODITE_ATTENTION_BACKEND"] = attention_backend
  46. dtype = "half"
  47. max_tokens = 5
  48. # NOTE: take care of the order. run Aphrodite first, and then run HF.
  49. # Aphrodite needs a fresh new process without cuda initialization.
  50. # if we run HF first, the cuda initialization will be done and it
  51. # will hurt multiprocessing backend with fork method (the default method).
  52. with aphrodite_runner(model,
  53. dtype=dtype,
  54. tensor_parallel_size=2,
  55. distributed_executor_backend=distributed_executor_backend
  56. ) as aphrodite_model:
  57. aphrodite_outputs = aphrodite_model.generate_greedy(
  58. example_prompts, max_tokens)
  59. with hf_runner(model, dtype=dtype) as hf_model:
  60. hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
  61. check_outputs_equal(
  62. outputs_0_lst=hf_outputs,
  63. outputs_1_lst=aphrodite_outputs,
  64. name_0="hf",
  65. name_1="aphrodite",
  66. )