test_pipeline_parallel.py 3.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. """
  2. WARNING: This test runs in both single-node (4 GPUs) and multi-node
  3. (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
  4. important to set the distributed backend to "mp" to avoid Ray scheduling
  5. all workers in a node other than the head node, which can cause the test
  6. to fail.
  7. """
  8. import os
  9. import pytest
  10. from loguru import logger
  11. from ..utils import compare_two_settings, fork_new_process_for_each_test
  12. APHRODITE_MULTI_NODE = os.getenv("APHRODITE_MULTI_NODE", "0") == "1"
  13. @pytest.mark.parametrize(("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, "
  14. "MODEL_NAME, DIST_BACKEND"),
  15. [
  16. (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
  17. (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
  18. (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
  19. (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
  20. (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
  21. (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
  22. (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
  23. (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
  24. (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
  25. (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
  26. ])
  27. @fork_new_process_for_each_test
  28. def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
  29. DIST_BACKEND):
  30. if APHRODITE_MULTI_NODE and DIST_BACKEND == "mp":
  31. pytest.skip("Skipping multi-node pipeline parallel test for "
  32. "multiprocessing distributed backend")
  33. pp_args = [
  34. # use half precision for speed and memory savings in CI environment
  35. "--dtype",
  36. "float16",
  37. "--pipeline-parallel-size",
  38. str(PP_SIZE),
  39. "--tensor-parallel-size",
  40. str(TP_SIZE),
  41. "--distributed-executor-backend",
  42. DIST_BACKEND,
  43. ]
  44. # compare without pipeline parallelism
  45. # NOTE: use mp backend for TP
  46. # PP tests might involve multiple nodes, and ray might
  47. # schedule all workers in a node other than the head node,
  48. # which can cause the test to fail.
  49. tp_args = [
  50. # use half precision for speed and memory savings in CI environment
  51. "--dtype",
  52. "bfloat16",
  53. "--tensor-parallel-size",
  54. str(max(TP_SIZE, 2)), # We only use 2 GPUs in the CI.
  55. "--distributed-executor-backend",
  56. "mp",
  57. ]
  58. if CHUNKED_PREFILL:
  59. pp_args.append("--enable-chunked-prefill")
  60. tp_args.append("--enable-chunked-prefill")
  61. if EAGER_MODE:
  62. pp_args.append("--enforce-eager")
  63. tp_args.append("--enforce-eager")
  64. pp_env = None
  65. if (DIST_BACKEND == "ray" and TP_SIZE == 2 and PP_SIZE == 2
  66. and CHUNKED_PREFILL):
  67. # Test Ray ADAG for a subset of the tests
  68. pp_env = {
  69. "APHRODITE_USE_RAY_COMPILED_DAG": "1",
  70. "APHRODITE_USE_RAY_SPMD_WORKER": "1",
  71. "APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
  72. }
  73. # Temporary. Currently when zeromq + SPMD is used, it does not properly
  74. # terminate because of aDAG issue.
  75. pp_args.append("--disable-frontend-multiprocessing")
  76. tp_args.append("--disable-frontend-multiprocessing")
  77. try:
  78. compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env)
  79. except Exception:
  80. if pp_env is None:
  81. raise
  82. else:
  83. # Ray ADAG tests are flaky, so we don't want to fail the test
  84. logger.exception("Ray ADAG tests failed")