test_pipeline_parallel.py 3.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. """
  2. WARNING: This test runs in both single-node (4 GPUs) and multi-node
  3. (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
  4. important to set the distributed backend to "mp" to avoid Ray scheduling
  5. all workers in a node other than the head node, which can cause the test
  6. to fail.
  7. """
  8. import os
  9. import pytest
  10. from loguru import logger
  11. from ..utils import compare_two_settings, fork_new_process_for_each_test
  12. APHRODITE_MULTI_NODE = os.getenv("APHRODITE_MULTI_NODE", "0") == "1"
  13. @pytest.mark.parametrize(
  14. ("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, TRUST_REMOTE_CODE, "
  15. "MODEL_NAME, DIST_BACKEND"),
  16. [
  17. (2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
  18. (2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
  19. (1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
  20. (1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
  21. (1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
  22. (1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
  23. (1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
  24. (1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
  25. (2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
  26. (2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
  27. (2, 2, 1, 1, 1, "internlm/internlm2_5-7b-chat", "ray"),
  28. ],
  29. )
  30. @fork_new_process_for_each_test
  31. def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
  32. TRUST_REMOTE_CODE, MODEL_NAME, DIST_BACKEND):
  33. if APHRODITE_MULTI_NODE and DIST_BACKEND == "mp":
  34. pytest.skip("Skipping multi-node pipeline parallel test for "
  35. "multiprocessing distributed backend")
  36. pp_args = [
  37. # use half precision for speed and memory savings in CI environment
  38. "--dtype",
  39. "float16",
  40. "--pipeline-parallel-size",
  41. str(PP_SIZE),
  42. "--tensor-parallel-size",
  43. str(TP_SIZE),
  44. "--distributed-executor-backend",
  45. DIST_BACKEND,
  46. ]
  47. # compare without pipeline parallelism
  48. # NOTE: use mp backend for TP
  49. # PP tests might involve multiple nodes, and ray might
  50. # schedule all workers in a node other than the head node,
  51. # which can cause the test to fail.
  52. tp_args = [
  53. # use half precision for speed and memory savings in CI environment
  54. "--dtype",
  55. "bfloat16",
  56. "--tensor-parallel-size",
  57. str(max(TP_SIZE, 2)), # We only use 2 GPUs in the CI.
  58. "--distributed-executor-backend",
  59. "mp",
  60. ]
  61. if CHUNKED_PREFILL:
  62. pp_args.append("--enable-chunked-prefill")
  63. tp_args.append("--enable-chunked-prefill")
  64. if EAGER_MODE:
  65. pp_args.append("--enforce-eager")
  66. tp_args.append("--enforce-eager")
  67. if TRUST_REMOTE_CODE:
  68. pp_args.append("--trust-remote-code")
  69. tp_args.append("--trust-remote-code")
  70. pp_env = None
  71. if (DIST_BACKEND == "ray" and TP_SIZE == 2 and PP_SIZE == 2
  72. and CHUNKED_PREFILL):
  73. # Test Ray ADAG for a subset of the tests
  74. pp_env = {
  75. "APHRODITE_USE_RAY_COMPILED_DAG": "1",
  76. "APHRODITE_USE_RAY_SPMD_WORKER": "1",
  77. "APHRODITE_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
  78. }
  79. # Temporary. Currently when zeromq + SPMD is used, it does not properly
  80. # terminate because of aDAG issue.
  81. pp_args.append("--disable-frontend-multiprocessing")
  82. tp_args.append("--disable-frontend-multiprocessing")
  83. try:
  84. compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env)
  85. except Exception:
  86. if pp_env is None:
  87. raise
  88. else:
  89. # Ray ADAG tests are flaky, so we don't want to fail the test
  90. logger.exception("Ray ADAG tests failed")