test_integration_dist_tp2.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. """Tests which cover integration of the speculative decoding framework with
  2. tensor parallelism.
  3. """
  4. import pytest
  5. import torch
  6. from aphrodite.common.utils import is_hip
  7. from .conftest import run_greedy_equality_correctness_test
  8. @pytest.mark.skipif(torch.cuda.device_count() < 2,
  9. reason="Need at least 2 GPUs to run the test.")
  10. @pytest.mark.parametrize(
  11. "common_llm_kwargs",
  12. [{
  13. "model": "JackFram/llama-68m",
  14. # Skip cuda graph recording for fast test.
  15. "enforce_eager": True,
  16. # Required for spec decode.
  17. "use_v2_block_manager": True,
  18. "tensor_parallel_size": 2,
  19. # Use AsyncLLM engine, so that the engine runs in its own process.
  20. # Otherwise, since aphrodite does not follow true SPMD, the test runner
  21. # process will have both the engine and the rank0 worker. NCCL is not
  22. # cleaned up properly, and its server host thread leaks, causing the
  23. # second run of the test to fail with internal NCCL error.
  24. "use_async": True,
  25. }])
  26. @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
  27. @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
  28. @pytest.mark.parametrize("test_llm_kwargs", [
  29. {
  30. "speculative_model": "JackFram/llama-68m",
  31. "num_speculative_tokens": 3,
  32. },
  33. {
  34. "speculative_model": "[ngram]",
  35. "num_speculative_tokens": 5,
  36. "ngram_prompt_lookup_max": 3,
  37. },
  38. ])
  39. @pytest.mark.parametrize("batch_size", [2])
  40. @pytest.mark.parametrize(
  41. "output_len",
  42. [
  43. # Use smaller output len for fast test.
  44. 32,
  45. ])
  46. @pytest.mark.parametrize("seed", [1])
  47. def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
  48. batch_size: int, output_len: int):
  49. """Verify greedy equality when tensor parallelism is used.
  50. """
  51. if is_hip():
  52. pytest.skip("hip is not well-supported yet")
  53. run_greedy_equality_correctness_test(baseline_llm_generator,
  54. test_llm_generator,
  55. batch_size,
  56. max_output_len=output_len,
  57. force_output_len=True)
  58. @pytest.mark.skipif(torch.cuda.device_count() < 2,
  59. reason="Need at least 2 GPUs to run the test.")
  60. @pytest.mark.parametrize(
  61. "common_llm_kwargs",
  62. [{
  63. # Skip cuda graph recording for fast test.
  64. "enforce_eager": True,
  65. # Required for spec decode.
  66. "use_v2_block_manager": True,
  67. "tensor_parallel_size": 2,
  68. # Use AsyncLLM engine, so that the engine runs in its own process.
  69. # Otherwise, since aphrodite does not follow true SPMD, the test runner
  70. # process will have both the engine and the rank0 worker. NCCL is not
  71. # cleaned up properly, and its server host thread leaks, causing the
  72. # second run of the test to fail with internal NCCL error.
  73. "use_async": True,
  74. # precision
  75. "dtype": "float32",
  76. }])
  77. @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
  78. @pytest.mark.parametrize(
  79. "per_test_common_llm_kwargs, test_llm_kwargs",
  80. [
  81. (
  82. {
  83. # Use a small model for a fast test.
  84. # Note this is repeated in the test body; to initialize a
  85. # tokenizer.
  86. "model": "JackFram/llama-68m",
  87. },
  88. {
  89. "speculative_model": "JackFram/llama-68m",
  90. "num_speculative_tokens": 5,
  91. "speculative_draft_tensor_parallel_size": 1,
  92. }),
  93. ({
  94. "model": "ibm-granite/granite-3b-code-instruct",
  95. }, {
  96. "speculative_model":
  97. "ibm-granite/granite-3b-code-instruct-accelerator",
  98. "num_speculative_tokens": 5,
  99. "speculative_draft_tensor_parallel_size": 1,
  100. })
  101. ])
  102. @pytest.mark.parametrize("batch_size", [2])
  103. @pytest.mark.parametrize("seed", [1])
  104. def test_draft_model_tp_lt_target_model_tp2(test_llm_generator,
  105. baseline_llm_generator,
  106. batch_size: int):
  107. """Verify spec decode works well with smaller tp for draft models.
  108. """
  109. run_greedy_equality_correctness_test(baseline_llm_generator,
  110. test_llm_generator,
  111. batch_size,
  112. max_output_len=32,
  113. force_output_len=True)