test_pp_cudagraph.py 797 B

123456789101112131415161718192021222324252627282930
  1. import os
  2. import pytest
  3. from ..utils import compare_two_settings, fork_new_process_for_each_test
  4. @pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
  5. (2, "JackFram/llama-160m"),
  6. ])
  7. @pytest.mark.parametrize("ATTN_BACKEND", [
  8. "FLASH_ATTN",
  9. "FLASHINFER",
  10. ])
  11. @fork_new_process_for_each_test
  12. def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
  13. cudagraph_args = [
  14. # use half precision for speed and memory savings in CI environment
  15. "--dtype",
  16. "float16",
  17. "--pipeline-parallel-size",
  18. str(PP_SIZE),
  19. "--distributed-executor-backend",
  20. "mp",
  21. ]
  22. os.environ["APHRODITE_ATTENTION_BACKEND"] = ATTN_BACKEND
  23. eager_args = cudagraph_args + ["--enforce-eager"]
  24. compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)