test_correctness_llm.py 3.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. # Test the LLMEngine with multi-step-decoding
  2. from typing import Optional
  3. import pytest
  4. from ..models.utils import check_logprobs_close, check_outputs_equal
  5. MODELS = [
  6. "JackFram/llama-160m",
  7. ]
  8. NUM_SCHEDULER_STEPS = [8] # Multi-step decoding steps
  9. NUM_PROMPTS = [10]
  10. @pytest.mark.parametrize("model", MODELS)
  11. @pytest.mark.parametrize("dtype", ["half"])
  12. @pytest.mark.parametrize("tp_size", [1])
  13. @pytest.mark.parametrize("max_tokens", [5])
  14. @pytest.mark.parametrize("enforce_eager", [True])
  15. @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
  16. @pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
  17. @pytest.mark.parametrize("num_logprobs", [None, 5])
  18. def test_multi_step_llm(
  19. hf_runner,
  20. aphrodite_runner,
  21. example_prompts,
  22. model: str,
  23. dtype: str,
  24. tp_size: int,
  25. max_tokens: int,
  26. enforce_eager: int,
  27. num_scheduler_steps: int,
  28. num_prompts: int,
  29. num_logprobs: Optional[int],
  30. ) -> None:
  31. """Test Aphrodite engine with multi-step scheduling via sync LLM Engine.
  32. Set up a HuggingFace (HF) transformers model as a ground-truth reference.
  33. Prompt them with the same example prompts.
  34. Validate:
  35. * Generated tokens match
  36. * Generated logprobs are all very close
  37. Args:
  38. hf_runner: HF transformers model runner fixture
  39. aphrodite_runner: Aphrodite model runner fixture
  40. example_prompts: test fixture providing example prompts
  41. model: model under test (same for single- and multi-step engines)
  42. dtype: tensor datatype for engine to utilize
  43. tp_size: degree of tensor-parallelism
  44. max_tokens: the maximum number of tokens to generate
  45. enforce_eager
  46. num_scheduler_steps: for multi-step scheduling, GPU-side steps per
  47. GPU -> CPU output transfer
  48. num_prompts: number of example prompts under test
  49. num_logprobs: corresponds to the `logprobs` argument to the OpenAI
  50. completions endpoint; `None` -> no logprobs
  51. """
  52. prompts = example_prompts
  53. if len(prompts) < num_prompts:
  54. prompts = prompts * ((num_prompts // len(prompts)) + 1)
  55. prompts = prompts[:num_prompts]
  56. assert len(prompts) == num_prompts
  57. with aphrodite_runner(
  58. model,
  59. dtype=dtype,
  60. enforce_eager=enforce_eager,
  61. gpu_memory_utilization=0.7,
  62. tensor_parallel_size=tp_size,
  63. use_v2_block_manager=True,
  64. num_scheduler_steps=num_scheduler_steps,
  65. ) as aphrodite_model:
  66. aphrodite_outputs = (aphrodite_model.generate_greedy(
  67. prompts, max_tokens)
  68. if num_logprobs is None else
  69. aphrodite_model.generate_greedy_logprobs(
  70. prompts, max_tokens, num_logprobs))
  71. with hf_runner(model, dtype=dtype) as hf_model:
  72. hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
  73. if num_logprobs is None else
  74. hf_model.generate_greedy_logprobs_limit(
  75. prompts, max_tokens, num_logprobs))
  76. if num_logprobs is None:
  77. check_outputs_equal(
  78. outputs_0_lst=hf_outputs,
  79. outputs_1_lst=aphrodite_outputs,
  80. name_0="hf",
  81. name_1="aphrodite",
  82. )
  83. else:
  84. check_logprobs_close(
  85. outputs_0_lst=hf_outputs,
  86. outputs_1_lst=aphrodite_outputs,
  87. name_0="hf",
  88. name_1="aphrodite",
  89. )