test_correctness_llm.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. # Test the LLMEngine with multi-step-decoding
  2. from typing import Optional
  3. import pytest
  4. from ..models.utils import check_logprobs_close, check_outputs_equal
  5. MODELS = [
  6. "JackFram/llama-160m",
  7. ]
  8. NUM_SCHEDULER_STEPS = [8] # Multi-step decoding steps
  9. NUM_PROMPTS = [10]
  10. @pytest.mark.parametrize("model", MODELS)
  11. @pytest.mark.parametrize("dtype", ["half"])
  12. @pytest.mark.parametrize("tp_size", [1])
  13. @pytest.mark.parametrize("max_tokens", [5])
  14. @pytest.mark.parametrize("enforce_eager", [True])
  15. @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
  16. @pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
  17. @pytest.mark.parametrize("num_logprobs", [None, 5])
  18. def test_multi_step_llm(
  19. hf_runner,
  20. aphrodite_runner,
  21. example_prompts,
  22. model: str,
  23. dtype: str,
  24. tp_size: int,
  25. max_tokens: int,
  26. enforce_eager: int,
  27. num_scheduler_steps: int,
  28. num_prompts: int,
  29. num_logprobs: Optional[int],
  30. ) -> None:
  31. """Test Aphrodite engine with multi-step scheduling via sync LLM Engine.
  32. Set up a HuggingFace (HF) transformers model as a ground-truth reference.
  33. Prompt them with the same example prompts.
  34. Validate:
  35. * Generated tokens match
  36. * Generated logprobs are all very close
  37. Args:
  38. hf_runner: HF transformers model runner fixture
  39. aphrodite_runner: Aphrodite model runner fixture
  40. example_prompts: test fixture providing example prompts
  41. model: model under test (same for single- and multi-step engines)
  42. dtype: tensor datatype for engine to utilize
  43. tp_size: degree of tensor-parallelism
  44. max_tokens: the maximum number of tokens to generate
  45. enforce_eager
  46. num_scheduler_steps: for multi-step scheduling, GPU-side steps per
  47. GPU -> CPU output transfer
  48. num_prompts: number of example prompts under test
  49. num_logprobs: corresponds to the `logprobs` argument to the OpenAI
  50. completions endpoint; `None` -> no logprobs
  51. """
  52. prompts = example_prompts
  53. if len(prompts) < num_prompts:
  54. prompts = prompts * ((num_prompts // len(prompts)) + 1)
  55. prompts = prompts[:num_prompts]
  56. assert len(prompts) == num_prompts
  57. with aphrodite_runner(
  58. model,
  59. dtype=dtype,
  60. enforce_eager=enforce_eager,
  61. gpu_memory_utilization=0.7,
  62. tensor_parallel_size=tp_size,
  63. use_v2_block_manager=True,
  64. num_scheduler_steps=num_scheduler_steps,
  65. ) as aphrodite_model:
  66. aphrodite_outputs = (aphrodite_model.generate_greedy(
  67. prompts, max_tokens)
  68. if num_logprobs is None else
  69. aphrodite_model.generate_greedy_logprobs(
  70. prompts, max_tokens, num_logprobs))
  71. with hf_runner(model, dtype=dtype) as hf_model:
  72. hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
  73. if num_logprobs is None else
  74. hf_model.generate_greedy_logprobs_limit(
  75. prompts, max_tokens, num_logprobs))
  76. if num_logprobs is None:
  77. check_outputs_equal(
  78. outputs_0_lst=hf_outputs,
  79. outputs_1_lst=aphrodite_outputs,
  80. name_0="hf",
  81. name_1="aphrodite",
  82. )
  83. else:
  84. check_logprobs_close(
  85. outputs_0_lst=hf_outputs,
  86. outputs_1_lst=aphrodite_outputs,
  87. name_0="hf",
  88. name_1="aphrodite",
  89. )
  90. @pytest.mark.parametrize("model", MODELS)
  91. @pytest.mark.parametrize("dtype", ["half"])
  92. @pytest.mark.parametrize("tp_size", [1])
  93. @pytest.mark.parametrize("max_tokens", [5])
  94. @pytest.mark.parametrize("enforce_eager", [True])
  95. @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
  96. @pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
  97. @pytest.mark.parametrize("num_logprobs,num_prompt_logprobs", [(5, 5)])
  98. def test_multi_step_llm_w_prompt_logprobs(
  99. aphrodite_runner,
  100. example_prompts,
  101. model: str,
  102. dtype: str,
  103. tp_size: int,
  104. max_tokens: int,
  105. enforce_eager: int,
  106. num_scheduler_steps: int,
  107. num_prompts: int,
  108. num_logprobs: Optional[int],
  109. num_prompt_logprobs: Optional[int],
  110. ) -> None:
  111. """Test prompt logprobs with multi-step scheduling via sync LLM Engine.
  112. Set up a Aphrodite engine instance w/ single-step scheduling as a
  113. ground-truth reference.
  114. Prompt them with the same example prompts.
  115. Validate:
  116. * All generated logprobs are all very close
  117. Args:
  118. hf_runner: HF transformers model runner fixture
  119. aphrodite_runner: Aphrodite model runner fixture
  120. example_prompts: test fixture providing example prompts
  121. model: model under test (same for single- and multi-step engines)
  122. dtype: tensor datatype for engine to utilize
  123. tp_size: degree of tensor-parallelism
  124. max_tokens: the maximum number of tokens to generate
  125. enforce_eager
  126. num_scheduler_steps: for multi-step scheduling, GPU-side steps per
  127. GPU -> CPU output transfer
  128. num_prompts: number of example prompts under test
  129. num_logprobs: corresponds to the `logprobs` argument to the OpenAI
  130. completions endpoint; `None` -> no logprobs
  131. num_prompt_logprobs: number of logprobs to return for each prompt token;
  132. note that this argument is not supported by the
  133. OpenAI completions endpoint.
  134. """
  135. prompts = example_prompts
  136. if len(prompts) < num_prompts:
  137. prompts = prompts * ((num_prompts // len(prompts)) + 1)
  138. prompts = prompts[:num_prompts]
  139. assert len(prompts) == num_prompts
  140. with aphrodite_runner(
  141. model,
  142. dtype=dtype,
  143. enforce_eager=enforce_eager,
  144. gpu_memory_utilization=0.7,
  145. tensor_parallel_size=tp_size,
  146. use_v2_block_manager=True,
  147. num_scheduler_steps=num_scheduler_steps,
  148. ) as aphrodite_model:
  149. aphrodite_outputs = aphrodite_model.generate_greedy_logprobs(
  150. prompts,
  151. max_tokens,
  152. num_logprobs,
  153. num_prompt_logprobs=num_prompt_logprobs)
  154. with aphrodite_runner(
  155. model,
  156. dtype=dtype,
  157. enforce_eager=enforce_eager,
  158. gpu_memory_utilization=0.7,
  159. tensor_parallel_size=tp_size,
  160. ) as aphrodite_model:
  161. single_step_aphrodite_outputs = (
  162. aphrodite_model.generate_greedy_logprobs(
  163. prompts,
  164. max_tokens,
  165. num_logprobs,
  166. num_prompt_logprobs=num_prompt_logprobs))
  167. check_logprobs_close(
  168. outputs_0_lst=single_step_aphrodite_outputs,
  169. outputs_1_lst=aphrodite_outputs,
  170. name_0="hf",
  171. name_1="aphrodite",
  172. )