test_chunked_prefill.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. """Compare the outputs of HF and Aphrodite when using greedy sampling.
  2. It tests chunked prefill. Chunked prefill can be enabled by
  3. enable_chunked_prefill=True. If prefill size exceeds max_num_batched_tokens,
  4. prefill requests are chunked.
  5. Run `pytest tests/models/test_chunked_prefill.py`.
  6. """
  7. import os
  8. from contextlib import nullcontext
  9. import pytest
  10. from ..models.utils import check_logprobs_close, check_outputs_equal
  11. from ..utils import multi_gpu_test
  12. MODELS = [
  13. "facebook/opt-125m",
  14. "meta-llama/Llama-2-7b-hf",
  15. ]
  16. @pytest.mark.parametrize("model", MODELS)
  17. @pytest.mark.parametrize("dtype", ["half"])
  18. @pytest.mark.parametrize("max_tokens", [32])
  19. @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
  20. @pytest.mark.parametrize("enforce_eager", [False, True])
  21. # NOTE: Increasing this in this suite will fail CI because we currently cannot
  22. # reset distributed env properly. Use a value > 1 just when you test.
  23. @pytest.mark.parametrize("tensor_parallel_size", [1])
  24. def test_models(
  25. hf_runner,
  26. aphrodite_runner,
  27. example_prompts,
  28. model: str,
  29. dtype: str,
  30. max_tokens: int,
  31. chunked_prefill_token_size: int,
  32. enforce_eager: bool,
  33. tensor_parallel_size: int,
  34. ) -> None:
  35. """
  36. Checks exact match decode between huggingface model and aphrodite runner
  37. with chunked prefill.
  38. """
  39. max_num_seqs = chunked_prefill_token_size
  40. max_num_batched_tokens = chunked_prefill_token_size
  41. with hf_runner(model, dtype=dtype) as hf_model:
  42. hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
  43. with aphrodite_runner(
  44. model,
  45. dtype=dtype,
  46. max_num_batched_tokens=max_num_batched_tokens,
  47. enable_chunked_prefill=True,
  48. tensor_parallel_size=tensor_parallel_size,
  49. enforce_eager=enforce_eager,
  50. max_num_seqs=max_num_seqs,
  51. ) as aphrodite_model:
  52. aphrodite_outputs = aphrodite_model.generate_greedy(
  53. example_prompts, max_tokens)
  54. check_outputs_equal(
  55. outputs_0_lst=hf_outputs,
  56. outputs_1_lst=aphrodite_outputs,
  57. name_0="hf",
  58. name_1="aphrodite",
  59. )
  60. @multi_gpu_test(num_gpus=2)
  61. @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
  62. @pytest.mark.parametrize("model", MODELS)
  63. def test_models_distributed(
  64. hf_runner,
  65. aphrodite_runner,
  66. example_prompts,
  67. model: str,
  68. distributed_executor_backend: str,
  69. ) -> None:
  70. if (model == "meta-llama/Llama-2-7b-hf"
  71. and distributed_executor_backend == "ray"):
  72. # test ray adag
  73. os.environ['APHRODITE_USE_RAY_SPMD_WORKER'] = "1"
  74. os.environ['APHRODITE_USE_RAY_COMPILED_DAG'] = "1"
  75. dtype = "half"
  76. max_tokens = 5
  77. chunked_prefill_token_size = 16
  78. # Add a chunked prefill config.
  79. max_num_seqs = min(chunked_prefill_token_size, 256)
  80. assert chunked_prefill_token_size != -1
  81. enable_chunked_prefill = True
  82. max_num_batched_tokens = chunked_prefill_token_size
  83. # NOTE: take care of the order. run Aphrodite first, and then run HF.
  84. # Aphrodite needs a fresh new process without cuda initialization.
  85. # if we run HF first, the cuda initialization will be done and it
  86. # will hurt multiprocessing backend with fork method (the default method).
  87. with aphrodite_runner(
  88. model,
  89. dtype=dtype,
  90. tensor_parallel_size=2,
  91. max_num_seqs=max_num_seqs,
  92. enable_chunked_prefill=enable_chunked_prefill,
  93. max_num_batched_tokens=max_num_batched_tokens,
  94. distributed_executor_backend=distributed_executor_backend,
  95. ) as aphrodite_model:
  96. aphrodite_outputs = aphrodite_model.generate_greedy(
  97. example_prompts, max_tokens)
  98. with hf_runner(model, dtype=dtype) as hf_model:
  99. hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
  100. check_outputs_equal(
  101. outputs_0_lst=hf_outputs,
  102. outputs_1_lst=aphrodite_outputs,
  103. name_0="hf",
  104. name_1="aphrodite",
  105. )
  106. @pytest.mark.parametrize(
  107. "kv_cache_dtype,model",
  108. [("fp8_e4m3",
  109. "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")])
  110. # Due to low-precision numerical divergence, we only test logprob of 4 tokens
  111. @pytest.mark.parametrize("max_tokens", [4])
  112. @pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
  113. @pytest.mark.parametrize("enforce_eager", [False, True])
  114. # NOTE: Increasing this in this suite will fail CI because we currently cannot
  115. # reset distributed env properly. Use a value > 1 just when you test.
  116. @pytest.mark.parametrize("tensor_parallel_size", [1])
  117. # Due to low-precision numerical divergence, this test is too sensitive to
  118. # the async postprocessor
  119. @pytest.mark.parametrize("disable_async_output_proc", [True])
  120. def test_models_with_fp8_kv_cache(
  121. aphrodite_runner,
  122. example_prompts,
  123. kv_cache_dtype: str,
  124. model: str,
  125. max_tokens: int,
  126. chunked_prefill_token_size: int,
  127. enforce_eager: bool,
  128. tensor_parallel_size: int,
  129. disable_async_output_proc: bool,
  130. ) -> None:
  131. """
  132. Check output logprobs match between no_chunked_prefill and chunked_prefill
  133. with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py,
  134. so here we only check chunked prefill.
  135. """
  136. NUM_LOG_PROBS = 8
  137. max_num_seqs = chunked_prefill_token_size
  138. max_num_batched_tokens = chunked_prefill_token_size
  139. with aphrodite_runner(
  140. model,
  141. tensor_parallel_size=tensor_parallel_size,
  142. enforce_eager=enforce_eager,
  143. max_num_seqs=max_num_seqs,
  144. kv_cache_dtype=kv_cache_dtype,
  145. disable_async_output_proc=disable_async_output_proc,
  146. ) as aphrodite_model:
  147. no_chunked_prefill_outputs = aphrodite_model.generate_greedy_logprobs(
  148. example_prompts, max_tokens, NUM_LOG_PROBS)
  149. with aphrodite_runner(
  150. model,
  151. max_num_batched_tokens=max_num_batched_tokens,
  152. enable_chunked_prefill=True,
  153. tensor_parallel_size=tensor_parallel_size,
  154. enforce_eager=enforce_eager,
  155. max_num_seqs=max_num_seqs,
  156. kv_cache_dtype=kv_cache_dtype,
  157. disable_async_output_proc=disable_async_output_proc,
  158. ) as aphrodite_model:
  159. chunked_prefill_outputs = aphrodite_model.generate_greedy_logprobs(
  160. example_prompts, max_tokens, NUM_LOG_PROBS)
  161. check_logprobs_close(
  162. outputs_0_lst=no_chunked_prefill_outputs,
  163. outputs_1_lst=chunked_prefill_outputs,
  164. name_0="no_chunked_prefill",
  165. name_1="chunked_prefill",
  166. )
  167. @pytest.mark.parametrize("max_tokens", [16])
  168. @pytest.mark.parametrize("enforce_eager", [False])
  169. @pytest.mark.parametrize("chunk_size", [30, 32])
  170. @pytest.mark.parametrize("use_v2_block_manager", [False, True])
  171. # NOTE: Increasing this in this suite will fail CI because we currently cannot
  172. # reset distributed env properly. Use a value > 1 just when you test.
  173. @pytest.mark.parametrize("tensor_parallel_size", [1])
  174. def test_with_prefix_caching(
  175. aphrodite_runner,
  176. max_tokens: int,
  177. enforce_eager: bool,
  178. chunk_size: int,
  179. use_v2_block_manager: bool,
  180. tensor_parallel_size: int,
  181. ) -> None:
  182. """
  183. Checks exact match decode with and without prefix caching
  184. with chunked prefill enabled.
  185. """
  186. model = "meta-llama/Llama-2-7b-chat-hf"
  187. # The common prompt has 142 tokens with Llama-2 tokenizer.
  188. common_prompt = "You are a helpful AI assistant " * 20
  189. unique_prompts = [
  190. "Question", # Warmup
  191. "Question", # Fully cached
  192. "Another question", # Partial cached
  193. ]
  194. full_prompts = [f"{common_prompt}\n{p}" for p in unique_prompts]
  195. max_num_batched_tokens = max_num_seqs = chunk_size
  196. outputs = {} # type: ignore
  197. check_result = True
  198. for enable in (True, False):
  199. with aphrodite_runner(
  200. model,
  201. dtype="half",
  202. max_num_batched_tokens=max_num_batched_tokens,
  203. enable_chunked_prefill=True,
  204. enable_prefix_caching=enable,
  205. tensor_parallel_size=tensor_parallel_size,
  206. use_v2_block_manager=use_v2_block_manager,
  207. enforce_eager=enforce_eager,
  208. max_num_seqs=max_num_seqs,
  209. ) as aphrodite_model:
  210. # It should fail when prefix caching is enable and chunk
  211. # size is not a multiple of block size (16).
  212. should_fail = chunk_size % 16 != 0 and enable
  213. check_result &= not should_fail
  214. outputs[enable] = []
  215. # Send the request one-by-one to ensure the cache is populated.
  216. with pytest.raises(ValueError) if should_fail else nullcontext():
  217. for prompt in full_prompts:
  218. outputs[enable] += aphrodite_model.generate_greedy([prompt],
  219. max_tokens)
  220. # Check results only if we did not expect a failure.
  221. if check_result:
  222. check_outputs_equal(
  223. outputs_0_lst=outputs[False],
  224. outputs_1_lst=outputs[True],
  225. name_0="w/o prefix caching",
  226. name_1="with prefix caching",
  227. )