|
@@ -6,7 +6,6 @@ prefill requests are chunked.
|
|
|
|
|
|
Run `pytest tests/models/test_chunked_prefill.py`.
|
|
|
"""
|
|
|
-from contextlib import nullcontext
|
|
|
|
|
|
import pytest
|
|
|
|
|
@@ -152,68 +151,3 @@ def test_models_with_fp8_kv_cache(
|
|
|
name_0="no_chunked_prefill",
|
|
|
name_1="chunked_prefill",
|
|
|
)
|
|
|
-
|
|
|
-
|
|
|
-@pytest.mark.parametrize("max_tokens", [16])
|
|
|
-@pytest.mark.parametrize("enforce_eager", [False])
|
|
|
-@pytest.mark.parametrize("chunk_size", [30, 32])
|
|
|
-@pytest.mark.parametrize("use_v2_block_manager", [False, True])
|
|
|
-# NOTE: Increasing this in this suite will fail CI because we currently cannot
|
|
|
-# reset distributed env properly. Use a value > 1 just when you test.
|
|
|
-@pytest.mark.parametrize("tensor_parallel_size", [1])
|
|
|
-def test_with_prefix_caching(
|
|
|
- aphrodite_runner,
|
|
|
- max_tokens: int,
|
|
|
- enforce_eager: bool,
|
|
|
- chunk_size: int,
|
|
|
- use_v2_block_manager: bool,
|
|
|
- tensor_parallel_size: int,
|
|
|
-) -> None:
|
|
|
- """
|
|
|
- Checks exact match decode with and without prefix caching
|
|
|
- with chunked prefill enabled.
|
|
|
- """
|
|
|
- model = "meta-llama/Llama-2-7b-chat-hf"
|
|
|
- # The common prompt has 142 tokens with Llama-2 tokenizer.
|
|
|
- common_prompt = "You are a helpful AI assistant " * 20
|
|
|
- unique_prompts = [
|
|
|
- "Question", # Warmup
|
|
|
- "Question", # Fully cached
|
|
|
- "Another question", # Partial cached
|
|
|
- ]
|
|
|
- full_prompts = [f"{common_prompt}\n{p}" for p in unique_prompts]
|
|
|
-
|
|
|
- max_num_batched_tokens = max_num_seqs = chunk_size
|
|
|
- outputs = {} # type: ignore
|
|
|
- check_result = True
|
|
|
- for enable in (True, False):
|
|
|
- with aphrodite_runner(
|
|
|
- model,
|
|
|
- dtype="half",
|
|
|
- max_num_batched_tokens=max_num_batched_tokens,
|
|
|
- enable_chunked_prefill=True,
|
|
|
- enable_prefix_caching=enable,
|
|
|
- tensor_parallel_size=tensor_parallel_size,
|
|
|
- use_v2_block_manager=use_v2_block_manager,
|
|
|
- enforce_eager=enforce_eager,
|
|
|
- max_num_seqs=max_num_seqs,
|
|
|
- ) as aphrodite_model:
|
|
|
- # It should fail when prefix caching is enable and chunk
|
|
|
- # size is not a multiple of block size (16).
|
|
|
- should_fail = chunk_size % 16 != 0 and enable
|
|
|
- check_result &= not should_fail
|
|
|
- outputs[enable] = []
|
|
|
- # Send the request one-by-one to ensure the cache is populated.
|
|
|
- with pytest.raises(ValueError) if should_fail else nullcontext():
|
|
|
- for prompt in full_prompts:
|
|
|
- outputs[enable] += aphrodite_model.generate_greedy(
|
|
|
- [prompt], max_tokens)
|
|
|
-
|
|
|
- # Check results only if we did not expect a failure.
|
|
|
- if check_result:
|
|
|
- check_outputs_equal(
|
|
|
- outputs_0_lst=outputs[False],
|
|
|
- outputs_1_lst=outputs[True],
|
|
|
- name_0="w/o prefix caching",
|
|
|
- name_1="with prefix caching",
|
|
|
- )
|