12345678910111213141516171819202122232425262728293031323334 |
- import pytest
- from aphrodite.common.sampling_params import SamplingParams
- from aphrodite.engine.aphrodite_engine import AphroditeEngine
- from aphrodite.engine.args_tools import EngineArgs
- @pytest.mark.parametrize("model", ["facebook/opt-125m"])
- @pytest.mark.parametrize("block_size", [16])
- def test_computed_prefix_blocks(model: str, block_size: int):
- # This test checks if we are able to run the engine to completion
- # without triggering asserts.
- # We are in a scenario where all blocks from the second request's prompt
- # are full and already computed when the second request arrives.
- prompt = (
- "You are a helpful assistant. How do I build a car from cardboard and "
- "paper clips? Is there an easy to follow video tutorial available "
- "online for free?")
- prompt2 = (
- " Please recommend to me some resources where I can learn not only to "
- "handle technical difficulties of building a car, but also "
- "decoration.")
- engine_args = EngineArgs(model=model,
- block_size=block_size,
- enable_prefix_caching=True)
- engine = AphroditeEngine.from_engine_args(engine_args)
- sampling_params = SamplingParams()
- engine.add_request("0", prompt + prompt2, sampling_params)
- engine.step()
- engine.add_request("1", prompt, sampling_params)
- engine.step()
|