test_computed_prefix_blocks.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334
  1. import pytest
  2. from aphrodite.common.sampling_params import SamplingParams
  3. from aphrodite.engine.aphrodite_engine import AphroditeEngine
  4. from aphrodite.engine.args_tools import EngineArgs
  5. @pytest.mark.parametrize("model", ["facebook/opt-125m"])
  6. @pytest.mark.parametrize("block_size", [16])
  7. def test_computed_prefix_blocks(model: str, block_size: int):
  8. # This test checks if we are able to run the engine to completion
  9. # without triggering asserts.
  10. # We are in a scenario where all blocks from the second request's prompt
  11. # are full and already computed when the second request arrives.
  12. prompt = (
  13. "You are a helpful assistant. How do I build a car from cardboard and "
  14. "paper clips? Is there an easy to follow video tutorial available "
  15. "online for free?")
  16. prompt2 = (
  17. " Please recommend to me some resources where I can learn not only to "
  18. "handle technical difficulties of building a car, but also "
  19. "decoration.")
  20. engine_args = EngineArgs(model=model,
  21. block_size=block_size,
  22. enable_prefix_caching=True)
  23. engine = AphroditeEngine.from_engine_args(engine_args)
  24. sampling_params = SamplingParams()
  25. engine.add_request("0", prompt + prompt2, sampling_params)
  26. engine.step()
  27. engine.add_request("1", prompt, sampling_params)
  28. engine.step()