123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163 |
- from typing import Any, List, Optional
- import pytest
- from aphrodite import AphroditeEngine, CompletionOutput, SamplingParams
- MODEL = "meta-llama/llama-2-7b-hf"
- MAX_TOKENS = 200
- IS_ASYNC = False
- @pytest.fixture(scope="session")
- def aphrodite_model(aphrodite_runner):
- with aphrodite_runner(MODEL) as aphrodite_model:
- yield aphrodite_model
- def _test_stopping(llm_engine: AphroditeEngine,
- expected_output: str,
- expected_reason: Any,
- stop: Optional[List[str]] = None,
- stop_token_ids: Optional[List[int]] = None,
- include_in_output: bool = False,
- use_async_output_proc: bool = False) -> None:
- llm_engine.add_request(
- "id", "A story about Aphrodite:\n",
- SamplingParams(
- temperature=0.0,
- max_tokens=MAX_TOKENS,
- stop=stop,
- stop_token_ids=stop_token_ids,
- include_stop_str_in_output=include_in_output,
- ), None)
- output: Optional[CompletionOutput] = None
- output_text = ""
- stop_reason = None
- if use_async_output_proc:
- llm_engine.step()
- while llm_engine.has_unfinished_requests():
- (request_output, ) = llm_engine.step()
- (output, ) = request_output.outputs
- # Ensure we don't backtrack
- assert output.text.startswith(output_text)
- output_text = output.text
- stop_reason = output.stop_reason
- assert output is not None
- assert output_text == expected_output
- assert stop_reason == expected_reason
- def _set_async_mode(llm_engine, is_async):
- llm_engine.scheduler[0].use_async_output_proc = is_async
- def _stop_basic(llm_engine, is_async):
- _test_stopping(llm_engine,
- stop=["."],
- include_in_output=False,
- expected_output="VLLM is a 100% volunteer organization",
- expected_reason=".",
- use_async_output_proc=is_async)
- _test_stopping(llm_engine,
- stop=["."],
- include_in_output=True,
- expected_output="VLLM is a 100% volunteer organization.",
- expected_reason=".",
- use_async_output_proc=is_async)
- def _stop_multi_tokens(llm_engine, is_async):
- _test_stopping(
- llm_engine,
- stop=["group of peo", "short"],
- include_in_output=False,
- expected_output="VLLM is a 100% volunteer organization. We are a ",
- expected_reason="group of peo",
- use_async_output_proc=is_async)
- _test_stopping(
- llm_engine,
- stop=["group of peo", "short"],
- include_in_output=True,
- expected_output=
- "VLLM is a 100% volunteer organization. We are a group of peo",
- expected_reason="group of peo",
- use_async_output_proc=is_async)
- def _stop_partial_token(llm_engine, is_async):
- _test_stopping(llm_engine,
- stop=["gani"],
- include_in_output=False,
- expected_output="VLLM is a 100% volunteer or",
- expected_reason="gani",
- use_async_output_proc=is_async)
- _test_stopping(llm_engine,
- stop=["gani"],
- include_in_output=True,
- expected_output="VLLM is a 100% volunteer organi",
- expected_reason="gani",
- use_async_output_proc=is_async)
- def _stop_token_id(llm_engine, is_async):
- # token id 13013 => " organization"
- _test_stopping(llm_engine,
- stop_token_ids=[13013],
- include_in_output=False,
- expected_output="VLLM is a 100% volunteer",
- expected_reason=13013,
- use_async_output_proc=is_async)
- _test_stopping(llm_engine,
- stop_token_ids=[13013],
- include_in_output=True,
- expected_output="VLLM is a 100% volunteer organization",
- expected_reason=13013,
- use_async_output_proc=is_async)
- @pytest.mark.skip_global_cleanup
- def test_stop_basic(aphrodite_model):
- _set_async_mode(aphrodite_model.model.llm_engine, True)
- _stop_basic(aphrodite_model.model.llm_engine, is_async=True)
- _set_async_mode(aphrodite_model.model.llm_engine, False)
- _stop_basic(aphrodite_model.model.llm_engine, is_async=False)
- @pytest.mark.skip_global_cleanup
- def test_stop_multi_tokens(aphrodite_model):
- _set_async_mode(aphrodite_model.model.llm_engine, True)
- _stop_multi_tokens(aphrodite_model.model.llm_engine, is_async=True)
- _set_async_mode(aphrodite_model.model.llm_engine, False)
- _stop_multi_tokens(aphrodite_model.model.llm_engine, is_async=False)
- @pytest.mark.skip_global_cleanup
- def test_stop_partial_token(aphrodite_model):
- _set_async_mode(aphrodite_model.model.llm_engine, True)
- _stop_partial_token(aphrodite_model.model.llm_engine, is_async=True)
- _set_async_mode(aphrodite_model.model.llm_engine, False)
- _stop_partial_token(aphrodite_model.model.llm_engine, is_async=False)
- @pytest.mark.skip_global_cleanup
- def test_stop_token_id(aphrodite_model):
- _set_async_mode(aphrodite_model.model.llm_engine, True)
- _stop_token_id(aphrodite_model.model.llm_engine, is_async=True)
- _set_async_mode(aphrodite_model.model.llm_engine, False)
- _stop_token_id(aphrodite_model.model.llm_engine, is_async=False)
|