12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970 |
- import pytest
- from .conftest import run_equality_correctness_test
- # main model
- MAIN_MODEL = "JackFram/llama-68m"
- # speculative model
- SPEC_MODEL = "JackFram/llama-160m"
- @pytest.mark.parametrize(
- "common_llm_kwargs",
- [{
- "model_name": "JackFram/llama-68m",
- # Skip cuda graph recording for fast test.
- "enforce_eager": True,
- # Required for spec decode.
- "use_v2_block_manager": True,
- # speculative model
- "speculative_model": "JackFram/llama-160m",
- # num speculative tokens
- "num_speculative_tokens": 3,
- }])
- @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
- @pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
- @pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}])
- @pytest.mark.parametrize("batch_size", [1, 8, 32])
- @pytest.mark.parametrize("temperature", [0.1, 1.0])
- @pytest.mark.parametrize(
- "output_len",
- [
- # Use smaller output len for fast test.
- 20,
- ])
- def test_seeded_consistency(aphrodite_runner, common_llm_kwargs,
- per_test_common_llm_kwargs, baseline_llm_kwargs,
- test_llm_kwargs, batch_size: int,
- temperature: float, output_len: int):
- """Verify outputs are consistent across multiple runs with same seed
- """
- run_equality_correctness_test(
- aphrodite_runner,
- common_llm_kwargs,
- per_test_common_llm_kwargs,
- baseline_llm_kwargs,
- test_llm_kwargs,
- batch_size,
- max_output_len=output_len,
- temperature=temperature,
- disable_seed=False,
- )
- # Ensure this same test does fail if we _don't_ include per-request seeds
- with pytest.raises(AssertionError):
- run_equality_correctness_test(
- aphrodite_runner,
- common_llm_kwargs,
- per_test_common_llm_kwargs,
- baseline_llm_kwargs,
- test_llm_kwargs,
- batch_size,
- max_output_len=output_len,
- temperature=temperature,
- disable_seed=True,
- )
|