|
@@ -23,12 +23,12 @@ test cases.
|
|
|
|
|
|
NOTE: Speculative decoding's distribution equality requires that the measured
|
|
|
distributions of the target model and proposal model be deterministic given the
|
|
|
-same input. aphrodite largely guarantees this.
|
|
|
+same input. Aphrodite largely guarantees this.
|
|
|
|
|
|
@cadedaniel has seen cases where the output probabilities of a draft/target
|
|
|
model change slightly with certain batch sizes or prompts, even with Torch
|
|
|
-determinism flags set. It is unclear if this is a bug in aphrodite, due to non-
|
|
|
-determinism in on-device batched operations, a bug in aphrodite's spec decode
|
|
|
+determinism flags set. It is unclear if this is a bug in Aphrodite, due to non-
|
|
|
+determinism in on-device batched operations, a bug in Aphrodite's spec decode
|
|
|
implementation, or the "hardware numerics" limitations. Either way, rejection
|
|
|
sampling ensures the output distribution matches the target model, but it breaks
|
|
|
greedy-equality tests for those batch sizes/prompts.
|
|
@@ -41,8 +41,9 @@ from transformers import AutoTokenizer
|
|
|
|
|
|
from aphrodite import SamplingParams
|
|
|
|
|
|
+from ...utils import fork_new_process_for_each_test
|
|
|
from .conftest import (get_output_from_llm_generator,
|
|
|
- run_greedy_equality_correctness_test)
|
|
|
+ run_equality_correctness_test)
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
@@ -73,6 +74,7 @@ from .conftest import (get_output_from_llm_generator,
|
|
|
@pytest.mark.parametrize("test_llm_kwargs", [{}])
|
|
|
@pytest.mark.parametrize("batch_size", [1, 32])
|
|
|
@pytest.mark.parametrize("seed", [1])
|
|
|
+@fork_new_process_for_each_test
|
|
|
def test_spec_decode_e2e_with_detokenization(test_llm_generator,
|
|
|
batch_size: int):
|
|
|
"""Run generation with speculative decoding on a batch. Verify the engine
|
|
@@ -116,44 +118,6 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
|
|
|
assert actual_tokens.strip() == expected_tokens.strip()
|
|
|
|
|
|
|
|
|
-@pytest.mark.parametrize(
|
|
|
- "common_llm_kwargs",
|
|
|
- [{
|
|
|
- # Use a small model for a fast test.
|
|
|
- # Note this is repeated in the test body; to initialize a tokenizer.
|
|
|
- "model": "JackFram/llama-68m",
|
|
|
-
|
|
|
- # Skip cuda graph recording for fast test.
|
|
|
- "enforce_eager": True,
|
|
|
-
|
|
|
- # Required for spec decode.
|
|
|
- "use_v2_block_manager": True,
|
|
|
-
|
|
|
- # Use AsyncLLM engine
|
|
|
- "use_async": True,
|
|
|
- }])
|
|
|
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
|
|
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
|
|
|
- {
|
|
|
- "speculative_model": "JackFram/llama-68m",
|
|
|
- "num_speculative_tokens": 5,
|
|
|
- },
|
|
|
-])
|
|
|
-@pytest.mark.parametrize("test_llm_kwargs", [{}])
|
|
|
-@pytest.mark.parametrize("batch_size", [2])
|
|
|
-@pytest.mark.parametrize("seed", [1])
|
|
|
-def test_spec_decode_e2e_with_async_engine(test_llm_generator,
|
|
|
- baseline_llm_generator,
|
|
|
- batch_size: int):
|
|
|
- """Verify spec decode works well with async LLM engine.
|
|
|
- """
|
|
|
- run_greedy_equality_correctness_test(baseline_llm_generator,
|
|
|
- test_llm_generator,
|
|
|
- batch_size,
|
|
|
- max_output_len=32,
|
|
|
- force_output_len=True)
|
|
|
-
|
|
|
-
|
|
|
@pytest.mark.parametrize(
|
|
|
"common_llm_kwargs",
|
|
|
[{
|
|
@@ -172,10 +136,10 @@ def test_spec_decode_e2e_with_async_engine(test_llm_generator,
|
|
|
# Try two different tiny base models.
|
|
|
# Note that one is equal to the draft model, another isn't.
|
|
|
{
|
|
|
- "model": "JackFram/llama-68m",
|
|
|
+ "model_name": "JackFram/llama-68m",
|
|
|
},
|
|
|
{
|
|
|
- "model": "JackFram/llama-160m",
|
|
|
+ "model_name": "JackFram/llama-160m",
|
|
|
},
|
|
|
])
|
|
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
|
@@ -189,13 +153,15 @@ def test_spec_decode_e2e_with_async_engine(test_llm_generator,
|
|
|
"output_len",
|
|
|
[
|
|
|
# Use long output len for the small model test.
|
|
|
- 1536,
|
|
|
+ 10,
|
|
|
])
|
|
|
@pytest.mark.parametrize("batch_size", [1])
|
|
|
@pytest.mark.parametrize("seed", [1])
|
|
|
+@fork_new_process_for_each_test
|
|
|
def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
|
|
|
- baseline_llm_generator, test_llm_generator, batch_size: int,
|
|
|
- output_len: int):
|
|
|
+ aphrodite_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
|
|
+ baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
|
|
+ seed: int):
|
|
|
"""Verify greedy equality on a tiny model with batch size of one.
|
|
|
|
|
|
Since this test is cheaper than other e2e correctness tests, we generate
|
|
@@ -204,14 +170,18 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
|
|
|
When the draft model is the same as the target model, we further check
|
|
|
whether all speculative tokens are accepted.
|
|
|
"""
|
|
|
- ensure_all_accepted = test_llm_generator.same_draft_target_model
|
|
|
- run_greedy_equality_correctness_test(
|
|
|
- baseline_llm_generator,
|
|
|
- test_llm_generator,
|
|
|
- batch_size,
|
|
|
- max_output_len=output_len,
|
|
|
- force_output_len=True,
|
|
|
- ensure_all_accepted=ensure_all_accepted)
|
|
|
+ ensure_all_accepted = per_test_common_llm_kwargs.get(
|
|
|
+ "model_name") == test_llm_kwargs.get("speculative_model")
|
|
|
+ run_equality_correctness_test(aphrodite_runner,
|
|
|
+ common_llm_kwargs,
|
|
|
+ per_test_common_llm_kwargs,
|
|
|
+ baseline_llm_kwargs,
|
|
|
+ test_llm_kwargs,
|
|
|
+ batch_size,
|
|
|
+ max_output_len=output_len,
|
|
|
+ seed=seed,
|
|
|
+ temperature=0.0,
|
|
|
+ ensure_all_accepted=ensure_all_accepted)
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
@@ -232,10 +202,10 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
|
|
|
# Try two different tiny base models.
|
|
|
# Note that one is equal to the draft model, another isn't.
|
|
|
{
|
|
|
- "model": "JackFram/llama-68m",
|
|
|
+ "model_name": "JackFram/llama-68m",
|
|
|
},
|
|
|
{
|
|
|
- "model": "JackFram/llama-160m",
|
|
|
+ "model_name": "JackFram/llama-160m",
|
|
|
},
|
|
|
])
|
|
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
|
@@ -253,16 +223,22 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
|
|
|
])
|
|
|
@pytest.mark.parametrize("batch_size", [64])
|
|
|
@pytest.mark.parametrize("seed", [1])
|
|
|
+@fork_new_process_for_each_test
|
|
|
def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
|
|
|
- baseline_llm_generator, test_llm_generator, batch_size: int,
|
|
|
- output_len: int):
|
|
|
+ aphrodite_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
|
|
+ baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
|
|
+ seed: int):
|
|
|
"""Verify greedy equality on a tiny model and large batch size.
|
|
|
"""
|
|
|
- run_greedy_equality_correctness_test(baseline_llm_generator,
|
|
|
- test_llm_generator,
|
|
|
- batch_size,
|
|
|
- max_output_len=output_len,
|
|
|
- force_output_len=True)
|
|
|
+ run_equality_correctness_test(aphrodite_runner,
|
|
|
+ common_llm_kwargs,
|
|
|
+ per_test_common_llm_kwargs,
|
|
|
+ baseline_llm_kwargs,
|
|
|
+ test_llm_kwargs,
|
|
|
+ batch_size,
|
|
|
+ max_output_len=output_len,
|
|
|
+ seed=seed,
|
|
|
+ temperature=0.0)
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
@@ -280,10 +256,10 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
|
|
|
# Try two different tiny base models.
|
|
|
# Note that one is equal to the draft model, another isn't.
|
|
|
{
|
|
|
- "model": "JackFram/llama-68m",
|
|
|
+ "model_name": "JackFram/llama-68m",
|
|
|
},
|
|
|
{
|
|
|
- "model": "JackFram/llama-160m",
|
|
|
+ "model_name": "JackFram/llama-160m",
|
|
|
},
|
|
|
])
|
|
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
|
@@ -298,24 +274,31 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
|
|
|
])
|
|
|
@pytest.mark.parametrize("batch_size", [32])
|
|
|
@pytest.mark.parametrize("seed", [1])
|
|
|
+@fork_new_process_for_each_test
|
|
|
def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
|
|
|
- baseline_llm_generator, test_llm_generator, batch_size: int,
|
|
|
- max_output_len: int):
|
|
|
+ aphrodite_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
|
|
+ baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
|
|
|
+ max_output_len: int, seed: int):
|
|
|
"""Verify greedy equality on a tiny model, with a large batch size, and when
|
|
|
sampling respects the EOS token.
|
|
|
"""
|
|
|
- run_greedy_equality_correctness_test(baseline_llm_generator,
|
|
|
- test_llm_generator,
|
|
|
- batch_size,
|
|
|
- max_output_len,
|
|
|
- force_output_len=False)
|
|
|
+ run_equality_correctness_test(aphrodite_runner,
|
|
|
+ common_llm_kwargs,
|
|
|
+ per_test_common_llm_kwargs,
|
|
|
+ baseline_llm_kwargs,
|
|
|
+ test_llm_kwargs,
|
|
|
+ batch_size,
|
|
|
+ max_output_len,
|
|
|
+ seed=seed,
|
|
|
+ temperature=0.0,
|
|
|
+ ignore_eos=False)
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
"common_llm_kwargs",
|
|
|
[{
|
|
|
# A "real" model (not tiny).
|
|
|
- "model": "meta-llama/Llama-2-7b-chat-hf",
|
|
|
+ "model_name": "meta-llama/Llama-2-7b-chat-hf",
|
|
|
|
|
|
# Skip cuda graph recording for fast test.
|
|
|
"enforce_eager": True,
|
|
@@ -342,24 +325,30 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
|
|
|
256,
|
|
|
])
|
|
|
@pytest.mark.parametrize("seed", [1])
|
|
|
+@fork_new_process_for_each_test
|
|
|
def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
|
|
|
- baseline_llm_generator, test_llm_generator, batch_size: int,
|
|
|
- output_len: int):
|
|
|
+ aphrodite_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
|
|
+ baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
|
|
+ seed: int):
|
|
|
"""Verify greedy equality on a "real" model and batch size of 1. This is
|
|
|
separate from large BS tests to make identifying the source of bugs easier.
|
|
|
"""
|
|
|
- run_greedy_equality_correctness_test(baseline_llm_generator,
|
|
|
- test_llm_generator,
|
|
|
- batch_size,
|
|
|
- max_output_len=output_len,
|
|
|
- force_output_len=True)
|
|
|
+ run_equality_correctness_test(aphrodite_runner,
|
|
|
+ common_llm_kwargs,
|
|
|
+ per_test_common_llm_kwargs,
|
|
|
+ baseline_llm_kwargs,
|
|
|
+ test_llm_kwargs,
|
|
|
+ batch_size,
|
|
|
+ max_output_len=output_len,
|
|
|
+ seed=seed,
|
|
|
+ temperature=0.0)
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
"common_llm_kwargs",
|
|
|
[{
|
|
|
# A "real" model (not tiny).
|
|
|
- "model": "meta-llama/Llama-2-7b-chat-hf",
|
|
|
+ "model_name": "meta-llama/Llama-2-7b-chat-hf",
|
|
|
|
|
|
# Skip cuda graph recording for fast test.
|
|
|
"enforce_eager": True,
|
|
@@ -386,17 +375,23 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
|
|
|
64,
|
|
|
])
|
|
|
@pytest.mark.parametrize("seed", [1])
|
|
|
+@fork_new_process_for_each_test
|
|
|
def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
|
|
|
- baseline_llm_generator, test_llm_generator, batch_size: int,
|
|
|
- output_len: int):
|
|
|
+ aphrodite_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
|
|
+ baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
|
|
+ seed: int):
|
|
|
"""Verify greedy equality with a "real" model on a nontrivial batch size.
|
|
|
This is the closest test to a real production workload.
|
|
|
"""
|
|
|
- run_greedy_equality_correctness_test(baseline_llm_generator,
|
|
|
- test_llm_generator,
|
|
|
- batch_size,
|
|
|
- max_output_len=output_len,
|
|
|
- force_output_len=True)
|
|
|
+ run_equality_correctness_test(aphrodite_runner,
|
|
|
+ common_llm_kwargs,
|
|
|
+ per_test_common_llm_kwargs,
|
|
|
+ baseline_llm_kwargs,
|
|
|
+ test_llm_kwargs,
|
|
|
+ batch_size,
|
|
|
+ max_output_len=output_len,
|
|
|
+ seed=seed,
|
|
|
+ temperature=0.0)
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
@@ -415,7 +410,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
|
|
|
}])
|
|
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
|
|
|
{
|
|
|
- "model": "JackFram/llama-160m",
|
|
|
+ "model_name": "JackFram/llama-160m",
|
|
|
},
|
|
|
])
|
|
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
|
@@ -433,23 +428,29 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
|
|
|
])
|
|
|
@pytest.mark.parametrize("batch_size", [4])
|
|
|
@pytest.mark.parametrize("seed", [1])
|
|
|
+@fork_new_process_for_each_test
|
|
|
def test_spec_decode_e2e_greedy_correctness_with_preemption(
|
|
|
- baseline_llm_generator, test_llm_generator, batch_size: int,
|
|
|
- output_len: int):
|
|
|
+ aphrodite_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
|
|
+ baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
|
|
+ seed: int):
|
|
|
"""Verify greedy equality, even when some sequences are preempted mid-
|
|
|
generation.
|
|
|
"""
|
|
|
- run_greedy_equality_correctness_test(baseline_llm_generator,
|
|
|
- test_llm_generator,
|
|
|
- batch_size,
|
|
|
- max_output_len=output_len,
|
|
|
- force_output_len=True)
|
|
|
+ run_equality_correctness_test(aphrodite_runner,
|
|
|
+ common_llm_kwargs,
|
|
|
+ per_test_common_llm_kwargs,
|
|
|
+ baseline_llm_kwargs,
|
|
|
+ test_llm_kwargs,
|
|
|
+ batch_size,
|
|
|
+ max_output_len=output_len,
|
|
|
+ seed=seed,
|
|
|
+ temperature=0.0)
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
"common_llm_kwargs",
|
|
|
[{
|
|
|
- "model": "JackFram/llama-160m",
|
|
|
+ "model_name": "JackFram/llama-160m",
|
|
|
|
|
|
# Skip cuda graph recording for fast test.
|
|
|
"enforce_eager": True,
|
|
@@ -460,7 +461,7 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
|
|
|
@pytest.mark.parametrize(
|
|
|
"per_test_common_llm_kwargs",
|
|
|
[
|
|
|
- # As of this writing, aphrodite only compiles with these 3 block sizes
|
|
|
+ # As of this writing, Aphrodite only compiles with these 3 block sizes
|
|
|
# by default.
|
|
|
{
|
|
|
"block_size": 8,
|
|
@@ -487,22 +488,29 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
|
|
|
32,
|
|
|
])
|
|
|
@pytest.mark.parametrize("seed", [1])
|
|
|
-def test_spec_decode_different_block_size(baseline_llm_generator,
|
|
|
- test_llm_generator, batch_size: int,
|
|
|
- output_len: int):
|
|
|
+@fork_new_process_for_each_test
|
|
|
+def test_spec_decode_different_block_size(aphrodite_runner, common_llm_kwargs,
|
|
|
+ per_test_common_llm_kwargs,
|
|
|
+ baseline_llm_kwargs, test_llm_kwargs,
|
|
|
+ batch_size: int, output_len: int,
|
|
|
+ seed: int):
|
|
|
"""Verify greedy equality over different block sizes.
|
|
|
"""
|
|
|
- run_greedy_equality_correctness_test(baseline_llm_generator,
|
|
|
- test_llm_generator,
|
|
|
- batch_size,
|
|
|
- max_output_len=output_len,
|
|
|
- force_output_len=True)
|
|
|
+ run_equality_correctness_test(aphrodite_runner,
|
|
|
+ common_llm_kwargs,
|
|
|
+ per_test_common_llm_kwargs,
|
|
|
+ baseline_llm_kwargs,
|
|
|
+ test_llm_kwargs,
|
|
|
+ batch_size,
|
|
|
+ max_output_len=output_len,
|
|
|
+ seed=seed,
|
|
|
+ temperature=0.0)
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
"common_llm_kwargs",
|
|
|
[{
|
|
|
- "model": "JackFram/llama-160m",
|
|
|
+ "model_name": "JackFram/llama-160m",
|
|
|
|
|
|
# Skip cuda graph recording for fast test.
|
|
|
"enforce_eager": True,
|
|
@@ -535,24 +543,31 @@ def test_spec_decode_different_block_size(baseline_llm_generator,
|
|
|
64,
|
|
|
])
|
|
|
@pytest.mark.parametrize("seed", [1])
|
|
|
-def test_skip_speculation(baseline_llm_generator, test_llm_generator,
|
|
|
- batch_size: int, output_len: int):
|
|
|
+@fork_new_process_for_each_test
|
|
|
+def test_skip_speculation(aphrodite_runner, common_llm_kwargs,
|
|
|
+ per_test_common_llm_kwargs, baseline_llm_kwargs,
|
|
|
+ test_llm_kwargs, batch_size: int, output_len: int,
|
|
|
+ seed: int):
|
|
|
"""Verify greedy equality when some (or all) sequences skip speculation.
|
|
|
We do this by setting the max model len of the draft model to an
|
|
|
artificially low value, such that when the sequences grow beyond it, they
|
|
|
are skipped in speculative decoding.
|
|
|
"""
|
|
|
- run_greedy_equality_correctness_test(baseline_llm_generator,
|
|
|
- test_llm_generator,
|
|
|
- batch_size,
|
|
|
- max_output_len=output_len,
|
|
|
- force_output_len=True)
|
|
|
+ run_equality_correctness_test(aphrodite_runner,
|
|
|
+ common_llm_kwargs,
|
|
|
+ per_test_common_llm_kwargs,
|
|
|
+ baseline_llm_kwargs,
|
|
|
+ test_llm_kwargs,
|
|
|
+ batch_size,
|
|
|
+ max_output_len=output_len,
|
|
|
+ seed=seed,
|
|
|
+ temperature=0.0)
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
"common_llm_kwargs",
|
|
|
[{
|
|
|
- "model": "JackFram/llama-160m",
|
|
|
+ "model_name": "JackFram/llama-160m",
|
|
|
|
|
|
# Skip cuda graph recording for fast test.
|
|
|
"enforce_eager": True,
|
|
@@ -572,21 +587,28 @@ def test_skip_speculation(baseline_llm_generator, test_llm_generator,
|
|
|
@pytest.mark.parametrize("batch_size", [8])
|
|
|
@pytest.mark.parametrize("output_len", [10])
|
|
|
@pytest.mark.parametrize("seed", [1])
|
|
|
-def test_disable_speculation(baseline_llm_generator, test_llm_generator,
|
|
|
- batch_size: int, output_len: int):
|
|
|
+@fork_new_process_for_each_test
|
|
|
+def test_disable_speculation(aphrodite_runner, common_llm_kwargs,
|
|
|
+ per_test_common_llm_kwargs, baseline_llm_kwargs,
|
|
|
+ test_llm_kwargs, batch_size: int, output_len: int,
|
|
|
+ seed: int):
|
|
|
"""Verify greedy equality when all sequences disable speculation.
|
|
|
"""
|
|
|
- run_greedy_equality_correctness_test(baseline_llm_generator,
|
|
|
- test_llm_generator,
|
|
|
- batch_size,
|
|
|
- max_output_len=output_len,
|
|
|
- force_output_len=True)
|
|
|
+ run_equality_correctness_test(aphrodite_runner,
|
|
|
+ common_llm_kwargs,
|
|
|
+ per_test_common_llm_kwargs,
|
|
|
+ baseline_llm_kwargs,
|
|
|
+ test_llm_kwargs,
|
|
|
+ batch_size,
|
|
|
+ max_output_len=output_len,
|
|
|
+ seed=seed,
|
|
|
+ temperature=0.0)
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
"common_llm_kwargs",
|
|
|
[{
|
|
|
- "model": "JackFram/llama-68m",
|
|
|
+ "model_name": "JackFram/llama-68m",
|
|
|
|
|
|
# Skip cuda graph recording for fast test.
|
|
|
"enforce_eager": True,
|
|
@@ -614,22 +636,28 @@ def test_disable_speculation(baseline_llm_generator, test_llm_generator,
|
|
|
32,
|
|
|
])
|
|
|
@pytest.mark.parametrize("seed", [1])
|
|
|
-def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int,
|
|
|
- output_len: int):
|
|
|
+@fork_new_process_for_each_test
|
|
|
+def test_many_k(aphrodite_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
|
|
+ baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
|
|
|
+ output_len: int, seed: int):
|
|
|
"""Verify that speculative decoding produces exact equality to without spec
|
|
|
decode with many different values of k.
|
|
|
"""
|
|
|
- run_greedy_equality_correctness_test(baseline_llm_generator,
|
|
|
- test_llm_generator,
|
|
|
- batch_size,
|
|
|
- max_output_len=output_len,
|
|
|
- force_output_len=True)
|
|
|
+ run_equality_correctness_test(aphrodite_runner,
|
|
|
+ common_llm_kwargs,
|
|
|
+ per_test_common_llm_kwargs,
|
|
|
+ baseline_llm_kwargs,
|
|
|
+ test_llm_kwargs,
|
|
|
+ batch_size,
|
|
|
+ max_output_len=output_len,
|
|
|
+ seed=seed,
|
|
|
+ temperature=0.0)
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
"common_llm_kwargs",
|
|
|
[{
|
|
|
- "model": "JackFram/llama-160m",
|
|
|
+ "model_name": "JackFram/llama-160m",
|
|
|
|
|
|
# Skip cuda graph recording for fast test.
|
|
|
"enforce_eager": True,
|
|
@@ -658,15 +686,22 @@ def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int,
|
|
|
32,
|
|
|
])
|
|
|
@pytest.mark.parametrize("seed", [1])
|
|
|
-def test_typical_acceptance_sampling(baseline_llm_generator,
|
|
|
- test_llm_generator, batch_size: int,
|
|
|
- output_len: int):
|
|
|
+@fork_new_process_for_each_test
|
|
|
+def test_typical_acceptance_sampling(aphrodite_runner, common_llm_kwargs,
|
|
|
+ per_test_common_llm_kwargs,
|
|
|
+ baseline_llm_kwargs, test_llm_kwargs,
|
|
|
+ batch_size: int, output_len: int,
|
|
|
+ seed: int):
|
|
|
"""Verify that speculative decoding produces exact equality to without spec
|
|
|
decode with TypicalAcceptanceSampler as the draft token acceptance
|
|
|
sampling method.
|
|
|
"""
|
|
|
- run_greedy_equality_correctness_test(baseline_llm_generator,
|
|
|
- test_llm_generator,
|
|
|
- batch_size,
|
|
|
- max_output_len=output_len,
|
|
|
- force_output_len=True)
|
|
|
+ run_equality_correctness_test(aphrodite_runner,
|
|
|
+ common_llm_kwargs,
|
|
|
+ per_test_common_llm_kwargs,
|
|
|
+ baseline_llm_kwargs,
|
|
|
+ test_llm_kwargs,
|
|
|
+ batch_size,
|
|
|
+ max_output_len=output_len,
|
|
|
+ seed=seed,
|
|
|
+ temperature=0.0)
|