123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328 |
- """This docstring details important information on the testing methodology.
- Most of the tests rely on "greedy equality", where we expect the output of
- speculative decoding on a sequence to exactly match the output of normal non-
- speculative decoding.
- Since speculative decoding with rejection sampling guarantees that the output
- distribution matches the target model's output distribution (up to hardware
- numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
- equality.
- However, we still need to verify below scenario could be passed:
- * Batch size 1 greedy equality
- * Batch size >1 greedy equality
- * Test greedy equality under preemption
- * Test greedy equality under various number of speculative tokens.
- With those tests, we can say at least, MLPSpeculator would not break the
- correctess for the target model outputs.
- """
- from unittest.mock import patch
- import pytest
- from aphrodite.modeling.layers.vocab_parallel_embedding import pad_vocab_size
- from .conftest import (run_equality_correctness_test,
- run_greedy_equality_correctness_test)
- # main model
- MAIN_MODEL = "JackFram/llama-160m"
- # speculative model
- SPEC_MODEL = "ibm-fms/llama-160m-accelerator"
- # max. number of speculative tokens: this corresponds to
- # n_predict in the config.json of the speculator model.
- MAX_SPEC_TOKENS = 3
- # precision
- PRECISION = "float32"
- @pytest.mark.parametrize(
- "common_llm_kwargs",
- [{
- # Skip cuda graph recording for fast test.
- "enforce_eager": True,
- # Required for spec decode.
- "use_v2_block_manager": True,
- # Print spec metrics.
- "disable_log_stats": False,
- # Precision
- "dtype": PRECISION,
- # Main model
- "model": MAIN_MODEL,
- }])
- @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
- @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
- @pytest.mark.parametrize("test_llm_kwargs", [
- {
- "speculative_model": SPEC_MODEL,
- },
- ])
- @pytest.mark.parametrize("output_len", [
- 128,
- ])
- @pytest.mark.parametrize("batch_size", [1, 32])
- @pytest.mark.parametrize("seed", [1])
- def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
- batch_size: int, output_len: int):
- """Verify greedy equality with different batch size."""
- run_greedy_equality_correctness_test(baseline_llm_generator,
- test_llm_generator,
- batch_size,
- max_output_len=output_len,
- force_output_len=True)
- @pytest.mark.parametrize(
- "common_llm_kwargs",
- [{
- # Skip cuda graph recording for fast test.
- "enforce_eager": True,
- # Required for spec decode.
- "use_v2_block_manager": True,
- # Print spec metrics.
- "disable_log_stats": False,
- # Precision
- "dtype": PRECISION,
- # Main model
- "model": MAIN_MODEL,
- # Speculative model
- "speculative_model": SPEC_MODEL,
- }])
- @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
- @pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
- @pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}])
- @pytest.mark.parametrize("output_len", [64])
- @pytest.mark.parametrize("batch_size", [1, 32])
- @pytest.mark.parametrize("temperature", [0.1, 1.0])
- @pytest.mark.parametrize("seed", [None])
- def test_mlp_e2e_seeded_correctness(baseline_llm_generator, test_llm_generator,
- batch_size: int, output_len: int,
- temperature: float):
- """Verify seeded runs produce the same output."""
- run_equality_correctness_test(baseline_llm_generator,
- test_llm_generator,
- batch_size,
- max_output_len=output_len,
- temperature=temperature,
- seeded=True,
- force_output_len=True)
- # Ensure this same test does fail if we _don't_ include per-request seeds
- with pytest.raises(AssertionError):
- run_equality_correctness_test(baseline_llm_generator,
- test_llm_generator,
- batch_size,
- max_output_len=output_len,
- temperature=temperature,
- seeded=False,
- force_output_len=True)
- @pytest.mark.parametrize(
- "common_llm_kwargs",
- [{
- "block_size": 8,
- # 2 for small prompt, 256//8 for generated.
- "num_gpu_blocks_override": 2 + 256 // 8,
- "max_model_len": (2 + 256 // 8) * 8,
- # Skip cuda graph recording for fast test.
- "enforce_eager": True,
- # Required for spec decode.
- "use_v2_block_manager": True,
- # Precision
- "dtype": PRECISION,
- # Main model
- "model": MAIN_MODEL,
- }])
- @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
- @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
- @pytest.mark.parametrize("test_llm_kwargs", [
- {
- "speculative_model": SPEC_MODEL,
- },
- ])
- @pytest.mark.parametrize(
- "output_len",
- [
- # Use small output len for fast test.
- 128,
- ])
- @pytest.mark.parametrize("batch_size", [4])
- @pytest.mark.parametrize("seed", [1])
- def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
- test_llm_generator,
- batch_size: int,
- output_len: int):
- """Verify greedy equality, even when some sequences are preempted mid-
- generation.
- """
- run_greedy_equality_correctness_test(baseline_llm_generator,
- test_llm_generator,
- batch_size,
- max_output_len=output_len,
- force_output_len=True)
- @pytest.mark.parametrize(
- "common_llm_kwargs",
- [{
- "block_size": 8,
- # 2 for small prompt, 256//8 for generated.
- "num_gpu_blocks_override": 2 + 256 // 8,
- "max_model_len": (2 + 256 // 8) * 8,
- # Skip cuda graph recording for fast test.
- "enforce_eager": True,
- # Required for spec decode.
- "use_v2_block_manager": True,
- # Precision
- "dtype": PRECISION,
- # Main model
- "model": MAIN_MODEL,
- }])
- @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
- @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
- @pytest.mark.parametrize("test_llm_kwargs", [
- {
- "speculative_model": SPEC_MODEL,
- },
- ])
- @pytest.mark.parametrize(
- "output_len",
- [
- # Use small output len for fast test.
- 128,
- ])
- @pytest.mark.parametrize("batch_size", [4])
- @pytest.mark.parametrize("seed", [1])
- def test_mlp_e2e_greedy_correctness_with_padding(baseline_llm_generator,
- test_llm_generator,
- batch_size: int,
- output_len: int):
- """Verify greedy equality when the vocab dimension is padded
- """
- # Default pad_to is 64, test model has vocab_size of 32000
- def patched_pad_vocab_size(vocab_size, pad_to=None):
- return pad_vocab_size(vocab_size, pad_to=32064)
- with patch(
- "aphrodite.modeling.layers.vocab_parallel_embedding.pad_vocab_size",
- patched_pad_vocab_size):
- run_greedy_equality_correctness_test(baseline_llm_generator,
- test_llm_generator,
- batch_size,
- max_output_len=output_len,
- force_output_len=True)
- @pytest.mark.parametrize(
- "common_llm_kwargs",
- [{
- # Skip cuda graph recording for fast test.
- "enforce_eager": True,
- # Required for spec decode.
- "use_v2_block_manager": True,
- # Precision
- "dtype": PRECISION,
- # Main model
- "model": MAIN_MODEL,
- }])
- @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
- @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
- @pytest.mark.parametrize(
- "test_llm_kwargs",
- [
- {
- "speculative_model": SPEC_MODEL,
- "num_speculative_tokens": k,
- }
- # Try a range of num. speculative tokens
- for k in range(1, 1 + MAX_SPEC_TOKENS)
- ])
- @pytest.mark.parametrize("batch_size", [2])
- @pytest.mark.parametrize(
- "output_len",
- [
- # Use smaller output len for fast test.
- 32,
- ])
- @pytest.mark.parametrize("seed", [1])
- def test_mlp_different_k(baseline_llm_generator, test_llm_generator,
- batch_size: int, output_len: int):
- """Verify that mlp speculative decoding produces exact equality
- to without spec decode with different values of num_speculative_tokens.
- """
- run_greedy_equality_correctness_test(baseline_llm_generator,
- test_llm_generator,
- batch_size,
- max_output_len=output_len,
- force_output_len=True)
- @pytest.mark.parametrize(
- "common_llm_kwargs",
- [{
- # Skip cuda graph recording for fast test.
- "enforce_eager": True,
- # Required for spec decode.
- "use_v2_block_manager": True,
- # Precision
- "dtype": PRECISION,
- # Main model
- "model": MAIN_MODEL,
- }])
- @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
- @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
- @pytest.mark.parametrize("test_llm_kwargs",
- [{
- "speculative_model": SPEC_MODEL,
- "speculative_disable_by_batch_size": 4
- }])
- @pytest.mark.parametrize("batch_size", [1, 5])
- @pytest.mark.parametrize(
- "output_len",
- [
- # Use smaller output len for fast test.
- 32,
- ])
- @pytest.mark.parametrize("seed", [1])
- def test_mlp_disable_queue(baseline_llm_generator, test_llm_generator,
- batch_size: int, output_len: int):
- """Verify that mlp speculative decoding produces exact equality
- to without spec decode when speculation is disabled for large
- batch sizes.
- """
- run_greedy_equality_correctness_test(baseline_llm_generator,
- test_llm_generator,
- batch_size,
- max_output_len=output_len,
- force_output_len=True)
|