test_ngram_correctness.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. """This docstring details important information on the testing methodology.
  2. Most of the tests rely on "greedy equality", where we expect the output of
  3. speculative decoding on a sequence to exactly match the output of normal non-
  4. speculative decoding.
  5. Since speculative decoding with rejection sampling guarantees that the output
  6. distribution matches the target model's output distribution (up to hardware
  7. numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
  8. equality.
  9. For ngram lookup, its idea comes from https://github.com/apoorvumang/prompt-lookup-decoding,
  10. and is merged into transform code base: https://github.com/huggingface/transformers/pull/27775.
  11. Since there is no model is needed for generate the proposal, we could make
  12. the testcase much simpler than drafter multi-step one.
  13. However, we still need to verify below scenario could be passed:
  14. * Batch size 1 greedy equality
  15. * Batch size >1 greedy equality
  16. * Test greedy equality under preemption
  17. * Test greedy equality under various ngram sizes / speculative sizes
  18. With those tests, we can say at least, ngram spec would not break the correctess
  19. for the target model outputs.
  20. """
  21. import pytest
  22. from .conftest import run_greedy_equality_correctness_test
  23. @pytest.mark.parametrize(
  24. "common_llm_kwargs",
  25. [{
  26. # Skip cuda graph recording for fast test.
  27. "enforce_eager": True,
  28. # Required for spec decode.
  29. "use_v2_block_manager": True,
  30. # Print spec metrics.
  31. "disable_log_stats": False,
  32. }])
  33. @pytest.mark.parametrize("per_test_common_llm_kwargs", [
  34. {
  35. "model": "JackFram/llama-68m",
  36. },
  37. ])
  38. @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
  39. @pytest.mark.parametrize("test_llm_kwargs", [
  40. {
  41. "speculative_model": "[ngram]",
  42. "num_speculative_tokens": 5,
  43. "ngram_prompt_lookup_max": 3,
  44. },
  45. ])
  46. @pytest.mark.parametrize("output_len", [
  47. 256,
  48. ])
  49. @pytest.mark.parametrize("batch_size", [1, 32])
  50. @pytest.mark.parametrize("seed", [1])
  51. def test_ngram_e2e_greedy_correctness(baseline_llm_generator,
  52. test_llm_generator, batch_size: int,
  53. output_len: int):
  54. """Verify greedy equality on a tiny model with different batch size."""
  55. run_greedy_equality_correctness_test(baseline_llm_generator,
  56. test_llm_generator,
  57. batch_size,
  58. max_output_len=output_len,
  59. force_output_len=True)
  60. @pytest.mark.parametrize(
  61. "common_llm_kwargs",
  62. [{
  63. "block_size": 8,
  64. # 2 for small prompt, 256//8 for generated.
  65. "num_gpu_blocks_override": 2 + 256 // 8,
  66. "max_model_len": (2 + 256 // 8) * 8,
  67. # Skip cuda graph recording for fast test.
  68. "enforce_eager": True,
  69. # Required for spec decode.
  70. "use_v2_block_manager": True
  71. }])
  72. @pytest.mark.parametrize("per_test_common_llm_kwargs", [
  73. {
  74. "model": "JackFram/llama-160m",
  75. },
  76. ])
  77. @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
  78. @pytest.mark.parametrize("test_llm_kwargs", [
  79. {
  80. "speculative_model": "[ngram]",
  81. "num_speculative_tokens": 5,
  82. "ngram_prompt_lookup_max": 3,
  83. },
  84. ])
  85. @pytest.mark.parametrize(
  86. "output_len",
  87. [
  88. # Use small output len for fast test.
  89. 256,
  90. ])
  91. @pytest.mark.parametrize("batch_size", [4])
  92. @pytest.mark.parametrize("seed", [1])
  93. def test_ngram_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
  94. test_llm_generator,
  95. batch_size: int,
  96. output_len: int):
  97. """Verify greedy equality, even when some sequences are preempted mid-
  98. generation.
  99. """
  100. run_greedy_equality_correctness_test(baseline_llm_generator,
  101. test_llm_generator,
  102. batch_size,
  103. max_output_len=output_len,
  104. force_output_len=True)
  105. @pytest.mark.parametrize(
  106. "common_llm_kwargs",
  107. [{
  108. "model": "JackFram/llama-68m",
  109. # Skip cuda graph recording for fast test.
  110. "enforce_eager": True,
  111. # Required for spec decode.
  112. "use_v2_block_manager": True
  113. }])
  114. @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
  115. @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
  116. @pytest.mark.parametrize(
  117. "test_llm_kwargs",
  118. [
  119. {
  120. "speculative_model": "[ngram]",
  121. "num_speculative_tokens": k,
  122. "ngram_prompt_lookup_max": 3,
  123. }
  124. # Try a range of common k, as well as large speculation.
  125. for k in [1, 3, 5]
  126. ] + [
  127. {
  128. "speculative_model": "[ngram]",
  129. "num_speculative_tokens": k,
  130. "ngram_prompt_lookup_max": 1,
  131. }
  132. # Try a range of common k, as well as large speculation.
  133. for k in [1, 3, 5]
  134. ])
  135. @pytest.mark.parametrize("batch_size", [2])
  136. @pytest.mark.parametrize(
  137. "output_len",
  138. [
  139. # Use smaller output len for fast test.
  140. 32,
  141. ])
  142. @pytest.mark.parametrize("seed", [1])
  143. def test_ngram_different_k(baseline_llm_generator, test_llm_generator,
  144. batch_size: int, output_len: int):
  145. """Verify that ngram speculative decoding produces exact equality
  146. to without spec decode with many different values of k and
  147. different ngram_prompt_lookup_max.
  148. """
  149. run_greedy_equality_correctness_test(baseline_llm_generator,
  150. test_llm_generator,
  151. batch_size,
  152. max_output_len=output_len,
  153. force_output_len=True)
  154. @pytest.mark.parametrize(
  155. "common_llm_kwargs",
  156. [{
  157. "model": "JackFram/llama-68m",
  158. # Skip cuda graph recording for fast test.
  159. "enforce_eager": True,
  160. # Required for spec decode.
  161. "use_v2_block_manager": True
  162. }])
  163. @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
  164. @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
  165. @pytest.mark.parametrize("test_llm_kwargs",
  166. [{
  167. "speculative_model": "[ngram]",
  168. "num_speculative_tokens": 5,
  169. "ngram_prompt_lookup_max": 3,
  170. "speculative_disable_by_batch_size": 4
  171. }])
  172. @pytest.mark.parametrize("batch_size", [1, 5])
  173. @pytest.mark.parametrize(
  174. "output_len",
  175. [
  176. # Use smaller output len for fast test.
  177. 32,
  178. ])
  179. @pytest.mark.parametrize("seed", [1])
  180. def test_ngram_disable_queue(baseline_llm_generator, test_llm_generator,
  181. batch_size: int, output_len: int):
  182. """Verify that ngram speculative decoding produces exact equality
  183. to without spec decode with many different values of k and
  184. different ngram_prompt_lookup_max.
  185. """
  186. run_greedy_equality_correctness_test(baseline_llm_generator,
  187. test_llm_generator,
  188. batch_size,
  189. max_output_len=output_len,
  190. force_output_len=True)