test_eagle_correctness.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. """This docstring details important information on the testing methodology.
  2. Most of the tests rely on "greedy equality", where we expect the output of
  3. speculative decoding on a sequence to exactly match the output of normal non-
  4. speculative decoding.
  5. Since speculative decoding with rejection sampling guarantees that the output
  6. distribution matches the target model's output distribution (up to hardware
  7. numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
  8. equality.
  9. However, we still need to verify below scenario could be passed:
  10. * Batch size 1 greedy equality
  11. * Batch size >1 greedy equality
  12. * Test greedy equality under preemption
  13. * Test greedy equality under various number of speculative tokens.
  14. With those tests, we can say at least, EAGLE would not break the
  15. correctess for the target model outputs.
  16. """
  17. import pytest
  18. from .conftest import run_greedy_equality_correctness_test
  19. # main model
  20. MAIN_MODEL = "JackFram/llama-68m"
  21. # speculative model
  22. SPEC_MODEL = "abhigoyal/vllm-eagle-llama-68m-random"
  23. # max. number of speculative tokens: this corresponds to
  24. # num_heads in the config.json of the speculator model.
  25. MAX_SPEC_TOKENS = 4
  26. # precision
  27. PRECISION = "float32"
  28. @pytest.mark.parametrize(
  29. "common_llm_kwargs",
  30. [
  31. {
  32. # Skip cuda graph recording for fast test.
  33. "enforce_eager": True,
  34. # Required for spec decode.
  35. "use_v2_block_manager": True,
  36. # Print spec metrics.
  37. "disable_log_stats": False,
  38. # Precision
  39. "dtype": PRECISION,
  40. # Main model
  41. "model": MAIN_MODEL,
  42. # Get the safetensors model
  43. "revision": "refs/pr/9",
  44. }
  45. ],
  46. )
  47. @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
  48. @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
  49. @pytest.mark.parametrize(
  50. "test_llm_kwargs",
  51. [
  52. {
  53. "speculative_model": SPEC_MODEL,
  54. "num_speculative_tokens": MAX_SPEC_TOKENS,
  55. },
  56. ],
  57. )
  58. @pytest.mark.parametrize(
  59. "output_len",
  60. [
  61. 128,
  62. ],
  63. )
  64. @pytest.mark.parametrize("batch_size", [1, 32])
  65. @pytest.mark.parametrize("seed", [1])
  66. def test_eagle_e2e_greedy_correctness(
  67. baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int
  68. ):
  69. """Verify greedy equality with different batch size."""
  70. run_greedy_equality_correctness_test(
  71. baseline_llm_generator,
  72. test_llm_generator,
  73. batch_size,
  74. max_output_len=output_len,
  75. force_output_len=True,
  76. )
  77. @pytest.mark.parametrize(
  78. "common_llm_kwargs",
  79. [
  80. {
  81. "enforce_eager": False,
  82. # Required for spec decode.
  83. "use_v2_block_manager": True,
  84. # Print spec metrics.
  85. "disable_log_stats": False,
  86. # Precision
  87. "dtype": PRECISION,
  88. # Main model
  89. "model": MAIN_MODEL,
  90. # Get the safetensors model
  91. "revision": "refs/pr/9",
  92. }
  93. ],
  94. )
  95. @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
  96. @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
  97. @pytest.mark.parametrize(
  98. "test_llm_kwargs",
  99. [
  100. {
  101. "speculative_model": SPEC_MODEL,
  102. "num_speculative_tokens": MAX_SPEC_TOKENS,
  103. },
  104. ],
  105. )
  106. @pytest.mark.parametrize(
  107. "output_len",
  108. [
  109. 128,
  110. ],
  111. )
  112. @pytest.mark.parametrize("batch_size", [1, 32])
  113. @pytest.mark.parametrize("seed", [1])
  114. def test_eagle_e2e_greedy_correctness_cuda_graph(
  115. baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int
  116. ):
  117. """Verify greedy equality with cuda graph enabled and different
  118. batch sizes."""
  119. run_greedy_equality_correctness_test(
  120. baseline_llm_generator,
  121. test_llm_generator,
  122. batch_size,
  123. max_output_len=output_len,
  124. force_output_len=True,
  125. )
  126. @pytest.mark.parametrize(
  127. "common_llm_kwargs",
  128. [
  129. {
  130. "block_size": 8,
  131. # 2 for small prompt, 256//8 for generated.
  132. "num_gpu_blocks_override": 2 + 256 // 8,
  133. "max_model_len": (2 + 256 // 8) * 8,
  134. # Skip cuda graph recording for fast test.
  135. "enforce_eager": True,
  136. # Required for spec decode.
  137. "use_v2_block_manager": True,
  138. # Precision
  139. "dtype": PRECISION,
  140. # Main model
  141. "model": MAIN_MODEL,
  142. # Get the safetensors model
  143. "revision": "refs/pr/9",
  144. }
  145. ],
  146. )
  147. @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
  148. @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
  149. @pytest.mark.parametrize(
  150. "test_llm_kwargs",
  151. [
  152. {
  153. "speculative_model": SPEC_MODEL,
  154. "num_speculative_tokens": MAX_SPEC_TOKENS,
  155. },
  156. ],
  157. )
  158. @pytest.mark.parametrize(
  159. "output_len",
  160. [
  161. # Use small output len for fast test.
  162. 128,
  163. ],
  164. )
  165. @pytest.mark.parametrize("batch_size", [4])
  166. @pytest.mark.parametrize("seed", [1])
  167. def test_eagle_e2e_greedy_correctness_with_preemption(
  168. baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int
  169. ):
  170. """Verify greedy equality, even when some sequences are preempted mid-
  171. generation.
  172. """
  173. run_greedy_equality_correctness_test(
  174. baseline_llm_generator,
  175. test_llm_generator,
  176. batch_size,
  177. max_output_len=output_len,
  178. force_output_len=True,
  179. )
  180. @pytest.mark.parametrize(
  181. "common_llm_kwargs",
  182. [
  183. {
  184. # Skip cuda graph recording for fast test.
  185. "enforce_eager": True,
  186. # Required for spec decode.
  187. "use_v2_block_manager": True,
  188. # Precision
  189. "dtype": PRECISION,
  190. # Main model
  191. "model": MAIN_MODEL,
  192. # Get the safetensors model
  193. "revision": "refs/pr/9",
  194. }
  195. ],
  196. )
  197. @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
  198. @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
  199. @pytest.mark.parametrize(
  200. "test_llm_kwargs",
  201. [
  202. {
  203. "speculative_model": SPEC_MODEL,
  204. "num_speculative_tokens": k,
  205. }
  206. # Try a range of num. speculative tokens
  207. for k in range(1, 1 + MAX_SPEC_TOKENS)
  208. ],
  209. )
  210. @pytest.mark.parametrize("batch_size", [2])
  211. @pytest.mark.parametrize(
  212. "output_len",
  213. [
  214. # Use smaller output len for fast test.
  215. 32,
  216. ],
  217. )
  218. @pytest.mark.parametrize("seed", [1])
  219. def test_eagle_different_k(
  220. baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int
  221. ):
  222. """Verify that eagle speculative decoding produces exact equality
  223. to without spec decode with different values of num_speculative_tokens.
  224. """
  225. run_greedy_equality_correctness_test(
  226. baseline_llm_generator,
  227. test_llm_generator,
  228. batch_size,
  229. max_output_len=output_len,
  230. force_output_len=True,
  231. )
  232. @pytest.mark.parametrize(
  233. "common_llm_kwargs",
  234. [
  235. {
  236. # Skip cuda graph recording for fast test.
  237. "enforce_eager": True,
  238. # Required for spec decode.
  239. "use_v2_block_manager": True,
  240. # Precision
  241. "dtype": PRECISION,
  242. # Main model
  243. "model": MAIN_MODEL,
  244. # Get the safetensors model
  245. "revision": "refs/pr/9",
  246. }
  247. ],
  248. )
  249. @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
  250. @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
  251. @pytest.mark.parametrize(
  252. "test_llm_kwargs",
  253. [
  254. {
  255. "speculative_model": SPEC_MODEL,
  256. "num_speculative_tokens": MAX_SPEC_TOKENS,
  257. "speculative_disable_by_batch_size": 4,
  258. }
  259. ],
  260. )
  261. @pytest.mark.parametrize("batch_size", [1, 5])
  262. @pytest.mark.parametrize(
  263. "output_len",
  264. [
  265. # Use smaller output len for fast test.
  266. 32,
  267. ],
  268. )
  269. @pytest.mark.parametrize("seed", [1])
  270. def test_eagle_disable_queue(
  271. baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int
  272. ):
  273. """Verify that eagle speculative decoding produces exact equality
  274. to without spec decode when speculation is disabled for large
  275. batch sizes.
  276. """
  277. run_greedy_equality_correctness_test(
  278. baseline_llm_generator,
  279. test_llm_generator,
  280. batch_size,
  281. max_output_len=output_len,
  282. force_output_len=True,
  283. )
  284. if __name__ == "__main__":
  285. import pytest
  286. pytest.main([__file__])