test_mlp_correctness.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
  1. """This docstring details important information on the testing methodology.
  2. Most of the tests rely on "greedy equality", where we expect the output of
  3. speculative decoding on a sequence to exactly match the output of normal non-
  4. speculative decoding.
  5. Since speculative decoding with rejection sampling guarantees that the output
  6. distribution matches the target model's output distribution (up to hardware
  7. numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
  8. equality.
  9. However, we still need to verify below scenario could be passed:
  10. * Batch size 1 greedy equality
  11. * Batch size >1 greedy equality
  12. * Test greedy equality under preemption
  13. * Test greedy equality under various number of speculative tokens.
  14. With those tests, we can say at least, MLPSpeculator would not break the
  15. correctess for the target model outputs.
  16. """
  17. from unittest.mock import patch
  18. import pytest
  19. from aphrodite.modeling.layers.vocab_parallel_embedding import pad_vocab_size
  20. from .conftest import (run_equality_correctness_test,
  21. run_greedy_equality_correctness_test)
  22. # main model
  23. MAIN_MODEL = "JackFram/llama-160m"
  24. # speculative model
  25. SPEC_MODEL = "ibm-fms/llama-160m-accelerator"
  26. # max. number of speculative tokens: this corresponds to
  27. # n_predict in the config.json of the speculator model.
  28. MAX_SPEC_TOKENS = 3
  29. # precision
  30. PRECISION = "float32"
  31. @pytest.mark.parametrize(
  32. "common_llm_kwargs",
  33. [{
  34. # Skip cuda graph recording for fast test.
  35. "enforce_eager": True,
  36. # Required for spec decode.
  37. "use_v2_block_manager": True,
  38. # Print spec metrics.
  39. "disable_log_stats": False,
  40. # Precision
  41. "dtype": PRECISION,
  42. # Main model
  43. "model": MAIN_MODEL,
  44. }])
  45. @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
  46. @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
  47. @pytest.mark.parametrize("test_llm_kwargs", [
  48. {
  49. "speculative_model": SPEC_MODEL,
  50. },
  51. ])
  52. @pytest.mark.parametrize("output_len", [
  53. 128,
  54. ])
  55. @pytest.mark.parametrize("batch_size", [1, 32])
  56. @pytest.mark.parametrize("seed", [1])
  57. def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
  58. batch_size: int, output_len: int):
  59. """Verify greedy equality with different batch size."""
  60. run_greedy_equality_correctness_test(baseline_llm_generator,
  61. test_llm_generator,
  62. batch_size,
  63. max_output_len=output_len,
  64. force_output_len=True)
  65. @pytest.mark.parametrize(
  66. "common_llm_kwargs",
  67. [{
  68. # Skip cuda graph recording for fast test.
  69. "enforce_eager": True,
  70. # Required for spec decode.
  71. "use_v2_block_manager": True,
  72. # Print spec metrics.
  73. "disable_log_stats": False,
  74. # Precision
  75. "dtype": PRECISION,
  76. # Main model
  77. "model": MAIN_MODEL,
  78. # Speculative model
  79. "speculative_model": SPEC_MODEL,
  80. }])
  81. @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
  82. @pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
  83. @pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}])
  84. @pytest.mark.parametrize("output_len", [64])
  85. @pytest.mark.parametrize("batch_size", [1, 32])
  86. @pytest.mark.parametrize("temperature", [0.1, 1.0])
  87. @pytest.mark.parametrize("seed", [None])
  88. def test_mlp_e2e_seeded_correctness(baseline_llm_generator, test_llm_generator,
  89. batch_size: int, output_len: int,
  90. temperature: float):
  91. """Verify seeded runs produce the same output."""
  92. run_equality_correctness_test(baseline_llm_generator,
  93. test_llm_generator,
  94. batch_size,
  95. max_output_len=output_len,
  96. temperature=temperature,
  97. seeded=True,
  98. force_output_len=True)
  99. # Ensure this same test does fail if we _don't_ include per-request seeds
  100. with pytest.raises(AssertionError):
  101. run_equality_correctness_test(baseline_llm_generator,
  102. test_llm_generator,
  103. batch_size,
  104. max_output_len=output_len,
  105. temperature=temperature,
  106. seeded=False,
  107. force_output_len=True)
  108. @pytest.mark.parametrize(
  109. "common_llm_kwargs",
  110. [{
  111. "block_size": 8,
  112. # 2 for small prompt, 256//8 for generated.
  113. "num_gpu_blocks_override": 2 + 256 // 8,
  114. "max_model_len": (2 + 256 // 8) * 8,
  115. # Skip cuda graph recording for fast test.
  116. "enforce_eager": True,
  117. # Required for spec decode.
  118. "use_v2_block_manager": True,
  119. # Precision
  120. "dtype": PRECISION,
  121. # Main model
  122. "model": MAIN_MODEL,
  123. }])
  124. @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
  125. @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
  126. @pytest.mark.parametrize("test_llm_kwargs", [
  127. {
  128. "speculative_model": SPEC_MODEL,
  129. },
  130. ])
  131. @pytest.mark.parametrize(
  132. "output_len",
  133. [
  134. # Use small output len for fast test.
  135. 128,
  136. ])
  137. @pytest.mark.parametrize("batch_size", [4])
  138. @pytest.mark.parametrize("seed", [1])
  139. def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
  140. test_llm_generator,
  141. batch_size: int,
  142. output_len: int):
  143. """Verify greedy equality, even when some sequences are preempted mid-
  144. generation.
  145. """
  146. run_greedy_equality_correctness_test(baseline_llm_generator,
  147. test_llm_generator,
  148. batch_size,
  149. max_output_len=output_len,
  150. force_output_len=True)
  151. @pytest.mark.parametrize(
  152. "common_llm_kwargs",
  153. [{
  154. "block_size": 8,
  155. # 2 for small prompt, 256//8 for generated.
  156. "num_gpu_blocks_override": 2 + 256 // 8,
  157. "max_model_len": (2 + 256 // 8) * 8,
  158. # Skip cuda graph recording for fast test.
  159. "enforce_eager": True,
  160. # Required for spec decode.
  161. "use_v2_block_manager": True,
  162. # Precision
  163. "dtype": PRECISION,
  164. # Main model
  165. "model": MAIN_MODEL,
  166. }])
  167. @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
  168. @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
  169. @pytest.mark.parametrize("test_llm_kwargs", [
  170. {
  171. "speculative_model": SPEC_MODEL,
  172. },
  173. ])
  174. @pytest.mark.parametrize(
  175. "output_len",
  176. [
  177. # Use small output len for fast test.
  178. 128,
  179. ])
  180. @pytest.mark.parametrize("batch_size", [4])
  181. @pytest.mark.parametrize("seed", [1])
  182. def test_mlp_e2e_greedy_correctness_with_padding(baseline_llm_generator,
  183. test_llm_generator,
  184. batch_size: int,
  185. output_len: int):
  186. """Verify greedy equality when the vocab dimension is padded
  187. """
  188. # Default pad_to is 64, test model has vocab_size of 32000
  189. def patched_pad_vocab_size(vocab_size, pad_to=None):
  190. return pad_vocab_size(vocab_size, pad_to=32064)
  191. with patch(
  192. "aphrodite.modeling.layers.vocab_parallel_embedding.pad_vocab_size",
  193. patched_pad_vocab_size):
  194. run_greedy_equality_correctness_test(baseline_llm_generator,
  195. test_llm_generator,
  196. batch_size,
  197. max_output_len=output_len,
  198. force_output_len=True)
  199. @pytest.mark.parametrize(
  200. "common_llm_kwargs",
  201. [{
  202. # Skip cuda graph recording for fast test.
  203. "enforce_eager": True,
  204. # Required for spec decode.
  205. "use_v2_block_manager": True,
  206. # Precision
  207. "dtype": PRECISION,
  208. # Main model
  209. "model": MAIN_MODEL,
  210. }])
  211. @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
  212. @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
  213. @pytest.mark.parametrize(
  214. "test_llm_kwargs",
  215. [
  216. {
  217. "speculative_model": SPEC_MODEL,
  218. "num_speculative_tokens": k,
  219. }
  220. # Try a range of num. speculative tokens
  221. for k in range(1, 1 + MAX_SPEC_TOKENS)
  222. ])
  223. @pytest.mark.parametrize("batch_size", [2])
  224. @pytest.mark.parametrize(
  225. "output_len",
  226. [
  227. # Use smaller output len for fast test.
  228. 32,
  229. ])
  230. @pytest.mark.parametrize("seed", [1])
  231. def test_mlp_different_k(baseline_llm_generator, test_llm_generator,
  232. batch_size: int, output_len: int):
  233. """Verify that mlp speculative decoding produces exact equality
  234. to without spec decode with different values of num_speculative_tokens.
  235. """
  236. run_greedy_equality_correctness_test(baseline_llm_generator,
  237. test_llm_generator,
  238. batch_size,
  239. max_output_len=output_len,
  240. force_output_len=True)
  241. @pytest.mark.parametrize(
  242. "common_llm_kwargs",
  243. [{
  244. # Skip cuda graph recording for fast test.
  245. "enforce_eager": True,
  246. # Required for spec decode.
  247. "use_v2_block_manager": True,
  248. # Precision
  249. "dtype": PRECISION,
  250. # Main model
  251. "model": MAIN_MODEL,
  252. }])
  253. @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
  254. @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
  255. @pytest.mark.parametrize("test_llm_kwargs",
  256. [{
  257. "speculative_model": SPEC_MODEL,
  258. "speculative_disable_by_batch_size": 4
  259. }])
  260. @pytest.mark.parametrize("batch_size", [1, 5])
  261. @pytest.mark.parametrize(
  262. "output_len",
  263. [
  264. # Use smaller output len for fast test.
  265. 32,
  266. ])
  267. @pytest.mark.parametrize("seed", [1])
  268. def test_mlp_disable_queue(baseline_llm_generator, test_llm_generator,
  269. batch_size: int, output_len: int):
  270. """Verify that mlp speculative decoding produces exact equality
  271. to without spec decode when speculation is disabled for large
  272. batch sizes.
  273. """
  274. run_greedy_equality_correctness_test(baseline_llm_generator,
  275. test_llm_generator,
  276. batch_size,
  277. max_output_len=output_len,
  278. force_output_len=True)