test_correctness.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558
  1. from itertools import cycle
  2. import pytest
  3. from aphrodite import SamplingParams
  4. from .conftest import get_token_ids_from_llm_generator
  5. @pytest.mark.parametrize(
  6. "common_llm_kwargs",
  7. [{
  8. # Use a small model for a fast test.
  9. "model": "facebook/opt-125m",
  10. # skip cuda graph creation for fast test.
  11. "enforce_eager": True,
  12. # Allow only 5 sequences of ~1024 tokens in worst case.
  13. "block_size": 16,
  14. "num_gpu_blocks_override": 5 * (64 + 1),
  15. }])
  16. @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
  17. @pytest.mark.parametrize("baseline_llm_kwargs", [{
  18. "use_v2_block_manager": False
  19. }])
  20. @pytest.mark.parametrize("test_llm_kwargs", [{
  21. "use_v2_block_manager": True,
  22. "preemption_mode": "swap"
  23. }, {
  24. "use_v2_block_manager": True,
  25. "preemption_mode": "recompute"
  26. }])
  27. @pytest.mark.parametrize("batch_size", [10])
  28. @pytest.mark.parametrize("seed", [1])
  29. def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
  30. test_llm_generator, batch_size):
  31. """Verify block manager v2 produces same outputs as block manager v1, even
  32. when there is preemption.
  33. This constructs two LLM, each with limited number of GPU blocks. The limit
  34. is decided such that as the sequences in the batch grow, sequences must be
  35. preempted and removed from cache.
  36. If the output token ids are equivalent, then we have confidence that the KV
  37. cache is not corrupted in the v2 block manager.
  38. NOTE: We want a significant number of generated tokens so that any incorrect
  39. KV mapping has time to build up error.
  40. """
  41. output_len = 1024
  42. temperature = 0.0
  43. # We want to ensure equality even with preemption.
  44. # We force the total block size to be 1 + cdiv(output_len, block_size)
  45. # so that only one sequence can fit at a time (once the sequences grow).
  46. prompts = [
  47. "Hello, my name is",
  48. "The president of the United States is",
  49. "The capital of France is",
  50. "The future of AI is",
  51. ]
  52. prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
  53. sampling_params = SamplingParams(
  54. max_tokens=output_len,
  55. ignore_eos=True,
  56. temperature=temperature,
  57. )
  58. print('Getting token ids from block manager v1')
  59. baseline_token_ids = get_token_ids_from_llm_generator(
  60. baseline_llm_generator, prompts, sampling_params)
  61. print('Getting token ids from block manager v2')
  62. test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
  63. prompts, sampling_params)
  64. for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
  65. test_token_ids):
  66. assert expected_token_ids == actual_token_ids
  67. assert baseline_token_ids == test_token_ids
  68. @pytest.mark.parametrize(
  69. "common_llm_kwargs",
  70. [{
  71. # Use a small model for a fast test.
  72. "model": "facebook/opt-125m",
  73. # skip cuda graph creation for fast test.
  74. "enforce_eager": True,
  75. # Use a large block size to trigger more copy-on-writes.
  76. "block_size": 32,
  77. }])
  78. @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
  79. @pytest.mark.parametrize("baseline_llm_kwargs", [{
  80. "use_v2_block_manager": False
  81. }])
  82. @pytest.mark.parametrize("test_llm_kwargs", [{
  83. "use_v2_block_manager": True,
  84. "preemption_mode": "swap"
  85. }, {
  86. "use_v2_block_manager": True,
  87. "preemption_mode": "recompute"
  88. }])
  89. @pytest.mark.parametrize("batch_size", [10])
  90. @pytest.mark.parametrize("seed", [1])
  91. def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
  92. test_llm_generator, batch_size):
  93. """Verify beam search equality with block manager v1 and v2.
  94. This requires copy-on-writes; if the v1 and v2 output is the same, then
  95. we have some confidence cow is working.
  96. """
  97. output_len = 128
  98. temperature = 0.0
  99. prompts = [
  100. "Hello, my name is",
  101. "The president of the United States is",
  102. "The capital of France is",
  103. "The future of AI is",
  104. ]
  105. prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
  106. sampling_params = SamplingParams(
  107. max_tokens=output_len,
  108. ignore_eos=True,
  109. temperature=temperature,
  110. use_beam_search=True,
  111. best_of=2,
  112. )
  113. print('Getting token ids from block manager v1')
  114. baseline_token_ids = get_token_ids_from_llm_generator(
  115. baseline_llm_generator, prompts, sampling_params)
  116. print('Getting token ids from block manager v2')
  117. test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
  118. prompts, sampling_params)
  119. for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
  120. test_token_ids):
  121. assert expected_token_ids == actual_token_ids
  122. assert baseline_token_ids == test_token_ids
  123. @pytest.mark.parametrize(
  124. "common_llm_kwargs",
  125. [{
  126. # Use a small model for a fast test.
  127. "model": "facebook/opt-125m",
  128. # Our prompts will generate 128 tokens; since the prompts themselves are
  129. # small, we don't need much KV space beyond 128.
  130. "max_model_len": 160,
  131. # skip cuda graph creation for fast test.
  132. "enforce_eager": True,
  133. # Lookahead scheduling only supported in v2 block manager.
  134. "use_v2_block_manager": True,
  135. }])
  136. @pytest.mark.parametrize(
  137. "per_test_common_llm_kwargs",
  138. [
  139. {
  140. "block_size": 16,
  141. # Allow only 2 sequences of ~128 tokens in worst case.
  142. # Note 8 = 128/block_size
  143. "num_gpu_blocks_override": 2 * (8 + 1),
  144. },
  145. {
  146. "block_size": 8,
  147. # Allow only 2 sequences of ~128 tokens in worst case.
  148. # Note 16 = 128/block_size
  149. "num_gpu_blocks_override": 2 * (16 + 2),
  150. }
  151. ])
  152. @pytest.mark.parametrize("baseline_llm_kwargs", [{
  153. "num_lookahead_slots": 0,
  154. }])
  155. @pytest.mark.parametrize(
  156. "test_llm_kwargs",
  157. [
  158. {
  159. # We run one test with block_size < lookahead_slots, one test with
  160. # block_size > lookahead_slots
  161. "num_lookahead_slots": 10,
  162. "preemption_mode": "swap",
  163. },
  164. {
  165. "num_lookahead_slots": 10,
  166. "preemption_mode": "recompute",
  167. }
  168. ])
  169. @pytest.mark.parametrize("batch_size", [4])
  170. @pytest.mark.parametrize("seed", [1])
  171. def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
  172. test_llm_generator,
  173. batch_size):
  174. """Verify Aphrodite produces the same output with greedy sampling, when
  175. lookahead scheduling is used vs. not.
  176. Lookahead scheduling is not expected to modify the output, as it simply
  177. allocates empty slots ahead of the known token ids in a sliding fashion.
  178. This test constrains the total number of blocks to force preemption. It also
  179. varies the block size so that the lookahead size is less than and greater
  180. than the block size.
  181. """
  182. output_len = 128
  183. temperature = 0.0
  184. prompts = [
  185. "Hello, my name is",
  186. "The president of the United States is",
  187. "The capital of France is",
  188. "The future of AI is",
  189. ]
  190. prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
  191. sampling_params = SamplingParams(
  192. max_tokens=output_len,
  193. ignore_eos=True,
  194. temperature=temperature,
  195. )
  196. print('Getting token ids without lookahead scheduling')
  197. baseline_token_ids = get_token_ids_from_llm_generator(
  198. baseline_llm_generator, prompts, sampling_params)
  199. print('Getting token ids with lookahead scheduling')
  200. test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
  201. prompts, sampling_params)
  202. for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
  203. test_token_ids):
  204. assert expected_token_ids == actual_token_ids
  205. assert baseline_token_ids == test_token_ids
  206. @pytest.mark.parametrize(
  207. "common_llm_kwargs",
  208. [
  209. {
  210. # Use a small model for a fast test.
  211. "model": "facebook/opt-125m",
  212. # skip cuda graph creation for fast test.
  213. "enforce_eager": True,
  214. "enable_chunked_prefill": True,
  215. },
  216. ])
  217. @pytest.mark.parametrize("per_test_common_llm_kwargs",
  218. [{
  219. "block_size": 8,
  220. "max_num_batched_tokens": 2,
  221. "max_num_seqs": 2,
  222. }, {
  223. "block_size": 8,
  224. "max_num_batched_tokens": 3,
  225. "max_num_seqs": 2,
  226. }, {
  227. "block_size": 8,
  228. "max_num_batched_tokens": 256,
  229. "max_num_seqs": 10,
  230. }])
  231. @pytest.mark.parametrize("baseline_llm_kwargs", [
  232. {
  233. "use_v2_block_manager": False,
  234. },
  235. ])
  236. @pytest.mark.parametrize("test_llm_kwargs", [
  237. {
  238. "use_v2_block_manager": True,
  239. "num_lookahead_slots": 0,
  240. },
  241. {
  242. "use_v2_block_manager": True,
  243. "num_lookahead_slots": 5,
  244. },
  245. ])
  246. @pytest.mark.parametrize("batch_size", [4])
  247. @pytest.mark.parametrize("seed", [1])
  248. def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
  249. test_llm_generator, batch_size):
  250. """Verify that chunked prefill works with BlockManagerV2, with and without
  251. lookahead scheduling.
  252. """
  253. output_len = 32
  254. temperature = 0.0
  255. prompts = [
  256. "Hello, my name is",
  257. "The president of the United States is",
  258. ("1 + " * 50) + " 1 = ", # Longer prompt.
  259. "The capital of France is",
  260. "The future of AI is",
  261. ]
  262. prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
  263. sampling_params = SamplingParams(
  264. max_tokens=output_len,
  265. ignore_eos=True,
  266. temperature=temperature,
  267. )
  268. print('Getting token ids with BlockManagerV1')
  269. baseline_token_ids = get_token_ids_from_llm_generator(
  270. baseline_llm_generator, prompts, sampling_params)
  271. print('Getting token ids with BlockManagerV2')
  272. test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
  273. prompts, sampling_params)
  274. for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
  275. test_token_ids):
  276. assert expected_token_ids == actual_token_ids
  277. assert baseline_token_ids == test_token_ids
  278. @pytest.mark.parametrize(
  279. "common_llm_kwargs",
  280. [{
  281. # Use a small model for a fast test.
  282. "model": "facebook/opt-125m",
  283. # skip cuda graph creation for fast test.
  284. "enforce_eager": True,
  285. # Allow only 5 sequences of ~1024 tokens in worst case.
  286. "block_size": 16,
  287. "num_gpu_blocks_override": 5 * (64 + 1),
  288. # Enable prefill cache
  289. "enable_prefix_caching": True,
  290. }])
  291. @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
  292. @pytest.mark.parametrize("baseline_llm_kwargs", [{
  293. "use_v2_block_manager": False
  294. }])
  295. @pytest.mark.parametrize("test_llm_kwargs", [{
  296. "use_v2_block_manager": True,
  297. "preemption_mode": "swap"
  298. }, {
  299. "use_v2_block_manager": True,
  300. "preemption_mode": "recompute"
  301. }])
  302. @pytest.mark.parametrize("batch_size", [10])
  303. @pytest.mark.parametrize("seed", [1])
  304. def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
  305. baseline_llm_generator, test_llm_generator, batch_size):
  306. """Verify block manager v2 produces same outputs as block manager v1, even
  307. when there is preemption.
  308. This constructs two LLM, each with limited number of GPU blocks. The limit
  309. is decided such that as the sequences in the batch grow, sequences must be
  310. preempted and removed from cache.
  311. If the output token ids are equivalent, then we have confidence that the KV
  312. cache is not corrupted in the v2 block manager.
  313. NOTE: We want a significant number of generated tokens so that any incorrect
  314. KV mapping has time to build up error.
  315. """
  316. output_len = 1024
  317. temperature = 0.0
  318. # We want to ensure equality even with preemption.
  319. # We force the total block size to be 1 + cdiv(output_len, block_size)
  320. # so that only one sequence can fit at a time (once the sequences grow).
  321. prompts = [
  322. "Hello, my name is",
  323. "The president of the United States is",
  324. "The capital of France is",
  325. "The future of AI is",
  326. ]
  327. prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
  328. sampling_params = SamplingParams(
  329. max_tokens=output_len,
  330. ignore_eos=True,
  331. temperature=temperature,
  332. )
  333. print('Getting token ids from block manager v1')
  334. baseline_token_ids = get_token_ids_from_llm_generator(
  335. baseline_llm_generator, prompts, sampling_params)
  336. print('Getting token ids from block manager v2')
  337. test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
  338. prompts, sampling_params)
  339. for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
  340. test_token_ids):
  341. assert expected_token_ids == actual_token_ids
  342. assert baseline_token_ids == test_token_ids
  343. @pytest.mark.parametrize(
  344. "common_llm_kwargs",
  345. [{
  346. # Use a small model for a fast test.
  347. "model": "facebook/opt-125m",
  348. # skip cuda graph creation for fast test.
  349. "enforce_eager": True,
  350. # Allow only 5 sequences of ~1024 tokens in worst case.
  351. "block_size": 16,
  352. "num_gpu_blocks_override": 5 * (64 + 1),
  353. # Test APC in v2 block
  354. "use_v2_block_manager": True,
  355. }])
  356. @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
  357. @pytest.mark.parametrize("baseline_llm_kwargs", [{
  358. "enable_prefix_caching": False
  359. }])
  360. @pytest.mark.parametrize("test_llm_kwargs", [{
  361. "enable_prefix_caching": True,
  362. "preemption_mode": "swap"
  363. }, {
  364. "enable_prefix_caching": True,
  365. "preemption_mode": "recompute"
  366. }])
  367. @pytest.mark.parametrize("batch_size", [10])
  368. @pytest.mark.parametrize("seed", [1])
  369. def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
  370. test_llm_generator, batch_size):
  371. """Verify block manager v2 with auto prefix caching enabled produces same
  372. outputs as auto prefix caching disabled, even when there is preemption.
  373. This constructs two LLM, each with limited number of GPU blocks. The limit
  374. is decided such that as the sequences in the batch grow, sequences must be
  375. preempted and removed from cache.
  376. If the output token ids are equivalent, then we have confidence that auto
  377. prefix caching itself at least don't cause result error.
  378. """
  379. output_len = 1024
  380. temperature = 0.0
  381. # We want to ensure equality even with preemption.
  382. # We force the total block size to be 1 + cdiv(output_len, block_size)
  383. # so that only one sequence can fit at a time (once the sequences grow).
  384. prompts = [
  385. "Hello, my name is",
  386. "The president of the United States is",
  387. "The capital of France is",
  388. "The future of AI is",
  389. ]
  390. prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
  391. sampling_params = SamplingParams(
  392. max_tokens=output_len,
  393. ignore_eos=True,
  394. temperature=temperature,
  395. )
  396. print('Getting token ids with APC disabled')
  397. baseline_token_ids = get_token_ids_from_llm_generator(
  398. baseline_llm_generator, prompts, sampling_params)
  399. print('Getting token ids with APC enabled')
  400. test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
  401. prompts, sampling_params)
  402. for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
  403. test_token_ids):
  404. assert expected_token_ids == actual_token_ids
  405. assert baseline_token_ids == test_token_ids
  406. @pytest.mark.parametrize(
  407. "common_llm_kwargs",
  408. [{
  409. # Use a small model for a fast test.
  410. "model": "facebook/opt-125m",
  411. # skip cuda graph creation for fast test.
  412. "enforce_eager": True,
  413. # we keep the blocks small, so that hit eviction quickly
  414. "max_model_len": 48,
  415. "block_size": 16,
  416. "num_gpu_blocks_override": 3,
  417. # Test APC in v2 block
  418. "use_v2_block_manager": True,
  419. }])
  420. @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
  421. @pytest.mark.parametrize("baseline_llm_kwargs", [{
  422. "enable_prefix_caching": False
  423. }])
  424. @pytest.mark.parametrize("test_llm_kwargs", [{
  425. "enable_prefix_caching": True,
  426. }])
  427. @pytest.mark.parametrize("seed", [1])
  428. def test_auto_prefix_caching_after_evition_start(baseline_llm_generator,
  429. test_llm_generator):
  430. """Verify block manager v2 with auto prefix caching could works normal
  431. even when eviction started.
  432. With APC enabled, all blocks are held by native block at the beginning.
  433. Then blocks are managed by evictor instead. If cache hit at the evitor's
  434. block, then it could be reused, or we need to recompute its kv cache.
  435. """
  436. output_len = 10
  437. temperature = 0.0
  438. prompts = [
  439. "You are a helpful assistant. Please answer truthfully and write "
  440. "out your thinking step by step to be sure you get the right answer. "
  441. "If you make a mistake, attempt to correct it. who are you?",
  442. "You are a helpful assistant. Please answer truthfully and write out "
  443. "your thinking step by step to be sure you get the right answer. You "
  444. "are helpful and harmless and you follow ethical guidelines. "
  445. "who are you?"
  446. ]
  447. sampling_params = SamplingParams(
  448. max_tokens=output_len,
  449. ignore_eos=True,
  450. temperature=temperature,
  451. )
  452. print('Getting token ids with APC disabled')
  453. baseline_token_ids = get_token_ids_from_llm_generator(
  454. baseline_llm_generator, prompts, sampling_params)
  455. print('Getting token ids with APC enabled')
  456. test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
  457. prompts, sampling_params)
  458. for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
  459. test_token_ids):
  460. assert expected_token_ids == actual_token_ids
  461. assert baseline_token_ids == test_token_ids