test_block_table.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577
  1. from typing import List
  2. import pytest
  3. from aphrodite.common.utils import Device, cdiv, chunk_list
  4. from aphrodite.processing.block.block_table import BlockTable
  5. from aphrodite.processing.block.cpu_gpu_block_allocator import (
  6. CpuGpuBlockAllocator)
  7. @pytest.mark.parametrize("block_size", [16])
  8. @pytest.mark.parametrize("sequence_len", [1, 16, 129])
  9. def test_allocate_naive(block_size: int, sequence_len: int):
  10. """Test the allocation of blocks using the naive allocator.
  11. This test creates a CpuGpuBlockAllocator with the specified block size and
  12. number of blocks. It then allocates multiple BlockTables with varying
  13. sequence lengths and verifies that the number of free blocks decreases as
  14. expected after each allocation.
  15. """
  16. assert block_size > 1
  17. num_gpu_blocks = 1024
  18. allocator = CpuGpuBlockAllocator.create(
  19. allocator_type="naive",
  20. num_gpu_blocks=num_gpu_blocks,
  21. num_cpu_blocks=1024,
  22. block_size=block_size,
  23. )
  24. token_ids = list(range(sequence_len))
  25. num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
  26. block_tables: List[BlockTable] = []
  27. for i in range(5):
  28. assert allocator.get_num_free_blocks(
  29. device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
  30. block_tables.append(
  31. BlockTable(
  32. block_size=block_size,
  33. block_allocator=allocator,
  34. ))
  35. block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
  36. @pytest.mark.parametrize("block_size", [16])
  37. @pytest.mark.parametrize("sequence_len", [1, 16, 129])
  38. def test_allocate_prefix_caching(block_size: int, sequence_len: int):
  39. """Test the allocation of blocks using the prefix caching allocator.
  40. This test creates a CpuGpuBlockAllocator with the specified block size and
  41. number of blocks, using the prefix caching allocator. It then allocates
  42. multiple BlockTables with varying sequence lengths and verifies that the
  43. number of free blocks decreases as expected after each allocation.
  44. The test expects all sequences to share allocations, except for their last
  45. block, which may be mutable. It calculates the expected number of immutable
  46. and mutable blocks per allocation based on the sequence length and block
  47. size.
  48. """
  49. assert block_size > 1
  50. num_gpu_blocks = 1024
  51. allocator = CpuGpuBlockAllocator.create(
  52. allocator_type="prefix_caching",
  53. num_gpu_blocks=num_gpu_blocks,
  54. num_cpu_blocks=1024,
  55. block_size=block_size,
  56. )
  57. token_ids = list(range(sequence_len))
  58. chunked_tokens = list(chunk_list(token_ids, block_size))
  59. num_mutable_blocks_per_alloc = 0 if len(
  60. chunked_tokens[-1]) == block_size else 1
  61. num_immutable_blocks_per_alloc = len(
  62. chunked_tokens) - num_mutable_blocks_per_alloc
  63. block_tables: List[BlockTable] = []
  64. for alloc_i in range(1, 6):
  65. block_tables.append(
  66. BlockTable(
  67. block_size=block_size,
  68. block_allocator=allocator,
  69. ))
  70. block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
  71. # Expect all sequences to share allocations, except for their last block
  72. # (which may be mutable).
  73. assert allocator.get_num_free_blocks(
  74. device=Device.GPU) == num_gpu_blocks - (
  75. num_immutable_blocks_per_alloc + num_mutable_blocks_per_alloc *
  76. (alloc_i))
  77. @pytest.mark.parametrize("block_size", [16])
  78. @pytest.mark.parametrize("sequence_len", [1, 16, 129])
  79. @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
  80. @pytest.mark.parametrize("device", ["cpu", "gpu"])
  81. def test_allocate_free(block_size: int, sequence_len: int, allocator_type: str,
  82. device: str):
  83. """Test the allocation and freeing of blocks using different allocators and
  84. devices.
  85. This test creates a CpuGpuBlockAllocator with the specified block size,
  86. number of blocks, allocator type, and device. It then allocates a BlockTable
  87. multiple times with the same sequence and verifies that the number of free
  88. blocks remains consistent after each allocation and freeing.
  89. """
  90. device = Device[device.upper()]
  91. num_device_blocks = 1024
  92. allocator = CpuGpuBlockAllocator.create(
  93. allocator_type=allocator_type,
  94. num_gpu_blocks=num_device_blocks,
  95. num_cpu_blocks=num_device_blocks,
  96. block_size=block_size,
  97. )
  98. token_ids = list(range(sequence_len))
  99. num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
  100. block_table = BlockTable(
  101. block_size=block_size,
  102. block_allocator=allocator,
  103. )
  104. for i in range(5):
  105. block_table.allocate(token_ids=token_ids, device=device)
  106. assert allocator.get_num_free_blocks(
  107. device) == num_device_blocks - num_blocks_per_alloc
  108. assert all(block_id is not None
  109. for block_id in block_table.physical_block_ids)
  110. block_table.free()
  111. assert allocator.get_num_free_blocks(device) == num_device_blocks
  112. @pytest.mark.parametrize("block_size", [1, 8])
  113. @pytest.mark.parametrize("sequence_len", [1, 16, 129])
  114. @pytest.mark.parametrize("append_len", [1, 16, 129])
  115. @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
  116. def test_append_token_ids_allocation(block_size: int, sequence_len: int,
  117. append_len: int, allocator_type: str):
  118. """Test the allocation behavior when appending token IDs to a BlockTable.
  119. This test creates a CpuGpuBlockAllocator with the specified block size,
  120. number of blocks, and allocator type. It then allocates a BlockTable with an
  121. initial sequence and appends additional token IDs to it. The test verifies
  122. that the number of allocated blocks before and after appending matches the
  123. expected values.
  124. """
  125. num_gpu_blocks = 1024
  126. allocator = CpuGpuBlockAllocator.create(
  127. allocator_type=allocator_type,
  128. num_gpu_blocks=num_gpu_blocks,
  129. num_cpu_blocks=1024,
  130. block_size=block_size,
  131. )
  132. token_ids = list(range(sequence_len))
  133. token_ids_to_append = list(range(append_len))
  134. block_table = BlockTable(
  135. block_size=block_size,
  136. block_allocator=allocator,
  137. )
  138. num_expected_blocks_before_append = len(
  139. list(chunk_list(token_ids, block_size)))
  140. num_expected_appended_blocks = len(
  141. list(chunk_list(token_ids + token_ids_to_append,
  142. block_size))) - num_expected_blocks_before_append
  143. block_table.allocate(token_ids=token_ids, device=Device.GPU)
  144. assert len(
  145. block_table.physical_block_ids) == num_expected_blocks_before_append
  146. block_table.append_token_ids(token_ids_to_append)
  147. assert len(
  148. block_table.physical_block_ids
  149. ) == num_expected_blocks_before_append + num_expected_appended_blocks
  150. @pytest.mark.parametrize("block_size", [1, 8])
  151. @pytest.mark.parametrize("sequence_len", [1, 16, 129])
  152. @pytest.mark.parametrize("num_empty_slots", [1, 16, 129])
  153. @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
  154. def test_ensure_num_empty_slots_allocation(block_size: int, sequence_len: int,
  155. num_empty_slots: int,
  156. allocator_type: str):
  157. """Test the allocation behavior when ensuring a certain number of empty
  158. slots in a BlockTable.
  159. This test creates a CpuGpuBlockAllocator with the specified block size,
  160. number of blocks, and allocator type. It then allocates a BlockTable with an
  161. initial sequence and ensures a certain number of empty slots. The test
  162. verifies that the number of allocated blocks before and after ensuring empty
  163. slots matches the expected values. It also checks that filling up the empty
  164. slots does not consume additional blocks.
  165. """
  166. num_gpu_blocks = 1024
  167. allocator = CpuGpuBlockAllocator.create(
  168. allocator_type=allocator_type,
  169. num_gpu_blocks=num_gpu_blocks,
  170. num_cpu_blocks=1024,
  171. block_size=block_size,
  172. )
  173. token_ids = list(range(sequence_len))
  174. block_table = BlockTable(
  175. block_size=block_size,
  176. block_allocator=allocator,
  177. )
  178. num_expected_blocks_before_append = len(
  179. list(chunk_list(token_ids, block_size)))
  180. num_expected_appended_blocks = len(
  181. list(chunk_list(token_ids + [-1] * num_empty_slots,
  182. block_size))) - num_expected_blocks_before_append
  183. block_table.allocate(token_ids=token_ids, device=Device.GPU)
  184. # Assert that the empty slots consume the expected number of additional
  185. # blocks.
  186. assert len(
  187. block_table.physical_block_ids) == num_expected_blocks_before_append
  188. block_table.ensure_num_empty_slots(num_empty_slots)
  189. assert len(
  190. block_table.physical_block_ids
  191. ) == num_expected_blocks_before_append + num_expected_appended_blocks
  192. # Now, ensure no additional blocks consumed as we fill up the empty slots.
  193. num_free_blocks = allocator.get_num_free_blocks(device=Device.GPU)
  194. block_table.append_token_ids(token_ids=list(range(num_empty_slots)))
  195. assert num_free_blocks == allocator.get_num_free_blocks(device=Device.GPU)
  196. @pytest.mark.parametrize("block_size", [1, 8])
  197. @pytest.mark.parametrize("sequence_len", [1, 9])
  198. @pytest.mark.parametrize("append_len", [1, 16, 129])
  199. @pytest.mark.parametrize("append_size", [1, 4, 129])
  200. @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
  201. def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
  202. append_len: int, allocator_type: str,
  203. append_size: int):
  204. """Verify token ids are correctly appended. Appends various amounts of
  205. token ids in various append sizes, and verifies the final sequence is
  206. correct.
  207. """
  208. num_gpu_blocks = 1024
  209. allocator = CpuGpuBlockAllocator.create(
  210. allocator_type=allocator_type,
  211. num_gpu_blocks=num_gpu_blocks,
  212. num_cpu_blocks=1024,
  213. block_size=block_size,
  214. )
  215. token_ids = list(range(sequence_len))
  216. token_ids_to_append = list(range(append_len))
  217. block_table = BlockTable(
  218. block_size=block_size,
  219. block_allocator=allocator,
  220. )
  221. block_table.allocate(token_ids=token_ids, device=Device.GPU)
  222. appended_so_far: List[int] = []
  223. for append in chunk_list(token_ids_to_append, append_size):
  224. block_table.append_token_ids(append)
  225. appended_so_far.extend(append)
  226. assert block_table._get_all_token_ids() == token_ids + appended_so_far
  227. assert block_table._get_all_token_ids() == token_ids + token_ids_to_append
  228. @pytest.mark.parametrize("seq_len", [1, 9, 129])
  229. @pytest.mark.parametrize("block_size", [1, 8])
  230. @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
  231. def test_fork(seq_len: int, block_size: int, allocator_type: str):
  232. """Create a sequence using the specified allocator.
  233. 1. Assert that after forking the sequence, the free block count is the
  234. same.
  235. 2. Assert that the forked sequence has the same physical mappings.
  236. 3. Then free the original sequence; verify that the free block count is
  237. the same.
  238. 4. Finally, free the forked sequence and verify that the free block
  239. count drops to zero.
  240. """
  241. num_gpu_blocks = 1024
  242. allocator = CpuGpuBlockAllocator.create(
  243. allocator_type=allocator_type,
  244. num_gpu_blocks=num_gpu_blocks,
  245. num_cpu_blocks=0,
  246. block_size=block_size,
  247. )
  248. token_ids = list(range(seq_len))
  249. block_table = BlockTable(
  250. block_size=block_size,
  251. block_allocator=allocator,
  252. )
  253. block_table.allocate(token_ids)
  254. num_free_blocks_before_fork = allocator.get_num_free_blocks(
  255. device=Device.GPU)
  256. forked_block_table = block_table.fork()
  257. # Expect physical_block_ids and token_ids to match.
  258. assert (block_table.physical_block_ids ==
  259. forked_block_table.physical_block_ids)
  260. assert block_table._get_all_token_ids(
  261. ) == forked_block_table._get_all_token_ids()
  262. # Do not expect any additional allocations.
  263. assert allocator.get_num_free_blocks(
  264. device=Device.GPU) == num_free_blocks_before_fork
  265. # Free the original blocks. Assert num free blocks does not change, since
  266. # refcount is nonzero.
  267. block_table.free()
  268. assert allocator.get_num_free_blocks(
  269. device=Device.GPU) == num_free_blocks_before_fork
  270. # Expect the forked block table to be unaffected by the free.
  271. assert all(block_id is not None
  272. for block_id in forked_block_table.physical_block_ids)
  273. # Free the forked blocks. Assert num free blocks does change, since
  274. # refcount is now zero.
  275. forked_block_table.free()
  276. assert allocator.get_num_free_blocks(device=Device.GPU) == num_gpu_blocks
  277. @pytest.mark.parametrize("block_size", [8])
  278. @pytest.mark.parametrize("sequence_len", [1, 16, 129])
  279. @pytest.mark.parametrize("append_len", [1, 16, 129])
  280. @pytest.mark.parametrize("appender", ["forked", "original"])
  281. @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
  282. def test_cow(block_size: int, sequence_len: int, append_len: int,
  283. allocator_type: str, appender: str):
  284. """Fork a sequence; append to the forked sequence; verify there's a CoW.
  285. """
  286. num_gpu_blocks = 1024
  287. allocator = CpuGpuBlockAllocator.create(
  288. allocator_type=allocator_type,
  289. num_gpu_blocks=num_gpu_blocks,
  290. num_cpu_blocks=0,
  291. block_size=block_size,
  292. )
  293. token_ids = list(range(sequence_len))
  294. token_ids_to_append = list(range(append_len))
  295. original_block_table = BlockTable(
  296. block_size=block_size,
  297. block_allocator=allocator,
  298. )
  299. num_expected_non_cow_blocks = cdiv(sequence_len, block_size)
  300. num_expected_cow_blocks = cdiv(sequence_len + append_len,
  301. block_size) - (sequence_len // block_size)
  302. original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
  303. original_block_ids = original_block_table.physical_block_ids[:]
  304. print("original_block_ids = {}".format(original_block_ids))
  305. forked_block_table = original_block_table.fork()
  306. # Expect no additional allocation (copy on _write_).
  307. assert allocator.get_num_free_blocks(
  308. Device.GPU) == (num_gpu_blocks - num_expected_non_cow_blocks)
  309. if appender == "forked":
  310. appender_block_table = forked_block_table
  311. static_block_table = original_block_table
  312. elif appender == "original":
  313. appender_block_table = original_block_table
  314. static_block_table = forked_block_table
  315. else:
  316. raise ValueError(f"unknown test config {appender=}")
  317. # Write tokens.
  318. appender_block_table.append_token_ids(token_ids_to_append)
  319. # Expect the non-appending block table to have no change.
  320. assert static_block_table.physical_block_ids == original_block_ids
  321. assert appender_block_table.physical_block_ids != original_block_ids
  322. # Expect the blocks changed during append to have a CoW.
  323. assert allocator.get_num_free_blocks(
  324. Device.GPU) == num_gpu_blocks - (num_expected_non_cow_blocks +
  325. num_expected_cow_blocks)
  326. cows = allocator.clear_copy_on_writes()
  327. if sequence_len % block_size > 0:
  328. # If the last block in the sequence is not full, then when appending we
  329. # expect a CoW.
  330. assert cows
  331. cow_block_id = sequence_len // block_size
  332. expected_src = static_block_table.physical_block_ids[cow_block_id]
  333. expected_dst = appender_block_table.physical_block_ids[cow_block_id]
  334. assert (expected_src, expected_dst) in cows
  335. else:
  336. # Otherwise, there should be no copy-on-write.
  337. assert not cows
  338. static_block_table.free()
  339. appender_block_table.free()
  340. # After free, expect all blocks to be freed.
  341. assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
  342. @pytest.mark.parametrize("block_size", [8])
  343. @pytest.mark.parametrize("sequence_len", [1, 16, 129])
  344. @pytest.mark.parametrize("append_len", [1, 16, 129])
  345. @pytest.mark.parametrize("lookahead_slots", [1, 16, 129])
  346. @pytest.mark.parametrize("appender", ["forked", "original"])
  347. @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
  348. def test_cow_lookahead_simple(block_size: int, sequence_len: int,
  349. append_len: int, lookahead_slots: int,
  350. allocator_type: str, appender: str):
  351. """Similar to test_cow, except with lookahead allocation. The assertions are
  352. less rigorous due to the complexity of the property under test.
  353. """
  354. num_gpu_blocks = 1024
  355. allocator = CpuGpuBlockAllocator.create(
  356. allocator_type=allocator_type,
  357. num_gpu_blocks=num_gpu_blocks,
  358. num_cpu_blocks=0,
  359. block_size=block_size,
  360. )
  361. token_ids = list(range(sequence_len))
  362. token_ids_to_append = list(range(append_len))
  363. original_block_table = BlockTable(
  364. block_size=block_size,
  365. block_allocator=allocator,
  366. )
  367. original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
  368. # Allocate lookahead slots.
  369. original_block_table.ensure_num_empty_slots(lookahead_slots)
  370. original_block_ids = original_block_table.physical_block_ids[:]
  371. forked_block_table = original_block_table.fork()
  372. if appender == "forked":
  373. appender_block_table = forked_block_table
  374. static_block_table = original_block_table
  375. elif appender == "original":
  376. appender_block_table = original_block_table
  377. static_block_table = forked_block_table
  378. else:
  379. raise ValueError(f"unknown test config {appender=}")
  380. # Write tokens.
  381. appender_block_table.append_token_ids(token_ids_to_append)
  382. # Expect the non-appending block table to have no change.
  383. assert static_block_table.physical_block_ids == original_block_ids
  384. assert appender_block_table.physical_block_ids != original_block_ids
  385. cows = allocator.clear_copy_on_writes()
  386. # Always expect copy-on-write
  387. assert cows
  388. if sequence_len % block_size > 0:
  389. # If the last block in the sequence is not full, then when appending we
  390. # expect a CoW.
  391. assert cows
  392. cow_block_id = sequence_len // block_size
  393. expected_src = static_block_table.physical_block_ids[cow_block_id]
  394. expected_dst = appender_block_table.physical_block_ids[cow_block_id]
  395. assert (expected_src, expected_dst) in cows
  396. static_block_table.free()
  397. appender_block_table.free()
  398. # After free, expect all blocks to be freed.
  399. assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
  400. @pytest.mark.parametrize("block_size", [1, 8])
  401. @pytest.mark.parametrize("sequence_len", [1, 16, 129])
  402. @pytest.mark.parametrize("num_new_tokens", [1, 16, 129])
  403. @pytest.mark.parametrize("num_lookahead_slots", [1, 7, 8])
  404. @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
  405. def test_num_blocks_touched_by_append_slots(block_size: int, sequence_len: int,
  406. num_new_tokens: int,
  407. num_lookahead_slots: int,
  408. allocator_type: str):
  409. """Verify correct calculation of get_num_blocks_touched_by_append_slots.
  410. This is done by using copy-on-write, which requires any modified block to
  411. be copied before write if the refcount > 1. We set the refcount>1 by forking
  412. a sequence, then measure the free blocks before and after an append. If the
  413. number of consumed blocks equals what `get_num_blocks_touched_by_append_
  414. slots` returns, then the calculation is correct.
  415. """
  416. num_gpu_blocks = 1024
  417. allocator = CpuGpuBlockAllocator.create(
  418. allocator_type=allocator_type,
  419. num_gpu_blocks=num_gpu_blocks,
  420. num_cpu_blocks=0,
  421. block_size=block_size,
  422. )
  423. token_ids = list(range(sequence_len))
  424. token_ids_to_append = list(range(num_new_tokens))
  425. block_table = BlockTable(
  426. block_size=block_size,
  427. block_allocator=allocator,
  428. )
  429. block_table.allocate(token_ids=token_ids, device=Device.GPU)
  430. # Add lookahead before fork so both sequences have the same lookahead
  431. # blocks.
  432. block_table.ensure_num_empty_slots(num_empty_slots=num_lookahead_slots)
  433. # Fork sequence so that every block has refcount > 1.
  434. _ = block_table.fork()
  435. # Determine how many blocks should be touched.
  436. expected_num_touched_blocks = (
  437. block_table.get_num_blocks_touched_by_append_slots(
  438. token_ids=token_ids_to_append,
  439. num_lookahead_slots=num_lookahead_slots))
  440. # Measure how many blocks are touched by measuring num_free_blocks before
  441. # and after the append.
  442. #
  443. # We expect append_token_ids to CoW all mutated blocks that have refcount>1.
  444. num_free_blocks_before_append = allocator.get_num_free_blocks(Device.GPU)
  445. block_table.append_token_ids(token_ids_to_append, num_lookahead_slots)
  446. num_consumed_blocks = (num_free_blocks_before_append -
  447. allocator.get_num_free_blocks(Device.GPU))
  448. # TODO(cade) ensure equality when num_lookahead_slots > 0.
  449. # The reason we have < is because lookahead blocks are not copied eagerly;
  450. # they are copied on first write. This will cause issues for beam search +
  451. # speculative decoding. This is acceptable for now as it is a large effort
  452. # to combine the two. To fix this, we can ensure single sequence ownership
  453. # of lookahead blocks by appending empty slots to each block, which will
  454. # trigger the CoW.
  455. #
  456. # Until then, we can accept that the consumed tokens are <= the expected
  457. # tokens when appending with lookahead.
  458. if num_lookahead_slots > 0:
  459. assert num_consumed_blocks <= expected_num_touched_blocks
  460. else:
  461. assert num_consumed_blocks == expected_num_touched_blocks