test_prefix_caching_block.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759
  1. import math
  2. import random
  3. from typing import List, Optional
  4. from unittest.mock import MagicMock
  5. import pytest
  6. from aphrodite.processing.block.interfaces import Block, BlockAllocator
  7. from aphrodite.processing.block.prefix_caching_block import (
  8. PrefixCachingBlock, PrefixCachingBlockAllocator)
  9. class TestPrefixCachingBlock:
  10. @staticmethod
  11. @pytest.mark.parametrize("seed", list(range(10)))
  12. @pytest.mark.parametrize("block_size", [1, 16])
  13. @pytest.mark.parametrize("is_curr_block_full", [True, False])
  14. def test_first_block_has_correct_content_hash(seed: int, block_size: int,
  15. is_curr_block_full: bool):
  16. """Verify a block which is first in the sequence has the correct hash.
  17. """
  18. random.seed(seed)
  19. num_to_fill = block_size if is_curr_block_full else random.randint(
  20. 0, block_size - 1)
  21. token_ids = list(range(num_to_fill))
  22. mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
  23. block_with_prev = PrefixCachingBlock(prev_block=None,
  24. token_ids=token_ids,
  25. block_size=block_size,
  26. allocator=mock_allocator)
  27. if is_curr_block_full:
  28. # Expect hash since block is full.
  29. assert block_with_prev.content_hash == (
  30. PrefixCachingBlock.hash_block_tokens(
  31. is_first_block=True,
  32. prev_block_hash=None,
  33. cur_block_token_ids=token_ids))
  34. else:
  35. # Do not expect hash since block is not full.
  36. assert block_with_prev.content_hash is None
  37. @staticmethod
  38. @pytest.mark.parametrize("seed", list(range(10)))
  39. @pytest.mark.parametrize("block_size", [1, 16])
  40. @pytest.mark.parametrize("is_curr_block_full", [True, False])
  41. @pytest.mark.parametrize("prev_block_has_hash", [True, False])
  42. def test_nth_block_has_correct_content_hash(seed: int, block_size: int,
  43. is_curr_block_full: bool,
  44. prev_block_has_hash: bool):
  45. """Verify a block which is not first in the sequence has the correct
  46. hash.
  47. """
  48. random.seed(seed)
  49. previous_block = MagicMock(spec=PrefixCachingBlock)
  50. prev_block_hash = random.randint(0, 1000)
  51. previous_block.content_hash = (prev_block_hash
  52. if prev_block_has_hash else None)
  53. num_to_fill = block_size if is_curr_block_full else random.randint(
  54. 0, block_size - 1)
  55. token_ids = list(range(num_to_fill))
  56. mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
  57. block_with_prev = PrefixCachingBlock(
  58. prev_block=previous_block,
  59. token_ids=token_ids,
  60. block_size=block_size,
  61. allocator=mock_allocator,
  62. )
  63. if is_curr_block_full and prev_block_has_hash:
  64. # Expect hash since block is full and previous block has hash.
  65. assert (block_with_prev.content_hash ==
  66. PrefixCachingBlock.hash_block_tokens(
  67. is_first_block=False,
  68. prev_block_hash=prev_block_hash,
  69. cur_block_token_ids=token_ids))
  70. else:
  71. # Do not expect hash since block is not full or the previous block
  72. # does not have a hash.
  73. assert block_with_prev.content_hash is None
  74. @staticmethod
  75. @pytest.mark.parametrize("block_size", [1, 2, 16])
  76. @pytest.mark.parametrize("num_tokens", list(range(3)))
  77. @pytest.mark.parametrize("num_empty_trailing_blocks", [0, 1, 10])
  78. def test_blocks_have_correct_hash_in_chain(block_size: int,
  79. num_tokens: int,
  80. num_empty_trailing_blocks: int):
  81. """Create two chains of logical blocks with the same contents.
  82. Assert the hashes are equal.
  83. """
  84. random.seed(0)
  85. token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)]
  86. first_chain, second_chain = [
  87. TestPrefixCachingBlock.create_chain(
  88. block_size=block_size,
  89. token_ids=token_ids,
  90. num_empty_trailing_blocks=num_empty_trailing_blocks)
  91. for _ in range(2)
  92. ]
  93. for first_chain_block, second_chain_block in zip(
  94. first_chain, second_chain):
  95. assert (first_chain_block.content_hash ==
  96. second_chain_block.content_hash)
  97. if not first_chain or not second_chain:
  98. assert first_chain == second_chain
  99. assert num_tokens == 0
  100. @staticmethod
  101. def create_chain(block_size: int,
  102. token_ids: List[int],
  103. num_empty_trailing_blocks=0) -> List[PrefixCachingBlock]:
  104. """Helper method which creates a chain of blocks.
  105. """
  106. blocks: List[PrefixCachingBlock] = []
  107. num_blocks = math.ceil(
  108. len(token_ids) / block_size) + num_empty_trailing_blocks
  109. if num_blocks == 0:
  110. return []
  111. allocator = MagicMock(spec=PrefixCachingBlockAllocator)
  112. prev_block = None
  113. for block_number in range(0, num_blocks):
  114. prev_block = PrefixCachingBlock(
  115. prev_block=prev_block,
  116. token_ids=[],
  117. block_size=block_size,
  118. allocator=allocator,
  119. )
  120. tokens_to_append = token_ids[block_number *
  121. block_size:(block_number + 1) *
  122. block_size]
  123. if tokens_to_append:
  124. prev_block.append_token_ids(tokens_to_append)
  125. blocks.append(prev_block)
  126. return blocks
  127. class TestPrefixCachingBlockAllocator:
  128. @staticmethod
  129. def create_allocate_lambda(allocate_type: str, allocator: BlockAllocator,
  130. prev_block: Optional[Block],
  131. token_ids: List[int]):
  132. if allocate_type == "immutable":
  133. allocate_block = lambda: allocator.allocate_immutable_block(
  134. prev_block=prev_block, token_ids=token_ids)
  135. elif allocate_type == "mutable":
  136. allocate_block = lambda: allocator.allocate_mutable_block(
  137. prev_block=prev_block)
  138. else:
  139. raise ValueError()
  140. return allocate_block
  141. @staticmethod
  142. @pytest.mark.parametrize("num_blocks", [1, 1024])
  143. @pytest.mark.parametrize("block_size", [1, 16])
  144. def test_allocate_mutable_ooms(num_blocks: int, block_size: int):
  145. allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
  146. block_size=block_size)
  147. allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
  148. allocate_type="mutable",
  149. allocator=allocator,
  150. prev_block=None,
  151. token_ids=list(range(block_size)),
  152. )
  153. [allocate_block() for _ in range(num_blocks)]
  154. with pytest.raises(BlockAllocator.NoFreeBlocksError):
  155. allocate_block()
  156. @staticmethod
  157. @pytest.mark.parametrize("num_blocks", [1, 1024])
  158. @pytest.mark.parametrize("block_size", [1, 16])
  159. def test_allocate_immutable_does_not_oom_single_hash(
  160. num_blocks: int, block_size: int):
  161. allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
  162. block_size=block_size)
  163. allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
  164. allocate_type="immutable",
  165. allocator=allocator,
  166. prev_block=None,
  167. token_ids=list(range(block_size)),
  168. )
  169. blocks = [allocate_block() for _ in range(num_blocks)]
  170. # Expect no OOM. If these were mutable blocks, this would OOM.
  171. non_oom_block = allocate_block()
  172. # Expect all blocks to have same physical block index.
  173. for block in blocks:
  174. assert (block.block_id == non_oom_block.block_id)
  175. @staticmethod
  176. @pytest.mark.parametrize("num_blocks", [1, 1024])
  177. @pytest.mark.parametrize("block_size", [1, 16])
  178. def test_allocate_immutable_ooms_many_hash(num_blocks: int,
  179. block_size: int):
  180. """Consume all blocks using many different hashes/block content.
  181. Do this by creating a sequence that is very long.
  182. Expect next block to OOM.
  183. """
  184. allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
  185. block_size=block_size)
  186. # Create token ids that will exhaust all blocks.
  187. token_ids = list(range(num_blocks * block_size))
  188. chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
  189. block_size=block_size,
  190. token_ids=token_ids,
  191. allocator=allocator,
  192. )
  193. # Expect allocation with unseen hash to fail.
  194. with pytest.raises(BlockAllocator.NoFreeBlocksError):
  195. allocator.allocate_immutable_block(prev_block=chain[-1],
  196. token_ids=list(
  197. range(block_size)))
  198. # Expect mutable allocation to fail.
  199. with pytest.raises(BlockAllocator.NoFreeBlocksError):
  200. allocator.allocate_mutable_block(prev_block=chain[-1])
  201. # Expect allocation of exact same chain to pass.
  202. second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
  203. block_size=block_size,
  204. token_ids=token_ids,
  205. allocator=allocator,
  206. )
  207. # Expect physical block indices to be the same in both chains.
  208. assert chain and second_chain
  209. for first_chain_block, second_chain_block in zip(chain, second_chain):
  210. assert (first_chain_block.block_id == second_chain_block.block_id)
  211. @staticmethod
  212. @pytest.mark.parametrize("num_blocks", [1, 1024])
  213. @pytest.mark.parametrize("block_size", [1, 16])
  214. def test_free_prevents_oom(num_blocks: int, block_size: int):
  215. allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
  216. block_size=block_size)
  217. # Create token ids that will exhaust all blocks.
  218. token_ids = list(range(num_blocks * block_size))
  219. chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
  220. block_size=block_size,
  221. token_ids=token_ids,
  222. allocator=allocator,
  223. )
  224. # Expect mutable allocation to fail.
  225. with pytest.raises(BlockAllocator.NoFreeBlocksError):
  226. allocator.allocate_mutable_block(prev_block=None)
  227. block_to_free = chain[-1]
  228. # Expect free/allocate loop to succeed many times.
  229. for i in range(100):
  230. block_id = block_to_free.block_id
  231. allocator.free(block_to_free)
  232. assert block_to_free.block_id is None, i
  233. new_block = allocator.allocate_mutable_block(prev_block=None)
  234. assert new_block.block_id == block_id, i
  235. with pytest.raises(BlockAllocator.NoFreeBlocksError):
  236. allocator.allocate_mutable_block(prev_block=None)
  237. block_to_free = new_block
  238. @staticmethod
  239. @pytest.mark.parametrize("num_blocks", [1024])
  240. @pytest.mark.parametrize("block_size", [16])
  241. @pytest.mark.parametrize("seed", list(range(20)))
  242. def test_get_num_free_blocks(num_blocks: int, block_size: int, seed: int):
  243. random.seed(seed)
  244. allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
  245. block_size=block_size)
  246. num_blocks_to_consume = random.randint(1, num_blocks - 1)
  247. # Create token ids that will exhaust all blocks.
  248. token_ids = list(range(num_blocks_to_consume * block_size))
  249. chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
  250. block_size=block_size,
  251. token_ids=token_ids,
  252. allocator=allocator,
  253. )
  254. # Free each block in chain, assert num free blocks includes new free
  255. # block.
  256. for i, block in enumerate(chain):
  257. assert allocator.get_num_free_blocks() == (num_blocks -
  258. num_blocks_to_consume +
  259. i)
  260. allocator.free(block)
  261. @staticmethod
  262. @pytest.mark.parametrize("num_blocks", [4])
  263. @pytest.mark.parametrize("block_size", [8])
  264. def test_prefix_caching_block_get_num_blocks_touched(
  265. num_blocks, block_size):
  266. """ Verify the allocator can correctly return the number of
  267. blocks touched, when there are cached prefixes and different
  268. lookahead slots.
  269. """
  270. allocator_src = PrefixCachingBlockAllocator(num_blocks=num_blocks,
  271. block_size=block_size)
  272. allocator_dst = PrefixCachingBlockAllocator(num_blocks=num_blocks,
  273. block_size=block_size)
  274. # Create token ids that will exhaust all blocks except the last
  275. token_ids = list(range((num_blocks - 1) * block_size))
  276. # Create a chain of cacheable blocks in the dst
  277. cached_blocks = TestPrefixCachingBlockAllocator.create_immutable_chain(
  278. block_size=block_size,
  279. token_ids=token_ids,
  280. allocator=allocator_dst,
  281. )
  282. # Create a chain of the same blocks in the src
  283. blocks_to_swap_in = \
  284. TestPrefixCachingBlockAllocator.create_immutable_chain(
  285. block_size=block_size,
  286. token_ids=token_ids,
  287. allocator=allocator_src,
  288. )
  289. # All blocks are cached
  290. assert allocator_dst.get_num_blocks_touched(blocks_to_swap_in) == 0
  291. # Free the first block in the dst
  292. allocator_dst.free(cached_blocks[0])
  293. # Now the first block becomes dangling, the swapped blocks need
  294. # to reclaim the first block in the dst
  295. assert allocator_dst.get_num_blocks_touched(blocks_to_swap_in) == 1
  296. # Insert one non-full block in the src
  297. non_full_block = allocator_src.allocate_mutable_block(
  298. blocks_to_swap_in[-1])
  299. non_full_block.append_token_ids([0])
  300. blocks_to_swap_in.append(non_full_block)
  301. assert allocator_dst.get_num_blocks_touched(blocks_to_swap_in,
  302. num_lookahead_slots=1) == 2
  303. assert allocator_dst.get_num_blocks_touched(
  304. blocks_to_swap_in, num_lookahead_slots=block_size - 1) == 2
  305. assert allocator_dst.get_num_blocks_touched(
  306. blocks_to_swap_in, num_lookahead_slots=block_size) == 3
  307. @staticmethod
  308. @pytest.mark.parametrize("num_blocks", [1024])
  309. @pytest.mark.parametrize("block_size", [16])
  310. @pytest.mark.parametrize("seed", list(range(20)))
  311. def test_get_num_free_blocks_shared(num_blocks: int, block_size: int,
  312. seed: int):
  313. """Verify sharing occurs by allocating two sequences that share prefixes
  314. and incrementally freeing blocks.
  315. """
  316. random.seed(seed)
  317. allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
  318. block_size=block_size)
  319. num_blocks_to_consume = random.randint(1, num_blocks - 1)
  320. # Create token ids that will exhaust all blocks.
  321. token_ids = list(range(num_blocks_to_consume * block_size))
  322. first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
  323. block_size=block_size,
  324. token_ids=token_ids,
  325. allocator=allocator,
  326. )
  327. second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
  328. block_size=block_size,
  329. token_ids=token_ids,
  330. allocator=allocator,
  331. )
  332. # Free each block in the first chain. Since all blocks are shared, the
  333. # free count should stay constant.
  334. for i, block in enumerate(first_chain):
  335. assert allocator.get_num_free_blocks() == (num_blocks -
  336. num_blocks_to_consume)
  337. allocator.free(block)
  338. # Free each block in the second chain. Since the refcount is now zero,
  339. # the free count should increment with each free.
  340. for i, block in enumerate(second_chain):
  341. assert allocator.get_num_free_blocks() == (num_blocks -
  342. num_blocks_to_consume +
  343. i)
  344. allocator.free(block)
  345. @staticmethod
  346. @pytest.mark.parametrize("num_blocks", [1024])
  347. @pytest.mark.parametrize("block_size", [16])
  348. @pytest.mark.parametrize("seed", list(range(20)))
  349. def test_get_common_computed_block_ids(num_blocks: int, block_size: int,
  350. seed: int):
  351. """Verify get_common_computed_block_ids could get correct result
  352. by create two immutable chain sharing prefix at specified pos,
  353. and compare whether we also could get right result
  354. from get_common_computed_block_ids.
  355. """
  356. random.seed(seed)
  357. allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks * 2,
  358. block_size=block_size)
  359. num_blocks_to_consume = random.randint(1, num_blocks - 1)
  360. # Create token ids that will exhaust all blocks.
  361. token_ids = list(range(num_blocks_to_consume * block_size))
  362. first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
  363. block_size=block_size,
  364. token_ids=token_ids,
  365. allocator=allocator,
  366. )
  367. # After zero_point, second_chain's token_ids would be set -1, which
  368. # make it different from here comparing with first_chain
  369. zero_point = random.randint(1, len(token_ids) - 1)
  370. zero_point_blocks = zero_point // block_size
  371. token_ids[zero_point:] = [-1] * (len(token_ids) - zero_point)
  372. second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
  373. block_size=block_size,
  374. token_ids=token_ids,
  375. allocator=allocator,
  376. )
  377. first_computed_ids = [
  378. first_chain[i].block_id for i in range(num_blocks_to_consume)
  379. ]
  380. second_computed_ids = [
  381. second_chain[i].block_id for i in range(num_blocks_to_consume)
  382. ]
  383. res = allocator.get_common_computed_block_ids(
  384. [first_computed_ids, second_computed_ids])
  385. assert (len(res) == zero_point_blocks)
  386. # Test case that assume those prompted block after first immutable would
  387. # be freed into hashless allocator, while first immutable block get ref
  388. # increased.
  389. @staticmethod
  390. @pytest.mark.parametrize("num_blocks", [3])
  391. @pytest.mark.parametrize("block_size", [16])
  392. @pytest.mark.parametrize("seed", list(range(10)))
  393. def test_alloc_promotion(num_blocks: int, block_size: int, seed: int):
  394. random.seed(seed)
  395. allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
  396. block_size=block_size)
  397. token_ids = list(range(block_size))
  398. block = allocator.allocate_immutable_block(prev_block=None,
  399. token_ids=token_ids)
  400. assert allocator._refcounter.get(block.block_id) == 1
  401. m = allocator.allocate_mutable_block(prev_block=None)
  402. block_id = m.block_id
  403. for i in range(block_size):
  404. m.append_token_ids([i])
  405. # After block get promoted to immutable from mutable, if there is
  406. # already same content hash block, then it shall be released into
  407. # hashless_allocator
  408. # And first immutable block's ref get increased by 1
  409. assert m.block_id == block.block_id
  410. assert block_id in allocator._hashless_allocator._free_block_indices
  411. assert allocator._refcounter.get(block.block_id) == 2
  412. # Test case when eviction and allocation are mixed,
  413. # make sure they work as expected
  414. @staticmethod
  415. @pytest.mark.parametrize("num_blocks", [3])
  416. @pytest.mark.parametrize("block_size", [16])
  417. @pytest.mark.parametrize("seed", list(range(10)))
  418. def test_eviction_alloc_mixed(num_blocks: int, block_size: int, seed: int):
  419. random.seed(seed)
  420. all_blocks_list = [i for i in range(num_blocks)]
  421. zero_ref = {i: 0 for i in range(num_blocks)}
  422. one_ref = {i: 1 for i in range(num_blocks)}
  423. allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
  424. block_size=block_size)
  425. token_ids = list(range(num_blocks * block_size))
  426. # Verify initial/pre-alloc state
  427. # Ensure all blocks are free inside hashless allocator
  428. assert list(allocator._hashless_allocator._free_block_indices
  429. ) == all_blocks_list
  430. # Ensure no tracked blocks
  431. assert len(allocator._block_tracker.keys()) == num_blocks
  432. for block_id in range(num_blocks):
  433. assert not allocator._block_tracker[block_id].active
  434. # Ensure no cached blocks
  435. assert len(allocator._cached_blocks.values()) == 0
  436. # Ensure no evicted blocks
  437. assert len(allocator.evictor.free_table.keys()) == 0
  438. # Ensure 0s ref counts for all blocks
  439. assert allocator._refcounter._refcounts == zero_ref
  440. # Allocate immutable chains with only one block residuled in
  441. new_block = []
  442. for i in range(num_blocks):
  443. block = allocator.allocate_immutable_block(
  444. prev_block=None,
  445. token_ids=token_ids[block_size * i:block_size * (i + 1)])
  446. new_block.append(block)
  447. # Verify post-alloc state
  448. # Ensure no blocks are free inside hashless allocator
  449. assert (len(allocator._hashless_allocator._free_block_indices) == 0)
  450. # Ensure all blocks are tracked
  451. assert len(allocator._block_tracker.keys()) == num_blocks
  452. for block_id in range(num_blocks):
  453. assert allocator._block_tracker[block_id].active
  454. # Ensure all blocks are cached (all promoted)
  455. assert len(allocator._cached_blocks.values()) == num_blocks
  456. # Ensure no evicted blocks
  457. assert len(allocator.evictor.free_table.keys()) == 0
  458. # Ensure 1s ref counts for all blocks
  459. assert allocator._refcounter._refcounts == one_ref
  460. # Free all blocks, and now all blocks shall be in the evictor
  461. # there shall be no tracking data left in _block_tracker
  462. # all blocks shall be tracked in _cached_blocks
  463. # all blocks' ref shall be zero
  464. for block in new_block:
  465. allocator.free(block)
  466. # Verify post-free state
  467. # Ensure no tracked blocks
  468. assert len(allocator._block_tracker.keys()) == num_blocks
  469. for block_id in range(num_blocks):
  470. assert not allocator._block_tracker[block_id].active
  471. # Ensure no blocks in hashless allocator (all promoted)
  472. assert len(allocator._hashless_allocator._free_block_indices) == 0
  473. # Ensure all blocks are cached
  474. assert list(allocator._cached_blocks.values()) == all_blocks_list
  475. # Ensure all blocks are inside the evictor
  476. assert list(allocator.evictor.free_table.keys()) == all_blocks_list
  477. # Ensure 0s refcounts
  478. assert allocator._refcounter._refcounts == zero_ref
  479. # Allocate a mutable block, and the first block shall be evicted
  480. # and set its content hash into None, ref to 1
  481. mutable = allocator.allocate_mutable_block(prev_block=None)
  482. assert mutable.block_id == 0
  483. assert mutable.content_hash is None
  484. assert allocator._block_tracker[0].active
  485. assert allocator._refcounter.get(0) == 1
  486. assert 0 not in allocator._cached_blocks
  487. assert 0 not in allocator.evictor
  488. # Since this mutable block has no hash yet, it shall be released into
  489. # hashless allocator
  490. allocator.free(mutable)
  491. assert not allocator._block_tracker[0].active
  492. assert allocator._refcounter._refcounts == zero_ref
  493. assert 0 not in allocator._cached_blocks
  494. assert 0 not in allocator.evictor
  495. assert 0 in allocator._hashless_allocator._free_block_indices
  496. # When allocate immutable with first block_size tokens, we
  497. # shall get free block from hashless allocator, thus no block left
  498. # in hashless
  499. block = allocator.allocate_immutable_block(
  500. prev_block=None, token_ids=token_ids[:block_size])
  501. assert block.block_id == 0
  502. assert len(allocator._hashless_allocator._free_block_indices) == 0
  503. assert allocator._block_tracker[0].active
  504. assert 0 in allocator._cached_blocks.values()
  505. assert allocator._refcounter.get(0) == 1
  506. assert 0 not in allocator.evictor
  507. # allocate mutable block again, it shall be popped from evictor
  508. mutable = allocator.allocate_mutable_block(prev_block=None)
  509. assert len(allocator._hashless_allocator._free_block_indices) == 0
  510. assert mutable.block_id not in allocator.evictor.free_table
  511. assert allocator._refcounter.get(mutable.block_id) == 1
  512. # Test case where two last accessed times are equal
  513. @staticmethod
  514. @pytest.mark.parametrize("num_blocks", [1024])
  515. @pytest.mark.parametrize("block_size", [16])
  516. @pytest.mark.parametrize("seed", list(range(20)))
  517. def test_eviction_order(num_blocks: int, block_size: int, seed: int):
  518. """This test case simulate the two chain created and free in order,
  519. and together they would exhaust the initial freed blocks.
  520. So the next block created after those two chain shall use the block
  521. from the first chain as that block has long access time.
  522. While first chain has two blocks, it shall pick up the last one, as
  523. it has larger token number.
  524. """
  525. random.seed(seed)
  526. allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
  527. block_size=block_size)
  528. num_blocks_to_consume = num_blocks + 1
  529. token_ids = list(range(num_blocks_to_consume * block_size))
  530. num_blocks_in_first_chain = 2
  531. num_tokens_in_first_chain = block_size * num_blocks_in_first_chain
  532. # First chain takes the first block
  533. first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
  534. block_size=block_size,
  535. token_ids=token_ids[:num_tokens_in_first_chain],
  536. allocator=allocator,
  537. )
  538. # There should only be one block allocated at this point
  539. assert allocator.get_num_free_blocks() == (num_blocks -
  540. num_blocks_in_first_chain)
  541. # Set the last accessed time of the first block to 1
  542. blocks_ids = [block.block_id for block in first_chain]
  543. allocator.mark_blocks_as_accessed(blocks_ids, 1)
  544. # Second chain takes the rest of the blocks
  545. second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
  546. block_size=block_size,
  547. token_ids=token_ids[num_tokens_in_first_chain:-block_size],
  548. allocator=allocator,
  549. )
  550. # There shouldn't be any blocks left at this point
  551. assert allocator.get_num_free_blocks() == (0)
  552. assert len(first_chain) == num_blocks_in_first_chain
  553. last_block_id = first_chain[-1].block_id
  554. # Free each block in the first chain.
  555. for i, block in enumerate(first_chain):
  556. allocator.free(block)
  557. # Set the last accessed time on all of the blocks in the second chain
  558. # to 2
  559. blocks_ids = [block.block_id for block in second_chain]
  560. allocator.mark_blocks_as_accessed(blocks_ids, 2)
  561. # Free each block in the second chain.
  562. for i, block in enumerate(second_chain):
  563. allocator.free(block)
  564. # Allocate a new block and check that it's the least recently used block
  565. # from the first chain.
  566. new_block = TestPrefixCachingBlockAllocator.create_immutable_chain(
  567. block_size=block_size,
  568. token_ids=token_ids[-block_size:],
  569. allocator=allocator,
  570. )
  571. assert new_block[0].block_id == last_block_id
  572. # Test case for cache mertics
  573. @staticmethod
  574. def test_metric():
  575. block_size = 16
  576. allocator = PrefixCachingBlockAllocator(num_blocks=4,
  577. block_size=block_size)
  578. # Test when no query (0/0)
  579. assert allocator.get_prefix_cache_hit_rate() == 0.0
  580. token_ids = list(range(block_size))
  581. allocator.allocate_immutable_block(prev_block=None,
  582. token_ids=token_ids)
  583. # Test 0/1 hit rate
  584. assert allocator.get_prefix_cache_hit_rate() == 0.0
  585. allocator.allocate_immutable_block(prev_block=None,
  586. token_ids=token_ids)
  587. # Test 1/2 hit rate
  588. assert allocator.get_prefix_cache_hit_rate() == 0.5
  589. # Test more than one block
  590. for _ in range(2, 1005):
  591. allocator.allocate_immutable_block(prev_block=None,
  592. token_ids=token_ids)
  593. assert allocator.get_prefix_cache_hit_rate() > 0.99
  594. # Test case for marking cache hit blocks as computed right after
  595. # a batch of prefill sequences are scheduled.
  596. @staticmethod
  597. def test_touch_block():
  598. block_size = 16
  599. common_blocks = 4
  600. allocator = PrefixCachingBlockAllocator(num_blocks=8,
  601. block_size=block_size)
  602. common_token_ids = list(range(block_size * common_blocks))
  603. # Mimic the behavior of allocating the same block chain
  604. # (i.e., common prefix) for a batch of 3 different prefill sequences.
  605. for _ in range(3):
  606. blocks = TestPrefixCachingBlockAllocator.create_immutable_chain(
  607. block_size=block_size,
  608. token_ids=common_token_ids,
  609. allocator=allocator,
  610. )
  611. block_ids = [block.block_id for block in blocks]
  612. # The allocated blocks should be marked as touched
  613. # but not computed.
  614. computed_block_ids = allocator.get_computed_block_ids(
  615. [], block_ids, skip_last_block_id=False)
  616. assert len(computed_block_ids) == 0
  617. allocator.mark_blocks_as_computed([])
  618. computed_block_ids = allocator.get_computed_block_ids(
  619. [], block_ids, skip_last_block_id=False)
  620. assert len(computed_block_ids) == common_blocks
  621. @staticmethod
  622. def create_immutable_chain(
  623. block_size: int,
  624. token_ids: List[int],
  625. allocator: PrefixCachingBlockAllocator,
  626. ) -> List[PrefixCachingBlock]:
  627. """Helper method which creates a chain of blocks.
  628. """
  629. blocks: List[Block] = []
  630. num_blocks = math.ceil(len(token_ids) / block_size)
  631. if num_blocks == 0:
  632. return []
  633. prev_block = None
  634. for block_number in range(0, num_blocks):
  635. block_token_ids = token_ids[block_number *
  636. block_size:(block_number + 1) *
  637. block_size]
  638. prev_block = allocator.allocate_immutable_block(
  639. prev_block=prev_block, token_ids=block_token_ids)
  640. blocks.append(prev_block)
  641. return blocks