test_block_manager.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638
  1. import time
  2. from collections import defaultdict
  3. from typing import List
  4. import pytest
  5. from aphrodite import SamplingParams
  6. from aphrodite.common.block import PhysicalTokenBlock
  7. from aphrodite.common.sequence import (Logprob, Sequence, SequenceGroup,
  8. SequenceStatus)
  9. from aphrodite.common.utils import Device
  10. from aphrodite.processing.block.utils import (
  11. STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, STR_NOT_IMPL_ENC_DEC_SWA)
  12. from aphrodite.processing.block_manager_v1 import (BlockSpaceManagerV1,
  13. UncachedBlockAllocator)
  14. from aphrodite.processing.interfaces import AllocStatus
  15. from .utils import create_dummy_prompt, create_dummy_prompt_encoder_decoder
  16. def test_block_allocator_allocate():
  17. block_size = 4
  18. num_cpu_blocks = 4
  19. cpu_allocator = UncachedBlockAllocator(Device.CPU, block_size,
  20. num_cpu_blocks)
  21. # Allocate all available cpu blocks.
  22. num_free = num_cpu_blocks
  23. assert cpu_allocator.get_num_free_blocks() == num_free
  24. for _ in range(num_cpu_blocks):
  25. block = cpu_allocator.allocate()
  26. num_free -= 1
  27. assert block not in cpu_allocator.free_blocks
  28. assert cpu_allocator.get_num_free_blocks() == num_free
  29. with pytest.raises(ValueError):
  30. cpu_allocator.allocate()
  31. def test_block_allocator_free():
  32. block_size = 4
  33. num_cpu_blocks = 4
  34. cpu_allocator = UncachedBlockAllocator(Device.CPU, block_size,
  35. num_cpu_blocks)
  36. # Allocate all available cpu blocks.
  37. blocks: List[PhysicalTokenBlock] = []
  38. for _ in range(num_cpu_blocks):
  39. block = cpu_allocator.allocate()
  40. blocks.append(block)
  41. assert block not in cpu_allocator.free_blocks
  42. # Free all allocated cpu blocks.
  43. num_free = 0
  44. assert cpu_allocator.get_num_free_blocks() == num_free
  45. for block in blocks:
  46. cpu_allocator.free(block)
  47. num_free += 1
  48. assert block in cpu_allocator.free_blocks
  49. assert cpu_allocator.get_num_free_blocks() == num_free
  50. with pytest.raises(ValueError):
  51. cpu_allocator.free(block)
  52. def test_allocate():
  53. block_size = 4
  54. num_cpu_blocks = 4
  55. num_gpu_blocks = 4
  56. block_manager = BlockSpaceManagerV1(block_size,
  57. num_cpu_blocks,
  58. num_gpu_blocks,
  59. watermark=0)
  60. # Allocate same sequence group to all available gpu blocks.
  61. for i in range(num_gpu_blocks):
  62. _, seq_group = create_dummy_prompt(str(i), block_size)
  63. assert block_manager.can_allocate(seq_group) == AllocStatus.OK
  64. block_manager.allocate(seq_group)
  65. assert block_manager.can_allocate(seq_group) != AllocStatus.OK
  66. # Allocate same sequence group to all available gpu blocks.
  67. # Use watermark to reserve one gpu block.
  68. block_manager = BlockSpaceManagerV1(block_size,
  69. num_cpu_blocks,
  70. num_gpu_blocks,
  71. watermark=1 / num_gpu_blocks)
  72. for i in range(num_gpu_blocks - 1):
  73. _, seq_group = create_dummy_prompt(str(i), block_size)
  74. assert block_manager.can_allocate(seq_group) == AllocStatus.OK
  75. block_manager.allocate(seq_group)
  76. assert block_manager.can_allocate(seq_group) != AllocStatus.OK
  77. def test_allocate_encoder_decoder():
  78. block_size = 4
  79. num_cpu_blocks = 4
  80. num_gpu_blocks = 4
  81. block_req_per_seq_group = 2
  82. block_manager = BlockSpaceManagerV1(block_size,
  83. num_cpu_blocks,
  84. num_gpu_blocks,
  85. watermark=0)
  86. # Allocate same sequence group to all available gpu blocks.
  87. for i in range(num_gpu_blocks // block_req_per_seq_group):
  88. _, _, seq_group = create_dummy_prompt_encoder_decoder(
  89. str(i),
  90. decoder_prompt_length=block_size,
  91. encoder_prompt_length=block_size)
  92. assert block_manager.can_allocate(seq_group) == AllocStatus.OK
  93. block_manager.allocate(seq_group)
  94. assert block_manager.can_allocate(seq_group) != AllocStatus.OK
  95. # Allocate same sequence group to all available gpu blocks.
  96. # Use watermark to reserve one gpu block.
  97. block_manager = BlockSpaceManagerV1(block_size,
  98. num_cpu_blocks,
  99. num_gpu_blocks,
  100. watermark=1 / num_gpu_blocks)
  101. for i in range((num_gpu_blocks - 1) // block_req_per_seq_group):
  102. _, _, seq_group = create_dummy_prompt_encoder_decoder(
  103. str(i),
  104. decoder_prompt_length=block_size,
  105. encoder_prompt_length=block_size)
  106. assert block_manager.can_allocate(seq_group) == AllocStatus.OK
  107. block_manager.allocate(seq_group)
  108. assert block_manager.can_allocate(seq_group) != AllocStatus.OK
  109. def test_allocate_encoder_decoder_fails_with_swa():
  110. # SWA short for sliding window attention
  111. block_size = 4
  112. num_cpu_blocks = 4
  113. num_gpu_blocks = 4
  114. block_manager = BlockSpaceManagerV1(block_size,
  115. num_cpu_blocks,
  116. num_gpu_blocks,
  117. watermark=0,
  118. sliding_window=5) # swa
  119. # Allocate same sequence group to all available gpu blocks.
  120. _, _, seq_group = create_dummy_prompt_encoder_decoder(
  121. "0",
  122. decoder_prompt_length=block_size,
  123. encoder_prompt_length=block_size)
  124. # Assert that can_allocate() fails due to SWA
  125. with pytest.raises(NotImplementedError) as exc_info:
  126. block_manager.can_allocate(seq_group)
  127. assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA
  128. # Assert that allocate() fails due to SWA
  129. with pytest.raises(NotImplementedError) as exc_info:
  130. block_manager.allocate(seq_group)
  131. assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA
  132. def test_allocate_encoder_decoder_fails_with_prefix_caching():
  133. block_size = 4
  134. num_cpu_blocks = 4
  135. num_gpu_blocks = 4
  136. block_manager = BlockSpaceManagerV1(block_size,
  137. num_cpu_blocks,
  138. num_gpu_blocks,
  139. watermark=0,
  140. enable_caching=True) # Prefix cache
  141. # Allocate same sequence group to all available gpu blocks.
  142. _, _, seq_group = create_dummy_prompt_encoder_decoder(
  143. "0",
  144. decoder_prompt_length=block_size,
  145. encoder_prompt_length=block_size)
  146. # Assert that can_allocate() fails due to prefix caching
  147. with pytest.raises(NotImplementedError) as exc_info:
  148. block_manager.can_allocate(seq_group)
  149. assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
  150. # Assert that allocate() fails due to prefix caching
  151. with pytest.raises(NotImplementedError) as exc_info:
  152. block_manager.allocate(seq_group)
  153. assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
  154. def test_append_slot_single_seq():
  155. block_size = 4
  156. num_cpu_blocks = 4
  157. num_gpu_blocks = 4
  158. block_manager = BlockSpaceManagerV1(block_size,
  159. num_cpu_blocks,
  160. num_gpu_blocks,
  161. watermark=0)
  162. # Allocate single seq to gpu block.
  163. prompt, seq_group = create_dummy_prompt("1", block_size)
  164. block_manager.allocate(seq_group)
  165. # Nothing to append. Sequence has no new logical blocks.
  166. assert block_manager.can_append_slots(seq_group)
  167. before_blocks = block_manager.get_num_free_gpu_blocks()
  168. assert not block_manager.append_slots(prompt)
  169. after_blocks = block_manager.get_num_free_gpu_blocks()
  170. assert before_blocks == after_blocks
  171. # Add block_size number of new tokens and append slot.
  172. for i in range(block_size):
  173. token_id = i + 5
  174. prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
  175. assert block_manager.can_append_slots(seq_group)
  176. before_blocks = block_manager.get_num_free_gpu_blocks()
  177. assert not block_manager.append_slots(prompt)
  178. after_blocks = block_manager.get_num_free_gpu_blocks()
  179. assert before_blocks - after_blocks == 1
  180. def test_append_slot_cow():
  181. block_size = 4
  182. num_cpu_blocks = 4
  183. num_gpu_blocks = 4
  184. block_manager = BlockSpaceManagerV1(block_size=block_size,
  185. num_cpu_blocks=num_cpu_blocks,
  186. num_gpu_blocks=num_gpu_blocks,
  187. watermark=0)
  188. # Allocate prompt to gpu block. There is one slot left in the block.
  189. prompt = Sequence(seq_id=1,
  190. inputs={
  191. "prompt": "one two three",
  192. "prompt_token_ids": [1, 2, 3],
  193. },
  194. block_size=block_size)
  195. # Fork the sequence, such that a COW will be required when we append a new
  196. # token id.
  197. child = prompt.fork(new_seq_id=2)
  198. # Allocate space for the sequence group.
  199. seq_group = SequenceGroup(request_id="1",
  200. seqs=[prompt, child],
  201. arrival_time=time.time(),
  202. sampling_params=SamplingParams())
  203. block_manager.allocate(seq_group)
  204. # Fork and append a new token id. We expect a COW to be scheduled.
  205. token_id = 4
  206. child.append_token_id(token_id, {token_id: Logprob(0.0)})
  207. block_manager.fork(prompt, child)
  208. assert block_manager.can_append_slots(seq_group)
  209. before_blocks = block_manager.get_num_free_gpu_blocks()
  210. cows = block_manager.append_slots(child)
  211. assert cows
  212. dict_cows = defaultdict(list)
  213. for src_block, dst_block in cows:
  214. dict_cows[src_block].append(dst_block)
  215. for src_block, dst_blocks in dict_cows.items():
  216. assert src_block not in dst_blocks
  217. after_blocks = block_manager.get_num_free_gpu_blocks()
  218. assert before_blocks - after_blocks == 1
  219. def test_fork():
  220. block_size = 4
  221. num_cpu_blocks = 4
  222. num_gpu_blocks = 4
  223. block_manager = BlockSpaceManagerV1(block_size,
  224. num_cpu_blocks,
  225. num_gpu_blocks,
  226. watermark=0)
  227. prompt, seq_group = create_dummy_prompt("1",
  228. block_size - 1,
  229. block_size=block_size)
  230. block_manager.allocate(seq_group)
  231. # Fork prompt and copy block tables.
  232. child = prompt.fork(2)
  233. block_manager.fork(prompt, child)
  234. assert block_manager.get_block_table(
  235. prompt) == block_manager.get_block_table(child)
  236. token_id = 4
  237. # Append token to child. Block is shared so copy on write occurs.
  238. child.append_token_id(token_id, {token_id: Logprob(0.0)})
  239. block_manager.append_slots(child)
  240. assert block_manager.get_block_table(
  241. prompt) != block_manager.get_block_table(child)
  242. def test_swap():
  243. block_size = 4
  244. num_cpu_blocks = 4
  245. num_gpu_blocks = 4
  246. block_manager = BlockSpaceManagerV1(block_size,
  247. num_cpu_blocks,
  248. num_gpu_blocks,
  249. watermark=0)
  250. prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
  251. prompt.status = SequenceStatus.WAITING
  252. block_manager.allocate(seq_group)
  253. # Emulate a forward pass by appending a single token.
  254. # The block manager then knows how many unprocessed
  255. # tokens will be written in the next forward pass.
  256. token_id = 0
  257. prompt.status = SequenceStatus.RUNNING
  258. prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
  259. # Swap seq group from GPU -> CPU.
  260. gpu_blocks = block_manager.get_block_table(prompt)
  261. assert block_manager.can_swap_out(seq_group)
  262. before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
  263. before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
  264. mapping = block_manager.swap_out(seq_group)
  265. assert [x[0] for x in mapping] == gpu_blocks
  266. after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
  267. after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
  268. assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
  269. assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
  270. prompt.status = SequenceStatus.SWAPPED
  271. # Swap seq group from CPU -> GPU.
  272. cpu_blocks = block_manager.get_block_table(prompt)
  273. assert block_manager.can_swap_in(seq_group) == AllocStatus.OK
  274. before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
  275. before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
  276. mapping = block_manager.swap_in(seq_group)
  277. assert [x[0] for x in mapping] == cpu_blocks
  278. after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
  279. after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
  280. assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks
  281. assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
  282. def test_swap_encoder_decoder():
  283. block_size = 4
  284. num_cpu_blocks = 4
  285. num_gpu_blocks = 4
  286. block_manager = BlockSpaceManagerV1(block_size,
  287. num_cpu_blocks,
  288. num_gpu_blocks,
  289. watermark=0)
  290. decoder_prompt, encoder_prompt, seq_group = \
  291. create_dummy_prompt_encoder_decoder(
  292. "1",
  293. decoder_prompt_length=block_size,
  294. encoder_prompt_length=block_size)
  295. decoder_prompt.status = SequenceStatus.WAITING
  296. encoder_prompt.status = SequenceStatus.WAITING
  297. block_manager.allocate(seq_group)
  298. # Emulate a forward pass by appending a single token.
  299. # The block manager then knows how many unprocessed
  300. # tokens will be written in the next forward pass.
  301. token_id = 0
  302. decoder_prompt.status = SequenceStatus.RUNNING
  303. decoder_prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
  304. # Swap encoder/decoder seq group from GPU -> CPU.
  305. decoder_gpu_blocks = block_manager.get_block_table(decoder_prompt)
  306. cross_gpu_blocks = block_manager.get_cross_block_table(seq_group)
  307. gpu_blocks = decoder_gpu_blocks + cross_gpu_blocks
  308. assert block_manager.can_swap_out(seq_group)
  309. before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
  310. before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
  311. mapping = block_manager.swap_out(seq_group)
  312. assert [x[0] for x in mapping] == gpu_blocks
  313. #assert list(mapping.keys()) == gpu_blocks
  314. after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
  315. after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
  316. assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
  317. assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
  318. decoder_prompt.status = SequenceStatus.SWAPPED
  319. # Swap encoder/decoder seq group from CPU -> GPU.
  320. decoder_cpu_blocks = block_manager.get_block_table(decoder_prompt)
  321. cross_cpu_blocks = block_manager.get_cross_block_table(seq_group)
  322. cpu_blocks = decoder_cpu_blocks + cross_cpu_blocks
  323. assert block_manager.can_swap_in(seq_group) == AllocStatus.OK
  324. before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
  325. before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
  326. mapping = block_manager.swap_in(seq_group)
  327. assert [x[0] for x in mapping] == cpu_blocks
  328. after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
  329. after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
  330. assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks
  331. assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
  332. def test_free():
  333. block_size = 4
  334. num_cpu_blocks = 4
  335. num_gpu_blocks = 4
  336. block_manager = BlockSpaceManagerV1(block_size,
  337. num_cpu_blocks,
  338. num_gpu_blocks,
  339. watermark=0)
  340. prompt, seq_group = create_dummy_prompt("1", block_size)
  341. block_manager.allocate(seq_group)
  342. # Free allocated seq.
  343. prompt_blocks = len(block_manager.get_block_table(prompt))
  344. before_blocks = block_manager.get_num_free_gpu_blocks()
  345. block_manager.free(prompt)
  346. after_blocks = block_manager.get_num_free_gpu_blocks()
  347. assert after_blocks == before_blocks + prompt_blocks
  348. # Block table for freed seq is deleted.
  349. with pytest.raises(KeyError):
  350. block_manager.get_block_table(prompt)
  351. def test_free_encoder_decoder():
  352. block_size = 4
  353. num_cpu_blocks = 4
  354. num_gpu_blocks = 4
  355. block_manager = BlockSpaceManagerV1(block_size,
  356. num_cpu_blocks,
  357. num_gpu_blocks,
  358. watermark=0)
  359. decoder_prompt, encoder_prompt, seq_group = \
  360. create_dummy_prompt_encoder_decoder(
  361. "1",
  362. decoder_prompt_length=block_size,
  363. encoder_prompt_length=block_size)
  364. block_manager.allocate(seq_group)
  365. # Free allocated seq.
  366. decoder_prompt_blocks = len(block_manager.get_block_table(decoder_prompt))
  367. encoder_prompt_blocks = len(block_manager.get_cross_block_table(seq_group))
  368. prompt_blocks = decoder_prompt_blocks + encoder_prompt_blocks
  369. before_blocks = block_manager.get_num_free_gpu_blocks()
  370. block_manager.free(decoder_prompt)
  371. block_manager.free_cross(seq_group)
  372. after_blocks = block_manager.get_num_free_gpu_blocks()
  373. assert after_blocks == before_blocks + prompt_blocks
  374. # Block table for freed encoder & decoder seq's are deleted.
  375. with pytest.raises(KeyError):
  376. block_manager.get_block_table(decoder_prompt)
  377. # Block table for freed encoder & decoder seq's are deleted.
  378. with pytest.raises(KeyError):
  379. block_manager.get_block_table(encoder_prompt)
  380. def test_reset():
  381. block_size = 4
  382. num_cpu_blocks = 4
  383. num_gpu_blocks = 4
  384. block_manager = BlockSpaceManagerV1(block_size,
  385. num_cpu_blocks,
  386. num_gpu_blocks,
  387. watermark=0)
  388. # Allocate same seq group on all available gpu blocks.
  389. original_blocks = block_manager.get_num_free_gpu_blocks()
  390. for i in range(num_gpu_blocks):
  391. _, seq_group = create_dummy_prompt(str(i), block_size)
  392. block_manager.allocate(seq_group)
  393. assert block_manager.get_num_free_gpu_blocks() == 0
  394. # Resetting block manager frees all allocated blocks.
  395. block_manager.reset()
  396. assert block_manager.get_num_free_gpu_blocks() == original_blocks
  397. def test_reset_encoder_decoder():
  398. block_size = 4
  399. num_cpu_blocks = 4
  400. num_gpu_blocks = 4
  401. block_req_per_seq_group = 2
  402. block_manager = BlockSpaceManagerV1(block_size,
  403. num_cpu_blocks,
  404. num_gpu_blocks,
  405. watermark=0)
  406. # Allocate same seq group on all available gpu blocks.
  407. original_blocks = block_manager.get_num_free_gpu_blocks()
  408. for i in range(num_gpu_blocks // block_req_per_seq_group):
  409. _, _, seq_group = create_dummy_prompt_encoder_decoder(
  410. f"{i}",
  411. decoder_prompt_length=block_size,
  412. encoder_prompt_length=block_size)
  413. block_manager.allocate(seq_group)
  414. assert block_manager.get_num_free_gpu_blocks() == 0
  415. # Resetting block manager frees all allocated blocks.
  416. block_manager.reset()
  417. assert block_manager.get_num_free_gpu_blocks() == original_blocks
  418. def test_sliding_window_multi_seq():
  419. """
  420. Tests that memory allocation and deallocation is handled
  421. correctly with multiple sequences that exceed the sliding
  422. window's capacity.
  423. """
  424. block_size = 1
  425. num_cpu_blocks = 8
  426. num_gpu_blocks = 8
  427. sliding_window = 2
  428. block_manager = BlockSpaceManagerV1(block_size,
  429. num_cpu_blocks,
  430. num_gpu_blocks,
  431. sliding_window=sliding_window,
  432. watermark=0)
  433. assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
  434. parent = Sequence(seq_id=1,
  435. inputs={
  436. "prompt": "one two three",
  437. "prompt_token_ids": [0, 1, 2],
  438. },
  439. block_size=block_size)
  440. seq_group = SequenceGroup(request_id="1",
  441. seqs=[parent],
  442. arrival_time=time.time(),
  443. sampling_params=SamplingParams(),
  444. lora_request=None)
  445. block_manager.allocate(seq_group)
  446. # assert the number of blocks allocated is correct
  447. # the parent seq has len 3, but since sliding_window is 2,
  448. # we will use at most 2 blocks
  449. assert block_manager.get_num_free_gpu_blocks(
  450. ) == num_gpu_blocks - sliding_window
  451. # Fork prompt and copy block tables.
  452. child = parent.fork(2)
  453. block_manager.fork(parent, child)
  454. # assert the number of blocks allocated is correct
  455. # forking does not increase memory consumption
  456. assert block_manager.get_num_free_gpu_blocks(
  457. ) == num_gpu_blocks - sliding_window
  458. # assert both parent and child share all blocks
  459. assert block_manager.get_block_table(
  460. parent) == block_manager.get_block_table(child)
  461. token_id = 4
  462. # Append token to child. Block is shared so copy on write occurs.
  463. child.append_token_id(token_id, {token_id: Logprob(0.0)})
  464. block_manager.append_slots(child)
  465. # assert the number of blocks allocated is correct
  466. # we will use now one block more. Each seq will use 2 blocks,
  467. # but only one can be shared
  468. assert block_manager.get_num_free_gpu_blocks(
  469. ) == num_gpu_blocks - sliding_window - 1
  470. token_id = 5
  471. parent.append_token_id(token_id, {token_id: Logprob(0.0)})
  472. block_manager.append_slots(parent)
  473. # assert the number of blocks allocated is correct
  474. # no change, because both sequences are still just sharing one block
  475. assert block_manager.get_num_free_gpu_blocks(
  476. ) == num_gpu_blocks - sliding_window - 1
  477. block_table_parent = block_manager.get_block_table(parent)
  478. block_table_child = block_manager.get_block_table(child)
  479. assert block_table_parent != block_table_child
  480. # assert both blocks are sharing the second-last block
  481. assert block_table_parent[-2] == block_table_child[-2]
  482. # now let's clean up...
  483. block_manager.free(parent)
  484. # assert the number of blocks allocated is correct
  485. # We have freed one seq, reducing the ref count of two blocks by one.
  486. # One of the two was only used by the parent seq, so this is now free.
  487. # The child seq still consumes sliding_window blocks
  488. assert block_manager.get_num_free_gpu_blocks(
  489. ) == num_gpu_blocks - sliding_window
  490. # free all blocks
  491. block_manager.free(child)
  492. # assert all blocks are free now
  493. assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
  494. def test_mark_blocks_as_computed_with_prefix_cache_and_chunked_prefill():
  495. """When prefix cache and chunked prefill are enabled, the block manager
  496. should only mark a chunk of blocks as computed instead of all blocks.
  497. """
  498. block_size = 4
  499. num_cpu_blocks = 0
  500. num_gpu_blocks = 16
  501. block_manager = BlockSpaceManagerV1(block_size,
  502. num_gpu_blocks,
  503. num_cpu_blocks,
  504. watermark=0,
  505. enable_caching=True)
  506. # Set prompt size to have num_gpu_blocks - 1 full blocks.
  507. prompt_length = block_size * num_gpu_blocks - 1
  508. # Allocate (reserve) all blocks.
  509. _, seq_group = create_dummy_prompt("0",
  510. prompt_length,
  511. block_size=block_size)
  512. block_manager.allocate(seq_group)
  513. assert seq_group.seqs[0].n_blocks == num_gpu_blocks
  514. # 1st chunk: Compute 2 and half blocks. Should mark 2 blocks as computed.
  515. token_chunk_size = int(block_size * 2.5)
  516. block_manager.mark_blocks_as_computed(seq_group, token_chunk_size)
  517. computed_blocks = block_manager.get_all_computed_blocks(seq_group.seqs[0])
  518. assert len(computed_blocks) == 2
  519. # Actual computed tokens.
  520. seq_group.seqs[0].data.update_num_computed_tokens(token_chunk_size)
  521. # 2nd chunk: Complete 3rd block and additional 4 blocks.
  522. token_chunk_size = int(block_size * 4.5)
  523. block_manager.mark_blocks_as_computed(seq_group, token_chunk_size)
  524. computed_blocks = block_manager.get_all_computed_blocks(seq_group.seqs[0])
  525. assert len(computed_blocks) == 7