test_encoder_decoder_model_runner.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488
  1. from array import array
  2. from typing import List
  3. import pytest
  4. import torch
  5. from aphrodite.common.sequence import (SamplingParams, SequenceData,
  6. SequenceGroupMetadata)
  7. from aphrodite.common.utils import is_cpu
  8. from aphrodite.constants import APHRODITE_TOKEN_ID_ARRAY_TYPE
  9. from aphrodite.engine.args_tools import EngineArgs
  10. from aphrodite.task_handler.enc_dec_model_runner import (
  11. EncoderDecoderModelRunner)
  12. # CUDA graph scenarios to test
  13. #
  14. # Currently CUDA graph is not supported
  15. ENFORCE_EAGER = [True]
  16. BATCH_SIZES = [1, 4, 16, 64, 256]
  17. def _create_model_runner(model: str, *args,
  18. **kwargs) -> EncoderDecoderModelRunner:
  19. engine_args = EngineArgs(model, *args, **kwargs)
  20. engine_config = engine_args.create_engine_config()
  21. model_runner = EncoderDecoderModelRunner(
  22. model_config=engine_config.model_config,
  23. parallel_config=engine_config.parallel_config,
  24. scheduler_config=engine_config.scheduler_config,
  25. device_config=engine_config.device_config,
  26. cache_config=engine_config.cache_config,
  27. load_config=engine_config.load_config,
  28. lora_config=engine_config.lora_config,
  29. prompt_adapter_config=engine_config.prompt_adapter_config,
  30. is_driver_worker=True,
  31. )
  32. return model_runner
  33. @pytest.mark.skipif(condition=is_cpu(),
  34. reason="CPU backend is currently "
  35. "unsupported for encoder/ "
  36. "decoder models")
  37. @pytest.mark.parametrize("enforce_eager", ENFORCE_EAGER)
  38. def test_empty_seq_group(enforce_eager, ):
  39. """Verify prepare prompt and decode returns empty output
  40. for empty seq group list"""
  41. model_runner = _create_model_runner(
  42. "facebook/bart-base",
  43. seed=0,
  44. dtype="float16",
  45. max_num_batched_tokens=100000,
  46. max_num_seqs=100000,
  47. enable_chunked_prefill=False,
  48. enforce_eager=enforce_eager,
  49. )
  50. seq_group_metadata_list: List[SequenceGroupMetadata] = []
  51. model_input = model_runner._prepare_model_input_tensors(
  52. seq_group_metadata_list)
  53. (
  54. input_tokens,
  55. input_positions,
  56. encoder_input_tokens,
  57. encoder_input_positions,
  58. attn_metadata,
  59. return_seq_lens,
  60. ) = (
  61. model_input.input_tokens,
  62. model_input.input_positions,
  63. model_input.encoder_input_tokens,
  64. model_input.encoder_input_positions,
  65. model_input.attn_metadata,
  66. model_input.seq_lens,
  67. )
  68. assert input_tokens is None
  69. assert input_positions is None
  70. assert encoder_input_tokens is None
  71. assert encoder_input_positions is None
  72. assert attn_metadata is None
  73. assert return_seq_lens is None
  74. @pytest.mark.skipif(condition=is_cpu(),
  75. reason="CPU backend is currently "
  76. "unsupported for encoder/ "
  77. "decoder models")
  78. @pytest.mark.parametrize("batch_size", BATCH_SIZES)
  79. @pytest.mark.parametrize("enforce_eager", ENFORCE_EAGER)
  80. def test_prepare_prompt(
  81. batch_size,
  82. enforce_eager,
  83. ):
  84. '''
  85. Test the ability of the encoder/decoder model runner subclass to
  86. produce prefill-phase model inputs & attention metadata.
  87. Test behavior:
  88. * Instantiate BART base model & enc/dec model runner
  89. * Construct sequence-group metadata for dummy prompts
  90. * Test that encoder attention, decoder self-attention,
  91. and encoder/decoder cross-attention inputs are correct
  92. Arguments:
  93. * batch_size
  94. * backend_name: The attention backend under test
  95. * enforce_eager: Enforce eager mode if True (i.e. no CUDAGraph)
  96. '''
  97. model_runner = _create_model_runner(
  98. "facebook/bart-base",
  99. seed=0,
  100. dtype="float16",
  101. max_num_batched_tokens=100000,
  102. max_num_seqs=100000,
  103. enable_chunked_prefill=False,
  104. enforce_eager=enforce_eager,
  105. )
  106. seq_lens: List[int] = []
  107. encoder_seq_lens: List[int] = []
  108. seq_group_metadata_list: List[SequenceGroupMetadata] = []
  109. block_tables = {0: [1]}
  110. cross_block_table = [2]
  111. for i in range(batch_size):
  112. # make sure all tokens fit into one block
  113. seq_len = i % (model_runner.block_size - 1) + 1
  114. seq_lens.append(seq_len)
  115. seq_data = SequenceData(array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
  116. range(seq_len)))
  117. encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1
  118. encoder_seq_lens.append(encoder_seq_len)
  119. encoder_seq_data = SequenceData(
  120. array(APHRODITE_TOKEN_ID_ARRAY_TYPE, range(encoder_seq_len)))
  121. seq_group_metadata = SequenceGroupMetadata(
  122. request_id=f"test_{i}",
  123. is_prompt=True,
  124. seq_data={0: seq_data},
  125. sampling_params=SamplingParams(temperature=0),
  126. block_tables=block_tables,
  127. encoder_seq_data=encoder_seq_data,
  128. cross_block_table=cross_block_table,
  129. )
  130. assert seq_group_metadata.token_chunk_size == seq_data.get_len()
  131. seq_group_metadata_list.append(seq_group_metadata)
  132. # Build
  133. # * Decoder model inputs
  134. # * Decoder self-attention KV caching data structures
  135. # * Encoder model inputs
  136. # * Encoder/decoder cross-attention KV caching data structures
  137. model_input = model_runner.prepare_model_input(seq_group_metadata_list)
  138. input_tokens = model_input.input_tokens
  139. input_positions = model_input.input_positions
  140. attn_metadata = model_input.attn_metadata
  141. return_seq_lens = model_input.seq_lens
  142. slot_mapping = attn_metadata.slot_mapping
  143. encoder_input_tokens = model_input.encoder_input_tokens
  144. encoder_input_positions = model_input.encoder_input_positions
  145. cross_slot_mapping = attn_metadata.cross_slot_mapping
  146. assert return_seq_lens == seq_lens
  147. assert len(slot_mapping) == len(input_tokens)
  148. assert len(cross_slot_mapping) == len(encoder_input_tokens)
  149. # Verify input metadata is correct for prompts.
  150. # - Decoder attention metadata
  151. device = model_runner.device
  152. assert attn_metadata.num_prefills > 0
  153. assert attn_metadata.num_decode_tokens == 0
  154. assert torch.equal(attn_metadata.seq_lens_tensor,
  155. torch.tensor(seq_lens, device=device, dtype=torch.int))
  156. assert attn_metadata.seq_lens == seq_lens
  157. assert attn_metadata.max_prefill_seq_len == max(seq_lens)
  158. assert attn_metadata.max_decode_seq_len == 0
  159. # - Encoder attention metadata
  160. assert attn_metadata.encoder_seq_lens == encoder_seq_lens
  161. assert torch.equal(
  162. attn_metadata.encoder_seq_lens_tensor,
  163. torch.tensor(encoder_seq_lens, device=device, dtype=torch.int))
  164. assert attn_metadata.max_encoder_seq_len == max(encoder_seq_lens)
  165. assert attn_metadata.num_encoder_tokens == sum(encoder_seq_lens)
  166. # Test decoder subquery start locs.
  167. start_idx = 0
  168. start_loc = [start_idx]
  169. for seq_len in seq_lens:
  170. start_idx += seq_len
  171. start_loc.append(start_idx)
  172. assert torch.equal(
  173. attn_metadata.query_start_loc,
  174. torch.tensor(start_loc, dtype=torch.int32, device=device),
  175. )
  176. # Test decoder seq start locs & context lengths
  177. assert torch.equal(
  178. attn_metadata.seq_start_loc,
  179. torch.tensor(start_loc, dtype=torch.int32, device=device),
  180. )
  181. assert torch.equal(
  182. attn_metadata.context_lens_tensor,
  183. torch.zeros(attn_metadata.context_lens_tensor.shape[0],
  184. dtype=torch.int,
  185. device=device),
  186. )
  187. # Verify block tables are correct for prompts
  188. # - Decoder self-attention
  189. expected = torch.tensor(
  190. [[] for _ in range(len(seq_group_metadata_list))],
  191. dtype=torch.int32,
  192. device=model_runner.device,
  193. )
  194. assert torch.equal(
  195. attn_metadata.block_tables,
  196. expected,
  197. )
  198. # - Encoder/decoder cross-attention
  199. assert torch.equal(
  200. attn_metadata.cross_block_tables,
  201. expected,
  202. )
  203. # Cuda graph should not be used for prefill.
  204. assert attn_metadata.use_cuda_graph is False
  205. # Verify the lengths of input tokens & positions
  206. # - Decoder
  207. assert len(input_tokens) == sum(seq_lens)
  208. assert len(input_positions) == sum(seq_lens)
  209. # -- An indirect check that model_input.input_tokens
  210. # and model_input.input_positions are correct -
  211. # by design of the test, the input tokens are
  212. # equal to the input position values, so if
  213. # the model_input data structure has the correct
  214. # values then these two should be equal
  215. assert torch.equal(
  216. input_tokens,
  217. input_positions,
  218. )
  219. # - Encoder
  220. assert len(encoder_input_tokens) == sum(encoder_seq_lens)
  221. # -- An indirect check that model_input.encoder_input_tokens
  222. # and model_input.encoder_input_positions are correct -
  223. # by design of the test, the input tokens are
  224. # equal to the input position values, so if
  225. # the model_input data structure has the correct
  226. # values then these two should be equal
  227. assert torch.equal(
  228. encoder_input_tokens,
  229. encoder_input_positions,
  230. )
  231. # Test that vLLM sampling infrastructure chooses the correct
  232. # sequence positions at which to sample (i.e. the end of
  233. # each sequence) in the prefill phase
  234. expected_selected_token_indices = []
  235. selected_token_start_idx = 0
  236. for seq_len in seq_lens:
  237. # Compute the index offset of the final token in each
  238. # prompt (recall that the prompts are concatenated)
  239. expected_selected_token_indices.append(selected_token_start_idx +
  240. seq_len - 1)
  241. selected_token_start_idx += seq_len
  242. sampling_metadata = model_input.sampling_metadata
  243. actual = sampling_metadata.selected_token_indices
  244. expected = torch.tensor(
  245. expected_selected_token_indices,
  246. device=actual.device,
  247. dtype=actual.dtype,
  248. )
  249. assert torch.equal(actual, expected)
  250. @pytest.mark.skipif(condition=is_cpu(),
  251. reason="CPU backend is currently "
  252. "unsupported for encoder/ "
  253. "decoder models")
  254. @pytest.mark.parametrize("batch_size", BATCH_SIZES)
  255. @pytest.mark.parametrize("enforce_eager", ENFORCE_EAGER)
  256. def test_prepare_decode(
  257. batch_size,
  258. enforce_eager,
  259. ):
  260. '''
  261. Test the ability of the encoder/decoder model runner subclass to
  262. produce decode-phase model inputs & attention metadata.
  263. Test behavior:
  264. * Instantiate BART base model & enc/dec model runner
  265. * Construct sequence-group metadata for dummy prompts
  266. * Test that encoder attention, decoder self-attention,
  267. and encoder/decoder cross-attention inputs are correct
  268. Arguments:
  269. * batch_size
  270. * backend_name: The attention backend under test
  271. * enforce_eager: Enforce eager mode if True (i.e. no CUDAGraph)
  272. '''
  273. model_runner = _create_model_runner(
  274. "facebook/bart-base",
  275. seed=0,
  276. dtype="float16",
  277. max_num_batched_tokens=100000,
  278. max_num_seqs=100000,
  279. enable_chunked_prefill=False,
  280. enforce_eager=enforce_eager,
  281. )
  282. seq_lens: List[int] = []
  283. encoder_seq_lens: List[int] = []
  284. seq_group_metadata_list: List[SequenceGroupMetadata] = []
  285. block_tables = {0: [1]}
  286. cross_block_table = [2]
  287. for i in range(batch_size):
  288. # make sure all tokens fit into one block
  289. seq_len = i % (model_runner.block_size - 1) + 1
  290. seq_lens.append(seq_len)
  291. seq_data = SequenceData(
  292. array(APHRODITE_TOKEN_ID_ARRAY_TYPE, (range(seq_len))))
  293. encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1
  294. encoder_seq_lens.append(encoder_seq_len)
  295. encoder_seq_data = SequenceData(
  296. array(APHRODITE_TOKEN_ID_ARRAY_TYPE, (range(encoder_seq_len))))
  297. seq_group_metadata = SequenceGroupMetadata(
  298. request_id=f"test_{i}",
  299. is_prompt=False,
  300. seq_data={0: seq_data},
  301. sampling_params=SamplingParams(temperature=0),
  302. block_tables=block_tables,
  303. encoder_seq_data=encoder_seq_data,
  304. cross_block_table=cross_block_table,
  305. )
  306. assert seq_group_metadata.token_chunk_size == 1
  307. seq_group_metadata_list.append(seq_group_metadata)
  308. # Build
  309. # * Decoder model inputs
  310. # * Decoder self-attention KV caching data structures
  311. # * Encoder model inputs
  312. # * Encoder/decoder cross-attention KV caching data structures
  313. model_input = model_runner.prepare_model_input(seq_group_metadata_list)
  314. input_tokens = model_input.input_tokens
  315. input_positions = model_input.input_positions
  316. attn_metadata = model_input.attn_metadata
  317. return_seq_lens = model_input.seq_lens
  318. slot_mapping = attn_metadata.slot_mapping
  319. encoder_input_tokens = model_input.encoder_input_tokens
  320. encoder_input_positions = model_input.encoder_input_positions
  321. cross_slot_mapping = attn_metadata.cross_slot_mapping
  322. assert return_seq_lens == seq_lens
  323. assert len(slot_mapping) == len(input_tokens)
  324. assert len(cross_slot_mapping) == len(encoder_input_tokens)
  325. # Verify input metadata is correct for decode phase.
  326. # - Decoder attention metadata
  327. device = model_runner.device
  328. assert attn_metadata.num_prefills == 0
  329. assert attn_metadata.num_decode_tokens > 0
  330. assert torch.equal(attn_metadata.seq_lens_tensor,
  331. torch.tensor(seq_lens, device=device, dtype=torch.int))
  332. assert attn_metadata.seq_lens == seq_lens
  333. assert attn_metadata.max_prefill_seq_len == 0
  334. assert attn_metadata.max_decode_seq_len == max(seq_lens)
  335. # - Encoder attention metadata
  336. assert attn_metadata.encoder_seq_lens == encoder_seq_lens
  337. assert torch.equal(
  338. attn_metadata.encoder_seq_lens_tensor,
  339. torch.tensor(encoder_seq_lens, device=device, dtype=torch.int))
  340. assert attn_metadata.max_encoder_seq_len == max(encoder_seq_lens)
  341. assert attn_metadata.num_encoder_tokens == sum(encoder_seq_lens)
  342. # Test decoder subquery start locs.
  343. start_idx = 0
  344. start_loc = [start_idx]
  345. for seq_len in seq_lens:
  346. start_idx += 1
  347. start_loc.append(start_idx)
  348. assert torch.equal(
  349. attn_metadata.query_start_loc,
  350. torch.tensor(start_loc, dtype=torch.int32, device=device),
  351. )
  352. # Test decoder seq start locs. Note that for normal prefill it is
  353. # equivalent to query_start_loc.
  354. start_idx = 0
  355. seq_start_loc = [start_idx]
  356. for seq_len in seq_lens:
  357. start_idx += seq_len
  358. seq_start_loc.append(start_idx)
  359. # Test seq_start_loc and context lengths
  360. assert torch.equal(
  361. attn_metadata.seq_start_loc,
  362. torch.tensor(seq_start_loc, dtype=torch.int32, device=device),
  363. )
  364. assert torch.equal(
  365. attn_metadata.context_lens_tensor,
  366. torch.tensor([seq_len - 1 for seq_len in seq_lens],
  367. dtype=torch.int,
  368. device=device))
  369. # Verify block tables are correct for prompts
  370. # - Decoder self-attention
  371. expected = torch.tensor(
  372. [block_tables[0] for _ in range(len(seq_group_metadata_list))],
  373. dtype=torch.int32,
  374. device=model_runner.device)
  375. assert torch.equal(
  376. attn_metadata.block_tables,
  377. expected,
  378. )
  379. # - Encoder/decoder cross-attention
  380. expected = torch.tensor(
  381. [cross_block_table for _ in range(len(seq_group_metadata_list))],
  382. dtype=torch.int32,
  383. device=model_runner.device)
  384. assert torch.equal(
  385. attn_metadata.cross_block_tables,
  386. expected,
  387. )
  388. # Cuda graph should is currently not supported for encoder/decoer.
  389. assert attn_metadata.use_cuda_graph is False
  390. # Verify the lengths of input tokens & positions
  391. # - Decoder
  392. assert len(input_tokens) == len(seq_lens)
  393. assert len(input_positions) == len(seq_lens)
  394. # -- An indirect check that model_input.input_tokens
  395. # and model_input.input_positions are correct -
  396. # by design of the test, the input tokens are
  397. # equal to the input position values, so if
  398. # the model_input data structure has the correct
  399. # values then these two should be equal
  400. assert torch.equal(
  401. input_tokens,
  402. input_positions,
  403. )
  404. # - Encoder
  405. assert len(encoder_input_tokens) == 0
  406. assert len(encoder_input_tokens) == 0
  407. # -- An indirect check that model_input.encoder_input_tokens
  408. # and model_input.encoder_input_positions are correct -
  409. # by design of the test, the input tokens are
  410. # equal to the input position values, so if
  411. # the model_input data structure has the correct
  412. # values then these two should be equal
  413. assert torch.equal(
  414. encoder_input_tokens,
  415. encoder_input_positions,
  416. )
  417. # Test that vLLM sampling infrastructure chooses the correct
  418. # sequence positions at which to sample (i.e. the end of
  419. # each sequence) in the decode phase
  420. expected_selected_token_indices = []
  421. selected_token_start_idx = 0
  422. for seq_len in seq_lens:
  423. # Compute the index offset of the final token in each
  424. # sequence's decoded outputs; since a single token is
  425. # decoded per iteration per sequence, then the length
  426. # of the decoded tokens for a given sequence is 1 and
  427. # the final index offset into a given sequence's
  428. # generated tokens is 0 (i.e. the expected sampling index
  429. # for a given sequence is just `selected_token_start_idx`)
  430. expected_selected_token_indices.append(selected_token_start_idx)
  431. selected_token_start_idx += 1
  432. sampling_metadata = model_input.sampling_metadata
  433. actual = sampling_metadata.selected_token_indices
  434. expected = torch.tensor(
  435. expected_selected_token_indices,
  436. device=actual.device,
  437. dtype=actual.dtype,
  438. )
  439. assert torch.equal(actual, expected)