test_qwen.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. import pathlib
  2. from typing import Dict, List, Optional, Tuple, Type, Union
  3. import pytest
  4. import torch
  5. from PIL.Image import Image
  6. from aphrodite.inputs import InputContext, LLMInputs
  7. from aphrodite.multimodal.base import MultiModalInputs
  8. from aphrodite.multimodal.utils import cached_get_tokenizer, rescale_image_size
  9. from ....conftest import (IMAGE_ASSETS, AphroditeRunner, HfRunner, ImageAsset,
  10. PromptImageInput, _ImageAssets)
  11. from ...utils import build_model_context, check_logprobs_close
  12. text_only_models = [
  13. "Qwen/Qwen-7B-Chat" # Has no visual component
  14. ]
  15. multimodal_models = ["Qwen/Qwen-VL"]
  16. HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
  17. "stop_sign":
  18. "Picture 1: <img></img>\nWhat's the content of the image?: ",
  19. "cherry_blossom":
  20. "Picture 1: <img></img>\nWhat is the season?: ",
  21. })
  22. HF_MULTIIMAGE_IMAGE_PROMPT = "Picture 1: <img></img>\nPicture 2: <img></img>\nCan you compare these images?\n" # noqa: E501
  23. HF_MULTIIMAGE_IMAGE_PROMPT = "Picture 1: <img></img>\nPicture 2: <img></img>\nDescribe the two images in detail.\n" # noqa: E501
  24. ### Multimodal preprocessing tests
  25. SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
  26. # These values are specific to Qwen-VL/Chat; we can get these from the model
  27. # config also, but they are hardcoded here to keep the parameterize/fixtures
  28. # easy to read.
  29. IMG_START_ID = 151857
  30. IMG_END_ID = 151858
  31. IMG_PAD_ID = 151859
  32. TOKS_PER_IMG = 256
  33. VIS_ENC_DIM = 4096
  34. IMG_SIZE = 448
  35. @pytest.fixture()
  36. def input_mapper_for_qwen():
  37. # Lazy import to avoid initializing CUDA during test collection
  38. from aphrodite.modeling.models.qwen import input_mapper_for_qwen
  39. return input_mapper_for_qwen
  40. @pytest.fixture()
  41. def input_processor_for_qwen():
  42. # Lazy import to avoid initializing CUDA during test collection
  43. from aphrodite.modeling.models.qwen import input_processor_for_qwen
  44. return input_processor_for_qwen
  45. @pytest.fixture()
  46. def qwen_vl_context() -> InputContext:
  47. """Get an InputContext for Qwen-VL."""
  48. return build_model_context(model_name="Qwen/Qwen-VL",
  49. trust_remote_code=True)
  50. # Happy path tests for single/multi-image scenarios for the multimodal
  51. # input processor and mapper, respectively
  52. @pytest.mark.parametrize("num_images", [1, 2])
  53. def test_input_processor_valid_mm_data(input_processor_for_qwen,
  54. qwen_vl_context: InputContext,
  55. num_images: int):
  56. """Happy cases for image inputs to Qwen's multimodal input processor."""
  57. prompt = "".join(
  58. [f"Picture {num}: <img></img>\n" for num in range(1, num_images + 1)])
  59. inputs = LLMInputs(
  60. prompt=prompt,
  61. # When processing multimodal data for a multimodal model, the qwen
  62. # input processor will overwrite the provided prompt_token_ids with
  63. # the image prompts
  64. prompt_token_ids=None,
  65. multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
  66. )
  67. proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
  68. assert isinstance(proc_inputs, dict)
  69. # Each image should have one start / stop and a fixed context of 256
  70. proc_tokens = proc_inputs["prompt_token_ids"]
  71. assert proc_tokens.count(IMG_START_ID) == num_images
  72. assert proc_tokens.count(IMG_END_ID) == num_images
  73. assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
  74. @pytest.mark.parametrize(
  75. "img_data,expected_shape",
  76. [
  77. # single / multi-image
  78. (SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)),
  79. (2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)),
  80. # single / multi-image embeddings
  81. (torch.rand(
  82. (TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
  83. (torch.rand(
  84. (1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
  85. (torch.rand(
  86. (2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)),
  87. ])
  88. def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
  89. qwen_vl_context: InputContext,
  90. img_data: Union[torch.Tensor, List[Image],
  91. Image],
  92. expected_shape: List[int]):
  93. """Happy cases for image inputs to Qwen's multimodal input mapper."""
  94. mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
  95. # Ensure that we get the appropriately shaped pixel_values
  96. # for images and image embeddings, respectively.
  97. assert isinstance(mapped_img_data, MultiModalInputs)
  98. assert "pixel_values" in mapped_img_data
  99. assert mapped_img_data["pixel_values"].shape == expected_shape
  100. # Sad path tests for the multimodal input processor and mapper, respectively
  101. @pytest.mark.parametrize("mm_data", [
  102. {
  103. "image": torch.rand((5))
  104. },
  105. {
  106. "image": torch.rand((5, 5, 5, 5, 5))
  107. },
  108. ])
  109. def test_input_processor_invalid_mm_data(input_processor_for_qwen,
  110. qwen_vl_context: InputContext,
  111. mm_data: Dict[str, torch.Tensor]):
  112. """Test sad cases validated in Qwen's multimodal input processor."""
  113. tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer,
  114. trust_remote_code=True)
  115. prompt = "Picture 1: <img></img>\n"
  116. prompt_token_ids = tokenizer.encode(prompt)
  117. inputs = LLMInputs(prompt=prompt,
  118. prompt_token_ids=prompt_token_ids,
  119. multi_modal_data=mm_data)
  120. # Should fail since we have too many or too few dimensions for embeddings
  121. with pytest.raises(ValueError):
  122. input_processor_for_qwen(qwen_vl_context, inputs)
  123. @pytest.mark.parametrize(
  124. "img_data",
  125. [
  126. # Wrong context length
  127. torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)),
  128. # Wrong visual encoder output size
  129. torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)),
  130. ])
  131. def test_input_mapper_invalid_mm_data(
  132. input_mapper_for_qwen,
  133. qwen_vl_context: InputContext,
  134. img_data: Union[torch.Tensor, List[Image], Image],
  135. ):
  136. """Sad cases validated in Qwen VL's multimodal input mapper."""
  137. with pytest.raises(ValueError):
  138. input_mapper_for_qwen(qwen_vl_context, img_data)
  139. ### End-to-end generation tests
  140. def get_prompt_with_path(tmp_path: pathlib.PosixPath, prompt: str,
  141. assets: Union[_ImageAssets, List[ImageAsset]]) -> str:
  142. """Given a temporary dir path, export one or more image assets into the
  143. tempdir & replace its contents with the local path to the string so that
  144. the HF version of Qwen-VL can resolve the path and load the image ni its
  145. forward() call.
  146. Args:
  147. tmp_path: Tempdir for test under consideration.
  148. prompt: Prompt with image placeholders.
  149. assets: List of image assets whose len equals the num placeholders.
  150. """
  151. # Ensure that the number of placeholders matches the number of assets;
  152. # If this is not true, the test is probably written incorrectly.
  153. assert prompt.count("<img></img>") == len(assets)
  154. # Replace the placeholders with local paths to the exported assets
  155. for asset in assets:
  156. image_tmp_path = tmp_path / f"{asset.name}.jpg"
  157. asset.pil_image.save(image_tmp_path)
  158. prompt = prompt.replace(
  159. "<img></img>",
  160. f"<img>{image_tmp_path}</img>",
  161. 1,
  162. )
  163. return prompt
  164. def run_test(
  165. hf_runner: Type[HfRunner],
  166. aphrodite_runner: Type[AphroditeRunner],
  167. inputs: List[Tuple[List[str], PromptImageInput]],
  168. model: str,
  169. *,
  170. dtype: str,
  171. max_tokens: int,
  172. num_logprobs: int,
  173. mm_limit: int,
  174. tensor_parallel_size: int,
  175. distributed_executor_backend: Optional[str] = None,
  176. ):
  177. """Inference result should be the same between hf and aphrodite.
  178. All the image fixtures for the test is under tests/images.
  179. For huggingface runner, we provide the PIL images as input.
  180. For aphrodite runner, we provide MultiModalDataDict objects
  181. and corresponding MultiModalConfig as input.
  182. Note, the text input is also adjusted to abide by aphrodite contract.
  183. The text output is sanitized to be able to compare with hf.
  184. """
  185. # NOTE: take care of the order. run Aphrodite first, and then run HF.
  186. # Aphrodite needs a fresh new process without cuda initialization.
  187. # if we run HF first, the cuda initialization will be done and it
  188. # will hurt multiprocessing backend with fork method (the default method).
  189. # max_model_len should be greater than image_feature_size
  190. # Qwen encodes each image into a fixed content size of 256
  191. with aphrodite_runner(model,
  192. max_model_len=1024,
  193. max_num_seqs=1,
  194. dtype=dtype,
  195. limit_mm_per_prompt={"image": mm_limit},
  196. tensor_parallel_size=tensor_parallel_size,
  197. distributed_executor_backend=distributed_executor_backend,
  198. enforce_eager=True) as aphrodite_model:
  199. aphrodite_outputs_per_image = [
  200. aphrodite_model.generate_greedy_logprobs(prompts,
  201. max_tokens,
  202. num_logprobs=num_logprobs,
  203. images=images)
  204. for prompts, images in inputs
  205. ]
  206. with hf_runner(model, dtype=dtype) as hf_model:
  207. hf_outputs_per_image = [
  208. hf_model.generate_greedy_logprobs_limit(prompts,
  209. max_tokens,
  210. num_logprobs=num_logprobs,
  211. images=images)
  212. for prompts, images in inputs
  213. ]
  214. for hf_outputs, aphrodite_outputs in zip(hf_outputs_per_image,
  215. aphrodite_outputs_per_image):
  216. check_logprobs_close(
  217. outputs_0_lst=hf_outputs,
  218. outputs_1_lst=aphrodite_outputs,
  219. name_0="hf",
  220. name_1="aphrodite",
  221. )
  222. @pytest.mark.parametrize("model", multimodal_models)
  223. @pytest.mark.parametrize(
  224. "size_factors",
  225. [
  226. # No image
  227. [],
  228. # Single-scale
  229. [1.0],
  230. # Single-scale, batched
  231. [1.0, 1.0, 1.0],
  232. # Multi-scale
  233. [0.25, 0.5, 1.0],
  234. ],
  235. )
  236. @pytest.mark.parametrize("dtype", ["bfloat16"])
  237. @pytest.mark.parametrize("max_tokens", [8])
  238. @pytest.mark.parametrize("num_logprobs", [5])
  239. def test_multimodal_models_single_image(tmp_path: pathlib.PosixPath,
  240. hf_runner: Type[HfRunner],
  241. aphrodite_runner: Type[AphroditeRunner],
  242. image_assets: _ImageAssets, model: str,
  243. size_factors: List[float], dtype: str,
  244. max_tokens: int,
  245. num_logprobs: int) -> None:
  246. """Tests multimodal models with single image prompts."""
  247. images = [asset.pil_image for asset in image_assets]
  248. prompts = [
  249. get_prompt_with_path(tmp_path, prompt, [asset])
  250. for prompt, asset in zip(HF_IMAGE_PROMPTS, image_assets)
  251. ]
  252. inputs = [(
  253. [prompt for _ in size_factors],
  254. [rescale_image_size(image, factor) for factor in size_factors],
  255. ) for image, prompt in zip(images, prompts)]
  256. run_test(
  257. hf_runner,
  258. aphrodite_runner,
  259. inputs,
  260. model,
  261. dtype=dtype,
  262. max_tokens=max_tokens,
  263. num_logprobs=num_logprobs,
  264. mm_limit=1,
  265. tensor_parallel_size=1,
  266. )
  267. @pytest.mark.parametrize("model", multimodal_models)
  268. @pytest.mark.parametrize(
  269. "size_factors",
  270. [
  271. # No image
  272. [],
  273. # Single-scale
  274. [1.0],
  275. # Single-scale, batched
  276. [1.0, 1.0, 1.0],
  277. # Multi-scale
  278. [0.25, 0.5, 1.0],
  279. ],
  280. )
  281. @pytest.mark.parametrize("dtype", ["bfloat16"])
  282. @pytest.mark.parametrize("max_tokens", [128])
  283. @pytest.mark.parametrize("num_logprobs", [5])
  284. def test_multimodal_models_multi_image(tmp_path: pathlib.PosixPath,
  285. hf_runner: Type[HfRunner],
  286. aphrodite_runner: Type[AphroditeRunner],
  287. image_assets: _ImageAssets, model: str,
  288. size_factors: List[float], dtype: str,
  289. max_tokens: int,
  290. num_logprobs: int) -> None:
  291. """Tests multimodal models with multi-image prompts."""
  292. images = [asset.pil_image for asset in image_assets]
  293. # Put all of the images into one prompt.
  294. prompt = get_prompt_with_path(tmp_path, HF_MULTIIMAGE_IMAGE_PROMPT,
  295. image_assets)
  296. inputs = [([prompt for _ in size_factors],
  297. [[rescale_image_size(image, factor) for image in images]
  298. for factor in size_factors])]
  299. run_test(
  300. hf_runner,
  301. aphrodite_runner,
  302. inputs,
  303. model,
  304. dtype=dtype,
  305. max_tokens=max_tokens,
  306. num_logprobs=num_logprobs,
  307. mm_limit=2,
  308. tensor_parallel_size=1,
  309. )
  310. # Ensure that a text-only Qwen model can still be loaded and
  311. # used for inference in Aphrodite without throwing.
  312. @pytest.mark.parametrize("model", text_only_models)
  313. @pytest.mark.parametrize("dtype", ["bfloat16"])
  314. @pytest.mark.parametrize("max_tokens", [32])
  315. @pytest.mark.parametrize("num_logprobs", [5])
  316. def test_text_only_qwen_model_can_be_loaded_and_run(
  317. aphrodite_runner: Type[AphroditeRunner],
  318. example_prompts: List[str],
  319. model: str,
  320. *,
  321. dtype: str,
  322. max_tokens: int,
  323. num_logprobs: int,
  324. ):
  325. with aphrodite_runner(model, dtype=dtype) as aphrodite_model:
  326. aphrodite_model.generate_greedy_logprobs(
  327. example_prompts,
  328. max_tokens,
  329. num_logprobs=num_logprobs,
  330. )