test_chat_utils.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389
  1. import warnings
  2. from typing import Optional
  3. import pytest
  4. from PIL import Image
  5. from aphrodite.assets.image import ImageAsset
  6. from aphrodite.common.config import ModelConfig
  7. from aphrodite.endpoints.chat_utils import (parse_chat_messages,
  8. parse_chat_messages_futures)
  9. from aphrodite.multimodal import MultiModalDataDict
  10. from aphrodite.multimodal.utils import encode_image_base64
  11. from aphrodite.transformers_utils.tokenizer_group import TokenizerGroup
  12. PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
  13. @pytest.fixture(scope="module")
  14. def phi3v_model_config():
  15. return ModelConfig(PHI3V_MODEL_ID,
  16. PHI3V_MODEL_ID,
  17. tokenizer_mode="auto",
  18. trust_remote_code=True,
  19. dtype="bfloat16",
  20. seed=0,
  21. limit_mm_per_prompt={
  22. "image": 2,
  23. })
  24. @pytest.fixture(scope="module")
  25. def phi3v_tokenizer():
  26. return TokenizerGroup(
  27. tokenizer_id=PHI3V_MODEL_ID,
  28. enable_lora=False,
  29. max_num_seqs=5,
  30. max_input_length=None,
  31. )
  32. @pytest.fixture(scope="module")
  33. def image_url():
  34. image = ImageAsset('cherry_blossom')
  35. base64 = encode_image_base64(image.pil_image)
  36. return f"data:image/jpeg;base64,{base64}"
  37. def _assert_mm_data_is_image_input(
  38. mm_data: Optional[MultiModalDataDict],
  39. image_count: int,
  40. ) -> None:
  41. assert mm_data is not None
  42. assert set(mm_data.keys()) == {"image"}
  43. image_data = mm_data.get("image")
  44. assert image_data is not None
  45. if image_count == 1:
  46. assert isinstance(image_data, Image.Image)
  47. else:
  48. assert isinstance(image_data, list) and len(image_data) == image_count
  49. def test_parse_chat_messages_single_image(
  50. phi3v_model_config,
  51. phi3v_tokenizer,
  52. image_url,
  53. ):
  54. conversation, mm_data = parse_chat_messages([{
  55. "role":
  56. "user",
  57. "content": [{
  58. "type": "image_url",
  59. "image_url": {
  60. "url": image_url
  61. }
  62. }, {
  63. "type": "text",
  64. "text": "What's in the image?"
  65. }]
  66. }], phi3v_model_config, phi3v_tokenizer)
  67. assert conversation == [{
  68. "role": "user",
  69. "content": "<|image_1|>\nWhat's in the image?"
  70. }]
  71. _assert_mm_data_is_image_input(mm_data, 1)
  72. @pytest.mark.asyncio
  73. async def test_parse_chat_messages_single_image_async(
  74. phi3v_model_config,
  75. phi3v_tokenizer,
  76. image_url,
  77. ):
  78. conversation, mm_future = parse_chat_messages_futures([{
  79. "role":
  80. "user",
  81. "content": [{
  82. "type": "image_url",
  83. "image_url": {
  84. "url": image_url
  85. }
  86. }, {
  87. "type": "text",
  88. "text": "What's in the image?"
  89. }]
  90. }], phi3v_model_config, phi3v_tokenizer)
  91. assert conversation == [{
  92. "role": "user",
  93. "content": "<|image_1|>\nWhat's in the image?"
  94. }]
  95. _assert_mm_data_is_image_input(await mm_future, 1)
  96. def test_parse_chat_messages_multiple_images(
  97. phi3v_model_config,
  98. phi3v_tokenizer,
  99. image_url,
  100. ):
  101. conversation, mm_data = parse_chat_messages([{
  102. "role":
  103. "user",
  104. "content": [{
  105. "type": "image_url",
  106. "image_url": {
  107. "url": image_url
  108. }
  109. }, {
  110. "type": "image_url",
  111. "image_url": {
  112. "url": image_url
  113. }
  114. }, {
  115. "type": "text",
  116. "text": "What's in these images?"
  117. }]
  118. }], phi3v_model_config, phi3v_tokenizer)
  119. assert conversation == [{
  120. "role":
  121. "user",
  122. "content":
  123. "<|image_1|>\n<|image_2|>\nWhat's in these images?"
  124. }]
  125. _assert_mm_data_is_image_input(mm_data, 2)
  126. @pytest.mark.asyncio
  127. async def test_parse_chat_messages_multiple_images_async(
  128. phi3v_model_config,
  129. phi3v_tokenizer,
  130. image_url,
  131. ):
  132. conversation, mm_future = parse_chat_messages_futures([{
  133. "role":
  134. "user",
  135. "content": [{
  136. "type": "image_url",
  137. "image_url": {
  138. "url": image_url
  139. }
  140. }, {
  141. "type": "image_url",
  142. "image_url": {
  143. "url": image_url
  144. }
  145. }, {
  146. "type": "text",
  147. "text": "What's in these images?"
  148. }]
  149. }], phi3v_model_config, phi3v_tokenizer)
  150. assert conversation == [{
  151. "role":
  152. "user",
  153. "content":
  154. "<|image_1|>\n<|image_2|>\nWhat's in these images?"
  155. }]
  156. _assert_mm_data_is_image_input(await mm_future, 2)
  157. def test_parse_chat_messages_placeholder_already_in_prompt(
  158. phi3v_model_config,
  159. phi3v_tokenizer,
  160. image_url,
  161. ):
  162. conversation, mm_data = parse_chat_messages([{
  163. "role":
  164. "user",
  165. "content": [{
  166. "type": "image_url",
  167. "image_url": {
  168. "url": image_url
  169. }
  170. }, {
  171. "type": "image_url",
  172. "image_url": {
  173. "url": image_url
  174. }
  175. }, {
  176. "type":
  177. "text",
  178. "text":
  179. "What's in <|image_1|> and how does it compare to <|image_2|>?"
  180. }]
  181. }], phi3v_model_config, phi3v_tokenizer)
  182. assert conversation == [{
  183. "role":
  184. "user",
  185. "content":
  186. "What's in <|image_1|> and how does it compare to <|image_2|>?"
  187. }]
  188. _assert_mm_data_is_image_input(mm_data, 2)
  189. def test_parse_chat_messages_placeholder_one_already_in_prompt(
  190. phi3v_model_config,
  191. phi3v_tokenizer,
  192. image_url,
  193. ):
  194. conversation, mm_data = parse_chat_messages([{
  195. "role":
  196. "user",
  197. "content": [{
  198. "type": "image_url",
  199. "image_url": {
  200. "url": image_url
  201. }
  202. }, {
  203. "type": "image_url",
  204. "image_url": {
  205. "url": image_url
  206. }
  207. }, {
  208. "type":
  209. "text",
  210. "text":
  211. "What's in <|image_1|> and how does it compare to the other one?"
  212. }]
  213. }], phi3v_model_config, phi3v_tokenizer)
  214. assert conversation == [{
  215. "role":
  216. "user",
  217. "content":
  218. "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
  219. "other one?"
  220. }]
  221. _assert_mm_data_is_image_input(mm_data, 2)
  222. def test_parse_chat_messages_multiple_images_across_messages(
  223. phi3v_model_config,
  224. phi3v_tokenizer,
  225. image_url,
  226. ):
  227. conversation, mm_data = parse_chat_messages([{
  228. "role":
  229. "user",
  230. "content": [{
  231. "type": "image_url",
  232. "image_url": {
  233. "url": image_url
  234. }
  235. }, {
  236. "type": "text",
  237. "text": "What's in this image?"
  238. }]
  239. }, {
  240. "role": "assistant",
  241. "content": "Some stuff."
  242. }, {
  243. "role":
  244. "user",
  245. "content": [{
  246. "type": "image_url",
  247. "image_url": {
  248. "url": image_url
  249. }
  250. }, {
  251. "type": "text",
  252. "text": "What about this one?"
  253. }]
  254. }], phi3v_model_config, phi3v_tokenizer)
  255. assert conversation == [
  256. {
  257. "role": "user",
  258. "content": "<|image_1|>\nWhat's in this image?"
  259. },
  260. {
  261. "role": "assistant",
  262. "content": "Some stuff."
  263. },
  264. {
  265. "role": "user",
  266. "content": "<|image_2|>\nWhat about this one?"
  267. },
  268. ]
  269. _assert_mm_data_is_image_input(mm_data, 2)
  270. def test_parse_chat_messages_rejects_too_many_images_in_one_message(
  271. phi3v_model_config,
  272. phi3v_tokenizer,
  273. image_url,
  274. ):
  275. with warnings.catch_warnings():
  276. warnings.filterwarnings(
  277. "ignore",
  278. message="coroutine 'async_get_and_parse_image' was never awaited")
  279. with pytest.raises(
  280. ValueError,
  281. match="At most 2 image\\(s\\) may be provided in one request\\."
  282. ):
  283. parse_chat_messages([{
  284. "role":
  285. "user",
  286. "content": [{
  287. "type": "image_url",
  288. "image_url": {
  289. "url": image_url
  290. }
  291. }, {
  292. "type": "image_url",
  293. "image_url": {
  294. "url": image_url
  295. }
  296. }, {
  297. "type": "image_url",
  298. "image_url": {
  299. "url": image_url
  300. }
  301. }, {
  302. "type": "text",
  303. "text": "What's in these images?"
  304. }]
  305. }], phi3v_model_config, phi3v_tokenizer)
  306. def test_parse_chat_messages_rejects_too_many_images_across_messages(
  307. phi3v_model_config,
  308. phi3v_tokenizer,
  309. image_url,
  310. ):
  311. with warnings.catch_warnings():
  312. warnings.filterwarnings(
  313. "ignore",
  314. message="coroutine 'async_get_and_parse_image' was never awaited")
  315. with pytest.raises(
  316. ValueError,
  317. match="At most 2 image\\(s\\) may be provided in one request\\."
  318. ):
  319. parse_chat_messages([{
  320. "role":
  321. "user",
  322. "content": [{
  323. "type": "image_url",
  324. "image_url": {
  325. "url": image_url
  326. }
  327. }, {
  328. "type": "text",
  329. "text": "What's in this image?"
  330. }]
  331. }, {
  332. "role": "assistant",
  333. "content": "Some stuff."
  334. }, {
  335. "role":
  336. "user",
  337. "content": [{
  338. "type": "image_url",
  339. "image_url": {
  340. "url": image_url
  341. }
  342. }, {
  343. "type": "image_url",
  344. "image_url": {
  345. "url": image_url
  346. }
  347. }, {
  348. "type": "text",
  349. "text": "What about these two?"
  350. }]
  351. }], phi3v_model_config, phi3v_tokenizer)