test_vision.py 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. from typing import Dict, List
  2. import openai
  3. import pytest
  4. from aphrodite.multimodal.utils import encode_image_base64, fetch_image
  5. from ...utils import RemoteOpenAIServer
  6. MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
  7. MAXIMUM_IMAGES = 2
  8. # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
  9. TEST_IMAGE_URLS = [
  10. "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
  11. "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
  12. "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
  13. "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
  14. ]
  15. @pytest.fixture(scope="module")
  16. def server():
  17. args = [
  18. "--dtype", "bfloat16", "--max-model-len", "4096", "--max-num-seqs",
  19. "5", "--enforce-eager", "--trust-remote-code", "--limit-mm-per-prompt",
  20. f"image={MAXIMUM_IMAGES}"
  21. ]
  22. with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
  23. yield remote_server
  24. @pytest.fixture(scope="module")
  25. def client(server):
  26. return server.get_async_client()
  27. @pytest.fixture(scope="session")
  28. def base64_encoded_image() -> Dict[str, str]:
  29. return {
  30. image_url: encode_image_base64(fetch_image(image_url))
  31. for image_url in TEST_IMAGE_URLS
  32. }
  33. @pytest.mark.asyncio
  34. @pytest.mark.parametrize("model_name", [MODEL_NAME])
  35. @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
  36. async def test_single_chat_session_image(client: openai.AsyncOpenAI,
  37. model_name: str, image_url: str):
  38. messages = [{
  39. "role":
  40. "user",
  41. "content": [
  42. {
  43. "type": "image_url",
  44. "image_url": {
  45. "url": image_url
  46. }
  47. },
  48. {
  49. "type": "text",
  50. "text": "What's in this image?"
  51. },
  52. ],
  53. }]
  54. # test single completion
  55. chat_completion = await client.chat.completions.create(model=model_name,
  56. messages=messages,
  57. max_tokens=10,
  58. logprobs=True,
  59. top_logprobs=5)
  60. assert len(chat_completion.choices) == 1
  61. choice = chat_completion.choices[0]
  62. assert choice.finish_reason == "length"
  63. assert chat_completion.usage == openai.types.CompletionUsage(
  64. completion_tokens=10, prompt_tokens=772, total_tokens=782)
  65. message = choice.message
  66. message = chat_completion.choices[0].message
  67. assert message.content is not None and len(message.content) >= 10
  68. assert message.role == "assistant"
  69. messages.append({"role": "assistant", "content": message.content})
  70. # test multi-turn dialogue
  71. messages.append({"role": "user", "content": "express your result in json"})
  72. chat_completion = await client.chat.completions.create(
  73. model=model_name,
  74. messages=messages,
  75. max_tokens=10,
  76. )
  77. message = chat_completion.choices[0].message
  78. assert message.content is not None and len(message.content) >= 0
  79. @pytest.mark.asyncio
  80. @pytest.mark.parametrize("model_name", [MODEL_NAME])
  81. @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
  82. async def test_single_chat_session_image_base64encoded(
  83. client: openai.AsyncOpenAI, model_name: str, image_url: str,
  84. base64_encoded_image: Dict[str, str]):
  85. messages = [{
  86. "role":
  87. "user",
  88. "content": [
  89. {
  90. "type": "image_url",
  91. "image_url": {
  92. "url":
  93. f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
  94. }
  95. },
  96. {
  97. "type": "text",
  98. "text": "What's in this image?"
  99. },
  100. ],
  101. }]
  102. # test single completion
  103. chat_completion = await client.chat.completions.create(model=model_name,
  104. messages=messages,
  105. max_tokens=10,
  106. logprobs=True,
  107. top_logprobs=5)
  108. assert len(chat_completion.choices) == 1
  109. choice = chat_completion.choices[0]
  110. assert choice.finish_reason == "length"
  111. assert chat_completion.usage == openai.types.CompletionUsage(
  112. completion_tokens=10, prompt_tokens=772, total_tokens=782)
  113. message = choice.message
  114. message = chat_completion.choices[0].message
  115. assert message.content is not None and len(message.content) >= 10
  116. assert message.role == "assistant"
  117. messages.append({"role": "assistant", "content": message.content})
  118. # test multi-turn dialogue
  119. messages.append({"role": "user", "content": "express your result in json"})
  120. chat_completion = await client.chat.completions.create(
  121. model=model_name,
  122. messages=messages,
  123. max_tokens=10,
  124. )
  125. message = chat_completion.choices[0].message
  126. assert message.content is not None and len(message.content) >= 0
  127. @pytest.mark.asyncio
  128. @pytest.mark.parametrize("model_name", [MODEL_NAME])
  129. @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
  130. async def test_chat_streaming_image(client: openai.AsyncOpenAI,
  131. model_name: str, image_url: str):
  132. messages = [{
  133. "role":
  134. "user",
  135. "content": [
  136. {
  137. "type": "image_url",
  138. "image_url": {
  139. "url": image_url
  140. }
  141. },
  142. {
  143. "type": "text",
  144. "text": "What's in this image?"
  145. },
  146. ],
  147. }]
  148. # test single completion
  149. chat_completion = await client.chat.completions.create(
  150. model=model_name,
  151. messages=messages,
  152. max_tokens=10,
  153. temperature=0.0,
  154. )
  155. output = chat_completion.choices[0].message.content
  156. stop_reason = chat_completion.choices[0].finish_reason
  157. # test streaming
  158. stream = await client.chat.completions.create(
  159. model=model_name,
  160. messages=messages,
  161. max_tokens=10,
  162. temperature=0.0,
  163. stream=True,
  164. )
  165. chunks: List[str] = []
  166. finish_reason_count = 0
  167. async for chunk in stream:
  168. delta = chunk.choices[0].delta
  169. if delta.role:
  170. assert delta.role == "assistant"
  171. if delta.content:
  172. chunks.append(delta.content)
  173. if chunk.choices[0].finish_reason is not None:
  174. finish_reason_count += 1
  175. # finish reason should only return in last block
  176. assert finish_reason_count == 1
  177. assert chunk.choices[0].finish_reason == stop_reason
  178. assert delta.content
  179. assert "".join(chunks) == output
  180. @pytest.mark.asyncio
  181. @pytest.mark.parametrize("model_name", [MODEL_NAME])
  182. @pytest.mark.parametrize(
  183. "image_urls",
  184. [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
  185. async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
  186. image_urls: List[str]):
  187. messages = [{
  188. "role":
  189. "user",
  190. "content": [
  191. *({
  192. "type": "image_url",
  193. "image_url": {
  194. "url": image_url
  195. }
  196. } for image_url in image_urls),
  197. {
  198. "type": "text",
  199. "text": "What's in this image?"
  200. },
  201. ],
  202. }]
  203. if len(image_urls) > MAXIMUM_IMAGES:
  204. with pytest.raises(openai.BadRequestError): # test multi-image input
  205. await client.chat.completions.create(
  206. model=model_name,
  207. messages=messages,
  208. max_tokens=10,
  209. temperature=0.0,
  210. )
  211. # the server should still work afterwards
  212. completion = await client.completions.create(
  213. model=model_name,
  214. prompt=[0, 0, 0, 0, 0],
  215. max_tokens=5,
  216. temperature=0.0,
  217. )
  218. completion = completion.choices[0].text
  219. assert completion is not None and len(completion) >= 0
  220. else:
  221. chat_completion = await client.chat.completions.create(
  222. model=model_name,
  223. messages=messages,
  224. max_tokens=10,
  225. temperature=0.0,
  226. )
  227. message = chat_completion.choices[0].message
  228. assert message.content is not None and len(message.content) >= 0