123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389 |
- import warnings
- from typing import Optional
- import pytest
- from PIL import Image
- from aphrodite.assets.image import ImageAsset
- from aphrodite.common.config import ModelConfig
- from aphrodite.endpoints.chat_utils import (parse_chat_messages,
- parse_chat_messages_futures)
- from aphrodite.multimodal import MultiModalDataDict
- from aphrodite.multimodal.utils import encode_image_base64
- from aphrodite.transformers_utils.tokenizer_group import TokenizerGroup
- PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
- @pytest.fixture(scope="module")
- def phi3v_model_config():
- return ModelConfig(PHI3V_MODEL_ID,
- PHI3V_MODEL_ID,
- tokenizer_mode="auto",
- trust_remote_code=True,
- dtype="bfloat16",
- seed=0,
- limit_mm_per_prompt={
- "image": 2,
- })
- @pytest.fixture(scope="module")
- def phi3v_tokenizer():
- return TokenizerGroup(
- tokenizer_id=PHI3V_MODEL_ID,
- enable_lora=False,
- max_num_seqs=5,
- max_input_length=None,
- )
- @pytest.fixture(scope="module")
- def image_url():
- image = ImageAsset('cherry_blossom')
- base64 = encode_image_base64(image.pil_image)
- return f"data:image/jpeg;base64,{base64}"
- def _assert_mm_data_is_image_input(
- mm_data: Optional[MultiModalDataDict],
- image_count: int,
- ) -> None:
- assert mm_data is not None
- assert set(mm_data.keys()) == {"image"}
- image_data = mm_data.get("image")
- assert image_data is not None
- if image_count == 1:
- assert isinstance(image_data, Image.Image)
- else:
- assert isinstance(image_data, list) and len(image_data) == image_count
- def test_parse_chat_messages_single_image(
- phi3v_model_config,
- phi3v_tokenizer,
- image_url,
- ):
- conversation, mm_data = parse_chat_messages([{
- "role":
- "user",
- "content": [{
- "type": "image_url",
- "image_url": {
- "url": image_url
- }
- }, {
- "type": "text",
- "text": "What's in the image?"
- }]
- }], phi3v_model_config, phi3v_tokenizer)
- assert conversation == [{
- "role": "user",
- "content": "<|image_1|>\nWhat's in the image?"
- }]
- _assert_mm_data_is_image_input(mm_data, 1)
- @pytest.mark.asyncio
- async def test_parse_chat_messages_single_image_async(
- phi3v_model_config,
- phi3v_tokenizer,
- image_url,
- ):
- conversation, mm_future = parse_chat_messages_futures([{
- "role":
- "user",
- "content": [{
- "type": "image_url",
- "image_url": {
- "url": image_url
- }
- }, {
- "type": "text",
- "text": "What's in the image?"
- }]
- }], phi3v_model_config, phi3v_tokenizer)
- assert conversation == [{
- "role": "user",
- "content": "<|image_1|>\nWhat's in the image?"
- }]
- _assert_mm_data_is_image_input(await mm_future, 1)
- def test_parse_chat_messages_multiple_images(
- phi3v_model_config,
- phi3v_tokenizer,
- image_url,
- ):
- conversation, mm_data = parse_chat_messages([{
- "role":
- "user",
- "content": [{
- "type": "image_url",
- "image_url": {
- "url": image_url
- }
- }, {
- "type": "image_url",
- "image_url": {
- "url": image_url
- }
- }, {
- "type": "text",
- "text": "What's in these images?"
- }]
- }], phi3v_model_config, phi3v_tokenizer)
- assert conversation == [{
- "role":
- "user",
- "content":
- "<|image_1|>\n<|image_2|>\nWhat's in these images?"
- }]
- _assert_mm_data_is_image_input(mm_data, 2)
- @pytest.mark.asyncio
- async def test_parse_chat_messages_multiple_images_async(
- phi3v_model_config,
- phi3v_tokenizer,
- image_url,
- ):
- conversation, mm_future = parse_chat_messages_futures([{
- "role":
- "user",
- "content": [{
- "type": "image_url",
- "image_url": {
- "url": image_url
- }
- }, {
- "type": "image_url",
- "image_url": {
- "url": image_url
- }
- }, {
- "type": "text",
- "text": "What's in these images?"
- }]
- }], phi3v_model_config, phi3v_tokenizer)
- assert conversation == [{
- "role":
- "user",
- "content":
- "<|image_1|>\n<|image_2|>\nWhat's in these images?"
- }]
- _assert_mm_data_is_image_input(await mm_future, 2)
- def test_parse_chat_messages_placeholder_already_in_prompt(
- phi3v_model_config,
- phi3v_tokenizer,
- image_url,
- ):
- conversation, mm_data = parse_chat_messages([{
- "role":
- "user",
- "content": [{
- "type": "image_url",
- "image_url": {
- "url": image_url
- }
- }, {
- "type": "image_url",
- "image_url": {
- "url": image_url
- }
- }, {
- "type":
- "text",
- "text":
- "What's in <|image_1|> and how does it compare to <|image_2|>?"
- }]
- }], phi3v_model_config, phi3v_tokenizer)
- assert conversation == [{
- "role":
- "user",
- "content":
- "What's in <|image_1|> and how does it compare to <|image_2|>?"
- }]
- _assert_mm_data_is_image_input(mm_data, 2)
- def test_parse_chat_messages_placeholder_one_already_in_prompt(
- phi3v_model_config,
- phi3v_tokenizer,
- image_url,
- ):
- conversation, mm_data = parse_chat_messages([{
- "role":
- "user",
- "content": [{
- "type": "image_url",
- "image_url": {
- "url": image_url
- }
- }, {
- "type": "image_url",
- "image_url": {
- "url": image_url
- }
- }, {
- "type":
- "text",
- "text":
- "What's in <|image_1|> and how does it compare to the other one?"
- }]
- }], phi3v_model_config, phi3v_tokenizer)
- assert conversation == [{
- "role":
- "user",
- "content":
- "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
- "other one?"
- }]
- _assert_mm_data_is_image_input(mm_data, 2)
- def test_parse_chat_messages_multiple_images_across_messages(
- phi3v_model_config,
- phi3v_tokenizer,
- image_url,
- ):
- conversation, mm_data = parse_chat_messages([{
- "role":
- "user",
- "content": [{
- "type": "image_url",
- "image_url": {
- "url": image_url
- }
- }, {
- "type": "text",
- "text": "What's in this image?"
- }]
- }, {
- "role": "assistant",
- "content": "Some stuff."
- }, {
- "role":
- "user",
- "content": [{
- "type": "image_url",
- "image_url": {
- "url": image_url
- }
- }, {
- "type": "text",
- "text": "What about this one?"
- }]
- }], phi3v_model_config, phi3v_tokenizer)
- assert conversation == [
- {
- "role": "user",
- "content": "<|image_1|>\nWhat's in this image?"
- },
- {
- "role": "assistant",
- "content": "Some stuff."
- },
- {
- "role": "user",
- "content": "<|image_2|>\nWhat about this one?"
- },
- ]
- _assert_mm_data_is_image_input(mm_data, 2)
- def test_parse_chat_messages_rejects_too_many_images_in_one_message(
- phi3v_model_config,
- phi3v_tokenizer,
- image_url,
- ):
- with warnings.catch_warnings():
- warnings.filterwarnings(
- "ignore",
- message="coroutine 'async_get_and_parse_image' was never awaited")
- with pytest.raises(
- ValueError,
- match="At most 2 image\\(s\\) may be provided in one request\\."
- ):
- parse_chat_messages([{
- "role":
- "user",
- "content": [{
- "type": "image_url",
- "image_url": {
- "url": image_url
- }
- }, {
- "type": "image_url",
- "image_url": {
- "url": image_url
- }
- }, {
- "type": "image_url",
- "image_url": {
- "url": image_url
- }
- }, {
- "type": "text",
- "text": "What's in these images?"
- }]
- }], phi3v_model_config, phi3v_tokenizer)
- def test_parse_chat_messages_rejects_too_many_images_across_messages(
- phi3v_model_config,
- phi3v_tokenizer,
- image_url,
- ):
- with warnings.catch_warnings():
- warnings.filterwarnings(
- "ignore",
- message="coroutine 'async_get_and_parse_image' was never awaited")
- with pytest.raises(
- ValueError,
- match="At most 2 image\\(s\\) may be provided in one request\\."
- ):
- parse_chat_messages([{
- "role":
- "user",
- "content": [{
- "type": "image_url",
- "image_url": {
- "url": image_url
- }
- }, {
- "type": "text",
- "text": "What's in this image?"
- }]
- }, {
- "role": "assistant",
- "content": "Some stuff."
- }, {
- "role":
- "user",
- "content": [{
- "type": "image_url",
- "image_url": {
- "url": image_url
- }
- }, {
- "type": "image_url",
- "image_url": {
- "url": image_url
- }
- }, {
- "type": "text",
- "text": "What about these two?"
- }]
- }], phi3v_model_config, phi3v_tokenizer)
|