|
@@ -10,11 +10,9 @@ from aphrodite.inputs import InputContext, LLMInputs
|
|
from aphrodite.multimodal.base import MultiModalInputs
|
|
from aphrodite.multimodal.base import MultiModalInputs
|
|
from aphrodite.multimodal.utils import cached_get_tokenizer, rescale_image_size
|
|
from aphrodite.multimodal.utils import cached_get_tokenizer, rescale_image_size
|
|
|
|
|
|
-from ..conftest import (IMAGE_ASSETS, AphroditeRunner, HfRunner, ImageAsset,
|
|
|
|
- PromptImageInput, _ImageAssets)
|
|
|
|
-from .utils import check_logprobs_close
|
|
|
|
-
|
|
|
|
-pytestmark = pytest.mark.vlm
|
|
|
|
|
|
+from ....conftest import (IMAGE_ASSETS, HfRunner, ImageAsset, PromptImageInput,
|
|
|
|
+ AphroditeRunner, _ImageAssets)
|
|
|
|
+from ...utils import check_logprobs_close
|
|
|
|
|
|
text_only_models = [
|
|
text_only_models = [
|
|
"Qwen/Qwen-7B-Chat" # Has no visual component
|
|
"Qwen/Qwen-7B-Chat" # Has no visual component
|
|
@@ -42,6 +40,8 @@ IMG_PAD_ID = 151859
|
|
TOKS_PER_IMG = 256
|
|
TOKS_PER_IMG = 256
|
|
VIS_ENC_DIM = 4096
|
|
VIS_ENC_DIM = 4096
|
|
IMG_SIZE = 448
|
|
IMG_SIZE = 448
|
|
|
|
+
|
|
|
|
+
|
|
def build_model_context(model_name: str,
|
|
def build_model_context(model_name: str,
|
|
tokenizer_name: Optional[str] = None,
|
|
tokenizer_name: Optional[str] = None,
|
|
trust_remote_code: bool = False):
|
|
trust_remote_code: bool = False):
|
|
@@ -51,6 +51,7 @@ def build_model_context(model_name: str,
|
|
model_name: Name of the model being considered.
|
|
model_name: Name of the model being considered.
|
|
tokenizer_name: Name of the tokenizer being considered.
|
|
tokenizer_name: Name of the tokenizer being considered.
|
|
trust_remote_code: Whether or not to allow loading remote code.
|
|
trust_remote_code: Whether or not to allow loading remote code.
|
|
|
|
+
|
|
Returns:
|
|
Returns:
|
|
InputContext for the model being considered.
|
|
InputContext for the model being considered.
|
|
"""
|
|
"""
|
|
@@ -65,21 +66,29 @@ def build_model_context(model_name: str,
|
|
seed=0,
|
|
seed=0,
|
|
)
|
|
)
|
|
return InputContext(model_config)
|
|
return InputContext(model_config)
|
|
|
|
+
|
|
|
|
+
|
|
@pytest.fixture()
|
|
@pytest.fixture()
|
|
def input_mapper_for_qwen():
|
|
def input_mapper_for_qwen():
|
|
# Lazy import to avoid initializing CUDA during test collection
|
|
# Lazy import to avoid initializing CUDA during test collection
|
|
from aphrodite.modeling.models.qwen import input_mapper_for_qwen
|
|
from aphrodite.modeling.models.qwen import input_mapper_for_qwen
|
|
return input_mapper_for_qwen
|
|
return input_mapper_for_qwen
|
|
|
|
+
|
|
|
|
+
|
|
@pytest.fixture()
|
|
@pytest.fixture()
|
|
def input_processor_for_qwen():
|
|
def input_processor_for_qwen():
|
|
# Lazy import to avoid initializing CUDA during test collection
|
|
# Lazy import to avoid initializing CUDA during test collection
|
|
from aphrodite.modeling.models.qwen import input_processor_for_qwen
|
|
from aphrodite.modeling.models.qwen import input_processor_for_qwen
|
|
return input_processor_for_qwen
|
|
return input_processor_for_qwen
|
|
|
|
+
|
|
|
|
+
|
|
@pytest.fixture()
|
|
@pytest.fixture()
|
|
def qwen_vl_context() -> InputContext:
|
|
def qwen_vl_context() -> InputContext:
|
|
"""Get an InputContext for Qwen-VL."""
|
|
"""Get an InputContext for Qwen-VL."""
|
|
return build_model_context(model_name="Qwen/Qwen-VL",
|
|
return build_model_context(model_name="Qwen/Qwen-VL",
|
|
trust_remote_code=True)
|
|
trust_remote_code=True)
|
|
|
|
+
|
|
|
|
+
|
|
# Happy path tests for single/multi-image scenarios for the multimodal
|
|
# Happy path tests for single/multi-image scenarios for the multimodal
|
|
# input processor and mapper, respectively
|
|
# input processor and mapper, respectively
|
|
@pytest.mark.parametrize("num_images", [1, 2])
|
|
@pytest.mark.parametrize("num_images", [1, 2])
|
|
@@ -99,11 +108,14 @@ def test_input_processor_valid_mm_data(input_processor_for_qwen,
|
|
)
|
|
)
|
|
proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
|
|
proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
|
|
assert isinstance(proc_inputs, dict)
|
|
assert isinstance(proc_inputs, dict)
|
|
|
|
+
|
|
# Each image should have one start / stop and a fixed context of 256
|
|
# Each image should have one start / stop and a fixed context of 256
|
|
proc_tokens = proc_inputs["prompt_token_ids"]
|
|
proc_tokens = proc_inputs["prompt_token_ids"]
|
|
assert proc_tokens.count(IMG_START_ID) == num_images
|
|
assert proc_tokens.count(IMG_START_ID) == num_images
|
|
assert proc_tokens.count(IMG_END_ID) == num_images
|
|
assert proc_tokens.count(IMG_END_ID) == num_images
|
|
assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
|
|
assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
|
|
|
|
+
|
|
|
|
+
|
|
@pytest.mark.parametrize(
|
|
@pytest.mark.parametrize(
|
|
"img_data,expected_shape",
|
|
"img_data,expected_shape",
|
|
[
|
|
[
|
|
@@ -130,6 +142,8 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
|
|
assert isinstance(mapped_img_data, MultiModalInputs)
|
|
assert isinstance(mapped_img_data, MultiModalInputs)
|
|
assert "pixel_values" in mapped_img_data
|
|
assert "pixel_values" in mapped_img_data
|
|
assert mapped_img_data["pixel_values"].shape == expected_shape
|
|
assert mapped_img_data["pixel_values"].shape == expected_shape
|
|
|
|
+
|
|
|
|
+
|
|
# Sad path tests for the multimodal input processor and mapper, respectively
|
|
# Sad path tests for the multimodal input processor and mapper, respectively
|
|
@pytest.mark.parametrize("mm_data", [
|
|
@pytest.mark.parametrize("mm_data", [
|
|
{
|
|
{
|
|
@@ -153,6 +167,8 @@ def test_input_processor_invalid_mm_data(input_processor_for_qwen,
|
|
# Should fail since we have too many or too few dimensions for embeddings
|
|
# Should fail since we have too many or too few dimensions for embeddings
|
|
with pytest.raises(ValueError):
|
|
with pytest.raises(ValueError):
|
|
input_processor_for_qwen(qwen_vl_context, inputs)
|
|
input_processor_for_qwen(qwen_vl_context, inputs)
|
|
|
|
+
|
|
|
|
+
|
|
@pytest.mark.parametrize(
|
|
@pytest.mark.parametrize(
|
|
"img_data",
|
|
"img_data",
|
|
[
|
|
[
|
|
@@ -169,6 +185,8 @@ def test_input_mapper_invalid_mm_data(
|
|
"""Sad cases validated in Qwen VL's multimodal input mapper."""
|
|
"""Sad cases validated in Qwen VL's multimodal input mapper."""
|
|
with pytest.raises(ValueError):
|
|
with pytest.raises(ValueError):
|
|
input_mapper_for_qwen(qwen_vl_context, img_data)
|
|
input_mapper_for_qwen(qwen_vl_context, img_data)
|
|
|
|
+
|
|
|
|
+
|
|
### End-to-end generation tests
|
|
### End-to-end generation tests
|
|
def get_prompt_with_path(tmp_path: pathlib.PosixPath, prompt: str,
|
|
def get_prompt_with_path(tmp_path: pathlib.PosixPath, prompt: str,
|
|
assets: Union[_ImageAssets, List[ImageAsset]]) -> str:
|
|
assets: Union[_ImageAssets, List[ImageAsset]]) -> str:
|
|
@@ -176,6 +194,7 @@ def get_prompt_with_path(tmp_path: pathlib.PosixPath, prompt: str,
|
|
tempdir & replace its contents with the local path to the string so that
|
|
tempdir & replace its contents with the local path to the string so that
|
|
the HF version of Qwen-VL can resolve the path and load the image ni its
|
|
the HF version of Qwen-VL can resolve the path and load the image ni its
|
|
forward() call.
|
|
forward() call.
|
|
|
|
+
|
|
Args:
|
|
Args:
|
|
tmp_path: Tempdir for test under consideration.
|
|
tmp_path: Tempdir for test under consideration.
|
|
prompt: Prompt with image placeholders.
|
|
prompt: Prompt with image placeholders.
|
|
@@ -184,6 +203,7 @@ def get_prompt_with_path(tmp_path: pathlib.PosixPath, prompt: str,
|
|
# Ensure that the number of placeholders matches the number of assets;
|
|
# Ensure that the number of placeholders matches the number of assets;
|
|
# If this is not true, the test is probably written incorrectly.
|
|
# If this is not true, the test is probably written incorrectly.
|
|
assert prompt.count("<img></img>") == len(assets)
|
|
assert prompt.count("<img></img>") == len(assets)
|
|
|
|
+
|
|
# Replace the placeholders with local paths to the exported assets
|
|
# Replace the placeholders with local paths to the exported assets
|
|
for asset in assets:
|
|
for asset in assets:
|
|
image_tmp_path = tmp_path / f"{asset.name}.jpg"
|
|
image_tmp_path = tmp_path / f"{asset.name}.jpg"
|
|
@@ -195,6 +215,7 @@ def get_prompt_with_path(tmp_path: pathlib.PosixPath, prompt: str,
|
|
)
|
|
)
|
|
return prompt
|
|
return prompt
|
|
|
|
|
|
|
|
+
|
|
def run_test(
|
|
def run_test(
|
|
hf_runner: Type[HfRunner],
|
|
hf_runner: Type[HfRunner],
|
|
aphrodite_runner: Type[AphroditeRunner],
|
|
aphrodite_runner: Type[AphroditeRunner],
|
|
@@ -209,6 +230,7 @@ def run_test(
|
|
distributed_executor_backend: Optional[str] = None,
|
|
distributed_executor_backend: Optional[str] = None,
|
|
):
|
|
):
|
|
"""Inference result should be the same between hf and aphrodite.
|
|
"""Inference result should be the same between hf and aphrodite.
|
|
|
|
+
|
|
All the image fixtures for the test is under tests/images.
|
|
All the image fixtures for the test is under tests/images.
|
|
For huggingface runner, we provide the PIL images as input.
|
|
For huggingface runner, we provide the PIL images as input.
|
|
For aphrodite runner, we provide MultiModalDataDict objects
|
|
For aphrodite runner, we provide MultiModalDataDict objects
|
|
@@ -286,14 +308,17 @@ def test_multimodal_models_single_image(tmp_path: pathlib.PosixPath,
|
|
num_logprobs: int) -> None:
|
|
num_logprobs: int) -> None:
|
|
"""Tests multimodal models with single image prompts."""
|
|
"""Tests multimodal models with single image prompts."""
|
|
images = [asset.pil_image for asset in image_assets]
|
|
images = [asset.pil_image for asset in image_assets]
|
|
|
|
+
|
|
prompts = [
|
|
prompts = [
|
|
get_prompt_with_path(tmp_path, prompt, [asset])
|
|
get_prompt_with_path(tmp_path, prompt, [asset])
|
|
for prompt, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
|
for prompt, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
|
]
|
|
]
|
|
|
|
+
|
|
inputs = [(
|
|
inputs = [(
|
|
[prompt for _ in size_factors],
|
|
[prompt for _ in size_factors],
|
|
[rescale_image_size(image, factor) for factor in size_factors],
|
|
[rescale_image_size(image, factor) for factor in size_factors],
|
|
) for image, prompt in zip(images, prompts)]
|
|
) for image, prompt in zip(images, prompts)]
|
|
|
|
+
|
|
run_test(
|
|
run_test(
|
|
hf_runner,
|
|
hf_runner,
|
|
aphrodite_runner,
|
|
aphrodite_runner,
|
|
@@ -305,6 +330,8 @@ def test_multimodal_models_single_image(tmp_path: pathlib.PosixPath,
|
|
mm_limit=1,
|
|
mm_limit=1,
|
|
tensor_parallel_size=1,
|
|
tensor_parallel_size=1,
|
|
)
|
|
)
|
|
|
|
+
|
|
|
|
+
|
|
@pytest.mark.parametrize("model", multimodal_models)
|
|
@pytest.mark.parametrize("model", multimodal_models)
|
|
@pytest.mark.parametrize(
|
|
@pytest.mark.parametrize(
|
|
"size_factors",
|
|
"size_factors",
|
|
@@ -337,6 +364,7 @@ def test_multimodal_models_multi_image(tmp_path: pathlib.PosixPath,
|
|
inputs = [([prompt for _ in size_factors],
|
|
inputs = [([prompt for _ in size_factors],
|
|
[[rescale_image_size(image, factor) for image in images]
|
|
[[rescale_image_size(image, factor) for image in images]
|
|
for factor in size_factors])]
|
|
for factor in size_factors])]
|
|
|
|
+
|
|
run_test(
|
|
run_test(
|
|
hf_runner,
|
|
hf_runner,
|
|
aphrodite_runner,
|
|
aphrodite_runner,
|