123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311 |
- import types
- from typing import List, Optional, Tuple, Type
- import pytest
- import torch
- from huggingface_hub import snapshot_download
- from PIL.Image import Image
- from transformers import AutoConfig
- from aphrodite.common.utils import is_cpu
- from aphrodite.modeling.models.internvl import (IMG_CONTEXT, IMG_END,
- IMG_START,
- image_to_pixel_values)
- from aphrodite.multimodal.utils import rescale_image_size
- from ..conftest import IMAGE_ASSETS, AphroditeRunner, HfRunner, _ImageAssets
- from .utils import check_logprobs_close
- pytestmark = pytest.mark.vlm
- HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
- "stop_sign":
- "<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
- "cherry_blossom":
- "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
- })
- # we use snapshot_download to prevent conflicts between
- # dynamic_module and trust_remote_code for hf_runner
- DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
- models = [
- snapshot_download("OpenGVLab/InternVL2-1B",
- allow_patterns=DOWNLOAD_PATTERN),
- snapshot_download("OpenGVLab/InternVL2-2B",
- allow_patterns=DOWNLOAD_PATTERN),
- # Broken due to outdated implementation of Phi-3
- # See: https://huggingface.co/OpenGVLab/InternVL2-4B/discussions/3
- # snapshot_download("OpenGVLab/InternVL2-4B"),
- ]
- class InternVLProcessor:
- """A simple processor for InternVL2 HF model which misses a processor."""
- def __init__(self, hf_runner: HfRunner):
- self.num_image_token = hf_runner.model.num_image_token
- self.tokenizer = hf_runner.tokenizer
- self.dtype = hf_runner.model.dtype
- self.config = AutoConfig.from_pretrained(hf_runner.model_name)
- self.vision_config = self.config.vision_config
- self.use_thumbnail = self.config.use_thumbnail
- self.min_num = self.config.min_dynamic_patch
- self.max_num = self.config.max_dynamic_patch
- self.image_size = self.vision_config.image_size
- def __call__(self, text: str, images: Image, **kwargs):
- pixel_values = image_to_pixel_values(images, self.image_size,
- self.min_num, self.max_num,
- self.use_thumbnail).to(self.dtype)
- num_patches_list = [pixel_values.shape[0]]
- for num_patches in num_patches_list:
- context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
- image_tokens = IMG_START + context_tokens + IMG_END
- text = text.replace('<image>', image_tokens, 1)
- prompt = self.tokenizer(text, return_tensors="pt")
- prompt.update({"pixel_values": pixel_values})
- return prompt
- # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py
- def generate(
- self,
- pixel_values: torch.FloatTensor,
- input_ids: torch.FloatTensor,
- attention_mask: Optional[torch.LongTensor] = None,
- **generate_kwargs,
- ) -> torch.LongTensor:
- """Generate method for InternVL2 model without fixed use_cache."""
- assert self.img_context_token_id is not None
- vit_embeds = self.extract_feature(pixel_values)
- input_embeds = self.language_model.get_input_embeddings()(input_ids)
- B, N, C = input_embeds.shape
- input_embeds = input_embeds.reshape(B * N, C)
- input_ids = input_ids.reshape(B * N)
- selected = (input_ids == self.img_context_token_id)
- assert selected.sum() != 0
- input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
- input_embeds = input_embeds.reshape(B, N, C)
- outputs = self.language_model.generate(
- inputs_embeds=input_embeds,
- attention_mask=attention_mask,
- **generate_kwargs,
- )
- return outputs
- def run_test(
- hf_runner: Type[HfRunner],
- aphrodite_runner: Type[AphroditeRunner],
- image_assets: _ImageAssets,
- model: str,
- *,
- size_factors: List[float],
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
- ):
- """Inference result should be the same between hf and aphrodite.
- All the image fixtures for the test is under tests/images.
- For huggingface runner, we provide the PIL images as input.
- For aphrodite runner, we provide MultiModalDataDict objects
- and corresponding MultiModalConfig as input.
- Note, the text input is also adjusted to abide by aphrodite contract.
- The text output is sanitized to be able to compare with hf.
- """
- images = [asset.pil_image for asset in image_assets]
- inputs_per_image = [(
- [prompt for _ in size_factors],
- [rescale_image_size(image, factor) for factor in size_factors],
- ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
- # NOTE: take care of the order. run Aphrodite first, and then run HF.
- # Aphrodite needs a fresh new process without cuda initialization.
- # if we run HF first, the cuda initialization will be done and it
- # will hurt multiprocessing backend with fork method (the default method).
- # max_model_len should be greater than image_feature_size
- with aphrodite_runner(model,
- max_model_len=4096,
- dtype=dtype,
- tensor_parallel_size=tensor_parallel_size,
- distributed_executor_backend=distributed_executor_backend,
- enforce_eager=True) as aphrodite_model:
- aphrodite_outputs_per_image = [
- aphrodite_model.generate_greedy_logprobs(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images)
- for prompts, images in inputs_per_image
- ]
- with hf_runner(model, dtype=dtype) as hf_model:
- img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
- "<IMG_CONTEXT>")
- hf_model.model.img_context_token_id = img_context_token_id
- hf_model.processor = InternVLProcessor(hf_model)
- hf_model.model.get_output_embeddings = lambda: \
- hf_model.model.language_model.get_output_embeddings()
- hf_model.model.generate = types.MethodType(generate, hf_model.model)
- eos_token_id = hf_model.tokenizer.eos_token_id
- hf_outputs_per_image = [
- hf_model.generate_greedy_logprobs_limit(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=hf_images,
- eos_token_id=eos_token_id)
- for prompts, hf_images in inputs_per_image
- ]
- for hf_outputs, aphrodite_outputs in zip(hf_outputs_per_image,
- aphrodite_outputs_per_image):
- # TODO: Check whether using original CLIPVisionModel can improve
- # consistency against HF
- check_logprobs_close(
- outputs_0_lst=hf_outputs,
- outputs_1_lst=aphrodite_outputs,
- name_0="hf",
- name_1="aphrodite",
- )
- def run_awq_test(
- aphrodite_runner: Type[AphroditeRunner],
- image_assets: _ImageAssets,
- models: Tuple[str, str],
- *,
- size_factors: List[float],
- dtype: str,
- max_tokens: int,
- num_logprobs: int,
- tensor_parallel_size: int,
- distributed_executor_backend: Optional[str] = None,
- ):
- source_model, quant_model = models
- images = [asset.pil_image for asset in image_assets]
- inputs_per_image = [(
- [prompt for _ in size_factors],
- [rescale_image_size(image, factor) for factor in size_factors],
- ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
- # NOTE: take care of the order. run Aphrodite first, and then run HF.
- # Aphrodite needs a fresh new process without cuda initialization.
- # if we run HF first, the cuda initialization will be done and it
- # will hurt multiprocessing backend with fork method (the default method).
- # max_model_len should be greater than image_feature_size
- with aphrodite_runner(source_model,
- max_model_len=4096,
- dtype=dtype,
- tensor_parallel_size=tensor_parallel_size,
- distributed_executor_backend=distributed_executor_backend,
- enforce_eager=True) as aphrodite_model:
- source_outputs_per_image = [
- aphrodite_model.generate_greedy_logprobs(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images)
- for prompts, images in inputs_per_image
- ]
- with aphrodite_runner(quant_model,
- quantization="awq",
- max_model_len=4096,
- dtype=dtype,
- tensor_parallel_size=tensor_parallel_size,
- distributed_executor_backend=distributed_executor_backend,
- enforce_eager=True) as aphrodite_model:
- quant_outputs_per_image = [
- aphrodite_model.generate_greedy_logprobs(prompts,
- max_tokens,
- num_logprobs=num_logprobs,
- images=images)
- for prompts, images in inputs_per_image
- ]
- for source_outputs, quant_outputs in zip(source_outputs_per_image,
- quant_outputs_per_image):
- # TODO: Check whether using original CLIPVisionModel can improve
- # consistency against HF
- check_logprobs_close(
- outputs_0_lst=source_outputs,
- outputs_1_lst=quant_outputs,
- name_0="source",
- name_1="awq",
- )
- target_dtype = "half"
- if is_cpu():
- target_dtype = "bfloat16"
- @pytest.mark.parametrize("model", models)
- @pytest.mark.parametrize(
- "size_factors",
- [
- # No image
- [],
- # Single-scale
- [1.0],
- # Single-scale, batched
- [1.0, 1.0, 1.0],
- # Multi-scale
- [0.25, 0.5, 1.0],
- ],
- )
- @pytest.mark.parametrize("dtype", [target_dtype])
- @pytest.mark.parametrize("max_tokens", [128])
- @pytest.mark.parametrize("num_logprobs", [5])
- @torch.inference_mode()
- def test_models(hf_runner, aphrodite_runner, image_assets, model, size_factors,
- dtype: str, max_tokens: int, num_logprobs: int) -> None:
- run_test(
- hf_runner,
- aphrodite_runner,
- image_assets,
- model,
- size_factors=size_factors,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- tensor_parallel_size=1,
- )
- @pytest.mark.parametrize(
- "models", [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")])
- @pytest.mark.parametrize(
- "size_factors",
- [
- # No image
- [],
- # Single-scale
- [1.0],
- # Single-scale, batched
- [1.0, 1.0, 1.0],
- # Multi-scale
- [0.25, 0.5, 1.0],
- ],
- )
- @pytest.mark.parametrize("dtype", ["half"])
- @pytest.mark.parametrize("max_tokens", [128])
- @pytest.mark.parametrize("num_logprobs", [5])
- @torch.inference_mode()
- def test_awq_models(aphrodite_runner, image_assets, models, size_factors,
- dtype: str, max_tokens: int, num_logprobs: int) -> None:
- run_awq_test(
- aphrodite_runner,
- image_assets,
- models,
- size_factors=size_factors,
- dtype=dtype,
- max_tokens=max_tokens,
- num_logprobs=num_logprobs,
- tensor_parallel_size=1,
- )
|