Sfoglia il codice sorgente

vlm: add support for video modality + llava next video (#1014)

AlpinDale 2 mesi fa
parent
commit
be59e30139

+ 1 - 0
Dockerfile

@@ -15,6 +15,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
     && apt-get install -y ccache software-properties-common \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
     && add-apt-repository ppa:deadsnakes/ppa \
     && apt-get update -y \
     && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv python3-pip \

+ 1 - 0
Dockerfile.cpu

@@ -4,6 +4,7 @@ FROM ubuntu:22.04 AS cpu-test-1
 
 RUN apt-get update -y \
     && apt-get install -y curl git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 
 # https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html

+ 3 - 1
Dockerfile.neuron

@@ -6,7 +6,9 @@ FROM $BASE_IMAGE
 RUN echo "Base image is $BASE_IMAGE"
 
 # Install some basic utilities
-RUN apt-get update && apt-get install python3 python3-pip -y
+RUN apt-get update \
+    && apt-get install python3 python3-pip -y \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 
 
 ### Mount Point ###
 # When launching the container, mount the code directory to /app

+ 2 - 1
Dockerfile.openvino

@@ -4,7 +4,8 @@
 FROM ubuntu:22.04 AS dev
 
 RUN apt-get update -y && \
-    apt-get install -y python3-pip git
+    apt-get install -y python3-pip git && \
+    apt-get install -y ffmpeg libsm6 libxext6 libgl1 
 WORKDIR /workspace
 
 # copy requirements

+ 1 - 1
Dockerfile.ppc64le

@@ -2,7 +2,7 @@ FROM mambaorg/micromamba
 ARG MAMBA_DOCKERFILE_ACTIVATE=1
 USER root
 
-RUN apt-get update  -y     && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 
 
 # Some packages in requirements-cpu are installed here
 # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba

+ 3 - 0
Dockerfile.tpu

@@ -5,6 +5,9 @@ FROM $BASE_IMAGE
 
 WORKDIR /workspace
 
+# Install some basic utilities
+RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libgl1 
+
 # Install the TPU and Pallas dependencies.
 RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
 RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html

+ 1 - 1
Dockerfile.xpu

@@ -9,7 +9,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
     chmod 644 /usr/share/keyrings/intel-graphics.gpg
 
 RUN apt-get update  -y \
-&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
+&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1
 
 COPY ./ /workspace/aphrodite-engine
 

+ 84 - 0
aphrodite/assets/video.py

@@ -0,0 +1,84 @@
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import List, Literal
+
+import numpy as np
+import numpy.typing as npt
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from aphrodite.multimodal.utils import (sample_frames_from_video,
+                                        try_import_video_packages)
+
+from .base import get_cache_dir
+
+
+@lru_cache
+def download_video_asset(filename: str) -> str:
+    """
+    Download and open an image from huggingface
+    repo: raushan-testing-hf/videos-test
+    """
+    video_directory = get_cache_dir() / "video-eample-data"
+    video_directory.mkdir(parents=True, exist_ok=True)
+    video_path = video_directory / filename
+    video_path_str = str(video_path)
+    if not video_path.exists():
+        video_path_str = hf_hub_download(
+            repo_id="raushan-testing-hf/videos-test",
+            filename=filename,
+            repo_type="dataset",
+            cache_dir=video_directory,
+        )
+    return video_path_str
+
+
+def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
+    cv2 = try_import_video_packages()
+    cap = cv2.VideoCapture(path)
+    if not cap.isOpened():
+        raise ValueError(f"Could not open video file {path}")
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    frames = []
+    for i in range(total_frames):
+        ret, frame = cap.read()
+        if ret:
+            frames.append(frame)
+    cap.release()
+    frames = np.stack(frames)
+    frames = sample_frames_from_video(frames, num_frames)
+    if len(frames) < num_frames:
+        raise ValueError(
+            f"Could not read enough frames from video file {path}"
+            f" (expected {num_frames} frames, got {len(frames)})"
+        )
+    return frames
+
+
+def video_to_pil_images_list(
+    path: str, num_frames: int = -1
+) -> List[Image.Image]:
+    cv2 = try_import_video_packages()
+    frames = video_to_ndarrays(path, num_frames)
+    return [
+        Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+        for frame in frames
+    ]
+
+
+@dataclass(frozen=True)
+class VideoAsset:
+    name: Literal["sample_demo_1.mp4"]
+    num_frames: int = -1
+
+    @property
+    def pil_images(self) -> List[Image.Image]:
+        video_path = download_video_asset(self.name)
+        ret = video_to_pil_images_list(video_path, self.num_frames)
+        return ret
+
+    @property
+    def np_ndarrays(self) -> List[npt.NDArray]:
+        video_path = download_video_asset(self.name)
+        ret = video_to_ndarrays(video_path, self.num_frames)
+        return ret

+ 4 - 2
aphrodite/modeling/models/__init__.py

@@ -82,8 +82,10 @@ _MULTIMODAL_MODELS = {
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "LlavaForConditionalGeneration":
     ("llava", "LlavaForConditionalGeneration"),
-    "LlavaNextForConditionalGeneration":
-    ("llava_next", "LlavaNextForConditionalGeneration"),
+    "LlavaNextForConditionalGeneration": ("llava_next",
+                                          "LlavaNextForConditionalGeneration"),
+    "LlavaNextVideoForConditionalGeneration":
+    ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
     "MiniCPMV": ("minicpmv", "MiniCPMV"),
     "PaliGemmaForConditionalGeneration": ("paligemma",
                                           "PaliGemmaForConditionalGeneration"),

+ 448 - 0
aphrodite/modeling/models/llava_next_video.py

@@ -0,0 +1,448 @@
+import itertools
+import math
+from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
+                    TypedDict, Union)
+
+import numpy as np
+import torch
+import torch.nn as nn
+from transformers import (CLIPVisionConfig, LlavaNextVideoConfig,
+                          SiglipVisionConfig)
+
+from aphrodite.attention import AttentionMetadata
+from aphrodite.common.config import CacheConfig, MultiModalConfig
+from aphrodite.common.sequence import IntermediateTensors
+from aphrodite.common.utils import is_list_of
+from aphrodite.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from aphrodite.modeling.layers.activation import get_act_fn
+from aphrodite.modeling.layers.sampler import SamplerOutput
+from aphrodite.modeling.model_loader.weight_utils import default_weight_loader
+from aphrodite.modeling.models.clip import CLIPVisionModel
+from aphrodite.modeling.sampling_metadata import SamplingMetadata
+from aphrodite.multimodal import MULTIMODAL_REGISTRY
+from aphrodite.multimodal.utils import (cached_get_tokenizer,
+                                        repeat_and_pad_placeholder_tokens)
+from aphrodite.quantization.base_config import QuantizationConfig
+
+from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
+from .interfaces import SupportsMultiModal
+from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
+                     dummy_seq_data_for_siglip)
+from .utils import (filter_weights, init_aphrodite_registered_model,
+                    merge_multimodal_embeddings)
+
+# For profile run
+_MAX_FRAMES_PER_VIDEO = 32
+_MAX_NUM_VIDEOS = 1
+
+
+class LlavaNextVideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
+    data: Union[torch.Tensor, List[torch.Tensor]]
+    """
+    Shape: `(batch_size, num_frames, num_channels, height, width)`
+    Note that `num_frames` may be different for each batch, in which case
+    the data is passed as a list instead of a batched tensor.
+    Note that it only supports one video input for one batch.
+    """
+
+
+def get_llava_next_video_frame_feature_size(
+    hf_config: LlavaNextVideoConfig
+) -> int:
+    # Support both CLIPVisionConfig and SiglipVisionConfig
+    image_size = hf_config.vision_config.image_size
+    patch_size = hf_config.vision_config.patch_size
+    spatial_pool_stride = hf_config.spatial_pool_stride
+    return int((image_size / patch_size / spatial_pool_stride) ** 2)
+
+
+def _get_max_llm_tokens(ctx: InputContext) -> int:
+    """
+    Calculated from the maximum video frames under the context length
+    constraints of the language model.
+    """
+    hf_text_config = ctx.model_config.hf_text_config
+    model_config = ctx.model_config
+    max_tokens = model_config.max_model_len
+    rope_scaling = model_config.rope_scaling
+    if rope_scaling:
+        rope_scaling_factor = hf_text_config.rope_scaling["factor"]
+    else:
+        rope_scaling_factor = 1
+    max_tokens *= rope_scaling_factor
+    return max_tokens
+
+
+def get_max_llava_next_video_tokens(ctx: InputContext) -> int:
+    # Currently set to 32 frames
+    # TODO: max_tokens = _get_max_llm_tokens(ctx)
+    hf_config = ctx.get_hf_config(LlavaNextVideoConfig)
+    tokens_per_frame = get_llava_next_video_frame_feature_size(hf_config)
+    return _MAX_FRAMES_PER_VIDEO * tokens_per_frame
+
+
+def dummy_data_for_llava_next_video(
+    ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int]
+):
+    hf_config = ctx.get_hf_config(LlavaNextVideoConfig)
+    vision_config = hf_config.vision_config
+    # TODO: support multiple videos
+    num_videos = mm_counts["video"]
+    if num_videos != _MAX_NUM_VIDEOS:
+        raise NotImplementedError(
+            f"Only {_MAX_NUM_VIDEOS} videos are supported"
+        )
+    # TODO: support configuring the number of frames
+    frames_per_video = _MAX_FRAMES_PER_VIDEO
+    # num_images = num_videos * frames_per_video
+    # fills the sequence with as longer video data as possible
+    tokens_per_frame = get_llava_next_video_frame_feature_size(hf_config)
+    video_feature_size = frames_per_video * tokens_per_frame
+    if isinstance(vision_config, CLIPVisionConfig):
+        seq_data = dummy_seq_data_for_clip(
+            vision_config,
+            seq_len,
+            num_videos,
+            image_token_id=hf_config.video_token_index,
+            image_feature_size_override=video_feature_size,
+        )
+        pil_frame = dummy_image_for_clip(vision_config, num_images=1)
+        np_frame = np.array(pil_frame["image"])
+        mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0)
+        mm_data = {"video": mm_data_per_video}
+        return seq_data, mm_data
+    elif isinstance(vision_config, SiglipVisionConfig):
+        seq_data = dummy_seq_data_for_siglip(
+            vision_config,
+            seq_len,
+            num_videos,
+            image_token_id=hf_config.video_token_index,
+            image_feature_size_override=video_feature_size,
+        )
+        pil_frame = dummy_image_for_siglip(vision_config, num_images=1)
+        np_frame = np.array(pil_frame["image"])
+        mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0)
+        mm_data = {"video": mm_data_per_video}
+        return seq_data, mm_data
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+def input_processor_for_llava_next_video(
+    ctx: InputContext, llm_inputs: LLMInputs
+):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "video" not in multi_modal_data:
+        return llm_inputs
+    video_data = multi_modal_data["video"]
+    model_config = ctx.model_config
+    hf_config = ctx.get_hf_config(LlavaNextVideoConfig)
+    vision_config = hf_config.vision_config
+    if isinstance(video_data, np.ndarray):
+        # Supports both CLIP and Siglip
+        num_frames = video_data.shape[0]
+        frame_feature_size = get_llava_next_video_frame_feature_size(hf_config)
+        video_feature_size = num_frames * frame_feature_size
+        tokenizer = cached_get_tokenizer(model_config.tokenizer)
+        new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+            tokenizer,
+            llm_inputs.get("prompt"),
+            llm_inputs["prompt_token_ids"],
+            placeholder_token_id=hf_config.video_token_index,
+            repeat_count=video_feature_size,
+        )
+        return LLMInputs(
+            prompt_token_ids=new_token_ids,
+            prompt=new_prompt,
+            multi_modal_data=multi_modal_data,
+        )
+    elif is_list_of(video_data, np.ndarray):
+        raise NotImplementedError("Processing multiple videos is not supported")
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+def _init_vision_tower(hf_config: LlavaNextVideoConfig):
+    vision_config = hf_config.vision_config
+    # Initialize the vision tower only up to the required feature layer
+    vision_feature_layer = hf_config.vision_feature_layer
+    if vision_feature_layer < 0:
+        num_hidden_layers = (
+            hf_config.vision_config.num_hidden_layers + vision_feature_layer + 1
+        )
+    else:
+        num_hidden_layers = vision_feature_layer + 1
+    if isinstance(vision_config, CLIPVisionConfig):
+        return CLIPVisionModel(
+            vision_config,
+            num_hidden_layers_override=num_hidden_layers,
+        )
+    elif isinstance(vision_config, SiglipVisionConfig):
+        return SiglipVisionModel(
+            vision_config,
+            num_hidden_layers_override=num_hidden_layers,
+        )
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+# adopted from transformers modeling_llava_next_video.py
+class LlavaNextVideoPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        mode = config.spatial_pool_mode
+        stride = config.spatial_pool_stride
+        image_size = config.vision_config.image_size
+        patch_size = config.vision_config.patch_size
+        self.image_size = image_size // patch_size**2
+        if mode == "average":
+            self.pool = nn.AvgPool2d(kernel_size=stride, stride=stride)
+        elif mode == "max":
+            self.pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
+        else:
+            # TODO: Support Conv2d pooling layer, need to load weights
+            raise ValueError(
+                f"Unknown pooling mode: {mode}. Expected [`average`, `max`]"
+            )
+
+    def forward(self, image_features):
+        ori_width = int(
+            math.sqrt(
+                image_features.shape[1] * self.image_size // self.image_size
+            )
+        )
+        ori_height = int(ori_width * self.image_size // self.image_size)
+        batch_size, _, dim = image_features.shape
+        image_features_spatial = image_features.view(
+            batch_size, ori_height, ori_height, dim
+        ).permute(0, 3, 1, 2)
+        image_features_spatial = self.pool(image_features_spatial)
+        return image_features_spatial.flatten(2).transpose(1, 2).contiguous()
+
+
+class LlavaNextMultiModalProjector(nn.Module):
+    def __init__(
+        self,
+        vision_hidden_size: int,
+        text_hidden_size: int,
+        projector_hidden_act: str,
+    ):
+        super().__init__()
+        self.linear_1 = nn.Linear(
+            vision_hidden_size, text_hidden_size, bias=True
+        )
+        self.act = get_act_fn(projector_hidden_act)
+        self.linear_2 = nn.Linear(text_hidden_size, text_hidden_size, bias=True)
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+@MULTIMODAL_REGISTRY.register_input_mapper("video")
+@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
+    "video", get_max_llava_next_video_tokens
+)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next_video)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_next_video)
+class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal):
+    def __init__(
+        self,
+        config: LlavaNextVideoConfig,
+        multimodal_config: MultiModalConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.multimodal_config = multimodal_config
+        # Initialize the vision tower only up to the required feature layer
+        self.vision_tower = _init_vision_tower(config)
+        self.multi_modal_projector = LlavaNextMultiModalProjector(
+            vision_hidden_size=config.vision_config.hidden_size,
+            text_hidden_size=config.text_config.hidden_size,
+            projector_hidden_act=config.projector_hidden_act,
+        )
+        self.language_model = init_aphrodite_registered_model(
+            config.text_config, cache_config, quant_config
+        )
+        self.vision_resampler = LlavaNextVideoPooler(config)
+
+    def _validate_video_pixel_values(
+        self, data: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape[2:])
+            if actual_dims != expected_dims:
+                expected_expr = ("num_frames", *map(str, expected_dims))
+                raise ValueError(
+                    "The expected shape of pixel values in each video frame "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}."
+                )
+
+        for d in data:
+            _validate_shape(d)
+        return data
+
+    def _parse_and_validate_video_input(
+        self, **kwargs: object
+    ) -> Optional[LlavaNextVideoPixelInputs]:
+        """
+        A legal video input should have the following dimensions:
+        {
+            "pixel_values_videos" :
+                List[b, Tensor(nb_frames, nb_channels, height, width)]
+        }
+        """
+        pixel_values = kwargs.pop("pixel_values_videos", None)
+        if pixel_values is None:
+            return None
+        if not (
+            is_list_of(pixel_values, (torch.Tensor))  # different shape videos
+            or isinstance(pixel_values, torch.Tensor)
+        ):  # same shape videos
+            raise ValueError(
+                "Incorrect type of pixel values. "
+                f"Got type: {type(pixel_values)}"
+            )
+        return LlavaNextVideoPixelInputs(
+            type="pixel_values_videos",
+            data=pixel_values,
+        )
+
+    def _select_image_features(
+        self, image_features: torch.Tensor, *, strategy: str
+    ) -> torch.Tensor:
+        if strategy == "default":
+            return image_features[:, 1:]
+        elif strategy == "full":
+            return image_features
+        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+
+    def _video_pixels_to_features(
+        self,
+        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
+        pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the vision tower
+        image_features = vision_tower(pixel_values)
+        image_features = self._select_image_features(
+            image_features,
+            strategy=self.config.vision_feature_select_strategy,
+        )
+        image_features = self.vision_resampler(image_features)
+        image_features = self.multi_modal_projector(image_features)
+        return image_features
+
+    def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs):
+        assert self.vision_tower is not None
+        video_pixels = inputs["data"]
+        if isinstance(video_pixels, torch.Tensor):
+            # TODO: support multiple videos per input
+            b, num_videos, num_frames, c, h, w = video_pixels.shape
+            assert num_videos == 1
+            stacked_pixels = video_pixels.view(
+                b * num_videos * num_frames, c, h, w
+            )
+            stacked_embeddings = self._video_pixels_to_features(
+                self.vision_tower, stacked_pixels
+            )
+            return stacked_embeddings.view(
+                b, num_frames, *stacked_embeddings.shape[1:]
+            )
+        elif is_list_of(video_pixels, torch.Tensor):
+            frames_per_videos = [v.shape[0] for v in video_pixels]
+            stacked_pixels = torch.cat(video_pixels, dim=0)
+            stacked_embeddings = self._video_pixels_to_features(
+                self.vision_tower, stacked_pixels
+            )
+            return torch.split(stacked_embeddings, frames_per_videos, dim=0)
+        else:
+            raise ValueError(
+                f"Unsupported type of video input {type(video_pixels)}"
+            )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: object,
+    ) -> SamplerOutput:
+        """Run forward pass for LlaVA-NeXT-Video.
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            pixel_values_videos: Pixels in each frames for each input videos.
+        """
+        video_input = self._parse_and_validate_video_input(**kwargs)
+        # merge video embeddings into input embeddings
+        if video_input is not None:
+            video_embeddings = self._process_video_pixels(video_input)
+            inputs_embeds = self.language_model.model.get_input_embeddings(
+                input_ids
+            )
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                video_embeddings,
+                self.config.video_token_index,
+            )
+            input_ids = None
+        else:
+            inputs_embeds = None
+        hidden_states = self.language_model.model(
+            input_ids,
+            positions,
+            kv_caches,
+            attn_metadata,
+            None,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(
+            hidden_states, sampling_metadata
+        )
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # prepare weight iterators
+        vit_weights, mlp_weights, newline_weights, llm_weights = itertools.tee(
+            weights, 4
+        )
+        # load vision encoder
+        vit_weights = filter_weights(vit_weights, "vision_tower")
+        self.vision_tower.load_weights(vit_weights)
+        # load mlp projector
+        mlp_weights = filter_weights(mlp_weights, "multi_modal_projector")
+        mlp_params_dict = dict(self.multi_modal_projector.named_parameters())
+        for name, loaded_weight in mlp_weights:
+            param = mlp_params_dict[name]
+            weight_loader = getattr(
+                param, "weight_loader", default_weight_loader
+            )
+            weight_loader(param, loaded_weight)
+        # load llm backbone
+        llm_weights = filter_weights(llm_weights, "language_model")
+        self.language_model.load_weights(llm_weights)

+ 2 - 1
aphrodite/multimodal/registry.py

@@ -10,6 +10,7 @@ from .audio import AudioPlugin
 from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalInputs,
                    MultiModalPlugin, MultiModalTokensCalc, NestedTensors)
 from .image import ImagePlugin
+from .video import VideoPlugin
 
 
 class _MultiModalLimits(UserDict):
@@ -34,7 +35,7 @@ class MultiModalRegistry:
     The registry handles both external and internal data input.
     """
 
-    DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin())
+    DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin(), VideoPlugin())
 
     def __init__(
             self,

+ 40 - 1
aphrodite/multimodal/utils.py

@@ -1,10 +1,11 @@
 import base64
 from functools import lru_cache
 from io import BytesIO
-from typing import List, Optional, Tuple, TypeVar, Union
+from typing import Any, List, Optional, Tuple, TypeVar, Union
 
 import librosa
 import numpy as np
+import numpy.typing as npt
 import soundfile
 from loguru import logger
 from PIL import Image
@@ -172,6 +173,44 @@ def rescale_image_size(image: Image.Image,
     return image
 
 
+def try_import_video_packages() -> Any:
+    try:
+        import cv2
+    except ImportError:
+        raise ImportError(
+            "Please install opencv-python for video support."
+            ) from None
+    return cv2
+
+
+def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray:
+    cv2 = try_import_video_packages()
+    num_frames, _, _, channels = frames.shape
+    new_height, new_width = size
+    resized_frames = np.empty((num_frames, new_height, new_width, channels),
+                              dtype=frames.dtype)
+    for i, frame in enumerate(frames):
+        resized_frame = cv2.resize(frame, (new_width, new_height))
+        resized_frames[i] = resized_frame
+    return resized_frames
+
+def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
+    _, height, width, _ = frames.shape
+    new_height = int(height * size_factor)
+    new_width = int(width * size_factor)
+    return resize_video(frames, (new_height, new_width))
+
+def sample_frames_from_video(frames: npt.NDArray,
+                             num_frames: int) -> npt.NDArray:
+    total_frames = frames.shape[0]
+    if num_frames == -1:
+        return frames
+    else:
+        frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+        sampled_frames = frames[frame_indices, ...]
+        return sampled_frames
+
+
 # Utilities for input processors
 _T = TypeVar("_T", str, int)
 

+ 68 - 0
aphrodite/multimodal/video.py

@@ -0,0 +1,68 @@
+from functools import lru_cache
+from typing import List, Union
+
+import numpy as np
+from loguru import logger
+
+from aphrodite.common.config import ModelConfig
+from aphrodite.common.utils import is_list_of
+from aphrodite.inputs.registry import InputContext
+from aphrodite.transformers_utils.image_processor import get_video_processor
+from aphrodite.transformers_utils.tokenizer import get_tokenizer
+
+from .base import MultiModalData, MultiModalInputs
+from .image import ImagePlugin
+
+cached_get_video_processor = lru_cache(get_video_processor)
+cached_get_tokenizer = lru_cache(get_tokenizer)
+VideoInput = Union[
+    "np.ndarray",  # single video input
+    List["np.ndarray"],
+    # TODO: support more types
+    # List[Image.Image], List[List[Image.Image]],
+    # "torch.Tensor",
+    # List["torch.Tensor"],
+    # List[List["np.ndarrray"]],
+    # List[List["torch.Tensor"]],
+]
+
+
+class VideoPlugin(ImagePlugin):
+    """Plugin for video data."""
+
+    def get_data_key(self) -> str:
+        return "video"
+
+    def _get_hf_video_processor(self, model_config: ModelConfig):
+        return cached_get_video_processor(
+            model_config.model, trust_remote_code=model_config.trust_remote_code
+        )
+
+    def _default_input_mapper(
+        self,
+        ctx: InputContext,
+        data: MultiModalData[object],
+    ) -> MultiModalInputs:
+        model_config = ctx.model_config
+        # single video input as np.ndarray
+        if isinstance(data, np.ndarray):
+            video_processor = self._get_hf_video_processor(model_config)
+            if video_processor is None:
+                raise RuntimeError(
+                    "No HuggingFace processor is available "
+                    "to process the image object"
+                )
+            try:
+                batch_data = video_processor(data, return_tensors="pt").data
+            except Exception:
+                logger.error("Failed to process image (%s)", data)
+                raise
+            return MultiModalInputs(batch_data)
+        elif is_list_of(data, np.ndarray):
+            raise NotImplementedError(
+                "Multi video for a prompt is not supported yet"
+            )
+        raise TypeError(f"Invalid video type: {type(data)}")
+
+    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
+        return 4096

+ 25 - 0
aphrodite/transformers_utils/image_processor.py

@@ -1,6 +1,31 @@
 from typing import cast
 
 
+def get_video_processor(
+    processor_name: str,
+    trust_remote_code: bool = False,
+):
+    """
+    Gets a processor for the given model name via HuggingFace.
+    """
+    from transformers import AutoProcessor
+    try:
+        processor = AutoProcessor.from_pretrained(processor_name)
+        video_processor = processor.video_processor
+    except ValueError as e:
+        if not trust_remote_code:
+            err_msg = (
+                "Failed to load the processor. If the processor is "
+                "a custom processor not yet available in the HuggingFace "
+                "transformers library, consider setting "
+                "`trust_remote_code=True` in LLM or using the "
+                "`--trust-remote-code` flag in the CLI.")
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
+    return video_processor
+
+
 def get_image_processor(
     processor_name: str,
     *args,

+ 1 - 0
requirements-common.txt

@@ -34,3 +34,4 @@ pandas
 msgspec
 python-multipart
 partial-json-parser
+opencv-python

+ 48 - 3
tests/conftest.py

@@ -22,6 +22,7 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
 
 from aphrodite import LLM, SamplingParams
 from aphrodite.assets.image import ImageAsset
+from aphrodite.assets.video import VideoAsset
 from aphrodite.common.config import TokenizerPoolConfig
 from aphrodite.common.outputs import RequestOutput
 from aphrodite.common.sequence import SampleLogprobs
@@ -43,6 +44,7 @@ _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
 PromptImageInput = Union[List[Image.Image], List[List[Image.Image]]]
 PromptAudioInput = Union[List[Tuple[np.ndarray, int]],
                          List[List[Tuple[np.ndarray, int]]]]
+PromptVideoInput = Union[List[np.ndarray], List[List[np.ndarray]]]
 
 
 def _read_prompts(filename: str) -> List[str]:
@@ -84,8 +86,28 @@ class _ImageAssets(_ImageAssetsBase):
         return [prompts["stop_sign"], prompts["cherry_blossom"]]
 
 
+class _VideoAssetPrompts(TypedDict):
+    sample_demo_1: str
+if sys.version_info < (3, 9):
+    # UserList cannot be subscripted
+    class _VideoAssetsBase(UserList):
+        pass
+else:
+    class _VideoAssetsBase(UserList[VideoAsset]):
+        pass
+class _VideoAssets(_VideoAssetsBase):
+    def __init__(self) -> None:
+        super().__init__([
+            VideoAsset("sample_demo_1.mp4"),
+        ])
+    def prompts(self, prompts: _VideoAssetPrompts) -> List[str]:
+        return [prompts["sample_demo_1"]]
+
+
 IMAGE_ASSETS = _ImageAssets()
 """Singleton instance of :class:`_ImageAssets`."""
+VIDEO_ASSETS = _VideoAssets()
+"""Singleton instance of :class:`_VideoAssets`."""
 
 
 @pytest.fixture(autouse=True)
@@ -201,6 +223,11 @@ def image_assets() -> _ImageAssets:
     return IMAGE_ASSETS
 
 
+@pytest.fixture(scope="session")
+def video_assets() -> _VideoAssets:
+    return VIDEO_ASSETS
+
+
 _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature)
 
 
@@ -268,8 +295,8 @@ class HfRunner:
             )
         except Exception as exc:
             logger.warning(
-                "Unable to auto-load HuggingFace processor for model (%s). "
-                "Using tokenizer instead. Reason: %s", model_name, exc)
+                f"Unable to auto-load HuggingFace processor for model "
+                f"({model_name}). Using tokenizer instead. Reason: {exc}")
             self.processor = self.tokenizer
 
         self.postprocess_inputs = postprocess_inputs
@@ -278,6 +305,7 @@ class HfRunner:
         self,
         prompts: List[str],
         images: Optional[PromptImageInput] = None,
+        videos: Optional[List[np.ndarray]] = None,
         **kwargs: Any,
     ) -> List[Tuple[List[List[int]], List[str]]]:
         if images:
@@ -291,6 +319,8 @@ class HfRunner:
             }
             if images is not None and images[i] is not None:
                 processor_kwargs["images"] = images[i]
+            if videos is not None and videos[i] is not None:
+                processor_kwargs["videos"] = videos[i]
 
             inputs = self.processor(**processor_kwargs)
             inputs = self.postprocess_inputs(inputs)
@@ -351,6 +381,7 @@ class HfRunner:
         prompts: List[str],
         max_tokens: int,
         images: Optional[PromptImageInput] = None,
+        videos: Optional[List[np.ndarray]] = None,
         **kwargs: Any,
     ) -> List[List[torch.Tensor]]:
         all_logprobs: List[List[torch.Tensor]] = []
@@ -361,6 +392,8 @@ class HfRunner:
             }
             if images is not None and images[i] is not None:
                 processor_kwargs["images"] = images[i]
+            if videos is not None and videos[i] is not None:
+                processor_kwargs["videos"] = videos[i]
 
             inputs = self.processor(**processor_kwargs)
             inputs = self.postprocess_inputs(inputs)
@@ -434,6 +467,7 @@ class HfRunner:
         num_logprobs: int,
         images: Optional[PromptImageInput] = None,
         audios: Optional[PromptAudioInput] = None,
+        videos: Optional[List[np.ndarray]] = None,
         **kwargs: Any,
     ) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
         all_logprobs: List[List[Dict[int, float]]] = []
@@ -453,6 +487,8 @@ class HfRunner:
                 processor_kwargs["audio"] = audio
                 processor_kwargs["sampling_rate"] = sr
 
+            if videos is not None:
+                processor_kwargs["videos"] = videos[i]
             inputs = self.processor(**processor_kwargs)
             inputs = self.postprocess_inputs(inputs)
 
@@ -633,12 +669,15 @@ class AphroditeRunner:
         sampling_params: SamplingParams,
         images: Optional[PromptImageInput] = None,
         audios: Optional[PromptAudioInput] = None,
+        videos: Optional[PromptVideoInput] = None,
     ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
         assert sampling_params.logprobs is not None
 
         if images is not None:
             assert len(prompts) == len(images)
 
+        if videos is not None:
+            assert len(prompts) == len(videos)
         inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
         if images is not None:
             for i, image in enumerate(images):
@@ -648,6 +687,10 @@ class AphroditeRunner:
             for i, audio in enumerate(audios):
                 inputs[i]["multi_modal_data"] = {"audio": audio}
 
+        if videos is not None:
+            for i, video in enumerate(videos):
+                inputs[i]["multi_modal_data"] = {"video": video}
+        print(f"[INPUTS!!!!]: {inputs}, {sampling_params}")
         req_outputs = self.model.generate(inputs,
                                           sampling_params=sampling_params)
         return self._final_steps_generate_w_logprobs(req_outputs)
@@ -684,6 +727,7 @@ class AphroditeRunner:
         num_logprobs: int,
         images: Optional[PromptImageInput] = None,
         audios: Optional[PromptAudioInput] = None,
+        videos: Optional[PromptVideoInput] = None,
         stop_token_ids: Optional[List[int]] = None,
     ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
         greedy_logprobs_params = SamplingParams(temperature=0.0,
@@ -693,7 +737,8 @@ class AphroditeRunner:
         outputs = self.generate_w_logprobs(prompts,
                                            greedy_logprobs_params,
                                            images=images,
-                                           audios=audios)
+                                           audios=audios,
+                                           videos=videos)
 
         return [(output_ids, output_str, output_logprobs)
                 for output_ids, output_str, output_logprobs in outputs]

+ 256 - 0
tests/models/test_llava_next_video.py

@@ -0,0 +1,256 @@
+from typing import List, Optional, Tuple, Type, overload
+
+import pytest
+import transformers
+from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
+
+from aphrodite.common.sequence import SampleLogprobs
+from aphrodite.multimodal.utils import (rescale_video_size, resize_video,
+                                        sample_frames_from_video)
+
+from ..conftest import VIDEO_ASSETS, AphroditeRunner, HfRunner, _VideoAssets
+from .utils import check_logprobs_close
+
+pytestmark = pytest.mark.vlm
+_PREFACE = (
+    "A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's "
+    "questions."
+)
+HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
+    {
+        "sample_demo_1": f"{_PREFACE}USER: <video>\nWhy is this video funny? "
+        "ASSISTANT:"
+    }
+)
+models = ["llava-hf/LLaVA-NeXT-Video-7B-hf"]
+
+
+def aphrodite_to_hf_output(
+    aphrodite_output: Tuple[List[int],
+                            str, Optional[SampleLogprobs]], model: str
+):
+    """Sanitize aphrodite output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = aphrodite_output
+    config = AutoConfig.from_pretrained(model)
+    video_token_id = config.video_token_index
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    eos_token_id = tokenizer.eos_token_id
+    hf_output_ids = [
+        token_id
+        for idx, token_id in enumerate(output_ids)
+        if token_id != video_token_id or output_ids[idx - 1] != video_token_id
+    ]
+    assert output_str[0] == " "
+    hf_output_str = output_str[1:]
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+@overload
+def run_test(
+    hf_runner: Type[HfRunner],
+    aphrodite_runner: Type[AphroditeRunner],
+    video_assets: _VideoAssets,
+    model: str,
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    num_frames: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    ...
+
+
+@overload
+def run_test(
+    hf_runner: Type[HfRunner],
+    aphrodite_runner: Type[AphroditeRunner],
+    video_assets: _VideoAssets,
+    model: str,
+    *,
+    sizes: List[Tuple[int, int]],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    num_frames: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    ...
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    aphrodite_runner: Type[AphroditeRunner],
+    video_assets: _VideoAssets,
+    model: str,
+    *,
+    size_factors: Optional[List[float]] = None,
+    sizes: Optional[List[Tuple[int, int]]] = None,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    num_frames: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    videos = [
+        sample_frames_from_video(asset.np_ndarrays, num_frames)
+        for asset in video_assets
+    ]
+    for video in videos:
+        print(video.shape)
+    if size_factors is not None:
+        inputs_per_video = [
+            (
+                [prompt for _ in size_factors],
+                [rescale_video_size(video, factor) for factor in size_factors],
+            )
+            for video, prompt in zip(videos, HF_VIDEO_PROMPTS)
+        ]
+    elif sizes is not None:
+        inputs_per_video = [
+            (
+                [prompt for _ in sizes],
+                [resize_video(video, size) for size in sizes],
+            )
+            for video, prompt in zip(videos, HF_VIDEO_PROMPTS)
+        ]
+    else:
+        raise ValueError("You must provide either `size_factors` or `sizes`")
+    # max_model_len should be greater than image_feature_size
+    with aphrodite_runner(
+        model,
+        dtype=dtype,
+        max_model_len=4096,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=True,
+    ) as aphrodite_model:
+        aphrodite_outputs_per_video = [
+            aphrodite_model.generate_greedy_logprobs(
+                prompts, max_tokens, num_logprobs=num_logprobs, videos=videos
+            )
+            for prompts, videos in inputs_per_video
+        ]
+    with hf_runner(
+        model, dtype=dtype, auto_cls=AutoModelForVision2Seq
+    ) as hf_model:
+        hf_outputs_per_video = [
+            hf_model.generate_greedy_logprobs_limit(
+                prompts, max_tokens, num_logprobs=num_logprobs, videos=videos
+            )
+            for prompts, videos in inputs_per_video
+        ]
+    for hf_outputs, aphrodite_outputs in zip(
+        hf_outputs_per_video, aphrodite_outputs_per_video
+    ):
+        # TODO: Check whether using original CLIPVisionModel can improve
+        # consistency against HF
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                aphrodite_to_hf_output(aphrodite_output, model)
+                for aphrodite_output in aphrodite_outputs
+            ],
+            name_0="hf",
+            name_1="aphrodite",
+        )
+
+
+@pytest.mark.skipif(
+    transformers.__version__ < "4.45",
+    reason="Waiting for next transformers release",
+)
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No video
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("num_frames", [16])
+def test_models(
+    hf_runner,
+    aphrodite_runner,
+    video_assets,
+    model,
+    size_factors,
+    dtype,
+    max_tokens,
+    num_logprobs,
+    num_frames,
+) -> None:
+    """Inference result should be the same between hf and aphrodite.
+    All the image fixtures for the test is under tests/videos.
+    For huggingface runner, we provide the np.ndarray as input.
+    For aphrodite runner, we provide MultiModalDataDict objects
+    and corresponding MultiModalConfig as input.
+    Note, the text input is also adjusted to abide by aphrodite contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    run_test(
+        hf_runner,
+        aphrodite_runner,
+        video_assets,
+        model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        num_frames=num_frames,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.skipif(
+    transformers.__version__ < "4.45",
+    reason="Waiting for next transformers release",
+)
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "sizes",
+    [[(1669, 2560), (2560, 1669), (183, 488), (488, 183)]],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("num_frames", [16])
+def test_models_fixed_sizes(
+    hf_runner,
+    aphrodite_runner,
+    video_assets,
+    model,
+    sizes,
+    dtype,
+    max_tokens,
+    num_logprobs,
+    num_frames,
+) -> None:
+    run_test(
+        hf_runner,
+        aphrodite_runner,
+        video_assets,
+        model,
+        sizes=sizes,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        num_frames=num_frames,
+        tensor_parallel_size=1,
+    )