2 ay önce · 411ac4f405
--- a/aphrodite/assets/video.py
+++ b/aphrodite/assets/video.py
@@ -1,6 +1,6 @@
 
				 from dataclasses import dataclass
			
 
				 from functools import lru_cache
			
 
				-from typing import List, Literal
			
 
				+from typing import List, Optional
			
 
				 
			
 
				 import numpy as np
			
 
				 import numpy.typing as npt
			
@@ -68,17 +68,20 @@ def video_to_pil_images_list(
 
				 
			
 
				 @dataclass(frozen=True)
			
 
				 class VideoAsset:
			
 
				-    name: Literal["sample_demo_1.mp4"]
			
 
				+    name: str = "sample_demo_1.mp4"
			
 
				     num_frames: int = -1
			
 
				+    local_path: Optional[str] = None
			
 
				 
			
 
				     @property
			
 
				     def pil_images(self) -> List[Image.Image]:
			
 
				-        video_path = download_video_asset(self.name)
			
 
				+        video_path = (self.local_path if self.local_path else
			
 
				+                      download_video_asset(self.name))
			
 
				         ret = video_to_pil_images_list(video_path, self.num_frames)
			
 
				         return ret
			
 
				 
			
 
				     @property
			
 
				     def np_ndarrays(self) -> List[npt.NDArray]:
			
 
				-        video_path = download_video_asset(self.name)
			
 
				+        video_path = (self.local_path if self.local_path else
			
 
				+                      download_video_asset(self.name))
			
 
				         ret = video_to_ndarrays(video_path, self.num_frames)
			
 
				         return ret
			
--- a/aphrodite/common/config.py
+++ b/aphrodite/common/config.py
@@ -1911,8 +1911,11 @@ def _get_and_verify_max_len(
 
				                     "Disabling sliding window is not supported for models "
			
 
				                     "with rope_scaling. Please raise an issue so we can "
			
 
				                     "investigate.")
			
 
				-            assert "factor" in rope_scaling
			
 
				-            scaling_factor = rope_scaling["factor"]
			
 
				+            if rope_type == "mrope":
			
 
				+                scaling_factor = 1
			
 
				+            else:
			
 
				+                assert "factor" in rope_scaling
			
 
				+                scaling_factor = rope_scaling["factor"]
			
 
				             if rope_type == "yarn":
			
 
				                 derived_max_model_len = rope_scaling[
			
 
				                     "original_max_position_embeddings"]
			
--- a/aphrodite/common/sequence.py
+++ b/aphrodite/common/sequence.py
@@ -155,6 +155,9 @@ class SequenceData(msgspec.Struct,
 
				     # is called.
			
 
				     _new_appended_tokens: List[int] = msgspec.field(default_factory=list)
			
 
				 
			
 
				+    # It is used to compute mrope_position_ids.
			
 
				+    _mrope_position_delta: Optional[int] = None
			
 
				+
			
 
				     def __post_init__(self) -> None:
			
 
				         assert self._prompt_token_ids.typecode == "l"
			
 
				         assert self._output_token_ids.typecode == "l"
			
@@ -209,6 +212,14 @@ class SequenceData(msgspec.Struct,
 
				         assert isinstance(self._output_token_ids, array)
			
 
				         return self._output_token_ids
			
 
				 
			
 
				+    @property
			
 
				+    def mrope_position_delta(self) -> Optional[int]:
			
 
				+        return self._mrope_position_delta
			
 
				+
			
 
				+    @mrope_position_delta.setter
			
 
				+    def mrope_position_delta(self, new_mrope_position_delta):
			
 
				+        self._mrope_position_delta = new_mrope_position_delta
			
 
				+
			
 
				     def append_token_id(self, token_id: int, logprob: float) -> None:
			
 
				         self._output_token_ids.append(token_id)
			
 
				         self._new_appended_tokens.append(token_id)
			
--- a/aphrodite/endpoints/chat_utils.py
+++ b/aphrodite/endpoints/chat_utils.py
@@ -101,7 +101,7 @@ class ConversationMessage(TypedDict, total=False):
 
				     """The tool calls generated by the model, such as function calls."""
			
 
				 
			
 
				 
			
 
				-ModalityStr = Literal["image", "audio"]
			
 
				+ModalityStr = Literal["image", "audio", "video"]
			
 
				 _T = TypeVar("_T")
			
 
				 
			
 
				 
			
@@ -148,12 +148,18 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
 
				                                               hf_config.image_token_index)
			
 
				             if model_type in ("chameleon", "internvl_chat"):
			
 
				                 return "<image>"
			
 
				+            if model_type == "qwen2_vl":
			
 
				+                return "<|vision_start|><|image_pad|><|vision_end|>"
			
 
				 
			
 
				             raise TypeError(f"Unknown model type: {model_type}")
			
 
				         elif modality == "audio":
			
 
				             if model_type == "ultravox":
			
 
				                 return "<|reserved_special_token_0|>"
			
 
				             raise TypeError(f"Unknown model type: {model_type}")
			
 
				+        elif modality == "video":
			
 
				+            if model_type == "qwen2_vl":
			
 
				+                return "<|vision_start|><|video_pad|><|vision_end|>"
			
 
				+            raise TypeError(f"Unknown model type: {model_type}")
			
 
				         else:
			
 
				             raise TypeError(f"Unknown modality: {modality}")
			
 
				 
			
--- a/aphrodite/modeling/layers/rotary_embedding.py
+++ b/aphrodite/modeling/layers/rotary_embedding.py
@@ -29,7 +29,6 @@ import torch
 
				 import torch.nn as nn
			
 
				 
			
 
				 from aphrodite.modeling._custom_op import CustomOp
			
 
				-from aphrodite.platforms import current_platform
			
 
				 
			
 
				 
			
 
				 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
			
@@ -45,26 +44,33 @@ def _rotate_gptj(x: torch.Tensor) -> torch.Tensor:
 
				     return x.flatten(-2)
			
 
				 
			
 
				 
			
 
				-# for TPUs
			
 
				 def _apply_rotary_emb(
			
 
				     x: torch.Tensor,
			
 
				     cos: torch.Tensor,
			
 
				     sin: torch.Tensor,
			
 
				+    is_neox_style: bool,
			
 
				 ) -> torch.Tensor:
			
 
				     """
			
 
				     Args:
			
 
				         x: [num_tokens, num_heads, head_size]
			
 
				         cos: [num_tokens, head_size // 2]
			
 
				         sin: [num_tokens, head_size // 2]
			
 
				+        is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
			
 
				+            positional embeddings.
			
 
				     """
			
 
				-    orig_dtype = x.dtype
			
 
				-    x = x.float()
			
 
				-    x1, x2 = torch.chunk(x, 2, dim=-1)
			
 
				-    cos = cos.unsqueeze(-2)
			
 
				-    sin = sin.unsqueeze(-2)
			
 
				+    cos = cos.unsqueeze(-2).to(x.dtype)
			
 
				+    sin = sin.unsqueeze(-2).to(x.dtype)
			
 
				+    if is_neox_style:
			
 
				+        x1, x2 = torch.chunk(x, 2, dim=-1)
			
 
				+    else:
			
 
				+        x1 = x[..., ::2]
			
 
				+        x2 = x[..., 1::2]
			
 
				     o1 = x1 * cos - x2 * sin
			
 
				     o2 = x2 * cos + x1 * sin
			
 
				-    return torch.cat((o1, o2), dim=-1).to(orig_dtype)
			
 
				+    if is_neox_style:
			
 
				+        return torch.cat((o1, o2), dim=-1)
			
 
				+    else:
			
 
				+        return torch.stack((o1, o2), dim=-1).flatten(-2)
			
 
				 
			
 
				 
			
 
				 class RotaryEmbedding(CustomOp):
			
@@ -89,16 +95,11 @@ class RotaryEmbedding(CustomOp):
 
				 
			
 
				         cache = self._compute_cos_sin_cache()
			
 
				         cache = cache.to(dtype)
			
 
				+        self.cos_sin_cache: torch.Tensor
			
 
				         self.register_buffer("cos_sin_cache", cache, persistent=False)
			
 
				-        self.use_native2 = current_platform.is_tpu() and is_neox_style
			
 
				 
			
 
				     def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
			
 
				         """Compute the inverse frequency."""
			
 
				-        # NOTE: The HF implementation uses `torch.arange(...).float()`.
			
 
				-        # However, we use `torch.arange(..., dtype=torch.float)` instead to
			
 
				-        # avoid numerical issues with large base values (e.g., 10000000).
			
 
				-        # This may cause a slight numerical difference between the HF
			
 
				-        # implementation and ours.
			
 
				         # NOTE: To exactly match the HF implementation, we need to
			
 
				         # use CPU to compute the cache and then move it to GPU. However, we
			
 
				         # create the cache on GPU for faster initialization. This may cause
			
@@ -125,58 +126,7 @@ class RotaryEmbedding(CustomOp):
 
				         key: torch.Tensor,
			
 
				         offsets: Optional[torch.Tensor] = None,
			
 
				     ) -> Tuple[torch.Tensor, torch.Tensor]:
			
 
				-        """A PyTorch-native implementation equivalent to forward().
			
 
				-
			
 
				-        This method mimics the implementation of the custom CUDA kernel
			
 
				-        used in `forward_cuda()`.
			
 
				-        """
			
 
				-        query = query.view(*query.shape[:-1], -1, self.head_size)
			
 
				-        key = key.view(*key.shape[:-1], -1, self.head_size)
			
 
				-
			
 
				-        query_rot = query[..., :self.rotary_dim]
			
 
				-        key_rot = key[..., :self.rotary_dim]
			
 
				-        if self.rotary_dim < self.head_size:
			
 
				-            query_pass = query[..., self.rotary_dim:]
			
 
				-            key_pass = key[..., self.rotary_dim:]
			
 
				-
			
 
				-        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(
			
 
				-            positions.device, dtype=query.dtype)
			
 
				-        cos_sin = self.cos_sin_cache[torch.add(positions, offsets)
			
 
				-                                     if offsets is not None else positions]
			
 
				-        cos, sin = cos_sin.chunk(2, dim=-1)
			
 
				-        if self.is_neox_style:
			
 
				-            # NOTE: Here we assume that the positions tensor has the
			
 
				-            # shape [batch_size, seq_len].
			
 
				-            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
			
 
				-            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
			
 
				-        else:
			
 
				-            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
			
 
				-            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
			
 
				-
			
 
				-        rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj
			
 
				-        query_rot = query_rot * cos + rotate_fn(query_rot) * sin
			
 
				-        key_rot = key_rot * cos + rotate_fn(key_rot) * sin
			
 
				-
			
 
				-        if self.rotary_dim < self.head_size:
			
 
				-            query = torch.cat((query_rot, query_pass), dim=-1)
			
 
				-            key = torch.cat((key_rot, key_pass), dim=-1)
			
 
				-        else:
			
 
				-            query = query_rot
			
 
				-            key = key_rot
			
 
				-        query = query.flatten(-2)
			
 
				-        key = key.flatten(-2)
			
 
				-        return query, key
			
 
				-
			
 
				-    def forward_native2(
			
 
				-        self,
			
 
				-        positions: torch.Tensor,
			
 
				-        query: torch.Tensor,
			
 
				-        key: torch.Tensor,
			
 
				-        offsets: Optional[torch.Tensor] = None,
			
 
				-    ) -> Tuple[torch.Tensor, torch.Tensor]:
			
 
				-        """Another PyTorch-native implementation of forward().
			
 
				-        This method might perform better than `forward_native()` when compiled.
			
 
				-        """
			
 
				+        """A PyTorch-native implementation of forward()."""
			
 
				         if offsets is not None:
			
 
				             positions = positions + offsets
			
 
				         positions = positions.flatten()
			
@@ -188,14 +138,14 @@ class RotaryEmbedding(CustomOp):
 
				         query = query.view(num_tokens, -1, self.head_size)
			
 
				         query_rot = query[..., :self.rotary_dim]
			
 
				         query_pass = query[..., self.rotary_dim:]
			
 
				-        query_rot = _apply_rotary_emb(query_rot, cos, sin)
			
 
				+        query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
			
 
				         query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
			
 
				 
			
 
				         key_shape = key.shape
			
 
				         key = key.view(num_tokens, -1, self.head_size)
			
 
				         key_rot = key[..., :self.rotary_dim]
			
 
				         key_pass = key[..., self.rotary_dim:]
			
 
				-        key_rot = _apply_rotary_emb(key_rot, cos, sin)
			
 
				+        key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
			
 
				         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
			
 
				         return query, key
			
 
				 
			
@@ -208,7 +158,7 @@ class RotaryEmbedding(CustomOp):
 
				     ) -> Tuple[torch.Tensor, torch.Tensor]:
			
 
				         from aphrodite import _custom_ops as ops
			
 
				 
			
 
				-        self.cos_sin_cache = self.cos_sin_cache.to(positions.device,
			
 
				+        self.cos_sin_cache = self.cos_sin_cache.to(query.device,
			
 
				                                                    dtype=query.dtype)
			
 
				         # ops.rotary_embedding()/batched_rotary_embedding()
			
 
				         # are in-place operations that update the query and key tensors.
			
@@ -245,17 +195,6 @@ class RotaryEmbedding(CustomOp):
 
				                                  self.cos_sin_cache, self.is_neox_style)
			
 
				         return query, key
			
 
				 
			
 
				-    def forward_tpu(
			
 
				-        self,
			
 
				-        positions: torch.Tensor,
			
 
				-        query: torch.Tensor,
			
 
				-        key: torch.Tensor,
			
 
				-        offsets: Optional[torch.Tensor] = None,
			
 
				-    ) -> Tuple[torch.Tensor, torch.Tensor]:
			
 
				-        forward_fn = (self.forward_native2
			
 
				-                      if self.use_native2 else self.forward_native)
			
 
				-        return forward_fn(positions, query, key, offsets)
			
 
				-
			
 
				     def extra_repr(self) -> str:
			
 
				         s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
			
 
				         s += f", max_position_embeddings={self.max_position_embeddings}"
			
@@ -541,6 +480,7 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
 
				             short_mscale = scaling_factor
			
 
				         if long_mscale is None:
			
 
				             long_mscale = scaling_factor
			
 
				+
			
 
				         self.short_mscale = short_mscale
			
 
				         self.long_mscale = long_mscale
			
 
				 
			
@@ -738,6 +678,7 @@ class GemmaRotaryEmbedding(RotaryEmbedding):
 
				 
			
 
				 
			
 
				 class Llama3RotaryEmbedding(RotaryEmbedding):
			
 
				+
			
 
				     def __init__(
			
 
				         self,
			
 
				         head_size: int,
			
@@ -762,6 +703,7 @@ class Llama3RotaryEmbedding(RotaryEmbedding):
 
				         inv_freqs = super()._compute_inv_freq(base)
			
 
				         low_freq_wavelen = self.orig_max_position / self.low_freq_factor
			
 
				         high_freq_wavelen = self.orig_max_position / self.high_freq_factor
			
 
				+
			
 
				         wave_len = 2 * math.pi / inv_freqs
			
 
				         if self.low_freq_factor != self.high_freq_factor:
			
 
				             smooth = (self.orig_max_position / wave_len - self.low_freq_factor
			
@@ -781,6 +723,179 @@ class Llama3RotaryEmbedding(RotaryEmbedding):
 
				         return new_freqs
			
 
				 
			
 
				 
			
 
				+class MRotaryEmbedding(RotaryEmbedding):
			
 
				+    """Rotary Embedding with Multimodal Sections."""
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        head_size: int,
			
 
				+        rotary_dim: int,
			
 
				+        max_position_embeddings: int,
			
 
				+        base: int,
			
 
				+        is_neox_style: bool,
			
 
				+        dtype: torch.dtype,
			
 
				+        mrope_section: Optional[List[int]] = None,
			
 
				+    ) -> None:
			
 
				+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
			
 
				+                         is_neox_style, dtype)
			
 
				+
			
 
				+        self.mrope_section = mrope_section
			
 
				+        if self.mrope_section:
			
 
				+            assert sum(self.mrope_section) == rotary_dim // 2
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        positions: torch.Tensor,
			
 
				+        query: torch.Tensor,
			
 
				+        key: torch.Tensor,
			
 
				+    ) -> Tuple[torch.Tensor, torch.Tensor]:
			
 
				+        """PyTorch-native implementation equivalent to forward().
			
 
				+
			
 
				+        Args:
			
 
				+            positions:
			
 
				+                [num_tokens,] (text only) or
			
 
				+                [3, num_tokens] (T/H/W positions with multimodal inputs)
			
 
				+            query: [num_tokens, num_heads * head_size]
			
 
				+            key: [num_tokens, num_kv_heads * head_size]
			
 
				+        """
			
 
				+        assert positions.ndim == 1 or positions.ndim == 2
			
 
				+
			
 
				+        num_tokens = positions.shape[-1]
			
 
				+        cos_sin = self.cos_sin_cache[positions]
			
 
				+        cos, sin = cos_sin.chunk(2, dim=-1)
			
 
				+        if positions.ndim == 2:
			
 
				+            assert self.mrope_section
			
 
				+
			
 
				+            cos = torch.cat([
			
 
				+                m[i]
			
 
				+                for i, m in enumerate(cos.split(self.mrope_section, dim=-1))
			
 
				+            ],
			
 
				+                            dim=-1)
			
 
				+            sin = torch.cat([
			
 
				+                m[i]
			
 
				+                for i, m in enumerate(sin.split(self.mrope_section, dim=-1))
			
 
				+            ],
			
 
				+                            dim=-1)
			
 
				+
			
 
				+        query_shape = query.shape
			
 
				+        query = query.view(num_tokens, -1, self.head_size)
			
 
				+        query_rot = query[..., :self.rotary_dim]
			
 
				+        query_pass = query[..., self.rotary_dim:]
			
 
				+        query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
			
 
				+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
			
 
				+
			
 
				+        key_shape = key.shape
			
 
				+        key = key.view(num_tokens, -1, self.head_size)
			
 
				+        key_rot = key[..., :self.rotary_dim]
			
 
				+        key_pass = key[..., self.rotary_dim:]
			
 
				+        key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
			
 
				+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
			
 
				+        return query, key
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def get_input_positions(
			
 
				+        input_tokens: List[int],
			
 
				+        image_grid_thw: Union[List[List[int]], torch.Tensor],
			
 
				+        video_grid_thw: Union[List[List[int]], torch.Tensor],
			
 
				+        image_token_id: int,
			
 
				+        video_token_id: int,
			
 
				+        vision_start_token_id: int,
			
 
				+        vision_end_token_id: int,
			
 
				+        spatial_merge_size: int,
			
 
				+        context_len: int = 0,
			
 
				+    ) -> Tuple[List[List[int]], int]:
			
 
				+        """Get mrope input positions and delta value."""
			
 
				+
			
 
				+        if isinstance(image_grid_thw, torch.Tensor):
			
 
				+            image_grid_thw = image_grid_thw.tolist()
			
 
				+        if isinstance(video_grid_thw, torch.Tensor):
			
 
				+            video_grid_thw = video_grid_thw.tolist()
			
 
				+
			
 
				+        input_tokens_tensor = torch.tensor(input_tokens)
			
 
				+        vision_start_indices = torch.argwhere(
			
 
				+            input_tokens_tensor == vision_start_token_id).squeeze(1)
			
 
				+        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
			
 
				+        image_nums = (vision_tokens == image_token_id).sum()
			
 
				+        video_nums = (vision_tokens == video_token_id).sum()
			
 
				+        llm_pos_ids_list: list = []
			
 
				+
			
 
				+        st = 0
			
 
				+        remain_images, remain_videos = image_nums, video_nums
			
 
				+
			
 
				+        image_index, video_index = 0, 0
			
 
				+        for _ in range(image_nums + video_nums):
			
 
				+            if image_token_id in input_tokens and remain_images > 0:
			
 
				+                ed_image = input_tokens.index(image_token_id, st)
			
 
				+            else:
			
 
				+                ed_image = len(input_tokens) + 1
			
 
				+            if video_token_id in input_tokens and remain_videos > 0:
			
 
				+                ed_video = input_tokens.index(video_token_id, st)
			
 
				+            else:
			
 
				+                ed_video = len(input_tokens) + 1
			
 
				+            if ed_image < ed_video:
			
 
				+                t, h, w = (
			
 
				+                    image_grid_thw[image_index][0],
			
 
				+                    image_grid_thw[image_index][1],
			
 
				+                    image_grid_thw[image_index][2],
			
 
				+                )
			
 
				+                image_index += 1
			
 
				+                remain_images -= 1
			
 
				+                ed = ed_image
			
 
				+            else:
			
 
				+                t, h, w = (
			
 
				+                    video_grid_thw[video_index][0],
			
 
				+                    video_grid_thw[video_index][1],
			
 
				+                    video_grid_thw[video_index][2],
			
 
				+                )
			
 
				+                video_index += 1
			
 
				+                remain_videos -= 1
			
 
				+                ed = ed_video
			
 
				+            llm_grid_t, llm_grid_h, llm_grid_w = \
			
 
				+                t, h // spatial_merge_size, w // spatial_merge_size
			
 
				+            text_len = ed - st
			
 
				+
			
 
				+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
			
 
				+                llm_pos_ids_list) > 0 else 0
			
 
				+            llm_pos_ids_list.append(
			
 
				+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
			
 
				+
			
 
				+            t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
			
 
				+                -1, llm_grid_h * llm_grid_w).flatten()
			
 
				+            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
			
 
				+                llm_grid_t, -1, llm_grid_w).flatten()
			
 
				+            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
			
 
				+                llm_grid_t, llm_grid_h, -1).flatten()
			
 
				+            llm_pos_ids_list.append(
			
 
				+                torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
			
 
				+            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
			
 
				+
			
 
				+        if st < len(input_tokens):
			
 
				+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
			
 
				+                llm_pos_ids_list) > 0 else 0
			
 
				+            text_len = len(input_tokens) - st
			
 
				+            llm_pos_ids_list.append(
			
 
				+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
			
 
				+
			
 
				+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
			
 
				+        llm_positions = llm_positions[:, context_len:]
			
 
				+        mrope_position_delta = (llm_positions.max() + 1 -
			
 
				+                                len(input_tokens)).item()
			
 
				+
			
 
				+        return llm_positions.tolist(), mrope_position_delta
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def get_next_input_positions(
			
 
				+        mrope_position_delta: int,
			
 
				+        context_len: int,
			
 
				+        seq_len: int,
			
 
				+    ) -> List[List[int]]:
			
 
				+        return [
			
 
				+            list(
			
 
				+                range(context_len + mrope_position_delta,
			
 
				+                      seq_len + mrope_position_delta)) for _ in range(3)
			
 
				+        ]
			
 
				+
			
 
				+
			
 
				 _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {}
			
 
				 
			
 
				 
			
@@ -792,7 +907,7 @@ def get_rope(
 
				     is_neox_style: bool = True,
			
 
				     rope_scaling: Optional[Dict[str, Any]] = None,
			
 
				     dtype: Optional[torch.dtype] = None,
			
 
				-    rotary_percent: float = 1.0,
			
 
				+    partial_rotary_factor: float = 1.0,
			
 
				 ) -> RotaryEmbedding:
			
 
				     if dtype is None:
			
 
				         dtype = torch.get_default_dtype()
			
@@ -805,12 +920,13 @@ def get_rope(
 
				         rope_scaling_args = tuple(rope_scaling_tuple.items())
			
 
				     else:
			
 
				         rope_scaling_args = None
			
 
				-    if rotary_percent < 1.0:
			
 
				-        rotary_dim = int(rotary_dim * rotary_percent)
			
 
				+    if partial_rotary_factor < 1.0:
			
 
				+        rotary_dim = int(rotary_dim * partial_rotary_factor)
			
 
				     key = (head_size, rotary_dim, max_position, base, is_neox_style,
			
 
				            rope_scaling_args, dtype)
			
 
				     if key in _ROPE_DICT:
			
 
				         return _ROPE_DICT[key]
			
 
				+
			
 
				     if rope_scaling is None:
			
 
				         rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base,
			
 
				                                      is_neox_style, dtype)
			
@@ -820,7 +936,7 @@ def get_rope(
 
				         # The correct one should be "longrope" but keep "su" here
			
 
				         # for backward compatible
			
 
				         if scaling_type not in {"su", "longrope"}:
			
 
				-            scaling_factor = rope_scaling["factor"]
			
 
				+            scaling_factor = rope_scaling.get("factor", 1.0)
			
 
				         if scaling_type == "llama3":
			
 
				             low_freq_factor = rope_scaling["low_freq_factor"]
			
 
				             high_freq_factor = rope_scaling["high_freq_factor"]
			
@@ -884,6 +1000,16 @@ def get_rope(
 
				                 head_size, rotary_dim, max_position, original_max_position,
			
 
				                 base, is_neox_style, dtype, short_factor, long_factor,
			
 
				                 **extra_kwargs)
			
 
				+        elif scaling_type == "mrope":
			
 
				+            return MRotaryEmbedding(
			
 
				+                head_size,
			
 
				+                rotary_dim,
			
 
				+                max_position,
			
 
				+                base,
			
 
				+                is_neox_style,
			
 
				+                dtype,
			
 
				+                mrope_section=rope_scaling["mrope_section"],
			
 
				+            )
			
 
				         else:
			
 
				             raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
			
 
				     _ROPE_DICT[key] = rotary_emb
			
--- a/aphrodite/modeling/models/__init__.py
+++ b/aphrodite/modeling/models/__init__.py
@@ -48,9 +48,10 @@ _GENERATION_MODELS = {
 
				     "PhiForCausalLM": ("phi", "PhiForCausalLM"),
			
 
				     "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),
			
 
				     "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"),
			
 
				-    "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
			
 
				     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
			
 
				     "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
			
 
				+    "Qwen2VLForConditionalGeneration":
			
 
				+    ("qwen2_vl", "Qwen2VLForConditionalGeneration"),
			
 
				     "RWForCausalLM": ("falcon", "FalconForCausalLM"),
			
 
				     "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
			
 
				     "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
			
@@ -91,6 +92,9 @@ _MULTIMODAL_MODELS = {
 
				                                           "PaliGemmaForConditionalGeneration"),
			
 
				     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
			
 
				     "UltravoxModel": ("ultravox", "UltravoxModel"),
			
 
				+    "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
			
 
				+    "Qwen2VLForConditionalGeneration": ("qwen2_vl",
			
 
				+                                        "Qwen2VLForConditionalGeneration"),
			
 
				 }
			
 
				 
			
 
				 _CONDITIONAL_GENERATION_MODELS = {
			
--- a/aphrodite/modeling/models/granite.py
+++ b/aphrodite/modeling/models/granite.py
@@ -26,6 +26,7 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
				 
			
 
				 import torch
			
 
				 from torch import nn
			
 
				+from transformers import GraniteConfig
			
 
				 
			
 
				 from aphrodite.attention import Attention, AttentionMetadata
			
 
				 from aphrodite.common.config import CacheConfig, LoRAConfig
			
@@ -50,7 +51,6 @@ from aphrodite.modeling.sampling_metadata import SamplingMetadata
 
				 from aphrodite.quantization.base_config import QuantizationConfig
			
 
				 from aphrodite.quantization.compressed_tensors.utils import (
			
 
				     get_compressed_tensors_cache_scale)
			
 
				-from aphrodite.transformers_utils.configs.granite import GraniteConfig
			
 
				 
			
 
				 from .interfaces import SupportsLoRA
			
 
				 from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
			
--- a/aphrodite/modeling/models/qwen2_vl.py
+++ b/aphrodite/modeling/models/qwen2_vl.py
@@ -0,0 +1,1129 @@
 
				+# coding=utf-8
			
 
				+# Adapted from
			
 
				+# https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
			
 
				+# Copyright 2024 The Qwen team.
			
 
				+# Copyright 2023 The PygmalionAI team.
			
 
				+# Copyright 2023 The vLLM team.
			
 
				+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
			
 
				+#
			
 
				+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
			
 
				+# and OPT implementations in this library. It has been modified from its
			
 
				+# original forms to accommodate minor architectural differences compared
			
 
				+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+"""Inference-only Qwen2-VL model compatible with HuggingFace weights."""
			
 
				+from array import array
			
 
				+from functools import lru_cache, partial
			
 
				+from typing import (Iterable, List, Mapping, Optional, Tuple, Type, TypedDict,
			
 
				+                    Union)
			
 
				+
			
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+import torch.nn.functional as F
			
 
				+from einops import rearrange, repeat
			
 
				+from loguru import logger
			
 
				+from PIL import Image
			
 
				+from transformers.image_utils import (get_image_size,
			
 
				+                                      infer_channel_dimension_format,
			
 
				+                                      to_numpy_array)
			
 
				+from transformers.models.qwen2_vl.image_processing_qwen2_vl import (
			
 
				+    make_batched_images, make_batched_videos, smart_resize)
			
 
				+
			
 
				+import aphrodite.common.envs as envs
			
 
				+from aphrodite.attention import AttentionMetadata
			
 
				+from aphrodite.attention.selector import (_Backend, backend_name_to_enum,
			
 
				+                                          get_global_forced_attn_backend)
			
 
				+from aphrodite.common.config import CacheConfig, MultiModalConfig
			
 
				+from aphrodite.common.logger import log_once
			
 
				+from aphrodite.common.sequence import (APHRODITE_TOKEN_ID_ARRAY_TYPE,
			
 
				+                                       IntermediateTensors, SequenceData)
			
 
				+from aphrodite.distributed import parallel_state
			
 
				+from aphrodite.distributed import utils as dist_utils
			
 
				+from aphrodite.inputs import INPUT_REGISTRY, InputContext, LLMInputs
			
 
				+from aphrodite.modeling.layers.activation import QuickGELU
			
 
				+from aphrodite.modeling.layers.linear import (ColumnParallelLinear,
			
 
				+                                              RowParallelLinear)
			
 
				+from aphrodite.modeling.layers.logits_processor import LogitsProcessor
			
 
				+from aphrodite.modeling.layers.sampler import Sampler, SamplerOutput
			
 
				+from aphrodite.modeling.layers.vocab_parallel_embedding import ParallelLMHead
			
 
				+from aphrodite.modeling.model_loader.weight_utils import default_weight_loader
			
 
				+from aphrodite.modeling.models.interfaces import SupportsMultiModal
			
 
				+from aphrodite.modeling.models.qwen2 import Qwen2Model
			
 
				+from aphrodite.modeling.sampling_metadata import SamplingMetadata
			
 
				+from aphrodite.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
			
 
				+                                  MultiModalInputs)
			
 
				+from aphrodite.multimodal.base import MultiModalData
			
 
				+from aphrodite.multimodal.image import cached_get_image_processor
			
 
				+from aphrodite.platforms import current_platform
			
 
				+from aphrodite.quantization import QuantizationConfig
			
 
				+from aphrodite.transformers_utils.configs import (Qwen2VLConfig,
			
 
				+                                                  Qwen2VLVisionConfig)
			
 
				+from aphrodite.transformers_utils.processor import get_processor
			
 
				+
			
 
				+
			
 
				+# === Vision Inputs === #
			
 
				+class Qwen2VLImageInputs(TypedDict):
			
 
				+    pixel_values: torch.Tensor
			
 
				+    """Shape: 
			
 
				+    `(num_patches, num_channels * patch_size * patch_size)`
			
 
				+    """
			
 
				+    image_grid_thw: torch.Tensor
			
 
				+    """Shape: `(num_images, 3)`
			
 
				+    
			
 
				+    This should be in `(grid_t, grid_h, grid_w)` format.
			
 
				+    """
			
 
				+
			
 
				+
			
 
				+class Qwen2VLVideoInputs(TypedDict):
			
 
				+    pixel_values_videos: torch.Tensor
			
 
				+    """Shape: 
			
 
				+    `(num_patches, 
			
 
				+      num_channels * temporal_patch_size * patch_size * patch_size)`
			
 
				+    """
			
 
				+    video_grid_thw: torch.Tensor
			
 
				+    """Shape: `(num_videos, 3)`
			
 
				+    
			
 
				+    This should be in `(grid_t, grid_h, grid_w)` format.
			
 
				+    """
			
 
				+
			
 
				+
			
 
				+# === Vision Encoder === #
			
 
				+class Qwen2VisionMLP(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        in_features: int,
			
 
				+        hidden_features: int = None,
			
 
				+        act_layer: Type[nn.Module] = QuickGELU,
			
 
				+        quant_config: Optional[QuantizationConfig] = None,
			
 
				+    ):
			
 
				+        super().__init__()
			
 
				+        self.fc1 = ColumnParallelLinear(
			
 
				+            in_features, hidden_features, quant_config=quant_config
			
 
				+        )
			
 
				+        self.act = act_layer()
			
 
				+        self.fc2 = RowParallelLinear(
			
 
				+            hidden_features, in_features, quant_config=quant_config
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        x_parallel, _ = self.fc1(x)
			
 
				+        x_parallel = self.act(x_parallel)
			
 
				+        x, _ = self.fc2(x_parallel)
			
 
				+        return x
			
 
				+
			
 
				+
			
 
				+def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
			
 
				+    if not interleaved:
			
 
				+        x1, x2 = x.chunk(2, dim=-1)
			
 
				+        return torch.cat((-x2, x1), dim=-1)
			
 
				+    else:
			
 
				+        x1, x2 = x[..., ::2], x[..., 1::2]
			
 
				+        return rearrange(
			
 
				+            torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+def apply_rotary_emb_torch(
			
 
				+    x: torch.Tensor,
			
 
				+    cos: torch.Tensor,
			
 
				+    sin: torch.Tensor,
			
 
				+    interleaved: bool = False,
			
 
				+) -> torch.Tensor:
			
 
				+    """
			
 
				+    x: (batch_size, seqlen, nheads, headdim)
			
 
				+    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
			
 
				+    """
			
 
				+    ro_dim = cos.shape[-1] * 2
			
 
				+    assert ro_dim <= x.shape[-1]
			
 
				+    cos = repeat(
			
 
				+        cos,
			
 
				+        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)",
			
 
				+    )
			
 
				+    sin = repeat(
			
 
				+        sin,
			
 
				+        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)",
			
 
				+    )
			
 
				+    return torch.cat(
			
 
				+        [
			
 
				+            x[..., :ro_dim] * cos
			
 
				+            + rotate_half(x[..., :ro_dim], interleaved) * sin,
			
 
				+            x[..., ro_dim:],
			
 
				+        ],
			
 
				+        dim=-1,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def apply_rotary_pos_emb_vision(
			
 
				+    t: torch.Tensor, freqs: torch.Tensor
			
 
				+) -> torch.Tensor:
			
 
				+    t_ = t.float()
			
 
				+    cos = freqs.cos()
			
 
				+    sin = freqs.sin()
			
 
				+    output = apply_rotary_emb_torch(t_, cos, sin).type_as(t)
			
 
				+    return output
			
 
				+
			
 
				+
			
 
				+class Qwen2VisionAttention(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        embed_dim: Optional[int] = None,
			
 
				+        num_heads: Optional[int] = None,
			
 
				+        projection_size: Optional[int] = None,
			
 
				+        quant_config: Optional[QuantizationConfig] = None,
			
 
				+    ) -> None:
			
 
				+        super().__init__()
			
 
				+        # Per attention head and per partition values.
			
 
				+        world_size = parallel_state.get_tensor_model_parallel_world_size()
			
 
				+        self.hidden_size_per_attention_head = dist_utils.divide(
			
 
				+            projection_size, num_heads
			
 
				+        )
			
 
				+        self.num_attention_heads_per_partition = dist_utils.divide(
			
 
				+            num_heads, world_size
			
 
				+        )
			
 
				+        self.qkv = ColumnParallelLinear(
			
 
				+            input_size=embed_dim,
			
 
				+            output_size=3 * projection_size,
			
 
				+            quant_config=quant_config,
			
 
				+        )
			
 
				+        self.proj = RowParallelLinear(
			
 
				+            input_size=projection_size,
			
 
				+            output_size=embed_dim,
			
 
				+            quant_config=quant_config,
			
 
				+        )
			
 
				+        # Detect attention implementation.
			
 
				+        selected_backend: Optional[_Backend] = get_global_forced_attn_backend()
			
 
				+        if selected_backend is None:
			
 
				+            backend_by_env_var: Optional[str] = envs.APHRODITE_ATTENTION_BACKEND
			
 
				+            if backend_by_env_var is not None:
			
 
				+                selected_backend = backend_name_to_enum(backend_by_env_var)
			
 
				+        if selected_backend is None:
			
 
				+            # For Volta and Turing GPUs, use xformers instead.
			
 
				+            device_available = current_platform.get_device_capability()[0] >= 8
			
 
				+            if device_available:
			
 
				+                from transformers.utils import is_flash_attn_2_available
			
 
				+
			
 
				+                if is_flash_attn_2_available():
			
 
				+                    self._use_flash_attn = True
			
 
				+                else:
			
 
				+                    log_once(
			
 
				+                    level="WARNING",
			
 
				+                    message=
			
 
				+                        "Current Qwen2-VL implementation has a bug with "
			
 
				+                        "`aphrodite-flash-attn` inside vision module, so we use"
			
 
				+                        " xformers backend instead. You can run `pip install "
			
 
				+                        "flash-attn to use flash-attention backend."
			
 
				+                    )
			
 
				+                    self._use_flash_attn = False
			
 
				+            else:
			
 
				+                self._use_flash_attn = False
			
 
				+        else:
			
 
				+            if selected_backend == _Backend.FLASH_ATTN:
			
 
				+                self._use_flash_attn = True
			
 
				+            elif selected_backend == _Backend.XFORMERS:
			
 
				+                self._use_flash_attn = False
			
 
				+            else:
			
 
				+                raise RuntimeError(
			
 
				+                    f"Qwen2-VL does not support {selected_backend} backend now."
			
 
				+                )
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        x: torch.Tensor,
			
 
				+        cu_seqlens: torch.Tensor,
			
 
				+        rotary_pos_emb: torch.Tensor = None,
			
 
				+    ) -> torch.Tensor:
			
 
				+        # [s, b, c] --> [s, b, head * 3 * head_dim]
			
 
				+        x, _ = self.qkv(x)
			
 
				+        # [s, b, head * 3 * head_dim] --> [s, b, head, 3 * head_dim]
			
 
				+        new_x_shape = x.size()[:-1] + (
			
 
				+            self.num_attention_heads_per_partition,
			
 
				+            3 * self.hidden_size_per_attention_head,
			
 
				+        )
			
 
				+        x = x.view(*new_x_shape)
			
 
				+        # [s, b, head, 3 * head_dim] --> 3 [s, b, head, head_dim]
			
 
				+        q, k, v = dist_utils.split_tensor_along_last_dim(x, 3)
			
 
				+        batch_size = q.shape[1]
			
 
				+        q, k, v = [
			
 
				+            rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)
			
 
				+        ]
			
 
				+        if rotary_pos_emb is not None:
			
 
				+            q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
			
 
				+            k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
			
 
				+        if self._use_flash_attn:
			
 
				+            # from aphrodite_flash_attn.flash_attn_interface import (
			
 
				+            #   flash_attn_varlen_func)
			
 
				+            from flash_attn import flash_attn_varlen_func
			
 
				+
			
 
				+            q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
			
 
				+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
			
 
				+            output = flash_attn_varlen_func(
			
 
				+                q,
			
 
				+                k,
			
 
				+                v,
			
 
				+                cu_seqlens_q=cu_seqlens,
			
 
				+                cu_seqlens_k=cu_seqlens,
			
 
				+                max_seqlen_q=max_seqlen,
			
 
				+                max_seqlen_k=max_seqlen,
			
 
				+                dropout_p=0,
			
 
				+                causal=False,
			
 
				+            )
			
 
				+            context_layer = rearrange(
			
 
				+                output, "(b s) ... -> b s ...", b=batch_size
			
 
				+            )
			
 
				+        else:
			
 
				+            from xformers import ops as xops
			
 
				+            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
			
 
				+
			
 
				+            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
			
 
				+            attn_bias = BlockDiagonalMask.from_seqlens(
			
 
				+                q_seqlen=seqlens, kv_seqlen=None
			
 
				+            )
			
 
				+            context_layer = xops.memory_efficient_attention_forward(
			
 
				+                q, k, v, attn_bias=attn_bias, p=0, scale=None
			
 
				+            )
			
 
				+        context_layer = rearrange(
			
 
				+            context_layer, "b s h d -> s b (h d)"
			
 
				+        ).contiguous()
			
 
				+        output, _ = self.proj(context_layer)
			
 
				+        return output
			
 
				+
			
 
				+
			
 
				+class Qwen2VisionBlock(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        dim: int,
			
 
				+        num_heads: int,
			
 
				+        mlp_ratio: float,
			
 
				+        act_layer: Type[nn.Module] = QuickGELU,
			
 
				+        norm_layer: Type[nn.Module] = None,
			
 
				+        quant_config: Optional[QuantizationConfig] = None,
			
 
				+    ) -> None:
			
 
				+        super().__init__()
			
 
				+        if norm_layer is None:
			
 
				+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
			
 
				+        self.norm1 = norm_layer(dim)
			
 
				+        self.norm2 = norm_layer(dim)
			
 
				+        mlp_hidden_dim = int(dim * mlp_ratio)
			
 
				+        self.attn = Qwen2VisionAttention(
			
 
				+            embed_dim=dim,
			
 
				+            num_heads=num_heads,
			
 
				+            projection_size=dim,
			
 
				+            quant_config=quant_config,
			
 
				+        )
			
 
				+        self.mlp = Qwen2VisionMLP(
			
 
				+            dim, mlp_hidden_dim, act_layer=act_layer, quant_config=quant_config
			
 
				+        )
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        x: torch.Tensor,
			
 
				+        cu_seqlens: torch.Tensor,
			
 
				+        rotary_pos_emb: torch.Tensor,
			
 
				+    ) -> torch.Tensor:
			
 
				+        x = x + self.attn(
			
 
				+            self.norm1(x), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
			
 
				+        )
			
 
				+        x = x + self.mlp(self.norm2(x))
			
 
				+        return x
			
 
				+
			
 
				+
			
 
				+class Qwen2VisionPatchEmbed(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        patch_size: int = 14,
			
 
				+        temporal_patch_size: int = 2,
			
 
				+        in_chans: int = 3,
			
 
				+        embed_dim: int = 1152,
			
 
				+    ) -> None:
			
 
				+        super().__init__()
			
 
				+        self.patch_size = patch_size
			
 
				+        self.temporal_patch_size = temporal_patch_size
			
 
				+        self.embed_dim = embed_dim
			
 
				+        kernel_size = [temporal_patch_size, patch_size, patch_size]
			
 
				+        self.proj = nn.Conv3d(
			
 
				+            in_chans,
			
 
				+            embed_dim,
			
 
				+            kernel_size=kernel_size,
			
 
				+            stride=kernel_size,
			
 
				+            bias=False,
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        L, C = x.shape
			
 
				+        x = x.view(
			
 
				+            L, -1, self.temporal_patch_size, self.patch_size, self.patch_size
			
 
				+        )
			
 
				+        x = self.proj(x).view(L, self.embed_dim)
			
 
				+        return x
			
 
				+
			
 
				+
			
 
				+class Qwen2VisionPatchMerger(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        d_model: int,
			
 
				+        context_dim: int,
			
 
				+        norm_layer: Type[nn.Module] = None,
			
 
				+        spatial_merge_size: int = 2,
			
 
				+        quant_config: Optional[QuantizationConfig] = None,
			
 
				+    ) -> None:
			
 
				+        super().__init__()
			
 
				+        self.hidden_size = context_dim * (spatial_merge_size**2)
			
 
				+        if norm_layer is None:
			
 
				+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
			
 
				+        self.ln_q = norm_layer(context_dim)
			
 
				+        self.mlp = nn.ModuleList(
			
 
				+            [
			
 
				+                ColumnParallelLinear(
			
 
				+                    self.hidden_size,
			
 
				+                    self.hidden_size,
			
 
				+                    bias=True,
			
 
				+                    quant_config=quant_config,
			
 
				+                ),
			
 
				+                nn.GELU(),
			
 
				+                RowParallelLinear(
			
 
				+                    self.hidden_size,
			
 
				+                    d_model,
			
 
				+                    bias=True,
			
 
				+                    quant_config=quant_config,
			
 
				+                ),
			
 
				+            ]
			
 
				+        )
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        x = self.ln_q(x)
			
 
				+        x = x.view(-1, self.hidden_size)
			
 
				+        mlp_fc1, mlp_act, mlp_fc2 = self.mlp
			
 
				+        x_parallel, _ = mlp_fc1(x)
			
 
				+        x_parallel = mlp_act(x_parallel)
			
 
				+        out, _ = mlp_fc2(x_parallel)
			
 
				+        return out
			
 
				+
			
 
				+
			
 
				+class Qwen2VisionRotaryEmbedding(nn.Module):
			
 
				+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
			
 
				+        super().__init__()
			
 
				+        self.dim = dim
			
 
				+        self.theta = theta
			
 
				+        inv_freq = 1.0 / (
			
 
				+            theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim)
			
 
				+        )
			
 
				+        self.register_buffer("inv_freq", inv_freq, persistent=False)
			
 
				+        self._seq_len_cached = 0
			
 
				+        self._freqs_cached = None
			
 
				+
			
 
				+    def update_freqs_cache(self, seqlen: int) -> None:
			
 
				+        if seqlen > self._seq_len_cached:
			
 
				+            seqlen *= 2
			
 
				+            self._seq_len_cached = seqlen
			
 
				+            self.inv_freq = 1.0 / (
			
 
				+                self.theta
			
 
				+                ** (
			
 
				+                    torch.arange(
			
 
				+                        0,
			
 
				+                        self.dim,
			
 
				+                        2,
			
 
				+                        dtype=torch.float,
			
 
				+                        device=self.inv_freq.device,
			
 
				+                    )
			
 
				+                    / self.dim
			
 
				+                )
			
 
				+            )
			
 
				+            seq = torch.arange(
			
 
				+                seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
			
 
				+            )
			
 
				+            freqs = torch.outer(seq, self.inv_freq)
			
 
				+            self._freqs_cached = freqs
			
 
				+
			
 
				+    def forward(self, seqlen: int) -> torch.Tensor:
			
 
				+        self.update_freqs_cache(seqlen)
			
 
				+        return self._freqs_cached[:seqlen]
			
 
				+
			
 
				+
			
 
				+class Qwen2VisionTransformer(nn.Module):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        vision_config: Qwen2VLVisionConfig,
			
 
				+        norm_eps: float = 1e-6,
			
 
				+        quant_config: Optional[QuantizationConfig] = None,
			
 
				+    ) -> None:
			
 
				+        super().__init__()
			
 
				+        patch_size: int = vision_config.patch_size
			
 
				+        temporal_patch_size: int = vision_config.temporal_patch_size
			
 
				+        spatial_merge_size: int = vision_config.spatial_merge_size
			
 
				+        in_chans: int = vision_config.in_chans
			
 
				+        hidden_size: int = vision_config.hidden_size
			
 
				+        embed_dim: int = vision_config.embed_dim
			
 
				+        depth: int = vision_config.depth
			
 
				+        num_heads: int = vision_config.num_heads
			
 
				+        mlp_ratio: float = vision_config.mlp_ratio
			
 
				+        self.spatial_merge_size = spatial_merge_size
			
 
				+        self.patch_embed = Qwen2VisionPatchEmbed(
			
 
				+            patch_size=patch_size,
			
 
				+            temporal_patch_size=temporal_patch_size,
			
 
				+            in_chans=in_chans,
			
 
				+            embed_dim=embed_dim,
			
 
				+        )
			
 
				+        norm_layer = partial(nn.LayerNorm, eps=norm_eps)
			
 
				+        head_dim = embed_dim // num_heads
			
 
				+        self.rotary_pos_emb = Qwen2VisionRotaryEmbedding(head_dim // 2)
			
 
				+        self.blocks = nn.ModuleList(
			
 
				+            [
			
 
				+                Qwen2VisionBlock(
			
 
				+                    dim=embed_dim,
			
 
				+                    num_heads=num_heads,
			
 
				+                    mlp_ratio=mlp_ratio,
			
 
				+                    norm_layer=norm_layer,
			
 
				+                    quant_config=quant_config,
			
 
				+                )
			
 
				+                for _ in range(depth)
			
 
				+            ]
			
 
				+        )
			
 
				+        self.merger = Qwen2VisionPatchMerger(
			
 
				+            d_model=hidden_size,
			
 
				+            context_dim=embed_dim,
			
 
				+            norm_layer=norm_layer,
			
 
				+            quant_config=quant_config,
			
 
				+        )
			
 
				+
			
 
				+    @property
			
 
				+    def dtype(self) -> torch.dtype:
			
 
				+        return self.blocks[0].mlp.fc2.weight.dtype
			
 
				+
			
 
				+    @property
			
 
				+    def device(self) -> torch.device:
			
 
				+        return self.blocks[0].mlp.fc2.weight.device
			
 
				+
			
 
				+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
			
 
				+        pos_ids = []
			
 
				+        for t, h, w in grid_thw:
			
 
				+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
			
 
				+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
			
 
				+            hpos_ids = (
			
 
				+                hpos_ids.reshape(
			
 
				+                    h // self.spatial_merge_size,
			
 
				+                    self.spatial_merge_size,
			
 
				+                    w // self.spatial_merge_size,
			
 
				+                    self.spatial_merge_size,
			
 
				+                )
			
 
				+                .permute(0, 2, 1, 3)
			
 
				+                .flatten()
			
 
				+            )
			
 
				+            wpos_ids = (
			
 
				+                wpos_ids.reshape(
			
 
				+                    h // self.spatial_merge_size,
			
 
				+                    self.spatial_merge_size,
			
 
				+                    w // self.spatial_merge_size,
			
 
				+                    self.spatial_merge_size,
			
 
				+                )
			
 
				+                .permute(0, 2, 1, 3)
			
 
				+                .flatten()
			
 
				+            )
			
 
				+            pos_ids.append(
			
 
				+                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)
			
 
				+            )
			
 
				+        pos_ids = torch.cat(pos_ids, dim=0)
			
 
				+        max_grid_size = grid_thw[:, 1:].max()
			
 
				+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
			
 
				+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
			
 
				+        return rotary_pos_emb
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        x: torch.Tensor,
			
 
				+        grid_thw: torch.Tensor,
			
 
				+    ) -> torch.Tensor:
			
 
				+        # patchify
			
 
				+        x = x.to(device=self.device, dtype=self.dtype)
			
 
				+        x = self.patch_embed(x)
			
 
				+        # compute position embedding
			
 
				+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
			
 
				+        # compute cu_seqlens
			
 
				+        cu_seqlens = torch.repeat_interleave(
			
 
				+            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
			
 
				+        ).cumsum(dim=0, dtype=torch.int32)
			
 
				+        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
			
 
				+        # transformers
			
 
				+        x = x.unsqueeze(1)
			
 
				+        for blk in self.blocks:
			
 
				+            x = blk(x, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
			
 
				+        # adapter
			
 
				+        x = self.merger(x)
			
 
				+        return x
			
 
				+
			
 
				+
			
 
				+# === Vision input helpers === #
			
 
				+cached_get_processor = lru_cache(get_processor)
			
 
				+
			
 
				+
			
 
				+def mm_input_mapper_for_qwen2_vl(
			
 
				+    ctx: InputContext,
			
 
				+    data: MultiModalData[object],
			
 
				+    data_type_key: str,
			
 
				+) -> MultiModalInputs:
			
 
				+    """Input mapper for Qwen2-VL."""
			
 
				+    model_config = ctx.model_config
			
 
				+    image_processor = cached_get_image_processor(
			
 
				+        model_config.model, trust_remote_code=model_config.trust_remote_code
			
 
				+    )
			
 
				+    if image_processor is None:
			
 
				+        raise RuntimeError(
			
 
				+            "No HuggingFace processor is available "
			
 
				+            "to process the image object"
			
 
				+        )
			
 
				+    images = None
			
 
				+    videos = None
			
 
				+    if data_type_key == "image":
			
 
				+        images = data
			
 
				+    else:
			
 
				+        assert data_type_key == "video"
			
 
				+        videos = data
			
 
				+    try:
			
 
				+        batch_data = image_processor.preprocess(
			
 
				+            images=images, videos=videos, return_tensors="pt"
			
 
				+        ).data
			
 
				+    except Exception:
			
 
				+        logger.error("Failed to process image (%s)", data)
			
 
				+        raise
			
 
				+    return MultiModalInputs(batch_data)
			
 
				+
			
 
				+
			
 
				+image_input_mapper_for_qwen2_vl = partial(
			
 
				+    mm_input_mapper_for_qwen2_vl, data_type_key="image"
			
 
				+)
			
 
				+video_input_mapper_for_qwen2_vl = partial(
			
 
				+    mm_input_mapper_for_qwen2_vl, data_type_key="video"
			
 
				+)
			
 
				+
			
 
				+
			
 
				+def _get_vision_info(
			
 
				+    image_processor,
			
 
				+    height: int,
			
 
				+    width: int,
			
 
				+    min_pixels: int,
			
 
				+    max_pixels: int,
			
 
				+    do_resize: bool = True,
			
 
				+    data_type_key: str = "image",
			
 
				+    mm_count: int = 1,
			
 
				+):
			
 
				+    """Get information (resized height / width and number of vision tokens)
			
 
				+    of input image / video frame."""
			
 
				+    if do_resize:
			
 
				+        resized_height, resized_width = smart_resize(
			
 
				+            height=height,
			
 
				+            width=width,
			
 
				+            factor=image_processor.patch_size * image_processor.merge_size,
			
 
				+            min_pixels=min_pixels,
			
 
				+            max_pixels=max_pixels,
			
 
				+        )
			
 
				+    else:
			
 
				+        resized_height, resized_width = height, width
			
 
				+    if data_type_key == "image":
			
 
				+        grid_t = mm_count
			
 
				+    else:
			
 
				+        assert data_type_key == "video"
			
 
				+        grid_t = max(mm_count // image_processor.temporal_patch_size, 1)
			
 
				+    grid_h = resized_height // image_processor.patch_size
			
 
				+    grid_w = resized_width // image_processor.patch_size
			
 
				+    vision_tokens = grid_t * grid_h * grid_w
			
 
				+    llm_num_vision_tokens = (
			
 
				+        vision_tokens
			
 
				+        // image_processor.merge_size
			
 
				+        // image_processor.merge_size
			
 
				+    )
			
 
				+    return resized_height, resized_width, llm_num_vision_tokens
			
 
				+
			
 
				+
			
 
				+def _get_max_image_info(
			
 
				+    image_processor,
			
 
				+    data_type_key: str = "image",
			
 
				+    mm_count: int = 1,
			
 
				+):
			
 
				+    return _get_vision_info(
			
 
				+        image_processor,
			
 
				+        height=9999999,
			
 
				+        width=9999999,
			
 
				+        # Limit min / max pixels.
			
 
				+        min_pixels=max(image_processor.min_pixels, 28 * 28),
			
 
				+        max_pixels=min(image_processor.max_pixels, 1280 * 28 * 28),
			
 
				+        data_type_key=data_type_key,
			
 
				+        mm_count=mm_count,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def get_max_qwen2_vl_mm_tokens(ctx: InputContext, data_type_key: str) -> int:
			
 
				+    image_processor = cached_get_image_processor(ctx.model_config.model)
			
 
				+    (
			
 
				+        max_resized_height,
			
 
				+        max_resized_width,
			
 
				+        max_llm_image_tokens,
			
 
				+    ) = _get_max_image_info(
			
 
				+        image_processor, data_type_key=data_type_key, mm_count=1
			
 
				+    )
			
 
				+    return max_llm_image_tokens
			
 
				+
			
 
				+
			
 
				+get_max_qwen2_vl_image_tokens = partial(
			
 
				+    get_max_qwen2_vl_mm_tokens, data_type_key="image"
			
 
				+)
			
 
				+get_max_qwen2_vl_video_tokens = partial(
			
 
				+    get_max_qwen2_vl_mm_tokens, data_type_key="video"
			
 
				+)
			
 
				+
			
 
				+
			
 
				+def dummy_data_for_qwen2_vl(
			
 
				+    ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int]
			
 
				+) -> Tuple[SequenceData, Optional[MultiModalDataDict]]:
			
 
				+    image_processor = cached_get_image_processor(ctx.model_config.model)
			
 
				+    num_images = mm_counts["image"]
			
 
				+    (
			
 
				+        max_resized_height,
			
 
				+        max_resized_width,
			
 
				+        max_llm_image_tokens,
			
 
				+    ) = _get_max_image_info(
			
 
				+        image_processor, data_type_key="image", mm_count=num_images
			
 
				+    )
			
 
				+    if seq_len - max_llm_image_tokens - 2 < 0:
			
 
				+        raise RuntimeError(
			
 
				+            f"Qwen2-VL cannot process {num_images} images in a prompt, "
			
 
				+            "please increase max_model_len or reduce image limit by "
			
 
				+            "--limit-mm-per-prompt."
			
 
				+        )
			
 
				+    # Check video counts.
			
 
				+    num_videos = mm_counts["video"]
			
 
				+    (
			
 
				+        max_resized_height,
			
 
				+        max_resized_width,
			
 
				+        max_llm_video_tokens,
			
 
				+    ) = _get_max_image_info(
			
 
				+        image_processor, data_type_key="video", mm_count=num_videos
			
 
				+    )
			
 
				+    if seq_len - max_llm_video_tokens - 2 < 0:
			
 
				+        raise RuntimeError(
			
 
				+            f"Qwen2-VL cannot process {num_images} videos in a prompt, "
			
 
				+            "please increase max_model_len or reduce video limit by "
			
 
				+            "--limit-mm-per-prompt."
			
 
				+        )
			
 
				+    hf_config = ctx.get_hf_config(Qwen2VLConfig)
			
 
				+    token_ids = array(
			
 
				+        APHRODITE_TOKEN_ID_ARRAY_TYPE, [hf_config.vision_start_token_id]
			
 
				+    )
			
 
				+    token_ids += (
			
 
				+        array(APHRODITE_TOKEN_ID_ARRAY_TYPE, [hf_config.image_token_id])
			
 
				+        * max_llm_image_tokens
			
 
				+    )
			
 
				+    token_ids += array(
			
 
				+        APHRODITE_TOKEN_ID_ARRAY_TYPE, [hf_config.vision_end_token_id]
			
 
				+    )
			
 
				+    token_ids += array(APHRODITE_TOKEN_ID_ARRAY_TYPE, [0]) * (
			
 
				+        seq_len - max_llm_image_tokens - 2
			
 
				+    )
			
 
				+    dummy_seqdata = SequenceData(token_ids)
			
 
				+    dummy_image = Image.new(
			
 
				+        "RGB", (max_resized_width, max_resized_height), color=0
			
 
				+    )
			
 
				+    return dummy_seqdata, {
			
 
				+        "image": dummy_image if num_images == 1 else [dummy_image] * num_images
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def _get_llm_num_vision_tokens(
			
 
				+    mm_inputs: list,
			
 
				+    data_type_key: str,
			
 
				+    image_processor,
			
 
				+):
			
 
				+    """Get number of vision tokens of multimodal inputs.
			
 
				+    This method is derived from `transformers.models.qwen2_vl.
			
 
				+    image_processing_qwen2_vl.Qwen2VLImageProcessor._preprocess`.
			
 
				+    """
			
 
				+    image = to_numpy_array(mm_inputs[0])
			
 
				+    input_data_format = infer_channel_dimension_format(image)
			
 
				+    height, width = get_image_size(image, channel_dim=input_data_format)
			
 
				+    _, _, llm_num_vision_tokens = _get_vision_info(
			
 
				+        image_processor,
			
 
				+        height=height,
			
 
				+        width=width,
			
 
				+        min_pixels=image_processor.min_pixels,
			
 
				+        max_pixels=image_processor.max_pixels,
			
 
				+        do_resize=image_processor.do_resize,
			
 
				+        data_type_key=data_type_key,
			
 
				+        mm_count=len(mm_inputs),
			
 
				+    )
			
 
				+    return llm_num_vision_tokens
			
 
				+
			
 
				+
			
 
				+def input_processor_for_qwen2_vl(
			
 
				+    ctx: InputContext, llm_inputs: LLMInputs
			
 
				+) -> LLMInputs:
			
 
				+    multi_modal_data = llm_inputs.get("multi_modal_data", None)
			
 
				+    if multi_modal_data is None:
			
 
				+        return llm_inputs
			
 
				+    image_inputs = multi_modal_data.get("image", None)
			
 
				+    video_inputs = multi_modal_data.get("video", None)
			
 
				+    processor = cached_get_processor(ctx.model_config.model)
			
 
				+    image_processor = processor.image_processor
			
 
				+    hf_config = ctx.get_hf_config(Qwen2VLConfig)
			
 
				+    # To avoid redundant processing of vision objects (resize, rescale, etc.),
			
 
				+    # we extract code of calculating number of vision tokens from
			
 
				+    # `transformers.models.qwen2_vl.processing_qwen2_vl.Qwen2VLProcessor`.
			
 
				+    #
			
 
				+    # The following code is equivalent to:
			
 
				+    #    prompt = llm_inputs["prompt"]
			
 
				+    #    inputs = processor(text=[prompt],
			
 
				+    #                       images=image_inputs,
			
 
				+    #                       videos=video_inputs,
			
 
				+    #                       padding=True,
			
 
				+    #                       return_tensors="pt")
			
 
				+    #    prompt_token_ids = inputs["input_ids"][0].tolist()
			
 
				+    prompt_token_ids = llm_inputs.get("prompt_token_ids", None)
			
 
				+    if prompt_token_ids is None:
			
 
				+        prompt = llm_inputs["prompt"]
			
 
				+        prompt_token_ids = processor.tokenizer(
			
 
				+            prompt,
			
 
				+            padding=True,
			
 
				+            return_tensors=None,
			
 
				+        )["input_ids"]
			
 
				+    # Expand image pad tokens.
			
 
				+    if image_inputs is not None:
			
 
				+        image_indices = [
			
 
				+            idx
			
 
				+            for idx, token in enumerate(prompt_token_ids)
			
 
				+            if token == hf_config.image_token_id
			
 
				+        ]
			
 
				+        image_inputs = make_batched_images(image_inputs)
			
 
				+        assert len(image_indices) == len(image_inputs)
			
 
				+        prompt_token_ids_with_image = []
			
 
				+        for image_cnt, image in enumerate(image_inputs):
			
 
				+            num_image_tokens = _get_llm_num_vision_tokens(
			
 
				+                [image],
			
 
				+                data_type_key="image",
			
 
				+                image_processor=image_processor,
			
 
				+            )
			
 
				+            if image_cnt == 0:
			
 
				+                non_image_tokens = prompt_token_ids[: image_indices[image_cnt]]
			
 
				+            else:
			
 
				+                non_image_tokens = prompt_token_ids[
			
 
				+                    image_indices[image_cnt - 1] + 1 : image_indices[image_cnt]
			
 
				+                ]
			
 
				+            prompt_token_ids_with_image.extend(non_image_tokens)
			
 
				+            prompt_token_ids_with_image.extend(
			
 
				+                hf_config.image_token_id for _ in range(num_image_tokens)
			
 
				+            )
			
 
				+        prompt_token_ids_with_image.extend(
			
 
				+            prompt_token_ids[image_indices[-1] + 1 :]
			
 
				+        )
			
 
				+        prompt_token_ids = prompt_token_ids_with_image
			
 
				+    # Expand video pad tokens.
			
 
				+    if video_inputs is not None:
			
 
				+        video_indices = [
			
 
				+            idx
			
 
				+            for idx, token in enumerate(prompt_token_ids)
			
 
				+            if token == hf_config.video_token_id
			
 
				+        ]
			
 
				+        video_inputs = make_batched_videos(video_inputs)
			
 
				+        assert len(video_indices) == len(video_inputs)
			
 
				+        prompt_token_ids_with_video = []
			
 
				+        for video_cnt, video in enumerate(video_inputs):
			
 
				+            num_video_tokens = _get_llm_num_vision_tokens(
			
 
				+                video,
			
 
				+                data_type_key="video",
			
 
				+                image_processor=image_processor,
			
 
				+            )
			
 
				+            if video_cnt == 0:
			
 
				+                non_video_tokens = prompt_token_ids[: video_indices[video_cnt]]
			
 
				+            else:
			
 
				+                non_video_tokens = prompt_token_ids[
			
 
				+                    video_indices[video_cnt - 1] + 1 : video_indices[video_cnt]
			
 
				+                ]
			
 
				+            prompt_token_ids_with_video.extend(non_video_tokens)
			
 
				+            prompt_token_ids_with_video.extend(
			
 
				+                hf_config.video_token_id for _ in range(num_video_tokens)
			
 
				+            )
			
 
				+        prompt_token_ids_with_video.extend(
			
 
				+            prompt_token_ids[video_indices[-1] + 1 :]
			
 
				+        )
			
 
				+        prompt_token_ids = prompt_token_ids_with_video
			
 
				+    return LLMInputs(
			
 
				+        prompt_token_ids=prompt_token_ids,
			
 
				+        prompt=llm_inputs["prompt"],
			
 
				+        multi_modal_data=multi_modal_data,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+@MULTIMODAL_REGISTRY.register_image_input_mapper(
			
 
				+    image_input_mapper_for_qwen2_vl
			
 
				+)
			
 
				+@MULTIMODAL_REGISTRY.register_input_mapper(
			
 
				+    "video", video_input_mapper_for_qwen2_vl
			
 
				+)
			
 
				+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_qwen2_vl_image_tokens)
			
 
				+@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
			
 
				+    "video", get_max_qwen2_vl_video_tokens
			
 
				+)
			
 
				+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_vl)
			
 
				+@INPUT_REGISTRY.register_input_processor(input_processor_for_qwen2_vl)
			
 
				+class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: Qwen2VLConfig,
			
 
				+        multimodal_config: MultiModalConfig,
			
 
				+        cache_config: Optional[CacheConfig] = None,
			
 
				+        quant_config: Optional[QuantizationConfig] = None,
			
 
				+    ) -> None:
			
 
				+        super().__init__()
			
 
				+        assert (
			
 
				+            not cache_config.enable_prefix_caching
			
 
				+        ), "Qwen2-VL currently does not support prefix caching"
			
 
				+        self.config = config
			
 
				+        self.multimodal_config = multimodal_config
			
 
				+        self.visual = Qwen2VisionTransformer(
			
 
				+            config.vision_config,
			
 
				+            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
			
 
				+            # NOTE: Qwen2-VL vision encoder does not support any
			
 
				+            # quantization method now.
			
 
				+            quant_config=None,
			
 
				+        )
			
 
				+        self.model = Qwen2Model(config, cache_config, quant_config)
			
 
				+        if config.tie_word_embeddings:
			
 
				+            self.lm_head = self.model.embed_tokens
			
 
				+        else:
			
 
				+            self.lm_head = ParallelLMHead(
			
 
				+                config.vocab_size, config.hidden_size, quant_config=quant_config
			
 
				+            )
			
 
				+        self.logits_processor = LogitsProcessor(config.vocab_size)
			
 
				+        self.sampler = Sampler()
			
 
				+
			
 
				+    def _validate_and_reshape_mm_tensor(
			
 
				+        self, mm_input: Union[torch.Tensor, List[torch.Tensor]], name: str
			
 
				+    ) -> torch.Tensor:
			
 
				+        if not isinstance(mm_input, (torch.Tensor, list)):
			
 
				+            raise ValueError(
			
 
				+                f"Incorrect type of {name}. " f"Got type: {type(mm_input)}"
			
 
				+            )
			
 
				+        if isinstance(mm_input, torch.Tensor):
			
 
				+            if mm_input.ndim == 2:
			
 
				+                return mm_input
			
 
				+            if mm_input.ndim != 3:
			
 
				+                raise ValueError(
			
 
				+                    f"{name} should be 2D or batched 3D tensor. "
			
 
				+                    f"Got ndim: {mm_input.ndim}"
			
 
				+                )
			
 
				+            return torch.concat(list(mm_input))
			
 
				+        else:
			
 
				+            return torch.concat(mm_input)
			
 
				+
			
 
				+    def _parse_and_validate_image_input(
			
 
				+        self, **kwargs: object
			
 
				+    ) -> Optional[Qwen2VLImageInputs]:
			
 
				+        pixel_values = kwargs.pop("pixel_values", None)
			
 
				+        image_grid_thw = kwargs.pop("image_grid_thw", None)
			
 
				+        if pixel_values is None:
			
 
				+            return None
			
 
				+        pixel_values = self._validate_and_reshape_mm_tensor(
			
 
				+            pixel_values, "image pixel values"
			
 
				+        )
			
 
				+        image_grid_thw = self._validate_and_reshape_mm_tensor(
			
 
				+            image_grid_thw, "image grid_thw"
			
 
				+        )
			
 
				+        if not isinstance(pixel_values, (torch.Tensor, list)):
			
 
				+            raise ValueError(
			
 
				+                "Incorrect type of image pixel values. "
			
 
				+                f"Got type: {type(pixel_values)}"
			
 
				+            )
			
 
				+        return Qwen2VLImageInputs(
			
 
				+            pixel_values=pixel_values, image_grid_thw=image_grid_thw
			
 
				+        )
			
 
				+
			
 
				+    def _parse_and_validate_video_input(
			
 
				+        self, **kwargs: object
			
 
				+    ) -> Optional[Qwen2VLVideoInputs]:
			
 
				+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
			
 
				+        video_grid_thw = kwargs.pop("video_grid_thw", None)
			
 
				+        if pixel_values_videos is None:
			
 
				+            return None
			
 
				+        pixel_values_videos = self._validate_and_reshape_mm_tensor(
			
 
				+            pixel_values_videos, "video pixel values"
			
 
				+        )
			
 
				+        video_grid_thw = self._validate_and_reshape_mm_tensor(
			
 
				+            video_grid_thw, "video grid_thw"
			
 
				+        )
			
 
				+        return Qwen2VLVideoInputs(
			
 
				+            pixel_values_videos=pixel_values_videos,
			
 
				+            video_grid_thw=video_grid_thw,
			
 
				+        )
			
 
				+
			
 
				+    def _process_image_input(
			
 
				+        self, image_input: Qwen2VLImageInputs
			
 
				+    ) -> torch.Tensor:
			
 
				+        pixel_values = image_input["pixel_values"].type(self.visual.dtype)
			
 
				+        image_embeds = self.visual(
			
 
				+            pixel_values, grid_thw=image_input["image_grid_thw"]
			
 
				+        )
			
 
				+        return image_embeds
			
 
				+
			
 
				+    def _process_video_input(
			
 
				+        self, video_input: Qwen2VLVideoInputs
			
 
				+    ) -> torch.Tensor:
			
 
				+        pixel_values_videos = video_input["pixel_values_videos"].type(
			
 
				+            self.visual.dtype
			
 
				+        )
			
 
				+        video_embeds = self.visual(
			
 
				+            pixel_values_videos, grid_thw=video_input["video_grid_thw"]
			
 
				+        )
			
 
				+        return video_embeds
			
 
				+
			
 
				+    def _merge_multimodal_embeddings(
			
 
				+        self,
			
 
				+        input_ids: torch.Tensor,
			
 
				+        inputs_embeds: torch.Tensor,
			
 
				+        multimodal_embeddings: torch.Tensor,
			
 
				+        placeholder_token_id: int,
			
 
				+    ) -> torch.Tensor:
			
 
				+        mask = input_ids == placeholder_token_id
			
 
				+        inputs_embeds[mask, :] = multimodal_embeddings
			
 
				+        return inputs_embeds
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        input_ids: torch.Tensor,
			
 
				+        positions: torch.Tensor,
			
 
				+        kv_caches: List[torch.Tensor],
			
 
				+        attn_metadata: AttentionMetadata,
			
 
				+        intermediate_tensors: Optional[IntermediateTensors] = None,
			
 
				+        **kwargs: object,
			
 
				+    ) -> SamplerOutput:
			
 
				+        """Run forward pass for Qwen2-VL.
			
 
				+        Args:
			
 
				+            input_ids: Flattened (concatenated) input_ids corresponding to a
			
 
				+                batch.
			
 
				+            positions: Flattened (concatenated) position ids corresponding to a
			
 
				+                batch.
			
 
				+                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
			
 
				+                opensource models), the shape will be `(3, seq_len)`,
			
 
				+                otherwise it will be `(seq_len,).
			
 
				+            pixel_values: Pixel values to be fed to a model.
			
 
				+                `None` if no images are passed.
			
 
				+            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
			
 
				+                `None` if no images are passed.
			
 
				+            pixel_values_videos: Pixel values of videos to be fed to a model.
			
 
				+                `None` if no videos are passed.
			
 
				+            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
			
 
				+                `None` if no videos are passed.
			
 
				+        """
			
 
				+        image_input = self._parse_and_validate_image_input(**kwargs)
			
 
				+        video_input = self._parse_and_validate_video_input(**kwargs)
			
 
				+        if image_input is None and video_input is None:
			
 
				+            inputs_embeds = None
			
 
				+        else:
			
 
				+            if (
			
 
				+                getattr(self.config, "rope_scaling", {}).get("type", None)
			
 
				+                == "mrope"
			
 
				+            ):
			
 
				+                assert positions.ndim == 2 and positions.size(0) == 3, (
			
 
				+                    "multimodal section rotary embedding requires "
			
 
				+                    f"(3, seq_len) positions, but got {positions.size()}"
			
 
				+                )
			
 
				+            inputs_embeds = self.model.embed_tokens(input_ids)
			
 
				+            if image_input is not None:
			
 
				+                image_embeds = self._process_image_input(image_input)
			
 
				+                inputs_embeds = self._merge_multimodal_embeddings(
			
 
				+                    input_ids,
			
 
				+                    inputs_embeds,
			
 
				+                    image_embeds,
			
 
				+                    placeholder_token_id=self.config.image_token_id,
			
 
				+                )
			
 
				+            if video_input is not None:
			
 
				+                video_embeds = self._process_video_input(video_input)
			
 
				+                inputs_embeds = self._merge_multimodal_embeddings(
			
 
				+                    input_ids,
			
 
				+                    inputs_embeds,
			
 
				+                    video_embeds,
			
 
				+                    placeholder_token_id=self.config.video_token_id,
			
 
				+                )
			
 
				+            input_ids = None
			
 
				+        hidden_states = self.model(
			
 
				+            input_ids=input_ids,
			
 
				+            positions=positions,
			
 
				+            kv_caches=kv_caches,
			
 
				+            attn_metadata=attn_metadata,
			
 
				+            inputs_embeds=inputs_embeds,
			
 
				+        )
			
 
				+        return hidden_states
			
 
				+
			
 
				+    def compute_logits(
			
 
				+        self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata
			
 
				+    ) -> torch.Tensor:
			
 
				+        logits = self.logits_processor(
			
 
				+            self.lm_head, hidden_states, sampling_metadata
			
 
				+        )
			
 
				+        return logits
			
 
				+
			
 
				+    def sample(
			
 
				+        self,
			
 
				+        logits: torch.Tensor,
			
 
				+        sampling_metadata: SamplingMetadata,
			
 
				+    ) -> Optional[SamplerOutput]:
			
 
				+        next_tokens = self.sampler(logits, sampling_metadata)
			
 
				+        return next_tokens
			
 
				+
			
 
				+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
			
 
				+        stacked_params_mapping = [
			
 
				+            # (param_name, shard_name, shard_id)
			
 
				+            ("qkv_proj", "q_proj", "q"),
			
 
				+            ("qkv_proj", "k_proj", "k"),
			
 
				+            ("qkv_proj", "v_proj", "v"),
			
 
				+            ("gate_up_proj", "up_proj", 1),
			
 
				+            ("gate_up_proj", "gate_proj", 0),
			
 
				+        ]
			
 
				+        params_dict = dict(self.named_parameters(remove_duplicate=False))
			
 
				+        for name, loaded_weight in weights:
			
 
				+            if "rotary_emb.inv_freq" in name:
			
 
				+                continue
			
 
				+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
			
 
				+                continue
			
 
				+            for param_name, weight_name, shard_id in stacked_params_mapping:
			
 
				+                if weight_name not in name:
			
 
				+                    continue
			
 
				+                name = name.replace(weight_name, param_name)
			
 
				+                param = params_dict[name]
			
 
				+                weight_loader = param.weight_loader
			
 
				+                weight_loader(param, loaded_weight, shard_id)
			
 
				+                break
			
 
				+            else:
			
 
				+                if "visual" in name and "qkv.weight" in name:
			
 
				+                    visual_num_heads = self.config.vision_config.num_heads
			
 
				+                    visual_embed_dim = self.config.vision_config.embed_dim
			
 
				+                    head_size = visual_embed_dim // visual_num_heads
			
 
				+                    loaded_weight = loaded_weight.view(
			
 
				+                        3, visual_num_heads, head_size, visual_embed_dim
			
 
				+                    )
			
 
				+                    loaded_weight = loaded_weight.transpose(0, 1)
			
 
				+                    loaded_weight = loaded_weight.reshape(-1, visual_embed_dim)
			
 
				+                elif "visual" in name and "qkv.bias" in name:
			
 
				+                    visual_num_heads = self.config.vision_config.num_heads
			
 
				+                    visual_embed_dim = self.config.vision_config.embed_dim
			
 
				+                    head_size = visual_embed_dim // visual_num_heads
			
 
				+                    loaded_weight = loaded_weight.view(
			
 
				+                        3, visual_num_heads, head_size
			
 
				+                    )
			
 
				+                    loaded_weight = loaded_weight.transpose(0, 1)
			
 
				+                    loaded_weight = loaded_weight.reshape(-1)
			
 
				+                try:
			
 
				+                    param = params_dict[name]
			
 
				+                except KeyError:
			
 
				+                    print(params_dict.keys())
			
 
				+                    raise
			
 
				+                weight_loader = getattr(
			
 
				+                    param, "weight_loader", default_weight_loader
			
 
				+                )
			
 
				+                weight_loader(param, loaded_weight)
			
--- a/aphrodite/multimodal/base.py
+++ b/aphrodite/multimodal/base.py
@@ -76,14 +76,12 @@ class MultiModalInputs(_MultiModalInputsBase):
 
				         if len(inputs_list) == 0:
			
 
				             return {}
			
 
				 
			
 
				-        keys = inputs_list[0].keys()
			
 
				-
			
 
				         item_lists: Dict[str, List[NestedTensors]] = defaultdict(list)
			
 
				 
			
 
				         for inputs in inputs_list:
			
 
				-            if inputs.keys() != keys:
			
 
				-                msg = f"Inputs do not share the same keys ({keys})"
			
 
				-                raise ValueError(msg)
			
 
				+            # For models that supports multiple modalities (e.g. Qwen2-VL),
			
 
				+            # different modalities will return different data keys,
			
 
				+            # so batch() should skip the same key check.
			
 
				 
			
 
				             for k, v in inputs.items():
			
 
				                 item_lists[k].append(v)
			
--- a/aphrodite/transformers_utils/config.py
+++ b/aphrodite/transformers_utils/config.py
@@ -17,12 +17,12 @@ from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 
				 
			
 
				 import aphrodite.common.envs as envs
			
 
				 from aphrodite.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
			
 
				-                                                  EAGLEConfig, GraniteConfig,
			
 
				+                                                  EAGLEConfig,
			
 
				                                                   InternVLChatConfig,
			
 
				                                                   JAISConfig, MedusaConfig,
			
 
				                                                   MLPSpeculatorConfig,
			
 
				-                                                  MPTConfig, RWConfig,
			
 
				-                                                  UltravoxConfig)
			
 
				+                                                  MPTConfig, Qwen2VLConfig,
			
 
				+                                                  RWConfig, UltravoxConfig)
			
 
				 from aphrodite.transformers_utils.utils import check_gguf_file
			
 
				 
			
 
				 APHRODITE_USE_MODELSCOPE = envs.APHRODITE_USE_MODELSCOPE
			
@@ -46,7 +46,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
 
				     "internvl_chat": InternVLChatConfig,
			
 
				     "ultravox": UltravoxConfig,
			
 
				     "eagle": EAGLEConfig,
			
 
				-    "granite": GraniteConfig,
			
 
				+    "qwen2_vl": Qwen2VLConfig,
			
 
				 }
			
 
				 
			
 
				 for name, cls in _CONFIG_REGISTRY.items():
			
--- a/aphrodite/transformers_utils/configs/__init__.py
+++ b/aphrodite/transformers_utils/configs/__init__.py
@@ -5,13 +5,14 @@ from aphrodite.transformers_utils.configs.eagle import EAGLEConfig
 
				 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
			
 
				 # `FalconConfig` class from the official HuggingFace transformers library.
			
 
				 from aphrodite.transformers_utils.configs.falcon import RWConfig
			
 
				-from aphrodite.transformers_utils.configs.granite import GraniteConfig
			
 
				 from aphrodite.transformers_utils.configs.internvl import InternVLChatConfig
			
 
				 from aphrodite.transformers_utils.configs.jais import JAISConfig
			
 
				 from aphrodite.transformers_utils.configs.medusa import MedusaConfig
			
 
				 from aphrodite.transformers_utils.configs.mlp_speculator import (
			
 
				     MLPSpeculatorConfig)
			
 
				 from aphrodite.transformers_utils.configs.mpt import MPTConfig
			
 
				+from aphrodite.transformers_utils.configs.qwen2vl import (Qwen2VLConfig,
			
 
				+                                                          Qwen2VLVisionConfig)
			
 
				 from aphrodite.transformers_utils.configs.ultravox import UltravoxConfig
			
 
				 
			
 
				 __all__ = [
			
@@ -25,5 +26,6 @@ __all__ = [
 
				     "MedusaConfig",
			
 
				     "UltravoxConfig",
			
 
				     "EAGLEConfig",
			
 
				-    "GraniteConfig",
			
 
				+    "Qwen2VLConfig",
			
 
				+    "Qwen2VLVisionConfig",
			
 
				 ]
			
--- a/aphrodite/transformers_utils/configs/granite.py
+++ b/aphrodite/transformers_utils/configs/granite.py
@@ -1,186 +0,0 @@
 
				-# coding=utf-8
			
 
				-# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
			
 
				-#
			
 
				-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
			
 
				-# and OPT implementations in this library. It has been modified from its
			
 
				-# original forms to accommodate minor architectural differences compared
			
 
				-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-"""Granite model configuration"""
			
 
				-from transformers.configuration_utils import PretrainedConfig
			
 
				-from transformers.modeling_rope_utils import rope_config_validation
			
 
				-from transformers.utils import logging
			
 
				-
			
 
				-logger = logging.get_logger(__name__)
			
 
				-
			
 
				-
			
 
				-class GraniteConfig(PretrainedConfig):
			
 
				-    r"""
			
 
				-    This is the configuration class to store the configuration of
			
 
				-    a [`GraniteModel`]. It is used to instantiate an Granite
			
 
				-    model according to the specified arguments, defining the model architecture.
			
 
				-    Instantiating a configuration with the defaults will yield a similar
			
 
				-    configuration to that of the Granite-3B.
			
 
				-    Configuration objects inherit from [`PretrainedConfig`] and can be used to
			
 
				-    control the model outputs. Read the documentation from [`PretrainedConfig`]
			
 
				-    for more information.
			
 
				-    Args:
			
 
				-        vocab_size (`int`, *optional*, defaults to 32000):
			
 
				-            Vocabulary size of the Granite model. Defines the number of
			
 
				-            different tokens that can be represented by the `inputs_ids`
			
 
				-            passed when calling [`GraniteModel`]
			
 
				-        hidden_size (`int`, *optional*, defaults to 4096):
			
 
				-            Dimension of the hidden representations.
			
 
				-        intermediate_size (`int`, *optional*, defaults to 11008):
			
 
				-            Dimension of the MLP representations.
			
 
				-        num_hidden_layers (`int`, *optional*, defaults to 32):
			
 
				-            Number of hidden layers in the Transformer decoder.
			
 
				-        num_attention_heads (`int`, *optional*, defaults to 32):
			
 
				-            Number of attention heads for each attention layer in the
			
 
				-            Transformer decoder.
			
 
				-        num_key_value_heads (`int`, *optional*):
			
 
				-            This is the number of key_value heads that should be used to
			
 
				-            implement Grouped Query Attention. If
			
 
				-            `num_key_value_heads=num_attention_heads`, the model will use Multi
			
 
				-            Head Attention (MHA), if `num_key_value_heads=1` the model will use
			
 
				-            Multi Query Attention (MQA) otherwise GQA is used. When converting
			
 
				-            a multi-head checkpoint to a GQA checkpoint, each group key and
			
 
				-            value head should be constructed by meanpooling all the original
			
 
				-            heads within that group. For more details checkout
			
 
				-            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
			
 
				-            specified, will default to `num_attention_heads`.
			
 
				-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
			
 
				-            The non-linear activation function (function or string) in the
			
 
				-            decoder.
			
 
				-        max_position_embeddings (`int`, *optional*, defaults to 2048):
			
 
				-            The maximum sequence length that this model might ever be used with.
			
 
				-        initializer_range (`float`, *optional*, defaults to 0.02):
			
 
				-            The standard deviation of the truncated_normal_initializer for
			
 
				-            initializing all weight matrices.
			
 
				-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
			
 
				-            The epsilon used by the rms normalization layers.
			
 
				-        use_cache (`bool`, *optional*, defaults to `True`):
			
 
				-            Whether or not the model should return the last key/values
			
 
				-            attentions (not used by all models). Only relevant if
			
 
				-            `config.is_decoder=True`.
			
 
				-        pad_token_id (`int`, *optional*):
			
 
				-            Padding token id.
			
 
				-        bos_token_id (`int`, *optional*, defaults to 1):
			
 
				-            Beginning of stream token id.
			
 
				-        eos_token_id (`int`, *optional*, defaults to 2):
			
 
				-            End of stream token id.
			
 
				-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
			
 
				-            Whether to tie weight embeddings
			
 
				-        rope_theta (`float`, *optional*, defaults to 10000.0):
			
 
				-            The base period of the RoPE embeddings.
			
 
				-        rope_scaling (`Dict`, *optional*):
			
 
				-            Dictionary containing the scaling configuration for the RoPE
			
 
				-            embeddings. Currently supports two scaling strategies: linear and
			
 
				-            dynamic. Their scaling factor must be a float greater than 1. The
			
 
				-            expected format is
			
 
				-            `{"type": strategy name, "factor": scaling factor}`.
			
 
				-            When using this flag, don't update `max_position_embeddings` to
			
 
				-            the expected new maximum. See the following thread for more
			
 
				-            information on how these scaling strategies behave:
			
 
				-            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/.
			
 
				-            This is an experimental feature, subject to breaking API changes
			
 
				-            in future versions.
			
 
				-        attention_bias (`bool`, *optional*, defaults to `False`):
			
 
				-            Whether to use a bias in the query, key, value and output
			
 
				-            projection layers during self-attention.
			
 
				-        attention_dropout (`float`, *optional*, defaults to 0.0):
			
 
				-            The dropout ratio for the attention probabilities.
			
 
				-        mlp_bias (`bool`, *optional*, defaults to `False`):
			
 
				-            Whether to use a bias in up_proj, down_proj and gate_proj layers
			
 
				-            in the MLP layers.
			
 
				-        embedding_multiplier (`float`, *optional*, defaults to 1.0):
			
 
				-            embedding multiplier
			
 
				-        logits_scaling (`float`, *optional*, defaults to 1.0):
			
 
				-            divisor for output logits
			
 
				-        residual_multiplier (`float`, *optional*, defaults to 1.0):
			
 
				-            residual multiplier
			
 
				-        attention_multiplier (`float`, *optional*, defaults to 1.0):
			
 
				-            attention multiplier
			
 
				-    ```python
			
 
				-    >>> from transformers import GraniteModel, GraniteConfig
			
 
				-    >>> # Initializing a Granite granite-3b style configuration
			
 
				-    >>> configuration = GraniteConfig()
			
 
				-    >>> # Initializing a model from the granite-7b style configuration
			
 
				-    >>> model = GraniteModel(configuration)
			
 
				-    >>> # Accessing the model configuration
			
 
				-    >>> configuration = model.config
			
 
				-    ```"""
			
 
				-
			
 
				-    model_type = "granite"
			
 
				-    keys_to_ignore_at_inference = ["past_key_values"]
			
 
				-
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        vocab_size=32000,
			
 
				-        hidden_size=4096,
			
 
				-        intermediate_size=11008,
			
 
				-        num_hidden_layers=32,
			
 
				-        num_attention_heads=32,
			
 
				-        num_key_value_heads=None,
			
 
				-        hidden_act="silu",
			
 
				-        max_position_embeddings=2048,
			
 
				-        initializer_range=0.02,
			
 
				-        rms_norm_eps=1e-6,
			
 
				-        use_cache=True,
			
 
				-        pad_token_id=None,
			
 
				-        bos_token_id=1,
			
 
				-        eos_token_id=2,
			
 
				-        tie_word_embeddings=False,
			
 
				-        rope_theta=10000.0,
			
 
				-        rope_scaling=None,
			
 
				-        attention_bias=False,
			
 
				-        attention_dropout=0.0,
			
 
				-        mlp_bias=False,
			
 
				-        embedding_multiplier=1.0,
			
 
				-        logits_scaling=1.0,
			
 
				-        residual_multiplier=1.0,
			
 
				-        attention_multiplier=1.0,
			
 
				-        **kwargs,
			
 
				-    ):
			
 
				-        self.vocab_size = vocab_size
			
 
				-        self.max_position_embeddings = max_position_embeddings
			
 
				-        self.hidden_size = hidden_size
			
 
				-        self.intermediate_size = intermediate_size
			
 
				-        self.num_hidden_layers = num_hidden_layers
			
 
				-        self.num_attention_heads = num_attention_heads
			
 
				-        # for backward compatibility
			
 
				-        if num_key_value_heads is None:
			
 
				-            num_key_value_heads = num_attention_heads
			
 
				-        self.num_key_value_heads = num_key_value_heads
			
 
				-        self.hidden_act = hidden_act
			
 
				-        self.initializer_range = initializer_range
			
 
				-        self.rms_norm_eps = rms_norm_eps
			
 
				-        self.use_cache = use_cache
			
 
				-        self.rope_theta = rope_theta
			
 
				-        self.rope_scaling = rope_scaling
			
 
				-        self.attention_bias = attention_bias
			
 
				-        self.attention_dropout = attention_dropout
			
 
				-        self.mlp_bias = mlp_bias
			
 
				-        self.embedding_multiplier = embedding_multiplier
			
 
				-        self.logits_scaling = logits_scaling
			
 
				-        self.residual_multiplier = residual_multiplier
			
 
				-        self.attention_multiplier = attention_multiplier
			
 
				-        super().__init__(
			
 
				-            pad_token_id=pad_token_id,
			
 
				-            bos_token_id=bos_token_id,
			
 
				-            eos_token_id=eos_token_id,
			
 
				-            tie_word_embeddings=tie_word_embeddings,
			
 
				-            **kwargs,
			
 
				-        )
			
 
				-        rope_config_validation(self)
			
--- a/aphrodite/transformers_utils/configs/qwen2vl.py
+++ b/aphrodite/transformers_utils/configs/qwen2vl.py
@@ -0,0 +1,131 @@
 
				+# coding=utf-8
			
 
				+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
			
 
				+# All rights reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+"""Qwen2VL model configuration"""
			
 
				+
			
 
				+import os
			
 
				+from typing import Union
			
 
				+
			
 
				+from transformers import PretrainedConfig
			
 
				+
			
 
				+
			
 
				+class Qwen2VLVisionConfig(PretrainedConfig):
			
 
				+    model_type = "qwen2_vl"
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        depth=32,
			
 
				+        embed_dim=1280,
			
 
				+        hidden_size=3584,
			
 
				+        hidden_act="quick_gelu",
			
 
				+        mlp_ratio=4,
			
 
				+        num_heads=16,
			
 
				+        in_channels=3,
			
 
				+        patch_size=14,
			
 
				+        spatial_merge_size=2,
			
 
				+        temporal_patch_size=2,
			
 
				+        **kwargs,
			
 
				+    ):
			
 
				+        super().__init__(**kwargs)
			
 
				+
			
 
				+        self.depth = depth
			
 
				+        self.embed_dim = embed_dim
			
 
				+        self.hidden_size = hidden_size
			
 
				+        self.hidden_act = hidden_act
			
 
				+        self.mlp_ratio = mlp_ratio
			
 
				+        self.num_heads = num_heads
			
 
				+        self.in_channels = in_channels
			
 
				+        self.patch_size = patch_size
			
 
				+        self.spatial_merge_size = spatial_merge_size
			
 
				+        self.temporal_patch_size = temporal_patch_size
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_pretrained(
			
 
				+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
			
 
				+    ) -> "PretrainedConfig":
			
 
				+        cls._set_token_in_kwargs(kwargs)
			
 
				+
			
 
				+        config_dict, kwargs = cls.get_config_dict(
			
 
				+            pretrained_model_name_or_path, **kwargs
			
 
				+        )
			
 
				+
			
 
				+        if config_dict.get("model_type") == "qwen2_vl":
			
 
				+            config_dict = config_dict["vision_config"]
			
 
				+
			
 
				+        return cls.from_dict(config_dict, **kwargs)
			
 
				+
			
 
				+
			
 
				+class Qwen2VLConfig(PretrainedConfig):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        vocab_size=152064,
			
 
				+        hidden_size=8192,
			
 
				+        intermediate_size=29568,
			
 
				+        num_hidden_layers=80,
			
 
				+        num_attention_heads=64,
			
 
				+        num_key_value_heads=8,
			
 
				+        hidden_act="silu",
			
 
				+        max_position_embeddings=32768,
			
 
				+        initializer_range=0.02,
			
 
				+        rms_norm_eps=1e-05,
			
 
				+        use_cache=True,
			
 
				+        tie_word_embeddings=False,
			
 
				+        rope_theta=1000000.0,
			
 
				+        use_sliding_window=False,
			
 
				+        sliding_window=4096,
			
 
				+        max_window_layers=80,
			
 
				+        attention_dropout=0.0,
			
 
				+        vision_config=None,
			
 
				+        rope_scaling=None,
			
 
				+        **kwargs,
			
 
				+    ):
			
 
				+        if isinstance(vision_config, dict):
			
 
				+            self.vision_config = Qwen2VLVisionConfig(**vision_config)
			
 
				+        elif vision_config is None:
			
 
				+            self.vision_config = Qwen2VLVisionConfig()
			
 
				+
			
 
				+        self.vocab_size = vocab_size
			
 
				+        self.max_position_embeddings = max_position_embeddings
			
 
				+        self.hidden_size = hidden_size
			
 
				+        self.intermediate_size = intermediate_size
			
 
				+        self.num_hidden_layers = num_hidden_layers
			
 
				+        self.num_attention_heads = num_attention_heads
			
 
				+        self.use_sliding_window = use_sliding_window
			
 
				+        self.sliding_window = sliding_window
			
 
				+        self.max_window_layers = max_window_layers
			
 
				+
			
 
				+        # for backward compatibility
			
 
				+        if num_key_value_heads is None:
			
 
				+            num_key_value_heads = num_attention_heads
			
 
				+
			
 
				+        self.num_key_value_heads = num_key_value_heads
			
 
				+        self.hidden_act = hidden_act
			
 
				+        self.initializer_range = initializer_range
			
 
				+        self.rms_norm_eps = rms_norm_eps
			
 
				+        self.use_cache = use_cache
			
 
				+        self.rope_theta = rope_theta
			
 
				+        self.attention_dropout = attention_dropout
			
 
				+        self.rope_scaling = rope_scaling
			
 
				+
			
 
				+        # NOTE: the following section from original transformers config
			
 
				+        # for Qwen2-VL is commented out to address rope config loading issue
			
 
				+        #
			
 
				+        # if self.rope_scaling is not None and "type" in self.rope_scaling:
			
 
				+        #     if self.rope_scaling["type"] == "mrope":
			
 
				+        #         self.rope_scaling["type"] = "default"
			
 
				+        #     self.rope_scaling["rope_type"] = self.rope_scaling["type"]
			
 
				+        # rope_config_validation(self)
			
 
				+
			
 
				+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
			
--- a/aphrodite/transformers_utils/processor.py
+++ b/aphrodite/transformers_utils/processor.py
@@ -0,0 +1,35 @@
 
				+from typing import cast
			
 
				+
			
 
				+
			
 
				+def get_processor(
			
 
				+    processor_name: str,
			
 
				+    *args,
			
 
				+    trust_remote_code: bool = False,
			
 
				+    **kwargs,
			
 
				+):
			
 
				+    """Gets a processor for the given model name via HuggingFace."""
			
 
				+    # don't put this import at the top level
			
 
				+    # it will call torch.cuda.device_count()
			
 
				+    from transformers import AutoProcessor
			
 
				+    from transformers.processing_utils import ProcessorMixin
			
 
				+
			
 
				+    try:
			
 
				+        processor = AutoProcessor.from_pretrained(
			
 
				+            processor_name, *args, trust_remote_code=trust_remote_code, **kwargs
			
 
				+        )
			
 
				+    except ValueError as e:
			
 
				+        # If the error pertains to the processor class not existing or not
			
 
				+        # currently being imported, suggest using the --trust-remote-code flag.
			
 
				+        # Unlike AutoTokenizer, AutoProcessor does not separate such errors
			
 
				+        if not trust_remote_code:
			
 
				+            err_msg = (
			
 
				+                "Failed to load the processor. If the processor is "
			
 
				+                "a custom processor not yet available in the HuggingFace "
			
 
				+                "transformers library, consider setting "
			
 
				+                "`trust_remote_code=True` in LLM or using the "
			
 
				+                "`--trust-remote-code` flag in the CLI."
			
 
				+            )
			
 
				+            raise RuntimeError(err_msg) from e
			
 
				+        else:
			
 
				+            raise e
			
 
				+    return cast(ProcessorMixin, processor)
			
--- a/aphrodite/worker/model_runner.py
+++ b/aphrodite/worker/model_runner.py
@@ -36,6 +36,7 @@ from aphrodite.inputs import INPUT_REGISTRY, InputRegistry
 
				 from aphrodite.lora.layers import LoRAMapping
			
 
				 from aphrodite.lora.request import LoRARequest
			
 
				 from aphrodite.lora.worker_manager import LRUCacheWorkerLoRAManager
			
 
				+from aphrodite.modeling.layers.rotary_embedding import MRotaryEmbedding
			
 
				 from aphrodite.modeling.layers.sampler import SamplerOutput
			
 
				 from aphrodite.modeling.model_loader import get_model
			
 
				 from aphrodite.modeling.model_loader.tensorizer import TensorizerConfig
			
@@ -183,6 +184,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
 
				         def simple_reinit(self):
			
 
				             self.input_tokens[0].clear()  # type: ignore
			
 
				             self.input_positions[0].clear()  # type: ignore
			
 
				+            self.mrope_input_positions = None  # type: ignore
			
 
				             self.seq_lens[0] = 0  # type: ignore
			
 
				             self.orig_seq_lens[0] = 0  # type: ignore
			
 
				             self.query_lens[0] = 0  # type: ignore
			
@@ -208,6 +210,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
 
				             # Input tokens and positions.
			
 
				             input_tokens: Optional[List[List[int]]] = None,
			
 
				             input_positions: Optional[List[List[int]]] = None,
			
 
				+            mrope_input_positions: Optional[List[List[List[int]]]] = None,
			
 
				 
			
 
				             # The sequence length (may be capped to the sliding window).
			
 
				             seq_lens: Optional[List[int]] = None,
			
@@ -267,6 +270,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
 
				                         for seq_id in range(len(self.seq_ids)):
			
 
				                             self.input_positions[seq_id].clear()
			
 
				 
			
 
				+                    self.mrope_input_positions = None
			
 
				                     if seq_lens:
			
 
				                         self.seq_lens = seq_lens
			
 
				                     else:
			
@@ -328,6 +332,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
 
				             else:
			
 
				                 self.input_tokens = input_tokens or []
			
 
				                 self.input_positions = input_positions or []
			
 
				+                self.mrope_input_positions = mrope_input_positions or None
			
 
				                 self.seq_lens = seq_lens or []
			
 
				                 self.orig_seq_lens = orig_seq_lens or []
			
 
				                 self.query_lens = query_lens or []
			
@@ -358,6 +363,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
 
				 
			
 
				             self.input_tokens = [[] for _ in range(self.n_seqs)]
			
 
				             self.input_positions = [[] for _ in range(self.n_seqs)]
			
 
				+            self.mrope_input_positions = None
			
 
				             self.seq_lens = [0] * self.n_seqs
			
 
				             self.orig_seq_lens = [0] * self.n_seqs
			
 
				             self.query_lens = [0] * self.n_seqs
			
@@ -493,6 +499,16 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
 
				         inter_data.query_lens[
			
 
				             seq_idx] = seq_len - context_len if inter_data.is_prompt else 1
			
 
				 
			
 
				+        if seq_data.mrope_position_delta is not None:
			
 
				+            if inter_data.mrope_input_positions is None:
			
 
				+                inter_data.mrope_input_positions = [None] * inter_data.n_seqs
			
 
				+            inter_data.mrope_input_positions[
			
 
				+                seq_idx] = MRotaryEmbedding.get_next_input_positions(
			
 
				+                    seq_data.mrope_position_delta,
			
 
				+                    context_len,
			
 
				+                    seq_len,
			
 
				+                )
			
 
				+
			
 
				     def _compute_for_prefix_cache_hit(
			
 
				             self, inter_data: InterDataForSeqGroup, seq_idx: int,
			
 
				             seq_group_metadata: SequenceGroupMetadata):
			
@@ -638,6 +654,36 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
 
				         mm_kwargs = self.multi_modal_input_mapper(mm_data)
			
 
				         inter_data.multi_modal_inputs = mm_kwargs
			
 
				 
			
 
				+        # special processing for mrope position deltas.
			
 
				+        if self.runner.model_is_mrope:
			
 
				+            image_grid_thw = mm_kwargs.get("image_grid_thw", None)
			
 
				+            video_grid_thw = mm_kwargs.get("video_grid_thw", None)
			
 
				+            assert image_grid_thw is not None or video_grid_thw is not None, (
			
 
				+                "mrope embedding type requires multi-modal input mapper "
			
 
				+                "returns 'image_grid_thw' or 'video_grid_thw'.")
			
 
				+            hf_config = self.runner.model_config.hf_config
			
 
				+            inter_data.mrope_input_positions = [None] * inter_data.n_seqs
			
 
				+            for seq_idx in range(inter_data.n_seqs):
			
 
				+                seq_data = seq_group_metadata.seq_data[
			
 
				+                    inter_data.seq_ids[seq_idx]]
			
 
				+                token_ids = seq_data.get_token_ids()
			
 
				+                mrope_input_positions, mrope_position_delta = \
			
 
				+                    MRotaryEmbedding.get_input_positions(
			
 
				+                        token_ids,
			
 
				+                        image_grid_thw=image_grid_thw,
			
 
				+                        video_grid_thw=video_grid_thw,
			
 
				+                        image_token_id=hf_config.image_token_id,
			
 
				+                        video_token_id=hf_config.video_token_id,
			
 
				+                        vision_start_token_id=hf_config.vision_start_token_id,
			
 
				+                        vision_end_token_id=hf_config.vision_end_token_id,
			
 
				+                        spatial_merge_size=hf_config.vision_config.
			
 
				+                        spatial_merge_size,
			
 
				+                        context_len=inter_data.context_lens[seq_idx],
			
 
				+                    )
			
 
				+                seq_data.mrope_position_delta = mrope_position_delta
			
 
				+                inter_data.mrope_input_positions[
			
 
				+                    seq_idx] = mrope_input_positions
			
 
				+
			
 
				     def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata):
			
 
				         """Add a sequence group to the builder."""
			
 
				         seq_ids = seq_group_metadata.seq_data.keys()
			
@@ -684,10 +730,28 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
 
				             # prefix caching and there is no decode request.
			
 
				             return self.model_input_cls()
			
 
				 
			
 
				-        input_positions = []
			
 
				-        for inter_data in self.inter_data_list:
			
 
				-            for cur_input_positions in inter_data.input_positions:
			
 
				-                input_positions.extend(cur_input_positions)
			
 
				+        mrope_input_positions: Optional[List[List[int]]] = None
			
 
				+        if any(inter_data.mrope_input_positions is not None
			
 
				+               for inter_data in self.inter_data_list):
			
 
				+            mrope_input_positions = [[] for _ in range(3)]
			
 
				+            for idx in range(3):
			
 
				+                for inter_data in self.inter_data_list:
			
 
				+                    msections = inter_data.mrope_input_positions
			
 
				+                    if msections is None:
			
 
				+                        for _seq_input_positions in inter_data.input_positions:
			
 
				+                            mrope_input_positions[idx].extend(
			
 
				+                                _seq_input_positions)
			
 
				+                    else:
			
 
				+                        for _seq_mrope_input_positions in msections:
			
 
				+                            mrope_input_positions[idx].extend(
			
 
				+                                _seq_mrope_input_positions[idx])
			
 
				+            input_positions = None
			
 
				+        else:
			
 
				+            input_positions = []
			
 
				+            for inter_data in self.inter_data_list:
			
 
				+                for cur_input_positions in inter_data.input_positions:
			
 
				+                    input_positions.extend(cur_input_positions)
			
 
				+
			
 
				         seq_lens = []
			
 
				         max_decode_seq_len = 0
			
 
				         for inter_data in self.inter_data_list:
			
@@ -722,14 +786,24 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
 
				         # Tokens and positions.
			
 
				         if cuda_graph_pad_size:
			
 
				             input_tokens.extend(itertools.repeat(0, cuda_graph_pad_size))
			
 
				-            input_positions.extend(itertools.repeat(0, cuda_graph_pad_size))
			
 
				         assert self.runner.device is not None
			
 
				         input_tokens_tensor = async_tensor_h2d(input_tokens, torch.long,
			
 
				                                                self.runner.device,
			
 
				                                                self.runner.pin_memory)
			
 
				-        input_positions_tensor = async_tensor_h2d(input_positions, torch.long,
			
 
				-                                                  self.runner.device,
			
 
				-                                                  self.runner.pin_memory)
			
 
				+        if mrope_input_positions is not None:
			
 
				+            for idx in range(3):
			
 
				+                mrope_input_positions[idx].extend(
			
 
				+                    itertools.repeat(0, cuda_graph_pad_size))
			
 
				+            input_positions_tensor = async_tensor_h2d(mrope_input_positions,
			
 
				+                                                      torch.long,
			
 
				+                                                      self.runner.device,
			
 
				+                                                      self.runner.pin_memory)
			
 
				+        else:
			
 
				+            input_positions.extend(itertools.repeat(0, cuda_graph_pad_size))
			
 
				+            input_positions_tensor = async_tensor_h2d(input_positions,
			
 
				+                                                      torch.long,
			
 
				+                                                      self.runner.device,
			
 
				+                                                      self.runner.pin_memory)
			
 
				 
			
 
				         # Sequence and query lengths.
			
 
				         if cuda_graph_pad_size:
			
@@ -1249,6 +1323,15 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
 
				             raise RuntimeError("PromptAdapter is not enabled.")
			
 
				         return self.prompt_adapter_manager.list_adapters()
			
 
				 
			
 
				+    @property
			
 
				+    def model_is_mrope(self) -> bool:
			
 
				+        """Detect if the model has "mrope" rope_scaling type.
			
 
				+        mrope requires keep "rope_deltas" between prompt and decoding phases."""
			
 
				+        rope_scaling = getattr(self.model_config.hf_config, "rope_scaling", {})
			
 
				+        if rope_scaling is None:
			
 
				+            return False
			
 
				+        return rope_scaling.get("type", None) == "mrope"
			
 
				+
			
 
				     @torch.inference_mode()
			
 
				     def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
			
 
				         """Cuda graph capture a model.
			
@@ -1283,6 +1366,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
 
				         max_batch_size = self.max_batchsize_to_capture
			
 
				         input_tokens = torch.zeros(max_batch_size, dtype=torch.long).cuda()
			
 
				         input_positions = torch.zeros(max_batch_size, dtype=torch.long).cuda()
			
 
				+        if self.model_is_mrope:
			
 
				+            input_positions = torch.tile(input_positions, (3, 1))
			
 
				         # Prepare dummy previous_hidden_states only if needed by the model.
			
 
				         # This is used by draft models such as EAGLE.
			
 
				         previous_hidden_states = None
			
@@ -1348,7 +1433,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
 
				                         "input_ids":
			
 
				                         input_tokens[:batch_size],
			
 
				                         "positions":
			
 
				-                        input_positions[:batch_size],
			
 
				+                        input_positions[..., :batch_size],
			
 
				                         "hidden_or_intermediate_states":
			
 
				                         hidden_or_intermediate_states[
			
 
				                             virtual_engine]  # type: ignore
			
--- a/examples/vision/nadeko.mp4
+++ b/examples/vision/nadeko.mp4
--- a/examples/vision/vision_example.py
+++ b/examples/vision/vision_example.py
@@ -6,17 +6,54 @@ on HuggingFace model repository.
 
				 """
			
 
				 import os
			
 
				 
			
 
				+import cv2
			
 
				+import numpy as np
			
 
				 from PIL import Image
			
 
				 from transformers import AutoTokenizer
			
 
				 
			
 
				 from aphrodite import LLM, SamplingParams
			
 
				+from aphrodite.assets.video import VideoAsset
			
 
				 from aphrodite.common.utils import FlexibleArgumentParser
			
 
				+from aphrodite.multimodal.utils import sample_frames_from_video
			
 
				 
			
 
				 # Input image and question
			
 
				 image_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
			
 
				                           "burg.jpg")
			
 
				 image = Image.open(image_path).convert("RGB")
			
 
				-question = "What is the content of this image?"
			
 
				+img_question = "What is the content of this image?"
			
 
				+
			
 
				+# Input video and question
			
 
				+video_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
			
 
				+                          "nadeko.mp4")
			
 
				+vid_question = "What's in this video?"
			
 
				+
			
 
				+
			
 
				+def load_video_frames(video_path: str, num_frames: int) -> np.ndarray:
			
 
				+    """
			
 
				+    Load video frames from a local file path.
			
 
				+
			
 
				+    Args:
			
 
				+        video_path: Path to the video file
			
 
				+        num_frames: Number of frames to sample from the video
			
 
				+
			
 
				+    Returns:
			
 
				+        np.ndarray: Array of sampled video frames
			
 
				+    """
			
 
				+    cap = cv2.VideoCapture(video_path)
			
 
				+    if not cap.isOpened():
			
 
				+        raise ValueError(f"Could not open video file {video_path}")
			
 
				+
			
 
				+    frames = []
			
 
				+    while True:
			
 
				+        ret, frame = cap.read()
			
 
				+        if not ret:
			
 
				+            break
			
 
				+        frames.append(frame)
			
 
				+    cap.release()
			
 
				+
			
 
				+    frames = np.stack(frames)
			
 
				+    return sample_frames_from_video(frames, num_frames)
			
 
				+
			
 
				 
			
 
				 
			
 
				 # LLaVA-1.5
			
@@ -25,17 +62,26 @@ def run_llava(question):
 
				     prompt = f"USER: <image>\n{question}\nASSISTANT:"
			
 
				 
			
 
				     llm = LLM(model="llava-hf/llava-1.5-7b-hf")
			
 
				-
			
 
				-    return llm, prompt
			
 
				+    stop_token_ids = None
			
 
				+    return llm, prompt, stop_token_ids
			
 
				 
			
 
				 
			
 
				 # LLaVA-1.6/LLaVA-NeXT
			
 
				 def run_llava_next(question):
			
 
				 
			
 
				     prompt = f"[INST] <image>\n{question} [/INST]"
			
 
				-    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf")
			
 
				+    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
			
 
				+    stop_token_ids = None
			
 
				+    return llm, prompt, stop_token_ids
			
 
				+
			
 
				 
			
 
				-    return llm, prompt
			
 
				+# LlaVA-NeXT-Video
			
 
				+# Currently only support for video input
			
 
				+def run_llava_next_video(question):
			
 
				+    prompt = f"USER: <video>\n{question} ASSISTANT:"
			
 
				+    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf")
			
 
				+    stop_token_ids = None
			
 
				+    return llm, prompt, stop_token_ids
			
 
				 
			
 
				 
			
 
				 # Fuyu
			
@@ -43,8 +89,8 @@ def run_fuyu(question):
 
				 
			
 
				     prompt = f"{question}\n"
			
 
				     llm = LLM(model="adept/fuyu-8b")
			
 
				-
			
 
				-    return llm, prompt
			
 
				+    stop_token_ids = None
			
 
				+    return llm, prompt, stop_token_ids
			
 
				 
			
 
				 
			
 
				 # Phi-3-Vision
			
@@ -58,20 +104,22 @@ def run_phi3v(question):
 
				     # In this example, we override max_num_seqs to 5 while
			
 
				     # keeping the original context length of 128k.
			
 
				     llm = LLM(
			
 
				-        model="microsoft/Phi-3.5-vision-instruct",
			
 
				+        model="microsoft/Phi-3-vision-128k-instruct",
			
 
				         trust_remote_code=True,
			
 
				         max_num_seqs=5,
			
 
				     )
			
 
				-    return llm, prompt
			
 
				+    stop_token_ids = None
			
 
				+    return llm, prompt, stop_token_ids
			
 
				 
			
 
				 
			
 
				 # PaliGemma
			
 
				 def run_paligemma(question):
			
 
				 
			
 
				+    # PaliGemma has special prompt format for VQA
			
 
				     prompt = "caption en"
			
 
				-    llm = LLM(model="google/paligemma2-3b-ft-docci-448")
			
 
				-
			
 
				-    return llm, prompt
			
 
				+    llm = LLM(model="google/paligemma-3b-mix-224")
			
 
				+    stop_token_ids = None
			
 
				+    return llm, prompt, stop_token_ids
			
 
				 
			
 
				 
			
 
				 # Chameleon
			
@@ -79,7 +127,8 @@ def run_chameleon(question):
 
				 
			
 
				     prompt = f"{question}<image>"
			
 
				     llm = LLM(model="facebook/chameleon-7b")
			
 
				-    return llm, prompt
			
 
				+    stop_token_ids = None
			
 
				+    return llm, prompt, stop_token_ids
			
 
				 
			
 
				 
			
 
				 # MiniCPM-V
			
@@ -90,13 +139,26 @@ def run_minicpmv(question):
 
				     # model_name = "HwwwH/MiniCPM-V-2"
			
 
				 
			
 
				     # 2.5
			
 
				-    model_name = "openbmb/MiniCPM-Llama3-V-2_5"
			
 
				+    # model_name = "openbmb/MiniCPM-Llama3-V-2_5"
			
 
				+
			
 
				+    #2.6
			
 
				+    model_name = "openbmb/MiniCPM-V-2_6"
			
 
				     tokenizer = AutoTokenizer.from_pretrained(model_name,
			
 
				                                               trust_remote_code=True)
			
 
				     llm = LLM(
			
 
				         model=model_name,
			
 
				         trust_remote_code=True,
			
 
				     )
			
 
				+    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
			
 
				+    # 2.0
			
 
				+    # stop_token_ids = [tokenizer.eos_id]
			
 
				+
			
 
				+    # 2.5
			
 
				+    # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
			
 
				+
			
 
				+    # 2.6
			
 
				+    stop_tokens = ['<|im_end|>', '<|endoftext|>']
			
 
				+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
			
 
				 
			
 
				     messages = [{
			
 
				         'role': 'user',
			
@@ -105,7 +167,33 @@ def run_minicpmv(question):
 
				     prompt = tokenizer.apply_chat_template(messages,
			
 
				                                            tokenize=False,
			
 
				                                            add_generation_prompt=True)
			
 
				-    return llm, prompt
			
 
				+    return llm, prompt, stop_token_ids
			
 
				+
			
 
				+
			
 
				+# InternVL
			
 
				+def run_internvl(question):
			
 
				+    model_name = "OpenGVLab/InternVL2-2B"
			
 
				+
			
 
				+    llm = LLM(
			
 
				+        model=model_name,
			
 
				+        trust_remote_code=True,
			
 
				+        max_num_seqs=5,
			
 
				+    )
			
 
				+
			
 
				+    tokenizer = AutoTokenizer.from_pretrained(model_name,
			
 
				+                                              trust_remote_code=True)
			
 
				+    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
			
 
				+    prompt = tokenizer.apply_chat_template(messages,
			
 
				+                                           tokenize=False,
			
 
				+                                           add_generation_prompt=True)
			
 
				+
			
 
				+    # Stop tokens for InternVL
			
 
				+    # models variants may have different stop tokens
			
 
				+    # please refer to the model card for the correct "stop words":
			
 
				+    # https://huggingface.co/OpenGVLab/InternVL2-2B#service
			
 
				+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
			
 
				+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
			
 
				+    return llm, prompt, stop_token_ids
			
 
				 
			
 
				 
			
 
				 # BLIP-2
			
@@ -115,39 +203,45 @@ def run_blip2(question):
 
				     # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
			
 
				     prompt = f"Question: {question} Answer:"
			
 
				     llm = LLM(model="Salesforce/blip2-opt-2.7b")
			
 
				-    return llm, prompt
			
 
				-
			
 
				-
			
 
				-# InternVL
			
 
				-def run_internvl(question):
			
 
				-    # Generally, InternVL can use chatml template for conversation
			
 
				-    TEMPLATE = "<|im_start|>User\n{prompt}<|im_end|>\n<|im_start|>Assistant\n"
			
 
				-    prompt = f"<image>\n{question}\n"
			
 
				-    prompt = TEMPLATE.format(prompt=prompt)
			
 
				-    llm = LLM(
			
 
				-        model="OpenGVLab/InternVL2-4B",
			
 
				-        trust_remote_code=True,
			
 
				-        max_num_seqs=28,
			
 
				-        tensor_parallel_size=2,
			
 
				-        max_model_len=8192,
			
 
				-    )
			
 
				-    return llm, prompt
			
 
				+    stop_token_ids = None
			
 
				+    return llm, prompt, stop_token_ids
			
 
				 
			
 
				 
			
 
				 # Qwen
			
 
				 def run_qwen_vl(question):
			
 
				+
			
 
				     llm = LLM(
			
 
				         model="Qwen/Qwen-VL",
			
 
				         trust_remote_code=True,
			
 
				         max_num_seqs=5,
			
 
				     )
			
 
				+
			
 
				     prompt = f"{question}Picture 1: <img></img>\n"
			
 
				-    return llm, prompt
			
 
				+    stop_token_ids = None
			
 
				+    return llm, prompt, stop_token_ids
			
 
				+
			
 
				+
			
 
				+# Qwen2-VL
			
 
				+def run_qwen2_vl(question):
			
 
				+    model_name = "Qwen/Qwen2-VL-7B-Instruct"
			
 
				+
			
 
				+    llm = LLM(
			
 
				+        model=model_name,
			
 
				+        max_num_seqs=5,
			
 
				+    )
			
 
				+
			
 
				+    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
			
 
				+              "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
			
 
				+              f"{question}<|im_end|>\n"
			
 
				+              "<|im_start|>assistant\n")
			
 
				+    stop_token_ids = None
			
 
				+    return llm, prompt, stop_token_ids
			
 
				 
			
 
				 
			
 
				 model_example_map = {
			
 
				     "llava": run_llava,
			
 
				     "llava-next": run_llava_next,
			
 
				+    "llava-next-video": run_llava_next_video,
			
 
				     "fuyu": run_fuyu,
			
 
				     "phi3_v": run_phi3v,
			
 
				     "paligemma": run_paligemma,
			
@@ -156,19 +250,53 @@ model_example_map = {
 
				     "blip-2": run_blip2,
			
 
				     "internvl_chat": run_internvl,
			
 
				     "qwen_vl": run_qwen_vl,
			
 
				+    "qwen2_vl": run_qwen2_vl,
			
 
				 }
			
 
				 
			
 
				 
			
 
				+def get_multi_modal_input(args):
			
 
				+    """
			
 
				+    return {
			
 
				+        "data": image or video,
			
 
				+        "question": question,
			
 
				+    }
			
 
				+    """
			
 
				+    if args.modality == "image":
			
 
				+        return {
			
 
				+            "data": image,
			
 
				+            "question": img_question,
			
 
				+        }
			
 
				+
			
 
				+    if args.modality == "video":
			
 
				+        video = VideoAsset(name="nadeko.mp4",
			
 
				+                          num_frames=args.num_frames,
			
 
				+                          local_path=video_path).np_ndarrays
			
 
				+        return {
			
 
				+            "data": video,
			
 
				+            "question": vid_question,
			
 
				+        }
			
 
				+
			
 
				+    msg = f"Modality {args.modality} is not supported."
			
 
				+    raise ValueError(msg)
			
 
				+
			
 
				+
			
 
				 def main(args):
			
 
				     model = args.model_type
			
 
				     if model not in model_example_map:
			
 
				         raise ValueError(f"Model type {model} is not supported.")
			
 
				 
			
 
				-    llm, prompt = model_example_map[model](question)
			
 
				+    modality = args.modality
			
 
				+    mm_input = get_multi_modal_input(args)
			
 
				+    data = mm_input["data"]
			
 
				+    question = mm_input["question"]
			
 
				+
			
 
				+    llm, prompt, stop_token_ids = model_example_map[model](question)
			
 
				 
			
 
				     # We set temperature to 0.2 so that outputs can be different
			
 
				     # even when all prompts are identical when running batch inference.
			
 
				-    sampling_params = SamplingParams(temperature=0.2, max_tokens=128)
			
 
				+    sampling_params = SamplingParams(temperature=0.2,
			
 
				+                                     max_tokens=512,
			
 
				+                                     stop_token_ids=stop_token_ids)
			
 
				 
			
 
				     assert args.num_prompts > 0
			
 
				     if args.num_prompts == 1:
			
@@ -176,7 +304,7 @@ def main(args):
 
				         inputs = {
			
 
				             "prompt": prompt,
			
 
				             "multi_modal_data": {
			
 
				-                "image": image
			
 
				+                modality: data
			
 
				             },
			
 
				         }
			
 
				 
			
@@ -185,7 +313,7 @@ def main(args):
 
				         inputs = [{
			
 
				             "prompt": prompt,
			
 
				             "multi_modal_data": {
			
 
				-                "image": image
			
 
				+                modality: data
			
 
				             },
			
 
				         } for _ in range(args.num_prompts)]
			
 
				 
			
@@ -198,7 +326,7 @@ def main(args):
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     parser = FlexibleArgumentParser(
			
 
				-        description='Demo on using Aphrodite for offline inference with '
			
 
				+        description='Demo on using vLLM for offline inference with '
			
 
				         'vision language models')
			
 
				     parser.add_argument('--model-type',
			
 
				                         '-m',
			
@@ -210,6 +338,13 @@ if __name__ == "__main__":
 
				                         type=int,
			
 
				                         default=1,
			
 
				                         help='Number of prompts to run.')
			
 
				-
			
 
				+    parser.add_argument('--modality',
			
 
				+                        type=str,
			
 
				+                        default="image",
			
 
				+                        help='Modality of the input.')
			
 
				+    parser.add_argument('--num-frames',
			
 
				+                        type=int,
			
 
				+                        default=16,
			
 
				+                        help='Number of frames to extract from the video.')
			
 
				     args = parser.parse_args()
			
 
				     main(args)
			
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -35,3 +35,4 @@ msgspec
 
				 python-multipart
			
 
				 partial-json-parser
			
 
				 opencv-python
			
 
				+einops