123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029 |
- # coding=utf-8
- # Adapted from
- # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
- # Copyright 2023 The vLLM team.
- # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
- #
- # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
- # and OPT implementations in this library. It has been modified from its
- # original forms to accommodate minor architectural differences compared
- # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """Inference-only MiniCPM-V model compatible with HuggingFace weights."""
- import math
- import re
- from functools import partial
- from typing import (Any, Callable, Iterable, List, Optional, Tuple, TypedDict,
- Union)
- import numpy as np
- import torch
- import torch.nn.functional as F
- import torch.types
- from PIL import Image
- from torch import nn
- from torch.nn.init import trunc_normal_
- from transformers.configuration_utils import PretrainedConfig
- from aphrodite.attention import AttentionMetadata
- from aphrodite.common.config import CacheConfig, MultiModalConfig
- from aphrodite.common.sequence import (IntermediateTensors, SamplerOutput,
- SequenceData)
- from aphrodite.common.utils import progress_bar
- from aphrodite.inputs import INPUT_REGISTRY, InputContext, LLMInputs
- from aphrodite.modeling.layers.linear import ReplicatedLinear
- from aphrodite.modeling.layers.logits_processor import LogitsProcessor
- from aphrodite.modeling.layers.sampler import Sampler
- from aphrodite.modeling.layers.vocab_parallel_embedding import ParallelLMHead
- from aphrodite.modeling.model_loader.utils import set_default_torch_dtype
- from aphrodite.modeling.model_loader.weight_utils import default_weight_loader
- from aphrodite.modeling.models.interfaces import SupportsMultiModal
- from aphrodite.modeling.models.llama import LlamaModel
- from aphrodite.modeling.models.minicpm import MiniCPMModel
- from aphrodite.modeling.models.qwen2 import Qwen2Model
- from aphrodite.modeling.sampling_metadata import SamplingMetadata
- from aphrodite.multimodal import MULTIMODAL_REGISTRY
- from aphrodite.multimodal.image import (cached_get_image_processor,
- cached_get_tokenizer)
- from aphrodite.quantization.base_config import QuantizationConfig
- from .idefics2_vision_model import Idefics2VisionTransformer
- _KEYS_TO_MODIFY_MAPPING = {
- "llm.lm_head": "lm_head",
- "llm.model": "llm",
- }
- class MiniCPMVImagePixelInputs(TypedDict):
- pixel_values: List[torch.Tensor]
- """
- Shape: `(batch_size * num_images, num_channels, height, width)`
- Note that the image size may vary, so we pass it as a list
- instead of a batched tensor.
- """
- image_bounds: torch.Tensor
- """
- Shape: `(batch_size * num_images, 2)`
- This should be in `(start, stop)` format.
- """
- tgt_sizes: torch.Tensor
- """
- Shape: `(batch_size * num_images, 2)`
- This should be in `(height, width)` format.
- """
- MiniCPMVImageInputs = MiniCPMVImagePixelInputs
- DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6)
- def get_abs_pos(abs_pos: torch.Tensor, tgt_size: torch.Tensor):
- # abs_pos: L, C
- # tgt_size: (H, W)
- # return: M, C
- src_size = int(math.sqrt(abs_pos.size(0)))
- # tgt_size = int(math.sqrt(tgt_size))
- dtype = abs_pos.dtype
- return (F.interpolate(
- abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
- size=(tgt_size[0], tgt_size[1]),
- mode="bicubic",
- align_corners=False,
- ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype))
- # https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
- def get_2d_sincos_pos_embed(
- embed_dim: int,
- grid_size: Union[int, Tuple[int, int]],
- cls_token: bool = False,
- version: Tuple[int, int] = (2, 0),
- ):
- """
- grid_size: int of the grid height and width
- return:
- pos_embed: [grid_size*grid_size, embed_dim] or
- [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
- """
- if isinstance(grid_size, int):
- grid_h_size, grid_w_size = grid_size, grid_size
- else:
- grid_h_size, grid_w_size = grid_size[0], grid_size[1]
- grid_h = np.arange(grid_h_size, dtype=np.float32)
- grid_w = np.arange(grid_w_size, dtype=np.float32)
- grid = np.meshgrid(grid_w, grid_h) # here w goes first
- grid = np.stack(grid, axis=0)
- if version == (2, 0):
- grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
- pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
- if cls_token:
- pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed],
- axis=0)
- else:
- pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
- return pos_embed
- def get_2d_sincos_pos_embed_from_grid(embed_dim: int,
- grid: np.ndarray,
- version: Tuple[int, int] = (2, 0)):
- assert embed_dim % 2 == 0
- # use half of dimensions to encode grid_h
- emb_h = get_1d_sincos_pos_embed_from_grid(
- embed_dim // 2, grid[0], version) # (H*W, D/2) or (H, W, D/2)
- emb_w = get_1d_sincos_pos_embed_from_grid(
- embed_dim // 2, grid[1], version) # (H*W, D/2) or (H, W, D/2)
- if version == (2, 0):
- emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
- else:
- emb = np.concatenate([emb_h, emb_w], axis=-1) # (H, W, D)
- return emb
- def get_1d_sincos_pos_embed_from_grid(embed_dim: int,
- pos: np.ndarray,
- version: Tuple[int, int] = (2, 0)):
- """
- embed_dim: output dimension for each position
- pos: a list of positions to be encoded: size (M,) / (H, W)
- out: (M, D) / (H, W, D)
- """
- assert embed_dim % 2 == 0
- omega = np.arange(embed_dim // 2, dtype=np.float32)
- omega /= embed_dim / 2.0
- omega = 1.0 / 10000**omega # (D/2,)
- if version == (2, 0):
- pos = pos.reshape(-1) # (M,)
- out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product
- emb_sin = np.sin(out) # (M, D/2)
- emb_cos = np.cos(out) # (M, D/2)
- emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
- else:
- out = np.einsum("hw,d->hwd", pos, omega) # (H, W, D/2), outer product
- emb_sin = np.sin(out) # (H, W, D/2)
- emb_cos = np.cos(out) # (H, W, D/2)
- emb = np.concatenate([emb_sin, emb_cos], axis=-1) # (H, W, D)
- return emb
- class BaseResampler(nn.Module):
- """
- A 2D perceiver-resampler network with one cross attention layers by
- (grid_size**2) learnable queries and 2d sincos pos_emb
- Outputs:
- A tensor with the shape of (grid_size**2, embed_dim)
- """
- def __init__(
- self,
- num_queries: int,
- embed_dim: int,
- num_heads: int,
- kv_dim: Optional[int] = None,
- norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
- ) -> None:
- super().__init__()
- self.num_queries = num_queries
- self.embed_dim = embed_dim
- self.num_heads = num_heads
- self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
- trunc_normal_(self.query, std=0.02)
- if kv_dim is not None and kv_dim != embed_dim:
- self.kv_proj = ReplicatedLinear(kv_dim, embed_dim, bias=False)
- else:
- # Maintain the same return value with ReplicatedLinear.forward
- self.kv_proj = lambda *args, **kwargs: (
- nn.Identity()(*args, **kwargs),
- None,
- )
- self.attn = nn.MultiheadAttention(embed_dim, num_heads)
- self.ln_q = norm_layer(embed_dim)
- self.ln_kv = norm_layer(embed_dim)
- self.ln_post = norm_layer(embed_dim)
- self.proj = nn.Parameter(
- (embed_dim**-0.5) * torch.randn(embed_dim, embed_dim))
- def _init_weights(self, m: nn.Module) -> None:
- if isinstance(m, nn.Linear):
- trunc_normal_(m.weight, std=0.02)
- if isinstance(m, nn.Linear) and m.bias is not None:
- nn.init.constant_(m.bias, 0)
- elif isinstance(m, nn.LayerNorm):
- nn.init.constant_(m.bias, 0)
- nn.init.constant_(m.weight, 1.0)
- def _repeat(self, query, N: int):
- return query.unsqueeze(1).repeat(1, N, 1)
- class Resampler2(BaseResampler):
- def __init__(
- self,
- grid_size: int,
- embed_dim: int,
- num_heads: int,
- kv_dim: Optional[int] = None,
- norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
- adaptive: bool = False,
- ) -> None:
- super().__init__(grid_size**2, embed_dim, num_heads, kv_dim,
- norm_layer)
- self.adaptive = adaptive
- pos_embed_arr = get_2d_sincos_pos_embed(embed_dim,
- grid_size,
- version=(2, 0))
- self.pos_embed = nn.Parameter(
- torch.from_numpy(pos_embed_arr).float()).requires_grad_(False)
- self.apply(self._init_weights)
- def forward(
- self,
- x: torch.Tensor,
- tgt_sizes: torch.Tensor,
- attn_mask: Optional[torch.Tensor] = None,
- ):
- if self.adaptive:
- pos_embed_arr = get_2d_sincos_pos_embed(self.embed_dim,
- tgt_sizes,
- version=(2, 0))
- pos_embed = torch.from_numpy(pos_embed_arr).to(device=x.device,
- dtype=x.dtype)
- else:
- pos_embed = get_abs_pos(self.pos_embed, tgt_sizes)
- x, _ = self.kv_proj(x)
- x = self.ln_kv(x).permute(1, 0, 2)
- N = x.shape[1]
- q = self.ln_q(self.query)
- out = self.attn(
- self._repeat(q, N) + self.pos_embed.unsqueeze(1),
- x + pos_embed.unsqueeze(1),
- x,
- attn_mask=attn_mask,
- )[0]
- x = out.permute(1, 0, 2)
- x = self.ln_post(x)
- x = x @ self.proj
- return x
- class Resampler2_5(BaseResampler):
- def __init__(
- self,
- num_queries: int,
- embed_dim: int,
- num_heads: int,
- kv_dim: Optional[int] = None,
- norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
- max_size: Tuple[int, int] = (70, 70),
- ) -> None:
- super().__init__(num_queries, embed_dim, num_heads, kv_dim, norm_layer)
- self.max_size = max_size
- self._set_2d_pos_cache(self.max_size)
- self.apply(self._init_weights)
- def _set_2d_pos_cache(self,
- max_size: Tuple[int, int],
- device: torch.types.Device = "cpu") -> None:
- pos_embed_arr = get_2d_sincos_pos_embed(self.embed_dim,
- max_size,
- version=(2, 5))
- pos_embed = torch.from_numpy(pos_embed_arr).float().to(device)
- self.register_buffer("pos_embed", pos_embed, persistent=False)
- def _adjust_pos_cache(self, tgt_sizes: torch.Tensor,
- device: torch.types.Device) -> None:
- max_h = tgt_sizes[:, 0].max().item()
- max_w = tgt_sizes[:, 1].max().item()
- assert isinstance(max_h, int) and isinstance(max_w, int)
- if max_h > self.max_size[0] or max_w > self.max_size[1]:
- self.max_size = (
- max(max_h, self.max_size[0]),
- max(max_w, self.max_size[1]),
- )
- self._set_2d_pos_cache(self.max_size, device)
- def forward(self, x: torch.Tensor,
- tgt_sizes: torch.Tensor) -> torch.Tensor:
- assert x.shape[0] == tgt_sizes.shape[0]
- bs = x.shape[0]
- device = x.device
- dtype = x.dtype
- patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
- self._adjust_pos_cache(tgt_sizes, device=device)
- max_patch_len = patch_len.max().item()
- assert isinstance(max_patch_len, int)
- key_padding_mask = torch.zeros((bs, max_patch_len),
- dtype=torch.bool,
- device=device)
- pos_embed = []
- for i in range(bs):
- tgt_h, tgt_w = tgt_sizes[i].tolist()
- pos_embed.append(self.pos_embed[:tgt_h, :tgt_w, :].reshape(
- (tgt_h * tgt_w, -1)).to(dtype)) # patches * D
- key_padding_mask[i, patch_len[i]:] = True
- pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed,
- batch_first=True,
- padding_value=0.0).permute(
- 1, 0,
- 2) # BLD => L * B * D
- x, _ = self.kv_proj(x) # B * L * D
- x = self.ln_kv(x).permute(1, 0, 2) # L * B * D
- q = self.ln_q(self.query) # Q * D
- out = self.attn(
- self._repeat(q, bs), # Q * B * D
- x + pos_embed, # L * B * D + L * B * D
- x,
- key_padding_mask=key_padding_mask,
- )[0]
- # out: Q * B * D
- x = out.permute(1, 0, 2) # B * Q * D
- x = self.ln_post(x)
- x = x @ self.proj
- return x
- def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]:
- version_float = getattr(config, "version", None)
- # The old configs do not include version number
- # TODO: Remove this after the HF repos are updated
- if version_float is None:
- if config.hidden_size == 2304 and config.query_num == 64:
- return (2, 0)
- return (2, 5)
- version_str = str(version_float)
- return tuple(int(x) for x in version_str.split("."))
- def get_max_minicpmv_image_tokens(ctx: InputContext):
- hf_config = ctx.get_hf_config(PretrainedConfig)
- return getattr(hf_config, "query_num", 64)
- def dummy_seq_data_for_minicpmv(seq_len: int):
- token_ids = [0] * seq_len
- return SequenceData(token_ids)
- def dummy_image_for_minicpmv(hf_config: PretrainedConfig):
- width = height = hf_config.image_size
- image = Image.new("RGB", (width, height), color=0)
- return {"image": image}
- def dummy_data_for_minicpmv(ctx: InputContext, seq_len: int):
- hf_config = ctx.get_hf_config(PretrainedConfig)
- seq_data = dummy_seq_data_for_minicpmv(seq_len)
- mm_data = dummy_image_for_minicpmv(hf_config)
- return seq_data, mm_data
- def input_processor_for_minicpmv(ctx: InputContext, llm_inputs: LLMInputs):
- multi_modal_data = llm_inputs.get("multi_modal_data")
- if multi_modal_data is None or "image" not in multi_modal_data:
- return llm_inputs
- model_config = ctx.model_config
- version = get_version_by_config(model_config.hf_config)
- tokenizer = cached_get_tokenizer(model_config.tokenizer,
- trust_remote_code=True)
- image_processor = cached_get_image_processor(model_config.tokenizer)
- def get_placeholder(image_size: Tuple[int, int], num_image: int):
- if version == (2, 0) or version == (2, 5):
- return image_processor. \
- get_slice_image_placeholder(image_size)
- return image_processor. \
- get_slice_image_placeholder(image_size, num_image)
- prompt = llm_inputs.get("prompt")
- if prompt is None:
- token_ids = llm_inputs.get("prompt_token_ids")
- prompt = tokenizer.decode(token_ids)
- pattern = "(<image>./</image>)"
- images = multi_modal_data["image"]
- if isinstance(images, Image.Image):
- images = [images]
- image_tags = re.findall(pattern, prompt)
- if len(image_tags) == 0:
- new_token_ids = token_ids
- new_prompt = prompt
- else:
- text_chunks = prompt.split(pattern)
- new_prompt_chunks: List[str] = []
- for i in range(len(images)):
- new_prompt_chunks += [
- text_chunks[i],
- get_placeholder(images[i].size, i)
- ]
- new_prompt_chunks.append(text_chunks[-1])
- new_prompt = "".join(new_prompt_chunks)
- new_token_ids = tokenizer.encode(new_prompt)
- llm_inputs = LLMInputs(
- prompt_token_ids=new_token_ids,
- prompt=new_prompt,
- multi_modal_data=multi_modal_data,
- )
- return llm_inputs
- class MiniCPMVBaseModel(nn.Module, SupportsMultiModal):
- """
- The abstract class of MiniCPMV can only be inherited, but cannot be
- instantiated.
- """
- def __init__(
- self,
- config: PretrainedConfig,
- multimodal_config: MultiModalConfig,
- cache_config: Optional[CacheConfig] = None,
- quant_config: Optional[QuantizationConfig] = None,
- ):
- super().__init__()
- self.config = config
- self.multimodal_config = multimodal_config
- self.version = get_version_by_config(self.config)
- self.llm = self.init_llm(config, cache_config, quant_config)
- self.vpm = self.init_vision_module()
- param_dtype = torch.get_default_dtype()
- self.vpm.to(dtype=param_dtype)
- self.vision_dim = (self.vpm.embed_dim if self.version == (2, 0) else
- self.vpm.embeddings.embed_dim)
- self.embed_dim = self.config.hidden_size
- self.resampler = self.init_resampler(self.embed_dim, self.vision_dim)
- self.resampler.to(device="cuda", dtype=param_dtype)
- self.lm_head = ParallelLMHead(config.vocab_size,
- config.hidden_size,
- quant_config=quant_config)
- self.logits_processor = LogitsProcessor(config.vocab_size)
- self.sampler = Sampler()
- def get_embedding(
- self,
- input_ids: torch.Tensor,
- image_inputs: Optional[MiniCPMVImageInputs],
- ) -> Tuple[torch.Tensor, torch.Tensor]:
- vlm_embedding: torch.Tensor = self.llm.embed_tokens(input_ids)
- if hasattr(self.config, "scale_emb"):
- vlm_embedding *= self.config.scale_emb
- if image_inputs is None: # No image
- vision_hidden_states = torch.tensor([], device=input_ids.device)
- else:
- vision_hidden_states = self.get_vision_hidden_states(image_inputs)
- # See NOTE in _parse_and_validate_inputs
- image_bounds = image_inputs["image_bounds"]
- if len(image_bounds) > 0:
- image_indices = torch.stack([
- torch.arange(start, end, dtype=torch.long)
- for start, end in image_bounds.tolist()
- ]).to(vlm_embedding.device)
- vlm_embedding.scatter_(
- 0,
- image_indices.view(-1, 1).repeat(1,
- vlm_embedding.shape[-1]),
- vision_hidden_states.view(-1,
- vision_hidden_states.shape[-1]),
- )
- return vlm_embedding, vision_hidden_states
- def _get_image_bounds(self, input_ids: torch.Tensor) -> torch.Tensor:
- tokenizer = cached_get_tokenizer(self.config._name_or_path,
- trust_remote_code=True)
- start_cond = input_ids == tokenizer.im_start_id
- end_cond = input_ids == tokenizer.im_end_id
- if hasattr(tokenizer, "slice_start_id"):
- start_cond |= (input_ids == tokenizer.slice_start_id)
- end_cond |= (input_ids == tokenizer.slice_end_id)
- image_start_tokens, = torch.where(start_cond)
- image_start_tokens += 1
- image_end_tokens, = torch.where(end_cond)
- valid_image_nums = max(len(image_start_tokens), len(image_end_tokens))
- if valid_image_nums == 0:
- return torch.zeros((0, 2), device=input_ids.device)
- return torch.hstack([
- image_start_tokens[:valid_image_nums].unsqueeze(-1),
- image_end_tokens[:valid_image_nums].unsqueeze(-1),
- ])
- def _parse_and_validate_inputs(
- self,
- input_ids: torch.Tensor,
- **kwargs: object,
- ) -> Optional[MiniCPMVImageInputs]:
- pixel_values = kwargs.pop("pixel_values", [])
- tgt_sizes = kwargs.pop("tgt_sizes", [])
- if not isinstance(pixel_values, (torch.Tensor, list)):
- raise ValueError("Incorrect type of pixel values. "
- f"Got type: {type(pixel_values)}")
- if not isinstance(tgt_sizes, (torch.Tensor, list)):
- raise ValueError("Incorrect type of target sizes. "
- f"Got type: {type(tgt_sizes)}")
- if len(pixel_values) != len(tgt_sizes):
- raise ValueError("Inconsistent batch lengths, found: "
- f"{len(pixel_values)} vs. {len(tgt_sizes)}")
- pixel_values_flat: List[torch.Tensor] = []
- tgt_sizes_flat: List[torch.Tensor] = []
- for b in range(len(pixel_values)):
- pixel_values_flat += pixel_values[b]
- tgt_sizes_flat += tgt_sizes[b]
- # NOTE: Input IDs does not contain image tokens during memory profiling,
- # so we allow it to be empty
- if len(pixel_values_flat) != len(tgt_sizes_flat):
- raise ValueError("Inconsistent flattened lengths, found: "
- f"{len(pixel_values_flat)} vs. "
- f"{len(tgt_sizes_flat)}")
- if len(pixel_values_flat) == 0:
- return None
- return MiniCPMVImageInputs(
- image_bounds=self._get_image_bounds(input_ids),
- pixel_values=pixel_values_flat,
- tgt_sizes=torch.stack(tgt_sizes_flat),
- )
- def forward(
- self,
- input_ids: torch.Tensor,
- positions: torch.Tensor,
- kv_caches: List[torch.Tensor],
- attn_metadata: AttentionMetadata,
- intermediate_tensors: Optional[IntermediateTensors] = None,
- **kwargs: Any,
- ) -> torch.Tensor:
- image_inputs = self._parse_and_validate_inputs(input_ids, **kwargs)
- vlm_embeddings, _ = self.get_embedding(input_ids, image_inputs)
- output = self.llm(
- input_ids=None,
- positions=positions,
- kv_caches=kv_caches,
- attn_metadata=attn_metadata,
- intermediate_tensors=intermediate_tensors,
- inputs_embeds=vlm_embeddings,
- )
- return output
- def compute_logits(
- self,
- hidden_states: torch.Tensor,
- sampling_metadata: SamplingMetadata,
- ) -> Optional[torch.Tensor]:
- logits = self.logits_processor(self.lm_head, hidden_states,
- sampling_metadata)
- return logits
- def sample(
- self,
- logits: torch.Tensor,
- sampling_metadata: SamplingMetadata,
- ) -> Optional[SamplerOutput]:
- next_tokens = self.sampler(logits, sampling_metadata)
- return next_tokens
- def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
- stacked_params_mapping = [
- # (param_name, shard_name, shard_id)
- ("qkv_proj", "q_proj", "q"),
- ("qkv_proj", "k_proj", "k"),
- ("qkv_proj", "v_proj", "v"),
- ("gate_up_proj", "gate_proj", 0),
- ("gate_up_proj", "up_proj", 1),
- ]
- params_dict = dict(self.named_parameters())
- weights_list = list(weights)
- for name, loaded_weight in progress_bar(weights_list,
- desc="Loading modules..."):
- for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
- if key_to_modify in name:
- name = name.replace(key_to_modify, new_key)
- if "rotary_emb.inv_freq" in name:
- continue
- if ("rotary_emb.cos_cached" in name
- or "rotary_emb.sin_cached" in name):
- # Models trained using ColossalAI may include these tensors in
- # the checkpoint. Skip them.
- continue
- use_default_weight_loading = False
- if self.is_default_weight_loading(name):
- use_default_weight_loading = True
- else:
- for param_name, weight_name, shard_id in stacked_params_mapping:
- if weight_name not in name:
- continue
- param = params_dict[name.replace(weight_name, param_name)]
- weight_loader = param.weight_loader
- weight_loader(param, loaded_weight, shard_id)
- break
- else:
- use_default_weight_loading = True
- if use_default_weight_loading:
- param = params_dict[name]
- weight_loader = getattr(param, "weight_loader",
- default_weight_loader)
- weight_loader(param, loaded_weight)
- def init_llm(
- self,
- config: PretrainedConfig,
- cache_config: Optional[CacheConfig] = None,
- quant_config: Optional[QuantizationConfig] = None,
- ) -> nn.Module:
- raise NotImplementedError
- def init_vision_module(self) -> nn.Module:
- raise NotImplementedError
- def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module:
- raise NotImplementedError
- def get_vision_embedding(
- self,
- pixel_values: List[torch.Tensor],
- patch_attn_mask: Optional[torch.Tensor] = None,
- tgt_sizes: Optional[torch.Tensor] = None,
- ) -> torch.Tensor:
- raise NotImplementedError
- def get_vision_hidden_states(self,
- data: MiniCPMVImageInputs) -> torch.Tensor:
- raise NotImplementedError
- def is_default_weight_loading(self, name: str) -> bool:
- raise NotImplementedError
- class MiniCPMV2(MiniCPMVBaseModel):
- def __init__(
- self,
- config: PretrainedConfig,
- multimodal_config: MultiModalConfig,
- cache_config: Optional[CacheConfig] = None,
- quant_config: Optional[QuantizationConfig] = None,
- ):
- super().__init__(config, multimodal_config, cache_config, quant_config)
- assert self.version == (2, 0)
- def init_llm(
- self,
- config: PretrainedConfig,
- cache_config: Optional[CacheConfig] = None,
- quant_config: Optional[QuantizationConfig] = None,
- ) -> nn.Module:
- return MiniCPMModel(config,
- cache_config=cache_config,
- quant_config=quant_config)
- def init_vision_module(self) -> nn.Module:
- # TODO :refactor this vision model
- try:
- import timm
- except ImportError:
- raise ImportError("Please install timm==0.9.10") from ImportError
- with set_default_torch_dtype(torch.float16):
- model = timm.create_model(
- "vit_so400m_patch14_siglip_384.webli",
- pretrained=False,
- num_classes=0,
- dynamic_img_size=True,
- dynamic_img_pad=True,
- )
- if (isinstance(model, timm.models.VisionTransformer)
- and model.attn_pool is not None):
- model.attn_pool = torch.nn.Identity()
- if self.config.drop_vision_last_layer:
- model.blocks = model.blocks[:-1]
- return model
- def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module:
- with set_default_torch_dtype(torch.float16):
- resampler = Resampler2(
- embed_dim=embed_dim,
- num_heads=embed_dim // 128,
- grid_size=int(math.sqrt(self.config.query_num)),
- kv_dim=vision_dim,
- adaptive=True,
- )
- return resampler
- def get_vision_embedding(
- self,
- pixel_values: List[torch.Tensor],
- patch_attn_mask: Optional[torch.Tensor] = None,
- tgt_sizes: Optional[torch.Tensor] = None,
- ) -> torch.Tensor:
- res = []
- dtype = self.vpm.pos_embed.data.dtype
- for pixel_value in pixel_values:
- H, W = pixel_value[0].shape[-2:]
- tgt_size = (
- math.ceil(H / self.vpm.patch_embed.patch_size[0]),
- math.ceil(W / self.vpm.patch_embed.patch_size[0]),
- )
- vision_embedding = self.vpm.forward_features(
- pixel_value.unsqueeze(0).type(dtype))
- if (hasattr(self.vpm, "num_prefix_tokens")
- and self.vpm.num_prefix_tokens > 0):
- vision_embedding = vision_embedding[:, self.vpm.
- num_prefix_tokens:]
- res.append(self.resampler(vision_embedding, tgt_size))
- return torch.vstack(res)
- def get_vision_hidden_states(self,
- data: MiniCPMVImageInputs) -> torch.Tensor:
- pixel_values = data["pixel_values"]
- return self.get_vision_embedding(pixel_values)
- def is_default_weight_loading(self, name: str) -> bool:
- return "resampler" in name or "vpm" in name
- class MiniCPMV2_5(MiniCPMVBaseModel):
- def __init__(
- self,
- config: PretrainedConfig,
- multimodal_config: MultiModalConfig,
- cache_config: Optional[CacheConfig] = None,
- quant_config: Optional[QuantizationConfig] = None,
- ):
- super().__init__(config, multimodal_config, cache_config, quant_config)
- assert self.version == (2, 5)
- def init_llm(
- self,
- config: PretrainedConfig,
- cache_config: Optional[CacheConfig] = None,
- quant_config: Optional[QuantizationConfig] = None,
- ) -> nn.Module:
- return LlamaModel(config,
- cache_config=cache_config,
- quant_config=quant_config)
- def init_vision_module(self) -> nn.Module:
- model = Idefics2VisionTransformer(self.config.vision_config)
- if self.config.drop_vision_last_layer:
- model.encoder.layers = model.encoder.layers[:-1]
- return model
- def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module:
- with set_default_torch_dtype(torch.float16):
- resampler = Resampler2_5(
- num_queries=self.config.query_num,
- embed_dim=embed_dim,
- num_heads=embed_dim // 128,
- kv_dim=vision_dim,
- )
- return resampler
- def get_vision_embedding(
- self,
- pixel_values: List[torch.Tensor],
- patch_attn_mask: Optional[torch.Tensor] = None,
- tgt_sizes: Optional[torch.Tensor] = None,
- ) -> torch.Tensor:
- vision_embedding = self.vpm(pixel_values,
- patch_attention_mask=patch_attn_mask)
- vision_embedding = self.resampler(vision_embedding, tgt_sizes)
- return vision_embedding
- def get_vision_hidden_states(self,
- data: MiniCPMVImageInputs) -> torch.Tensor:
- pixel_values = data["pixel_values"]
- tgt_sizes = data["tgt_sizes"]
- device = self.vpm.embeddings.position_embedding.weight.device
- dtype = self.vpm.embeddings.position_embedding.weight.dtype
- all_pixel_values_lst = [
- i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
- ]
- max_patches = (tgt_sizes[:, 0] * tgt_sizes[:, 1]).max().item()
- assert isinstance(max_patches, int)
- all_pixel_values = torch.nn.utils.rnn.pad_sequence(
- all_pixel_values_lst, batch_first=True, padding_value=0.0)
- B, L, _ = all_pixel_values.shape
- all_pixel_values = all_pixel_values.permute(0, 2,
- 1).reshape(B, 3, -1, L)
- patch_attn_mask = torch.zeros((B, 1, max_patches),
- dtype=torch.bool,
- device=device)
- for i in range(B):
- patch_attn_mask[i, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True
- return self.get_vision_embedding(all_pixel_values.type(dtype),
- patch_attn_mask, tgt_sizes)
- def is_default_weight_loading(self, name: str) -> bool:
- return "resampler" in name
- # NOTE: Currently, information about this model is unavailable. We are
- # temporarily using `MiniCPMVQwen2` as it's name. The name may need
- # to be modified in the future.
- class MiniCPMVQwen2(MiniCPMVBaseModel):
- def __init__(
- self,
- config: PretrainedConfig,
- multimodal_config: MultiModalConfig,
- cache_config: Optional[CacheConfig] = None,
- quant_config: Optional[QuantizationConfig] = None,
- ):
- super().__init__(config, multimodal_config, cache_config, quant_config)
- def init_llm(
- self,
- config: PretrainedConfig,
- cache_config: Optional[CacheConfig] = None,
- quant_config: Optional[QuantizationConfig] = None,
- ) -> nn.Module:
- return Qwen2Model(config,
- cache_config=cache_config,
- quant_config=quant_config)
- def init_vision_module(self) -> nn.Module:
- # A custom version of SiglipVisionTransformer, won't work with TP
- from aphrodite.modeling.models.na_vit import SiglipVisionTransformer
- if self.config._attn_implementation == "flash_attention_2":
- self.config.vision_config._attn_implementation = "flash_attention_2"
- else:
- # not support sdpa
- self.config.vision_config._attn_implementation = "eager"
- model = SiglipVisionTransformer(self.config.vision_config)
- if self.config.drop_vision_last_layer:
- model.encoder.layers = model.encoder.layers[:-1]
- return model
- def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module:
- with set_default_torch_dtype(torch.float16):
- resampler = Resampler2_5(
- num_queries=self.config.query_num,
- embed_dim=embed_dim,
- num_heads=embed_dim // 128,
- kv_dim=vision_dim,
- )
- return resampler
- def get_vision_embedding(
- self,
- pixel_values: List[torch.Tensor],
- patch_attn_mask: Optional[torch.Tensor] = None,
- tgt_sizes: Optional[torch.Tensor] = None,
- ) -> torch.Tensor:
- vision_embedding = self.vpm(
- pixel_values,
- patch_attention_mask=patch_attn_mask,
- tgt_sizes=tgt_sizes,
- ).last_hidden_state
- return vision_embedding
- def get_vision_hidden_states(self,
- data: MiniCPMVImageInputs) -> torch.Tensor:
- pixel_values = data["pixel_values"]
- tgt_sizes = data["tgt_sizes"]
- device = self.vpm.embeddings.position_embedding.weight.device
- dtype = self.vpm.embeddings.position_embedding.weight.dtype
- all_pixel_values_lst = [
- i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
- ]
- max_patches = (tgt_sizes[:, 0] * tgt_sizes[:, 1]).max().item()
- assert isinstance(max_patches, int)
- all_pixel_values = torch.nn.utils.rnn.pad_sequence(
- all_pixel_values_lst, batch_first=True, padding_value=0.0)
- B, L, _ = all_pixel_values.shape
- all_pixel_values = all_pixel_values.permute(0, 2,
- 1).reshape(B, 3, -1, L)
- patch_attn_mask = torch.zeros((B, 1, max_patches),
- dtype=torch.bool,
- device=device)
- for i in range(B):
- patch_attn_mask[i, 0, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True
- vision_embedding = self.vpm(
- all_pixel_values.type(dtype),
- patch_attention_mask=patch_attn_mask,
- tgt_sizes=tgt_sizes,
- ).last_hidden_state
- return self.resampler(vision_embedding, tgt_sizes)
- def is_default_weight_loading(self, name: str) -> bool:
- return "resampler" in name or "vpm" in name
- @MULTIMODAL_REGISTRY.register_image_input_mapper()
- @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_minicpmv_image_tokens)
- @INPUT_REGISTRY.register_dummy_data(dummy_data_for_minicpmv)
- @INPUT_REGISTRY.register_input_processor(input_processor_for_minicpmv)
- class MiniCPMV(MiniCPMVBaseModel):
- """
- Different versions of MiniCPMV use different visual encoders and LLMs,
- which is not conducive to the current integration logic of LoRA and
- bitsandbytes in aphrodite. Therefore, it is necessary to separate them.
- """
- def __new__(
- cls,
- config: PretrainedConfig,
- multimodal_config: MultiModalConfig,
- cache_config: Optional[CacheConfig] = None,
- quant_config: Optional[QuantizationConfig] = None,
- ):
- if not hasattr(config, "version"):
- if config.hidden_size == 2304 and config.query_num == 64:
- version = (2, 0)
- else:
- version = (2, 5)
- else:
- version = str(config.version).split(".")
- version = tuple([int(x) for x in version])
- # Dispatch class based on version
- if version == (2, 0):
- instance_class = MiniCPMV2
- elif version == (2, 5):
- instance_class = MiniCPMV2_5
- else:
- instance_class = MiniCPMVQwen2
- return instance_class(config, multimodal_config, cache_config,
- quant_config)
|