Browse Source

VLM: use `SequenceData.from_token_counts` to create dummy data (#1093)

* VLM: use `SequenceData.from_token_counts to create dummy data

* fix vision example
AlpinDale 1 month ago
parent
commit
651678d2df

+ 7 - 4
aphrodite/common/sequence.py

@@ -6,7 +6,7 @@ from array import array
 from collections import defaultdict
 from collections import defaultdict
 from dataclasses import dataclass
 from dataclasses import dataclass
 from functools import cached_property, reduce
 from functools import cached_property, reduce
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Mapping, Optional
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
 from typing import Sequence as GenericSequence
 from typing import Sequence as GenericSequence
 from typing import Set, Tuple, Union, cast
 from typing import Set, Tuple, Union, cast
 
 
@@ -161,14 +161,17 @@ class SequenceData(msgspec.Struct,
     _mrope_position_delta: Optional[int] = None
     _mrope_position_delta: Optional[int] = None
 
 
     @staticmethod
     @staticmethod
-    def from_counts(counts_by_token: Mapping[int, int]) -> "SequenceData":
-        if len(counts_by_token) == 0:
+    def from_token_counts(*token_counts: Tuple[int, int]) -> "SequenceData":
+        if len(token_counts) == 0:
             return SequenceData.from_seqs([])
             return SequenceData.from_seqs([])
+
         arrs = [
         arrs = [
             array(APHRODITE_TOKEN_ID_ARRAY_TYPE, [token_id]) * count
             array(APHRODITE_TOKEN_ID_ARRAY_TYPE, [token_id]) * count
-            for token_id, count in counts_by_token.items()
+            for token_id, count in token_counts
         ]
         ]
+
         return SequenceData(reduce(array.__add__, arrs))
         return SequenceData(reduce(array.__add__, arrs))
+
     @staticmethod
     @staticmethod
     def from_seqs(
     def from_seqs(
         prompt_token_ids: GenericSequence[int],
         prompt_token_ids: GenericSequence[int],

+ 1 - 1
aphrodite/inputs/registry.py

@@ -118,7 +118,7 @@ class InputRegistry:
         # Avoid circular import
         # Avoid circular import
         from aphrodite.common.sequence import SequenceData
         from aphrodite.common.sequence import SequenceData
 
 
-        dummy_seq_data = SequenceData.from_counts({0: seq_len})
+        dummy_seq_data = SequenceData.from_token_counts((0, seq_len))
         dummy_multi_modal_data = None
         dummy_multi_modal_data = None
 
 
         return dummy_seq_data, dummy_multi_modal_data
         return dummy_seq_data, dummy_multi_modal_data

+ 5 - 7
aphrodite/modeling/models/blip.py

@@ -1,6 +1,5 @@
 """Minimal implementation of BlipVisionModel intended to be only used 
 """Minimal implementation of BlipVisionModel intended to be only used 
 within a vision language model."""
 within a vision language model."""
-from array import array
 from typing import Optional, Union
 from typing import Optional, Union
 
 
 import torch
 import torch
@@ -11,7 +10,6 @@ from transformers.models.blip.modeling_blip import BlipAttention
 
 
 from aphrodite.common.config import ModelConfig
 from aphrodite.common.config import ModelConfig
 from aphrodite.common.sequence import SequenceData
 from aphrodite.common.sequence import SequenceData
-from aphrodite.constants import APHRODITE_TOKEN_ID_ARRAY_TYPE
 from aphrodite.distributed import divide, get_tensor_model_parallel_world_size
 from aphrodite.distributed import divide, get_tensor_model_parallel_world_size
 from aphrodite.inputs import LLMInputs
 from aphrodite.inputs import LLMInputs
 from aphrodite.modeling.layers.activation import get_act_fn
 from aphrodite.modeling.layers.activation import get_act_fn
@@ -54,6 +52,7 @@ def get_max_blip_image_tokens(
 def dummy_seq_data_for_blip(
 def dummy_seq_data_for_blip(
     hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
     hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
     seq_len: int,
     seq_len: int,
+    num_images: int,
     *,
     *,
     image_token_id: int,
     image_token_id: int,
     image_feature_size_override: Optional[int] = None,
     image_feature_size_override: Optional[int] = None,
@@ -63,11 +62,10 @@ def dummy_seq_data_for_blip(
     else:
     else:
         image_feature_size = image_feature_size_override
         image_feature_size = image_feature_size_override
 
 
-    token_ids = array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
-                      [image_token_id]) * image_feature_size
-    token_ids += array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
-                       [0]) * (seq_len - image_feature_size)
-    return SequenceData(token_ids)
+    return SequenceData.from_token_counts(
+        (image_token_id, image_feature_size * num_images),
+        (0, seq_len - image_feature_size * num_images),
+    )
 
 
 
 
 def dummy_image_for_blip(
 def dummy_image_for_blip(

+ 4 - 7
aphrodite/modeling/models/blip2.py

@@ -1,4 +1,3 @@
-from array import array
 from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
 from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
                     TypedDict, Union)
                     TypedDict, Union)
 
 
@@ -10,7 +9,6 @@ from transformers import (Blip2Config, Blip2QFormerConfig, Blip2VisionConfig,
 from aphrodite.attention import AttentionMetadata
 from aphrodite.attention import AttentionMetadata
 from aphrodite.common.config import CacheConfig, MultiModalConfig
 from aphrodite.common.config import CacheConfig, MultiModalConfig
 from aphrodite.common.sequence import IntermediateTensors, SequenceData
 from aphrodite.common.sequence import IntermediateTensors, SequenceData
-from aphrodite.constants import APHRODITE_TOKEN_ID_ARRAY_TYPE
 from aphrodite.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from aphrodite.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from aphrodite.modeling.layers.activation import get_act_fn
 from aphrodite.modeling.layers.activation import get_act_fn
 from aphrodite.modeling.layers.logits_processor import LogitsProcessor
 from aphrodite.modeling.layers.logits_processor import LogitsProcessor
@@ -428,11 +426,10 @@ def dummy_seq_data_for_blip2(
     else:
     else:
         image_feature_size = image_feature_size_override
         image_feature_size = image_feature_size_override
 
 
-    token_ids = array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
-                      [image_token_id]) * image_feature_size * num_images
-    token_ids += array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
-                       [0]) * (seq_len - image_feature_size * num_images)
-    return SequenceData(token_ids)
+    return SequenceData.from_token_counts(
+        (image_token_id, image_feature_size * num_images),
+        (0, seq_len - image_feature_size * num_images),
+    )
 
 
 
 
 def dummy_data_for_blip2(ctx: InputContext, seq_len: int,
 def dummy_data_for_blip2(ctx: InputContext, seq_len: int,

+ 4 - 7
aphrodite/modeling/models/chameleon.py

@@ -1,4 +1,3 @@
-from array import array
 from functools import cached_property
 from functools import cached_property
 from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional,
 from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional,
                     Tuple, TypedDict)
                     Tuple, TypedDict)
@@ -13,7 +12,6 @@ from aphrodite.attention import Attention, AttentionMetadata
 from aphrodite.common.config import CacheConfig, MultiModalConfig
 from aphrodite.common.config import CacheConfig, MultiModalConfig
 from aphrodite.common.sequence import IntermediateTensors, SequenceData
 from aphrodite.common.sequence import IntermediateTensors, SequenceData
 from aphrodite.common.utils import print_warning_once
 from aphrodite.common.utils import print_warning_once
-from aphrodite.constants import APHRODITE_TOKEN_ID_ARRAY_TYPE
 from aphrodite.distributed import get_tensor_model_parallel_world_size
 from aphrodite.distributed import get_tensor_model_parallel_world_size
 from aphrodite.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from aphrodite.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from aphrodite.modeling.layers.activation import SiluAndMul
 from aphrodite.modeling.layers.activation import SiluAndMul
@@ -69,11 +67,10 @@ def dummy_seq_data_for_chameleon(
     else:
     else:
         image_feature_size = image_feature_size_override
         image_feature_size = image_feature_size_override
 
 
-    token_ids = array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
-                      [image_token_id]) * image_feature_size * num_images
-    token_ids += array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
-                       [0]) * (seq_len - image_feature_size * num_images)
-    return SequenceData(token_ids)
+    return SequenceData.from_token_counts(
+        (image_token_id, image_feature_size * num_images),
+        (0, seq_len - image_feature_size * num_images),
+    )
 
 
 
 
 def dummy_image_for_chameleon(
 def dummy_image_for_chameleon(

+ 5 - 8
aphrodite/modeling/models/clip.py

@@ -1,6 +1,5 @@
 """Minimal implementation of CLIPVisionModel intended to be only used
 """Minimal implementation of CLIPVisionModel intended to be only used
 within a vision language model."""
 within a vision language model."""
-from array import array
 from typing import Iterable, List, Optional, Tuple, Union
 from typing import Iterable, List, Optional, Tuple, Union
 
 
 import torch
 import torch
@@ -10,8 +9,7 @@ from transformers import CLIPVisionConfig
 from transformers.models.clip.modeling_clip import CLIPSdpaAttention
 from transformers.models.clip.modeling_clip import CLIPSdpaAttention
 
 
 from aphrodite.common.config import ModelConfig
 from aphrodite.common.config import ModelConfig
-from aphrodite.common.sequence import (APHRODITE_TOKEN_ID_ARRAY_TYPE,
-                                       SequenceData)
+from aphrodite.common.sequence import SequenceData
 from aphrodite.distributed import divide, get_tensor_model_parallel_world_size
 from aphrodite.distributed import divide, get_tensor_model_parallel_world_size
 from aphrodite.inputs import LLMInputs
 from aphrodite.inputs import LLMInputs
 from aphrodite.modeling.layers.activation import get_act_fn
 from aphrodite.modeling.layers.activation import get_act_fn
@@ -63,11 +61,10 @@ def dummy_seq_data_for_clip(
     else:
     else:
         image_feature_size = image_feature_size_override
         image_feature_size = image_feature_size_override
 
 
-    token_ids = array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
-                      [image_token_id]) * image_feature_size * num_images
-    token_ids += array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
-                       [0]) * (seq_len - image_feature_size * num_images)
-    return SequenceData(token_ids)
+    return SequenceData.from_token_counts(
+        (image_token_id, image_feature_size * num_images),
+        (0, seq_len - image_feature_size * num_images),
+    )
 
 
 
 
 def dummy_image_for_clip(
 def dummy_image_for_clip(

+ 2 - 5
aphrodite/modeling/models/minicpmv.py

@@ -23,7 +23,6 @@
 """Inference-only MiniCPM-V model compatible with HuggingFace weights."""
 """Inference-only MiniCPM-V model compatible with HuggingFace weights."""
 import math
 import math
 import re
 import re
-from array import array
 from functools import partial
 from functools import partial
 from typing import (Any, Callable, Iterable, List, Mapping, Optional, Tuple,
 from typing import (Any, Callable, Iterable, List, Mapping, Optional, Tuple,
                     TypedDict)
                     TypedDict)
@@ -37,8 +36,7 @@ from transformers import PretrainedConfig
 
 
 from aphrodite.attention import AttentionMetadata
 from aphrodite.attention import AttentionMetadata
 from aphrodite.common.config import CacheConfig, MultiModalConfig
 from aphrodite.common.config import CacheConfig, MultiModalConfig
-from aphrodite.common.sequence import (APHRODITE_TOKEN_ID_ARRAY_TYPE,
-                                       IntermediateTensors, SequenceData)
+from aphrodite.common.sequence import IntermediateTensors, SequenceData
 from aphrodite.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from aphrodite.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from aphrodite.modeling.layers.linear import ReplicatedLinear
 from aphrodite.modeling.layers.linear import ReplicatedLinear
 from aphrodite.modeling.layers.logits_processor import LogitsProcessor
 from aphrodite.modeling.layers.logits_processor import LogitsProcessor
@@ -256,8 +254,7 @@ def get_max_minicpmv_image_tokens(ctx: InputContext):
 
 
 
 
 def dummy_seq_data_for_minicpmv(seq_len: int, num_images: int):
 def dummy_seq_data_for_minicpmv(seq_len: int, num_images: int):
-    token_ids = array(APHRODITE_TOKEN_ID_ARRAY_TYPE, [0]) * seq_len
-    return SequenceData(token_ids)
+    return SequenceData.from_token_counts((0, seq_len))
 
 
 
 
 def dummy_image_for_minicpmv(hf_config: PretrainedConfig, num_images: int):
 def dummy_image_for_minicpmv(hf_config: PretrainedConfig, num_images: int):

+ 5 - 9
aphrodite/modeling/models/pixtral.py

@@ -1,4 +1,3 @@
-from array import array
 from dataclasses import dataclass, fields
 from dataclasses import dataclass, fields
 from itertools import tee
 from itertools import tee
 from typing import Iterable, List, Mapping, Optional, Tuple, Union
 from typing import Iterable, List, Mapping, Optional, Tuple, Union
@@ -14,8 +13,7 @@ from xformers.ops.fmha.attn_bias import BlockDiagonalMask
 
 
 from aphrodite.attention import AttentionMetadata
 from aphrodite.attention import AttentionMetadata
 from aphrodite.common.config import CacheConfig, MultiModalConfig
 from aphrodite.common.config import CacheConfig, MultiModalConfig
-from aphrodite.common.sequence import (APHRODITE_TOKEN_ID_ARRAY_TYPE,
-                                       IntermediateTensors, SequenceData)
+from aphrodite.common.sequence import IntermediateTensors, SequenceData
 from aphrodite.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from aphrodite.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from aphrodite.modeling.layers.layernorm import RMSNorm
 from aphrodite.modeling.layers.layernorm import RMSNorm
 from aphrodite.modeling.layers.sampler import SamplerOutput
 from aphrodite.modeling.layers.sampler import SamplerOutput
@@ -64,12 +62,10 @@ def dummy_data_for_pixtral(
 
 
     num_image_tokens = image_feature_size * num_images
     num_image_tokens = image_feature_size * num_images
 
 
-    token_ids = array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
-                      [image_token_id]) * num_image_tokens
-    token_ids += array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
-                       [0]) * (seq_len - num_image_tokens)
-
-    seq_data = SequenceData(token_ids)
+    seq_data = SequenceData.from_token_counts(
+        (image_token_id, num_image_tokens),
+        (0, seq_len - num_image_tokens),
+    )
     mm_data = {"image": num_images * [image]}
     mm_data = {"image": num_images * [image]}
     return seq_data, mm_data
     return seq_data, mm_data
 
 

+ 5 - 6
aphrodite/modeling/models/qwen.py

@@ -7,7 +7,6 @@
 
 
 import math
 import math
 import re
 import re
-from array import array
 from functools import partial
 from functools import partial
 from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
 from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
                     Optional, Tuple, TypedDict, Union)
                     Optional, Tuple, TypedDict, Union)
@@ -23,8 +22,7 @@ from transformers import PretrainedConfig
 
 
 from aphrodite.attention import Attention, AttentionMetadata
 from aphrodite.attention import Attention, AttentionMetadata
 from aphrodite.common.config import CacheConfig, MultiModalConfig
 from aphrodite.common.config import CacheConfig, MultiModalConfig
-from aphrodite.common.sequence import (APHRODITE_TOKEN_ID_ARRAY_TYPE,
-                                       IntermediateTensors, SequenceData)
+from aphrodite.common.sequence import IntermediateTensors, SequenceData
 from aphrodite.common.utils import is_list_of
 from aphrodite.common.utils import is_list_of
 from aphrodite.distributed import (get_pp_group,
 from aphrodite.distributed import (get_pp_group,
                                    get_tensor_model_parallel_world_size)
                                    get_tensor_model_parallel_world_size)
@@ -819,8 +817,7 @@ def dummy_data_for_qwen(
     # The presence of a visual config indicates this is a multimodal model.
     # The presence of a visual config indicates this is a multimodal model.
     # If we don't have it, the model is considered an LLM for warmup purposes.
     # If we don't have it, the model is considered an LLM for warmup purposes.
     if not hasattr(hf_config, "visual"):
     if not hasattr(hf_config, "visual"):
-        seq_data = SequenceData(array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
-                                      [0] * seq_len))
+        seq_data = SequenceData.from_token_counts((0, seq_len))
         mm_data = None
         mm_data = None
         return seq_data, mm_data
         return seq_data, mm_data
 
 
@@ -847,11 +844,13 @@ def dummy_data_for_qwen(
     if len(toks) < seq_len:
     if len(toks) < seq_len:
         toks += [0] * (seq_len - len(toks))
         toks += [0] * (seq_len - len(toks))
 
 
+    seq_data = SequenceData.from_seqs(toks)
+
     # Build the input images; width/height doesn't actually matter here since
     # Build the input images; width/height doesn't actually matter here since
     # the data will get resized and the # of tokens per image is constant
     # the data will get resized and the # of tokens per image is constant
     image = Image.new("RGB", (224, 224), color=0)
     image = Image.new("RGB", (224, 224), color=0)
     mm_data = {"image": image if num_images == 1 else [image] * num_images}
     mm_data = {"image": image if num_images == 1 else [image] * num_images}
-    return SequenceData(array(APHRODITE_TOKEN_ID_ARRAY_TYPE, toks)), mm_data
+    return seq_data, mm_data
 
 
 
 
 @MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_qwen)
 @MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_qwen)

+ 6 - 16
aphrodite/modeling/models/qwen2_vl.py

@@ -23,7 +23,6 @@
 # See the License for the specific language governing permissions and
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
-from array import array
 from functools import lru_cache, partial
 from functools import lru_cache, partial
 from typing import (Iterable, List, Mapping, Optional, Tuple, Type, TypedDict,
 from typing import (Iterable, List, Mapping, Optional, Tuple, Type, TypedDict,
                     Union)
                     Union)
@@ -46,8 +45,7 @@ from aphrodite.attention.selector import (_Backend, backend_name_to_enum,
                                           get_global_forced_attn_backend)
                                           get_global_forced_attn_backend)
 from aphrodite.common.config import CacheConfig, MultiModalConfig
 from aphrodite.common.config import CacheConfig, MultiModalConfig
 from aphrodite.common.logger import log_once
 from aphrodite.common.logger import log_once
-from aphrodite.common.sequence import (APHRODITE_TOKEN_ID_ARRAY_TYPE,
-                                       IntermediateTensors, SequenceData)
+from aphrodite.common.sequence import IntermediateTensors, SequenceData
 from aphrodite.distributed import parallel_state
 from aphrodite.distributed import parallel_state
 from aphrodite.distributed import utils as dist_utils
 from aphrodite.distributed import utils as dist_utils
 from aphrodite.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from aphrodite.inputs import INPUT_REGISTRY, InputContext, LLMInputs
@@ -715,20 +713,12 @@ def dummy_data_for_qwen2_vl(
             "--limit-mm-per-prompt."
             "--limit-mm-per-prompt."
         )
         )
     hf_config = ctx.get_hf_config(Qwen2VLConfig)
     hf_config = ctx.get_hf_config(Qwen2VLConfig)
-    token_ids = array(
-        APHRODITE_TOKEN_ID_ARRAY_TYPE, [hf_config.vision_start_token_id]
+    dummy_seqdata = SequenceData.from_token_counts(
+        (hf_config.vision_start_token_id, 1),
+        (hf_config.image_token_id, max_llm_image_tokens),
+        (hf_config.vision_end_token_id, 1),
+        (0, seq_len - max_llm_image_tokens - 2),
     )
     )
-    token_ids += (
-        array(APHRODITE_TOKEN_ID_ARRAY_TYPE, [hf_config.image_token_id])
-        * max_llm_image_tokens
-    )
-    token_ids += array(
-        APHRODITE_TOKEN_ID_ARRAY_TYPE, [hf_config.vision_end_token_id]
-    )
-    token_ids += array(APHRODITE_TOKEN_ID_ARRAY_TYPE, [0]) * (
-        seq_len - max_llm_image_tokens - 2
-    )
-    dummy_seqdata = SequenceData(token_ids)
     dummy_image = Image.new(
     dummy_image = Image.new(
         "RGB", (max_resized_width, max_resized_height), color=0
         "RGB", (max_resized_width, max_resized_height), color=0
     )
     )

+ 4 - 7
aphrodite/modeling/models/siglip.py

@@ -2,7 +2,6 @@
 within a vision language model."""
 within a vision language model."""
 
 
 import math
 import math
-from array import array
 from typing import Iterable, List, Optional, Tuple, Union
 from typing import Iterable, List, Optional, Tuple, Union
 
 
 import torch
 import torch
@@ -13,7 +12,6 @@ from transformers.models.siglip.modeling_siglip import SiglipSdpaAttention
 
 
 from aphrodite.common.config import ModelConfig
 from aphrodite.common.config import ModelConfig
 from aphrodite.common.sequence import SequenceData
 from aphrodite.common.sequence import SequenceData
-from aphrodite.constants import APHRODITE_TOKEN_ID_ARRAY_TYPE
 from aphrodite.distributed import divide, get_tensor_model_parallel_world_size
 from aphrodite.distributed import divide, get_tensor_model_parallel_world_size
 from aphrodite.inputs import LLMInputs
 from aphrodite.inputs import LLMInputs
 from aphrodite.modeling.layers.activation import get_act_fn
 from aphrodite.modeling.layers.activation import get_act_fn
@@ -68,11 +66,10 @@ def dummy_seq_data_for_siglip(
     else:
     else:
         image_feature_size = image_feature_size_override
         image_feature_size = image_feature_size_override
 
 
-    token_ids = array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
-                      [image_token_id]) * image_feature_size
-    token_ids += array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
-                       [0]) * (seq_len - image_feature_size)
-    return SequenceData(token_ids)
+    return SequenceData.from_token_counts(
+        (image_token_id, image_feature_size * num_images),
+        (0, seq_len - image_feature_size * num_images),
+    )
 
 
 
 
 def dummy_image_for_siglip(
 def dummy_image_for_siglip(

+ 20 - 8
aphrodite/modeling/models/ultravox.py

@@ -75,15 +75,11 @@ def get_ultravox_max_audio_tokens(ctx: InputContext):
     return math.ceil(feature_extractor.chunk_length * _AUDIO_TOKENS_PER_SECOND)
     return math.ceil(feature_extractor.chunk_length * _AUDIO_TOKENS_PER_SECOND)
 
 
 
 
-def dummy_data_for_ultravox(
+def dummy_seq_data_for_ultravox(
     ctx: InputContext,
     ctx: InputContext,
     seq_len: int,
     seq_len: int,
-    mm_counts: Mapping[str, int],
+    audio_count: int,
 ):
 ):
-    feature_extractor = whisper_feature_extractor(ctx)
-
-    audio_count = mm_counts["audio"]
-
     audio_placeholder = array(
     audio_placeholder = array(
         APHRODITE_TOKEN_ID_ARRAY_TYPE,
         APHRODITE_TOKEN_ID_ARRAY_TYPE,
         [_AUDIO_PLACEHOLDER_TOKEN]) * get_ultravox_max_audio_tokens(ctx)
         [_AUDIO_PLACEHOLDER_TOKEN]) * get_ultravox_max_audio_tokens(ctx)
@@ -93,10 +89,26 @@ def dummy_data_for_ultravox(
     other_token_ids = array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
     other_token_ids = array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
                             [0]) * (seq_len - len(audio_token_ids))
                             [0]) * (seq_len - len(audio_token_ids))
 
 
+    return SequenceData(audio_token_ids + other_token_ids)
+
+def dummy_audio_for_ultravox(
+    ctx: InputContext,
+    audio_count: int,
+):
+    feature_extractor = whisper_feature_extractor(ctx)
     audio_and_sr = (np.array([0.0] * feature_extractor.chunk_length), 1)
     audio_and_sr = (np.array([0.0] * feature_extractor.chunk_length), 1)
-    mm_dict = {"audio": [audio_and_sr] * audio_count}
+    return {"audio": [audio_and_sr] * audio_count}
+
+def dummy_data_for_ultravox(
+    ctx: InputContext,
+    seq_len: int,
+    mm_counts: Mapping[str, int],
+):
+    audio_count = mm_counts["audio"]
+    seq_data = dummy_seq_data_for_ultravox(ctx, seq_len, audio_count)
+    mm_dict = dummy_audio_for_ultravox(ctx, audio_count)
 
 
-    return (SequenceData(audio_token_ids + other_token_ids), mm_dict)
+    return (seq_data, mm_dict)
 
 
 
 
 def input_mapper_for_ultravox(ctx: InputContext, data: object):
 def input_mapper_for_ultravox(ctx: InputContext, data: object):

+ 2 - 1
examples/vision/vision_example.py

@@ -148,6 +148,7 @@ def run_minicpmv(question):
     llm = LLM(
     llm = LLM(
         model=model_name,
         model=model_name,
         trust_remote_code=True,
         trust_remote_code=True,
+        max_model_len=8192,
     )
     )
     # NOTE The stop_token_ids are different for various versions of MiniCPM-V
     # NOTE The stop_token_ids are different for various versions of MiniCPM-V
     # 2.0
     # 2.0
@@ -342,7 +343,7 @@ def main(args):
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
     parser = FlexibleArgumentParser(
     parser = FlexibleArgumentParser(
-        description='Demo on using vLLM for offline inference with '
+        description='Demo on using Aphrodite for offline inference with '
         'vision language models')
         'vision language models')
     parser.add_argument('--model-type',
     parser.add_argument('--model-type',
                         '-m',
                         '-m',