ソースを参照

fix vision towers

AlpinDale 4 ヶ月 前
コミット
2242874526

+ 1 - 1
aphrodite/inputs/registry.py

@@ -34,7 +34,7 @@ class InputContext:
     model_config: "ModelConfig"
     model_config: "ModelConfig"
     """The configuration of the model."""
     """The configuration of the model."""
 
 
-    def get_hf_config(self, hf_config_type: Type[C]) -> C:
+    def get_hf_config(self, hf_config_type: Type[C] = PretrainedConfig) -> C:
         """
         """
         Get the HuggingFace configuration
         Get the HuggingFace configuration
         (:class:`transformers.PretrainedConfig`) of the model,
         (:class:`transformers.PretrainedConfig`) of the model,

+ 7 - 3
aphrodite/modeling/models/blip.py

@@ -1,5 +1,6 @@
 """Minimal implementation of BlipVisionModel intended to be only used 
 """Minimal implementation of BlipVisionModel intended to be only used 
 within a vision language model."""
 within a vision language model."""
+from array import array
 from typing import Optional, Union
 from typing import Optional, Union
 
 
 import torch
 import torch
@@ -9,7 +10,8 @@ from transformers import Blip2VisionConfig, BlipVisionConfig
 from transformers.models.blip.modeling_blip import BlipAttention
 from transformers.models.blip.modeling_blip import BlipAttention
 
 
 from aphrodite.common.config import ModelConfig
 from aphrodite.common.config import ModelConfig
-from aphrodite.common.sequence import SequenceData
+from aphrodite.common.sequence import (APHRODITE_TOKEN_ID_ARRAY_TYPE,
+                                       SequenceData)
 from aphrodite.inputs import LLMInputs
 from aphrodite.inputs import LLMInputs
 from aphrodite.modeling.layers.activation import get_act_fn
 from aphrodite.modeling.layers.activation import get_act_fn
 from aphrodite.modeling.layers.linear import (ColumnParallelLinear,
 from aphrodite.modeling.layers.linear import (ColumnParallelLinear,
@@ -53,8 +55,10 @@ def dummy_seq_data_for_blip(
     else:
     else:
         image_feature_size = image_feature_size_override
         image_feature_size = image_feature_size_override
 
 
-    token_ids = [image_token_id] * image_feature_size
-    token_ids += [0] * (seq_len - image_feature_size)
+    token_ids = array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
+                      [image_token_id]) * image_feature_size
+    token_ids += array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
+                       [0]) * (seq_len - image_feature_size)
     return SequenceData(token_ids)
     return SequenceData(token_ids)
 
 
 
 

+ 7 - 3
aphrodite/modeling/models/blip2.py

@@ -1,3 +1,4 @@
+from array import array
 from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
 from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
                     TypedDict, Union)
                     TypedDict, Union)
 
 
@@ -8,7 +9,8 @@ from transformers import (Blip2Config, Blip2QFormerConfig, Blip2VisionConfig,
 
 
 from aphrodite.attention import AttentionMetadata
 from aphrodite.attention import AttentionMetadata
 from aphrodite.common.config import CacheConfig, MultiModalConfig
 from aphrodite.common.config import CacheConfig, MultiModalConfig
-from aphrodite.common.sequence import (IntermediateTensors, SamplerOutput,
+from aphrodite.common.sequence import (APHRODITE_TOKEN_ID_ARRAY_TYPE,
+                                       IntermediateTensors, SamplerOutput,
                                        SequenceData)
                                        SequenceData)
 from aphrodite.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from aphrodite.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from aphrodite.modeling.layers.activation import get_act_fn
 from aphrodite.modeling.layers.activation import get_act_fn
@@ -427,8 +429,10 @@ def dummy_seq_data_for_blip2(
     else:
     else:
         image_feature_size = image_feature_size_override
         image_feature_size = image_feature_size_override
 
 
-    token_ids = [image_token_id] * image_feature_size * num_images
-    token_ids += [0] * (seq_len - image_feature_size * num_images)
+    token_ids = array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
+                      [image_token_id]) * image_feature_size * num_images
+    token_ids += array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
+                       [0]) * (seq_len - image_feature_size * num_images)
     return SequenceData(token_ids)
     return SequenceData(token_ids)
 
 
 
 

+ 7 - 3
aphrodite/modeling/models/chameleon.py

@@ -1,3 +1,4 @@
+from array import array
 from functools import cached_property
 from functools import cached_property
 from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional,
 from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional,
                     Tuple, TypedDict)
                     Tuple, TypedDict)
@@ -10,7 +11,8 @@ from transformers import ChameleonConfig, ChameleonVQVAEConfig
 
 
 from aphrodite.attention import Attention, AttentionMetadata
 from aphrodite.attention import Attention, AttentionMetadata
 from aphrodite.common.config import CacheConfig, MultiModalConfig
 from aphrodite.common.config import CacheConfig, MultiModalConfig
-from aphrodite.common.sequence import (IntermediateTensors, SamplerOutput,
+from aphrodite.common.sequence import (APHRODITE_TOKEN_ID_ARRAY_TYPE,
+                                       IntermediateTensors, SamplerOutput,
                                        SequenceData)
                                        SequenceData)
 from aphrodite.common.utils import print_warning_once
 from aphrodite.common.utils import print_warning_once
 from aphrodite.distributed import get_tensor_model_parallel_world_size
 from aphrodite.distributed import get_tensor_model_parallel_world_size
@@ -68,8 +70,10 @@ def dummy_seq_data_for_chameleon(
     else:
     else:
         image_feature_size = image_feature_size_override
         image_feature_size = image_feature_size_override
 
 
-    token_ids = [image_token_id] * image_feature_size * num_images
-    token_ids += [0] * (seq_len - image_feature_size * num_images)
+    token_ids = array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
+                      [image_token_id]) * image_feature_size * num_images
+    token_ids += array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
+                       [0]) * (seq_len - image_feature_size * num_images)
     return SequenceData(token_ids)
     return SequenceData(token_ids)
 
 
 
 

+ 7 - 3
aphrodite/modeling/models/clip.py

@@ -1,5 +1,6 @@
 """Minimal implementation of CLIPVisionModel intended to be only used 
 """Minimal implementation of CLIPVisionModel intended to be only used 
 within a vision language model."""
 within a vision language model."""
+from array import array
 from typing import Optional
 from typing import Optional
 
 
 import torch
 import torch
@@ -9,7 +10,8 @@ from transformers import CLIPVisionConfig
 from transformers.models.clip.modeling_clip import CLIPAttention
 from transformers.models.clip.modeling_clip import CLIPAttention
 
 
 from aphrodite.common.config import ModelConfig
 from aphrodite.common.config import ModelConfig
-from aphrodite.common.sequence import SequenceData
+from aphrodite.common.sequence import (APHRODITE_TOKEN_ID_ARRAY_TYPE,
+                                       SequenceData)
 from aphrodite.inputs import LLMInputs
 from aphrodite.inputs import LLMInputs
 from aphrodite.modeling.layers.activation import get_act_fn
 from aphrodite.modeling.layers.activation import get_act_fn
 from aphrodite.modeling.layers.linear import (ColumnParallelLinear,
 from aphrodite.modeling.layers.linear import (ColumnParallelLinear,
@@ -52,8 +54,10 @@ def dummy_seq_data_for_clip(
     else:
     else:
         image_feature_size = image_feature_size_override
         image_feature_size = image_feature_size_override
 
 
-    token_ids = [image_token_id] * image_feature_size * num_images
-    token_ids += [0] * (seq_len - image_feature_size * num_images)
+    token_ids = array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
+                      [image_token_id]) * image_feature_size * num_images
+    token_ids += array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
+                       [0]) * (seq_len - image_feature_size * num_images)
     return SequenceData(token_ids)
     return SequenceData(token_ids)
 
 
 
 

+ 10 - 4
aphrodite/modeling/models/fuyu.py

@@ -16,6 +16,7 @@
 # limitations under the License.
 # limitations under the License.
 """ PyTorch Fuyu model."""
 """ PyTorch Fuyu model."""
 import math
 import math
+from array import array
 from typing import Iterable, List, Literal, Mapping, Optional, Tuple, TypedDict
 from typing import Iterable, List, Literal, Mapping, Optional, Tuple, TypedDict
 
 
 import torch
 import torch
@@ -26,7 +27,8 @@ from transformers import FuyuConfig, FuyuImageProcessor
 
 
 from aphrodite.attention import AttentionMetadata
 from aphrodite.attention import AttentionMetadata
 from aphrodite.common.config import CacheConfig, MultiModalConfig
 from aphrodite.common.config import CacheConfig, MultiModalConfig
-from aphrodite.common.sequence import (IntermediateTensors, SamplerOutput,
+from aphrodite.common.sequence import (APHRODITE_TOKEN_ID_ARRAY_TYPE,
+                                       IntermediateTensors, SamplerOutput,
                                        SequenceData)
                                        SequenceData)
 from aphrodite.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from aphrodite.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from aphrodite.modeling.layers.linear import ColumnParallelLinear
 from aphrodite.modeling.layers.linear import ColumnParallelLinear
@@ -95,9 +97,13 @@ def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int, num_images: int):
     ncol, nrow = get_max_fuyu_image_feature_size()
     ncol, nrow = get_max_fuyu_image_feature_size()
     image_feature_size = get_max_fuyu_image_tokens(ctx)
     image_feature_size = get_max_fuyu_image_tokens(ctx)
 
 
-    image_token_ids = ([_IMAGE_TOKEN_ID] * ncol + [_NEWLINE_TOKEN_ID]) * nrow
-    token_ids = image_token_ids * num_images
-    token_ids += [0] * (seq_len - image_feature_size * num_images)
+    image_token_ids = (
+        array(APHRODITE_TOKEN_ID_ARRAY_TYPE, [_IMAGE_TOKEN_ID]) * ncol +
+        array(APHRODITE_TOKEN_ID_ARRAY_TYPE, [_NEWLINE_TOKEN_ID])) * nrow
+    token_ids = array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
+                      image_token_ids) * num_images
+    token_ids += array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
+                       [0]) * (seq_len - image_feature_size * num_images)
     return SequenceData(token_ids)
     return SequenceData(token_ids)
 
 
 
 

+ 4 - 2
aphrodite/modeling/models/minicpmv.py

@@ -23,6 +23,7 @@
 """Inference-only MiniCPM-V model compatible with HuggingFace weights."""
 """Inference-only MiniCPM-V model compatible with HuggingFace weights."""
 import math
 import math
 import re
 import re
+from array import array
 from functools import partial
 from functools import partial
 from typing import (Any, Callable, Iterable, List, Mapping, Optional, Tuple,
 from typing import (Any, Callable, Iterable, List, Mapping, Optional, Tuple,
                     TypedDict, Union)
                     TypedDict, Union)
@@ -38,7 +39,8 @@ from transformers.configuration_utils import PretrainedConfig
 
 
 from aphrodite.attention import AttentionMetadata
 from aphrodite.attention import AttentionMetadata
 from aphrodite.common.config import CacheConfig, MultiModalConfig
 from aphrodite.common.config import CacheConfig, MultiModalConfig
-from aphrodite.common.sequence import (IntermediateTensors, SamplerOutput,
+from aphrodite.common.sequence import (APHRODITE_TOKEN_ID_ARRAY_TYPE,
+                                       IntermediateTensors, SamplerOutput,
                                        SequenceData)
                                        SequenceData)
 from aphrodite.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from aphrodite.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from aphrodite.modeling.layers.linear import ReplicatedLinear
 from aphrodite.modeling.layers.linear import ReplicatedLinear
@@ -409,7 +411,7 @@ def get_max_minicpmv_image_tokens(ctx: InputContext):
 
 
 
 
 def dummy_seq_data_for_minicpmv(seq_len: int, num_images: int):
 def dummy_seq_data_for_minicpmv(seq_len: int, num_images: int):
-    token_ids = [0] * seq_len
+    token_ids = array(APHRODITE_TOKEN_ID_ARRAY_TYPE, [0]) * seq_len
     return SequenceData(token_ids)
     return SequenceData(token_ids)
 
 
 
 

+ 7 - 3
aphrodite/modeling/models/siglip.py

@@ -2,6 +2,7 @@
 within a vision language model."""
 within a vision language model."""
 
 
 import math
 import math
+from array import array
 from typing import Iterable, Optional, Tuple
 from typing import Iterable, Optional, Tuple
 
 
 import torch
 import torch
@@ -13,7 +14,8 @@ from transformers.models.siglip.modeling_siglip import SiglipAttention
 from xformers.ops import memory_efficient_attention
 from xformers.ops import memory_efficient_attention
 
 
 from aphrodite.common.config import ModelConfig
 from aphrodite.common.config import ModelConfig
-from aphrodite.common.sequence import SequenceData
+from aphrodite.common.sequence import (APHRODITE_TOKEN_ID_ARRAY_TYPE,
+                                       SequenceData)
 from aphrodite.distributed import get_tensor_model_parallel_world_size
 from aphrodite.distributed import get_tensor_model_parallel_world_size
 from aphrodite.inputs import LLMInputs
 from aphrodite.inputs import LLMInputs
 from aphrodite.modeling.layers.activation import get_act_fn
 from aphrodite.modeling.layers.activation import get_act_fn
@@ -62,8 +64,10 @@ def dummy_seq_data_for_siglip(
     else:
     else:
         image_feature_size = image_feature_size_override
         image_feature_size = image_feature_size_override
 
 
-    token_ids = [image_token_id] * image_feature_size * num_images
-    token_ids += [0] * (seq_len - image_feature_size * num_images)
+    token_ids = array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
+                      [image_token_id]) * image_feature_size
+    token_ids += array(APHRODITE_TOKEN_ID_ARRAY_TYPE,
+                       [0]) * (seq_len - image_feature_size)
     return SequenceData(token_ids)
     return SequenceData(token_ids)