|
@@ -51,6 +51,10 @@ _KEYS_TO_MODIFY_MAPPING = {
|
|
|
# Cannot find the following 2 numbers from hf config.
|
|
|
_IMAGE_TOKEN_ID = 32044
|
|
|
|
|
|
+# Result in the max possible feature size (h:w = 16:1)
|
|
|
+MAX_IMAGE_FEATURE_SIZE_HEIGHT = 8000
|
|
|
+MAX_IMAGE_FEATURE_SIZE_WIDTH = 50
|
|
|
+
|
|
|
CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(dropout=0.0,
|
|
|
hidden_act="quick_gelu",
|
|
|
hidden_size=1024,
|
|
@@ -320,24 +324,16 @@ def get_phi3v_image_feature_size(
|
|
|
|
|
|
|
|
|
def get_max_phi3v_image_tokens(ctx: InputContext):
|
|
|
- # Result in the max possible feature size (h:w = 16:1)
|
|
|
- dummy_height, dummy_width = 8000, 50
|
|
|
|
|
|
return get_phi3v_image_feature_size(
|
|
|
ctx.get_hf_config(PretrainedConfig),
|
|
|
- input_height=dummy_height,
|
|
|
- input_width=dummy_width,
|
|
|
+ input_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
|
|
|
+ input_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
|
|
|
)
|
|
|
|
|
|
|
|
|
def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
|
|
|
- # Result in the max possible feature size (h:w = 16:1)
|
|
|
- dummy_height, dummy_width = 8000, 50
|
|
|
- image_feature_size = get_phi3v_image_feature_size(
|
|
|
- ctx.get_hf_config(PretrainedConfig),
|
|
|
- input_height=dummy_height,
|
|
|
- input_width=dummy_width,
|
|
|
- )
|
|
|
+ image_feature_size = get_max_phi3v_image_tokens(ctx)
|
|
|
|
|
|
seq_data = dummy_seq_data_for_clip(
|
|
|
CLIP_VIT_LARGE_PATCH14_336_CONFIG,
|
|
@@ -347,8 +343,8 @@ def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
|
|
|
)
|
|
|
mm_data = dummy_image_for_clip(
|
|
|
CLIP_VIT_LARGE_PATCH14_336_CONFIG,
|
|
|
- image_width_override=dummy_width,
|
|
|
- image_height_override=dummy_height,
|
|
|
+ image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
|
|
|
+ image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
|
|
|
)
|
|
|
|
|
|
return seq_data, mm_data
|