4 months ago · 7222b84582
--- a/aphrodite/common/config.py
+++ b/aphrodite/common/config.py
@@ -17,7 +17,8 @@ from aphrodite.distributed import get_current_tp_rank_partition_size
 
															 from aphrodite.modeling.models import ModelRegistry
														
 
															 from aphrodite.platforms import current_platform
														
 
															 from aphrodite.quantization import QUANTIZATION_METHODS
														
 
															-from aphrodite.transformers_utils.config import get_config, get_hf_text_config
														
 
															+from aphrodite.transformers_utils.config import (ConfigFormat, get_config,
														
 
															+                                                 get_hf_text_config)
														
 
															 if TYPE_CHECKING:
														
 
															     from ray.util.placement_group import PlacementGroup
														
@@ -133,6 +134,8 @@ class ModelConfig:
 
															             the model name will be the same as `model`.
														
 
															         limit_mm_per_prompt: Maximum number of data instances per modality
														
 
															             per prompt. Only applicable for multimodal models.
														
 
															+        config_format: The config format which will be loaded. Defaults to
														
 
															+            'auto' which defaults to 'hf'.
														
 
															     """
														
 
															     def __init__(
														
@@ -162,6 +165,7 @@ class ModelConfig:
 
															         skip_tokenizer_init: bool = False,
														
 
															         served_model_name: Optional[Union[str, List[str]]] = None,
														
 
															         limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
														
 
															+        config_format: ConfigFormat = ConfigFormat.AUTO,
														
 
															     ) -> None:
														
 
															         self.model = model
														
 
															         self.tokenizer = tokenizer
														
@@ -194,7 +198,8 @@ class ModelConfig:
 
															         self.skip_tokenizer_init = skip_tokenizer_init
														
 
															         self.hf_config = get_config(self.model, trust_remote_code, revision,
														
 
															-                                    code_revision, rope_scaling, rope_theta)
														
 
															+                                    code_revision, rope_scaling, rope_theta,
														
 
															+                                    config_format)
														
 
															         self.hf_text_config = get_hf_text_config(self.hf_config)
														
 
															         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
														
@@ -226,14 +231,18 @@ class ModelConfig:
 
															             # so no logging message needed.
														
 
															             self.enforce_eager = False
														
 
															-        if (not self.disable_sliding_window
														
 
															-                and self.hf_text_config.model_type == "gemma2"
														
 
															-                and self.hf_text_config.sliding_window is not None):
														
 
															+        sliding_window = getattr(self.hf_text_config, "sliding_window", None)
														
 
															+        has_interleaved_attention = (sliding_window is not None) and (
														
 
															+            isinstance(sliding_window, list) or
														
 
															+            (self.hf_text_config.model_type in ["gemma2"]))
														
 
															+        if (not self.disable_sliding_window and has_interleaved_attention):
														
 
															+            sliding_window_len_min = get_min_sliding_window(
														
 
															+                self.hf_text_config.sliding_window)
														
 
															             print_warning_once(
														
 
															-                "Gemma 2 uses sliding window attention for every odd layer, "
														
 
															-                "which is currently not supported by Aphrodite. Disabling "
														
 
															-                "sliding window and capping the max length to the sliding "
														
 
															-                f"window size ({self.hf_text_config.sliding_window}).")
														
 
															+                f"{self.hf_text_config.model_type} has interleaved attention, "
														
 
															+                "which is currently not supported by vLLM. Disabling sliding "
														
 
															+                "window and capping the max length to the sliding window size "
														
 
															+                f"({sliding_window_len_min}).")
														
 
															             self.disable_sliding_window = True
														
 
															         self.max_model_len = _get_and_verify_max_len(
														
@@ -469,7 +478,8 @@ class ModelConfig:
 
															             return True
														
 
															         return False
														
 
															-    def get_hf_config_sliding_window(self) -> Optional[int]:
														
 
															+    def get_hf_config_sliding_window(
														
 
															+            self) -> Union[Optional[int], List[Optional[int]]]:
														
 
															         """Get the sliding window size, or None if disabled.
														
 
															         """
														
@@ -481,7 +491,7 @@ class ModelConfig:
 
															             return None
														
 
															         return getattr(self.hf_text_config, "sliding_window", None)
														
 
															-    def get_sliding_window(self) -> Optional[int]:
														
 
															+    def get_sliding_window(self) -> Optional[Union[int, List[Optional[int]]]]:
														
 
															         """Get the sliding window size, or None if disabled.
														
 
															         """
														
 
															         # If user disables sliding window, return None.
														
@@ -811,6 +821,7 @@ class LoadFormat(str, enum.Enum):
 
															     SHARDED_STATE = "sharded_state"
														
 
															     GGUF = "gguf"
														
 
															     BITSANDBYTES = "bitsandbytes"
														
 
															+    MISTRAL = "mistral"
														
 
															 @dataclass
														
@@ -853,7 +864,7 @@ class LoadConfig:
 
															                 "Ignoring the following patterns when downloading weights: "
														
 
															                 f"{self.ignore_patterns}")
														
 
															         else:
														
 
															-            self.ignore_patterns = ["original/**/*", "consolidated*"]
														
 
															+            self.ignore_patterns = ["original/**/*"]
														
 
															     def _verify_load_format(self) -> None:
														
 
															         if not isinstance(self.load_format, str):
														
@@ -1704,7 +1715,7 @@ def _get_and_verify_max_len(
 
															     hf_config: PretrainedConfig,
														
 
															     max_model_len: Optional[int],
														
 
															     disable_sliding_window: bool,
														
 
															-    sliding_window_len: Optional[int],
														
 
															+    sliding_window_len: Optional[Union[int, List[Optional[int]]]],
														
 
															     rope_scaling_arg: Optional[Dict[str, Any]],
														
 
															 ) -> int:
														
 
															     """Get and verify the model's maximum length."""
														
@@ -1739,9 +1750,11 @@ def _get_and_verify_max_len(
 
															     # If sliding window is manually disabled, max_length should be less
														
 
															     # than the sliding window length in the model config.
														
 
															     if disable_sliding_window and sliding_window_len is not None:
														
 
															+        sliding_window_len_min = get_min_sliding_window(sliding_window_len)
														
 
															         max_len_key = "sliding_window" \
														
 
															-            if sliding_window_len < derived_max_model_len else max_len_key
														
 
															-        derived_max_model_len = min(derived_max_model_len, sliding_window_len)
														
 
															+            if sliding_window_len_min < derived_max_model_len else max_len_key
														
 
															+        derived_max_model_len = min(derived_max_model_len,
														
 
															+                                    sliding_window_len_min)
														
 
															     # If none of the keys were found in the config, use a default and
														
 
															     # log a warning.
														
@@ -1795,6 +1808,13 @@ def _get_and_verify_max_len(
 
															     return int(max_model_len)
														
 
															+def get_min_sliding_window(
														
 
															+        sliding_window: Union[int, List[Optional[int]]]) -> int:
														
 
															+    if isinstance(sliding_window, list):
														
 
															+        return min(s for s in sliding_window if s is not None)
														
 
															+    return sliding_window
														
 
															+
														
 
															+
														
 
															 def get_served_model_name(model: str,
														
 
															                           served_model_name: Optional[Union[str, List[str]]]):
														
 
															     """
														
--- a/aphrodite/engine/args_tools.py
+++ b/aphrodite/engine/args_tools.py
@@ -7,11 +7,12 @@ from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Tuple, Type,
 
															 from loguru import logger
														
 
															-from aphrodite.common.config import (CacheConfig, DecodingConfig, DeviceConfig,
														
 
															-                                     EngineConfig, LoadConfig, LoRAConfig,
														
 
															-                                     ModelConfig, ParallelConfig,
														
 
															-                                     PromptAdapterConfig, SchedulerConfig,
														
 
															-                                     SpeculativeConfig, TokenizerPoolConfig)
														
 
															+from aphrodite.common.config import (CacheConfig, ConfigFormat, DecodingConfig,
														
 
															+                                     DeviceConfig, EngineConfig, LoadConfig,
														
 
															+                                     LoadFormat, LoRAConfig, ModelConfig,
														
 
															+                                     ParallelConfig, PromptAdapterConfig,
														
 
															+                                     SchedulerConfig, SpeculativeConfig,
														
 
															+                                     TokenizerPoolConfig)
														
 
															 from aphrodite.common.utils import FlexibleArgumentParser, is_cpu
														
 
															 from aphrodite.executor.executor_base import ExecutorBase
														
 
															 from aphrodite.quantization import QUANTIZATION_METHODS
														
@@ -75,6 +76,7 @@ class EngineArgs:
 
															     device: str = "auto"
														
 
															     # Load Options
														
 
															     load_format: str = "auto"
														
 
															+    config_format: str = "auto"
														
 
															     dtype: str = "auto"
														
 
															     ignore_patterns: Optional[Union[str, List[str]]] = None
														
 
															     # Parallel Options
														
@@ -359,16 +361,7 @@ class EngineArgs:
 
															             '--load-format',
														
 
															             type=str,
														
 
															             default=EngineArgs.load_format,
														
 
															-            choices=[
														
 
															-                'auto',
														
 
															-                'pt',
														
 
															-                'safetensors',
														
 
															-                'npcache',
														
 
															-                'dummy',
														
 
															-                'tensorizer',
														
 
															-                'sharded_state',
														
 
															-                'bitsandbytes',
														
 
															-            ],
														
 
															+            choices=[f.value for f in LoadFormat],
														
 
															             help='Category: Model Options\n'
														
 
															             'The format of the model weights to load.\n\n'
														
 
															             '* "auto" will try to load the weights in the safetensors format '
														
@@ -385,6 +378,15 @@ class EngineArgs:
 
															             'Examples section for more information.\n'
														
 
															             '* "bitsandbytes" will load the weights using bitsandbytes '
														
 
															             'quantization.\n')
														
 
															+        parser.add_argument(
														
 
															+            '--config-format',
														
 
															+            default=EngineArgs.config_format,
														
 
															+            choices=[f.value for f in ConfigFormat],
														
 
															+            help='The format of the model config to load.\n\n'
														
 
															+            '* "auto" will try to load the config in hf format '
														
 
															+            'if available else it will try to load in mistral format. '
														
 
															+            'Mistral format is specific to mistral models and is not '
														
 
															+            'compatible with other models.')
														
 
															         parser.add_argument(
														
 
															             '--dtype',
														
 
															             type=str,
														
@@ -911,6 +913,7 @@ class EngineArgs:
 
															             skip_tokenizer_init=self.skip_tokenizer_init,
														
 
															             served_model_name=self.served_model_name,
														
 
															             limit_mm_per_prompt=self.limit_mm_per_prompt,
														
 
															+            config_format=self.config_format,
														
 
															         )
														
 
															         cache_config = CacheConfig(
														
--- a/aphrodite/modeling/model_loader/loader.py
+++ b/aphrodite/modeling/model_loader/loader.py
@@ -18,6 +18,7 @@ from huggingface_hub import HfApi, hf_hub_download
 
															 from loguru import logger
														
 
															 from torch import nn
														
 
															 from transformers import AutoModelForCausalLM, PretrainedConfig
														
 
															+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
														
 
															 from aphrodite.common.config import (APHRODITE_USE_MODELSCOPE, CacheConfig,
														
 
															                                      DeviceConfig, LoadConfig, LoadFormat,
														
@@ -237,12 +238,17 @@ class DefaultModelLoader(BaseModelLoader):
 
															         is_local = os.path.isdir(model_name_or_path)
														
 
															         load_format = self.load_config.load_format
														
 
															         use_safetensors = False
														
 
															+        index_file = SAFE_WEIGHTS_INDEX_NAME
														
 
															         # Some quantized models use .pt files for storing the weights.
														
 
															         if load_format == LoadFormat.AUTO:
														
 
															             allow_patterns = ["*.safetensors", "*.bin"]
														
 
															         elif load_format == LoadFormat.SAFETENSORS:
														
 
															             use_safetensors = True
														
 
															             allow_patterns = ["*.safetensors"]
														
 
															+        elif load_format == LoadFormat.MISTRAL:
														
 
															+            use_safetensors = True
														
 
															+            allow_patterns = ["consolidated*.safetensors"]
														
 
															+            index_file = "consolidated.safetensors.index.json"
														
 
															         elif load_format == LoadFormat.PT:
														
 
															             allow_patterns = ["*.pt"]
														
 
															         elif load_format == LoadFormat.NPCACHE:
														
@@ -280,10 +286,10 @@ class DefaultModelLoader(BaseModelLoader):
 
															             # any files not found in the index.
														
 
															             if not is_local:
														
 
															                 download_safetensors_index_file_from_hf(
														
 
															-                    model_name_or_path, self.load_config.download_dir,
														
 
															-                    revision)
														
 
															+                    model_name_or_path, index_file,
														
 
															+                    self.load_config.download_dir, revision)
														
 
															             hf_weights_files = filter_duplicate_safetensors_files(
														
 
															-                hf_weights_files, hf_folder)
														
 
															+                hf_weights_files, hf_folder, index_file)
														
 
															         else:
														
 
															             hf_weights_files = filter_files_not_needed_for_inference(
														
 
															                 hf_weights_files)
														
--- a/aphrodite/modeling/model_loader/weight_utils.py
+++ b/aphrodite/modeling/model_loader/weight_utils.py
@@ -17,7 +17,6 @@ from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
 
															 from loguru import logger
														
 
															 from safetensors.torch import load_file, safe_open, save_file
														
 
															 from tqdm.auto import tqdm
														
 
															-from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
														
 
															 from aphrodite.common.config import LoadConfig, ModelConfig
														
 
															 from aphrodite.common.utils import print_warning_once
														
@@ -243,6 +242,7 @@ def download_weights_from_hf(
 
															 def download_safetensors_index_file_from_hf(
														
 
															     model_name_or_path: str,
														
 
															+    index_file: str,
														
 
															     cache_dir: Optional[str],
														
 
															     revision: Optional[str] = None,
														
 
															 ) -> None:
														
@@ -260,36 +260,37 @@ def download_safetensors_index_file_from_hf(
 
															             # Download the safetensors index file.
														
 
															             hf_hub_download(
														
 
															                 repo_id=model_name_or_path,
														
 
															-                filename=SAFE_WEIGHTS_INDEX_NAME,
														
 
															+                filename=index_file,
														
 
															                 cache_dir=cache_dir,
														
 
															                 revision=revision,
														
 
															                 local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
														
 
															             )
														
 
															         # If file not found on remote or locally, we should not fail since
														
 
															-        # only some models will have SAFE_WEIGHTS_INDEX_NAME.
														
 
															+        # only some models will have index_file.
														
 
															         except huggingface_hub.utils.EntryNotFoundError:
														
 
															-            logger.info(f"No {SAFE_WEIGHTS_INDEX_NAME} found in remote.")
														
 
															+            logger.info(f"No {index_file} found in remote.")
														
 
															         except huggingface_hub.utils.LocalEntryNotFoundError:
														
 
															-            logger.info(f"No {SAFE_WEIGHTS_INDEX_NAME} found in local cache.")
														
 
															+            logger.info(f"No {index_file} found in local cache.")
														
 
															 # For models like Mistral-7B-v0.3, there are both sharded
														
 
															 # safetensors files and a consolidated safetensors file.
														
 
															 # Passing both of these to the weight loader functionality breaks.
														
 
															-# So, we use the SAFE_WEIGHTS_INDEX_NAME to
														
 
															+# So, we use the index_file to
														
 
															 # look up which safetensors files should be used.
														
 
															 def filter_duplicate_safetensors_files(hf_weights_files: List[str],
														
 
															-                                       hf_folder: str) -> List[str]:
														
 
															+                                       hf_folder: str,
														
 
															+                                       index_file: str) -> List[str]:
														
 
															     # model.safetensors.index.json is a mapping from keys in the
														
 
															     # torch state_dict to safetensors file holding that weight.
														
 
															-    index_file_name = os.path.join(hf_folder, SAFE_WEIGHTS_INDEX_NAME)
														
 
															+    index_file_name = os.path.join(hf_folder, index_file)
														
 
															     if not os.path.isfile(index_file_name):
														
 
															         return hf_weights_files
														
 
															     # Iterate through the weight_map (weight_name: safetensors files)
														
 
															     # to identify weights that we should use.
														
 
															-    with open(index_file_name) as index_file:
														
 
															-        weight_map = json.load(index_file)["weight_map"]
														
 
															+    with open(index_file_name, "r") as f:
														
 
															+        weight_map = json.load(f)["weight_map"]
														
 
															     weight_files_in_index = set()
														
 
															     for weight_name in weight_map:
														
 
															         weight_files_in_index.add(
														
--- a/aphrodite/modeling/models/llama.py
+++ b/aphrodite/modeling/models/llama.py
@@ -372,6 +372,27 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
 
															         "gate_proj": ("gate_up_proj", 0),
														
 
															         "up_proj": ("gate_up_proj", 1),
														
 
															     }
														
 
															+    # Mistral/Llama models can also be loaded with --load-format mistral
														
 
															+    # from consolidated.safetensors checkpoints
														
 
															+    mistral_mapping = {
														
 
															+        "layers": "model.layers",
														
 
															+        "attention": "self_attn",
														
 
															+        "wq": "q_proj",
														
 
															+        "wk": "k_proj",
														
 
															+        "wv": "v_proj",
														
 
															+        "wo": "o_proj",
														
 
															+        "attention_norm": "input_layernorm",
														
 
															+        "feed_forward": "mlp",
														
 
															+        "w1": "gate_proj",
														
 
															+        "w2": "down_proj",
														
 
															+        "w3": "up_proj",
														
 
															+        "ffn_norm": "post_attention_layernorm",
														
 
															+        "tok_embeddings": "model.embed_tokens",
														
 
															+        "output": "lm_head",
														
 
															+        "norm": "model.norm"
														
 
															+    }
														
 
															+
														
 
															+
														
 
															     def __init__(
														
 
															         self,
														
@@ -469,6 +490,7 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
 
															         weights_list = list(weights)
														
 
															         for name, loaded_weight in progress_bar(weights_list,
														
 
															                                                 desc="Loading modules..."):
														
 
															+            name, loaded_weight = self.maybe_remap_mistral(name, loaded_weight)
														
 
															             if "rotary_emb.inv_freq" in name:
														
 
															                 continue
														
 
															             if ("rotary_emb.cos_cached" in name
														
@@ -545,3 +567,26 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
 
															             else:
														
 
															                 raise RuntimeError("Self attention has no KV cache scaling "
														
 
															                                    "factor attribute!")
														
 
															+    # This function is used to remap the mistral format as
														
 
															+    # used by Mistral and Llama <=2
														
 
															+    def maybe_remap_mistral(
														
 
															+            self, name: str,
														
 
															+            loaded_weight: torch.Tensor) -> Tuple[str, torch.Tensor]:
														
 
															+        def permute(w, n_heads):
														
 
															+            attn_in = self.config.head_dim * n_heads
														
 
															+            attn_out = self.config.hidden_size
														
 
															+            return w.view(n_heads, attn_in // n_heads // 2, 2,
														
 
															+                          attn_out).transpose(1, 2).reshape(attn_in, attn_out)
														
 
															+        mapping = self.mistral_mapping
														
 
															+        modules = name.split(".")
														
 
															+        # rotary embeds should be sliced
														
 
															+        if "wk" in modules:
														
 
															+            loaded_weight = permute(loaded_weight,
														
 
															+                                    self.config.num_key_value_heads)
														
 
															+        elif "wq" in modules:
														
 
															+            loaded_weight = permute(loaded_weight,
														
 
															+                                    self.config.num_attention_heads)
														
 
															+        for item in modules:
														
 
															+            if item in mapping and mapping[item] not in name:
														
 
															+                name = name.replace(item, mapping[item])
														
 
															+        return name, loaded_weight
														
--- a/aphrodite/transformers_utils/config.py
+++ b/aphrodite/transformers_utils/config.py
@@ -1,12 +1,18 @@
 
															 import contextlib
														
 
															+import enum
														
 
															+import json
														
 
															 import os
														
 
															 from pathlib import Path
														
 
															-from typing import Dict, Optional, Type, Union
														
 
															+from typing import Any, Dict, Optional, Type, Union
														
 
															+import huggingface_hub
														
 
															+from huggingface_hub import (file_exists, hf_hub_download,
														
 
															+                             try_to_load_from_cache)
														
 
															 from loguru import logger
														
 
															 from transformers import GenerationConfig, PretrainedConfig
														
 
															 from transformers.models.auto.modeling_auto import (
														
 
															     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
														
 
															+from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
														
 
															 from aphrodite.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
														
 
															                                                   InternVLChatConfig,
														
@@ -22,6 +28,8 @@ if APHRODITE_USE_MODELSCOPE:
 
															 else:
														
 
															     from transformers import AutoConfig
														
 
															+MISTRAL_CONFIG_NAME = "params.json"
														
 
															+
														
 
															 _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
														
 
															     "chatglm": ChatGLMConfig,
														
 
															     "dbrx": DbrxConfig,
														
@@ -39,6 +47,35 @@ for name, cls in _CONFIG_REGISTRY.items():
 
															         AutoConfig.register(name, cls)
														
 
															+class ConfigFormat(str, enum.Enum):
														
 
															+    AUTO = "auto"
														
 
															+    HF = "hf"
														
 
															+    MISTRAL = "mistral"
														
 
															+
														
 
															+
														
 
															+def file_or_path_exists(model: Union[str, Path], config_name, revision,
														
 
															+                        token) -> bool:
														
 
															+    if Path(model).exists():
														
 
															+        return (Path(model) / config_name).is_file()
														
 
															+
														
 
															+    # Offline mode support: Check if config file is cached already
														
 
															+    cached_filepath = try_to_load_from_cache(repo_id=model,
														
 
															+                                             filename=config_name,
														
 
															+                                             revision=revision)
														
 
															+    if isinstance(cached_filepath, str):
														
 
															+        # The config file exists in cache- we can continue trying to load
														
 
															+        return True
														
 
															+
														
 
															+    # NB: file_exists will only check for the existence of the config file on
														
 
															+    # hf_hub. This will fail in offline mode.
														
 
															+    try:
														
 
															+        return file_exists(model, config_name, revision=revision, token=token)
														
 
															+    except huggingface_hub.errors.OfflineModeIsEnabled:
														
 
															+        # Don't raise in offline mode, all we know is that we don't have this
														
 
															+        # file cached.
														
 
															+        return False
														
 
															+
														
 
															+
														
 
															 def get_config(
														
 
															     model: Union[str, Path],
														
 
															     trust_remote_code: bool,
														
@@ -46,38 +83,77 @@ def get_config(
 
															     code_revision: Optional[str] = None,
														
 
															     rope_scaling: Optional[dict] = None,
														
 
															     rope_theta: Optional[float] = None,
														
 
															+    config_format: ConfigFormat = ConfigFormat.AUTO,
														
 
															     **kwargs,
														
 
															 ) -> PretrainedConfig:
														
 
															-
														
 
															     # Separate model folder from file path for GGUF models
														
 
															+
														
 
															     is_gguf = check_gguf_file(model)
														
 
															     if is_gguf:
														
 
															         kwargs["gguf_file"] = Path(model).name
														
 
															         model = Path(model).parent
														
 
															-    try:
														
 
															-        config = AutoConfig.from_pretrained(
														
 
															-            model,
														
 
															-            trust_remote_code=trust_remote_code,
														
 
															-            revision=revision,
														
 
															-            code_revision=code_revision,
														
 
															-            **kwargs)
														
 
															-    except ValueError as e:
														
 
															-        if (not trust_remote_code and
														
 
															-                "requires you to execute the configuration file" in str(e)):
														
 
															-            err_msg = (
														
 
															-                "Failed to load the model config. If the model is a custom "
														
 
															-                "model not yet available in the HuggingFace transformers "
														
 
															-                "library, consider setting `trust_remote_code=True` in LLM "
														
 
															-                "or using the `--trust-remote-code` flag in the CLI.")
														
 
															-            raise RuntimeError(err_msg) from e
														
 
															+    if config_format == ConfigFormat.AUTO:
														
 
															+        if is_gguf or file_or_path_exists(model,
														
 
															+                                          HF_CONFIG_NAME,
														
 
															+                                          revision=revision,
														
 
															+                                          token=kwargs.get("token")):
														
 
															+            config_format = ConfigFormat.HF
														
 
															+        elif file_or_path_exists(model,
														
 
															+                                 MISTRAL_CONFIG_NAME,
														
 
															+                                 revision=revision,
														
 
															+                                 token=kwargs.get("token")):
														
 
															+            config_format = ConfigFormat.MISTRAL
														
 
															         else:
														
 
															-            raise e
														
 
															-    if config.model_type in _CONFIG_REGISTRY:
														
 
															-        config_class = _CONFIG_REGISTRY[config.model_type]
														
 
															-        config = config_class.from_pretrained(model,
														
 
															-                                              revision=revision,
														
 
															-                                              code_revision=code_revision)
														
 
															+            # If we're in offline mode and found no valid config format, then
														
 
															+            # raise an offline mode error to indicate to the user that they
														
 
															+            # don't have files cached and may need to go online.
														
 
															+            # This is conveniently triggered by calling file_exists().
														
 
															+            file_exists(model,
														
 
															+                        HF_CONFIG_NAME,
														
 
															+                        revision=revision,
														
 
															+                        token=kwargs.get("token"))
														
 
															+
														
 
															+            raise ValueError(f"No supported config format found in {model}")
														
 
															+
														
 
															+    if config_format == ConfigFormat.HF:
														
 
															+        config_dict, _ = PretrainedConfig.get_config_dict(
														
 
															+            model, revision=revision, code_revision=code_revision, **kwargs)
														
 
															+
														
 
															+        # Use custom model class if it's in our registry
														
 
															+        model_type = config_dict.get("model_type")
														
 
															+        if model_type in _CONFIG_REGISTRY:
														
 
															+            config_class = _CONFIG_REGISTRY[model_type]
														
 
															+            config = config_class.from_pretrained(model,
														
 
															+                                                  revision=revision,
														
 
															+                                                  code_revision=code_revision)
														
 
															+        else:
														
 
															+            try:
														
 
															+                config = AutoConfig.from_pretrained(
														
 
															+                    model,
														
 
															+                    trust_remote_code=trust_remote_code,
														
 
															+                    revision=revision,
														
 
															+                    code_revision=code_revision,
														
 
															+                    **kwargs,
														
 
															+                )
														
 
															+            except ValueError as e:
														
 
															+                if (not trust_remote_code
														
 
															+                        and "requires you to execute the configuration file"
														
 
															+                        in str(e)):
														
 
															+                    err_msg = (
														
 
															+                        "Failed to load the model config. If the model "
														
 
															+                        "is a custom model not yet available in the "
														
 
															+                        "HuggingFace transformers library, consider setting "
														
 
															+                        "`trust_remote_code=True` in LLM or using the "
														
 
															+                        "`--trust-remote-code` flag in the CLI.")
														
 
															+                    raise RuntimeError(err_msg) from e
														
 
															+                else:
														
 
															+                    raise e
														
 
															+
														
 
															+    elif config_format == ConfigFormat.MISTRAL:
														
 
															+        config = load_params_config(model, revision)
														
 
															+    else:
														
 
															+        raise ValueError(f"Unsupported config format: {config_format}")
														
 
															     # Special architecture mapping check for GGUF models
														
 
															     if is_gguf:
														
@@ -86,13 +162,82 @@ def get_config(
 
															                 f"Can't get gguf config for {config.model_type}.")
														
 
															         model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
														
 
															         config.update({"architectures": [model_type]})
														
 
															-    for key, value in [("rope_scaling", rope_scaling),
														
 
															-                       ("rope_theta", rope_theta)]:
														
 
															+
														
 
															+    for key, value in [
														
 
															+        ("rope_scaling", rope_scaling),
														
 
															+        ("rope_theta", rope_theta),
														
 
															+    ]:
														
 
															         if value is not None:
														
 
															-            logger.info(f"Updating {key} from "
														
 
															-                        f"{getattr(config, key, None)} to {value}")
														
 
															+            logger.info(
														
 
															+                "Updating %s from %r to %r",
														
 
															+                key,
														
 
															+                getattr(config, key, None),
														
 
															+                value,
														
 
															+            )
														
 
															             config.update({key: value})
														
 
															+
														
 
															+    return config
														
 
															+
														
 
															+
														
 
															+def load_params_config(model, revision) -> PretrainedConfig:
														
 
															+    # This function loads a params.json config which
														
 
															+    # should be used when loading models in mistral format
														
 
															+
														
 
															+    config_file_name = "params.json"
														
 
															+
														
 
															+    config_path = Path(model) / config_file_name
														
 
															+
														
 
															+    if not config_path.is_file():
														
 
															+        config_path = Path(
														
 
															+            hf_hub_download(model, config_file_name, revision=revision))
														
 
															+
														
 
															+    with open(config_path, "r") as file:
														
 
															+        config_dict = json.load(file)
														
 
															+
														
 
															+    config_mapping = {
														
 
															+        "dim": "hidden_size",
														
 
															+        "norm_eps": "rms_norm_eps",
														
 
															+        "n_kv_heads": "num_key_value_heads",
														
 
															+        "n_layers": "num_hidden_layers",
														
 
															+        "n_heads": "num_attention_heads",
														
 
															+        "hidden_dim": "intermediate_size",
														
 
															+    }
														
 
															+
														
 
															+    def recurse_elems(elem: Any):
														
 
															+        if isinstance(elem, dict):
														
 
															+            config_dict = {}
														
 
															+            for key, value in elem.items():
														
 
															+                key = config_mapping.get(key, key)
														
 
															+                config_dict[key] = recurse_elems(value)
														
 
															+            return PretrainedConfig(**config_dict)
														
 
															+        else:
														
 
															+            return elem
														
 
															+
														
 
															+    config_dict["model_type"] = config_dict.get("model_type", "transformer")
														
 
															+    config_dict["hidden_act"] = config_dict.get("activation", "silu")
														
 
															+    config_dict["tie_word_embeddings"] = config_dict.get(
														
 
															+        "tie_embeddings", False)
														
 
															+    config_dict["max_seq_len"] = config_dict.get("max_seq_len", 128_000)
														
 
															+    config_dict["max_position_embeddings"] = config_dict.get(
														
 
															+        "max_position_embeddings", 128_000)
														
 
															+
														
 
															+    if config_dict.get("moe") is not None:
														
 
															+        config_dict["architectures"] = ["MixtralForCausalLM"]
														
 
															+    else:
														
 
															+        config_dict["architectures"] = ["MistralForCausalLM"]
														
 
															+
														
 
															+    if config_dict.get("vision_encoder") is not None:
														
 
															+        multimodal_config = config_dict.pop("vision_encoder")
														
 
															+
														
 
															+        config_dict = {
														
 
															+            "text_config": config_dict,
														
 
															+            "vision_config": multimodal_config
														
 
															+        }
														
 
															+        config_dict["architectures"] = ["PixtralForConditionalGeneration"]
														
 
															+        config_dict["model_type"] = "pixtral"
														
 
															+
														
 
															+    config = recurse_elems(config_dict)
														
 
															     return config