|
@@ -12,14 +12,14 @@ import filelock
|
|
|
import huggingface_hub.constants
|
|
|
import numpy as np
|
|
|
import torch
|
|
|
-from huggingface_hub import HfFileSystem, snapshot_download
|
|
|
+from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
|
|
|
+from loguru import logger
|
|
|
from safetensors.torch import load_file, safe_open, save_file
|
|
|
from tqdm.auto import tqdm
|
|
|
-from loguru import logger
|
|
|
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
|
|
|
|
|
|
from aphrodite.common.config import LoadConfig, ModelConfig
|
|
|
-from aphrodite.quantization import (QuantizationConfig,
|
|
|
- get_quantization_config)
|
|
|
+from aphrodite.quantization import QuantizationConfig, get_quantization_config
|
|
|
from aphrodite.quantization.schema import QuantParamSchema
|
|
|
|
|
|
# use system-level temp directory for file locks, so that multiple users
|
|
@@ -215,6 +215,66 @@ def download_weights_from_hf(
|
|
|
return hf_folder
|
|
|
|
|
|
|
|
|
+def download_safetensors_index_file_from_hf(
|
|
|
+ model_name_or_path: str,
|
|
|
+ cache_dir: Optional[str],
|
|
|
+ revision: Optional[str] = None,
|
|
|
+) -> None:
|
|
|
+ """Download hf safetensors index file from Hugging Face Hub.
|
|
|
+ Args:
|
|
|
+ model_name_or_path (str): The model name or path.
|
|
|
+ cache_dir (Optional[str]): The cache directory to store the model
|
|
|
+ weights. If None, will use HF defaults.
|
|
|
+ revision (Optional[str]): The revision of the model.
|
|
|
+ """
|
|
|
+ # Use file lock to prevent multiple processes from
|
|
|
+ # downloading the same model weights at the same time.
|
|
|
+ with get_lock(model_name_or_path, cache_dir):
|
|
|
+ try:
|
|
|
+ # Download the safetensors index file.
|
|
|
+ hf_hub_download(
|
|
|
+ repo_id=model_name_or_path,
|
|
|
+ filename=SAFE_WEIGHTS_INDEX_NAME,
|
|
|
+ cache_dir=cache_dir,
|
|
|
+ revision=revision,
|
|
|
+ local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
|
|
+ )
|
|
|
+ # If file not found on remote or locally, we should not fail since
|
|
|
+ # only some models will have SAFE_WEIGHTS_INDEX_NAME.
|
|
|
+ except huggingface_hub.utils.EntryNotFoundError:
|
|
|
+ logger.info(f"No {SAFE_WEIGHTS_INDEX_NAME} found in remote.")
|
|
|
+ except huggingface_hub.utils.LocalEntryNotFoundError:
|
|
|
+ logger.info(f"No {SAFE_WEIGHTS_INDEX_NAME} found in local cache.")
|
|
|
+
|
|
|
+
|
|
|
+# For models like Mistral-7B-v0.3, there are both sharded
|
|
|
+# safetensors files and a consolidated safetensors file.
|
|
|
+# Passing both of these to the weight loader functionality breaks.
|
|
|
+# So, we use the SAFE_WEIGHTS_INDEX_NAME to
|
|
|
+# look up which safetensors files should be used.
|
|
|
+def filter_duplicate_safetensors_files(hf_weights_files: List[str],
|
|
|
+ hf_folder: str) -> List[str]:
|
|
|
+ # model.safetensors.index.json is a mapping from keys in the
|
|
|
+ # torch state_dict to safetensors file holding that weight.
|
|
|
+ index_file_name = os.path.join(hf_folder, SAFE_WEIGHTS_INDEX_NAME)
|
|
|
+ if not os.path.isfile(index_file_name):
|
|
|
+ return hf_weights_files
|
|
|
+
|
|
|
+ # Iterate through the weight_map (weight_name: safetensors files)
|
|
|
+ # to identify weights that we should use.
|
|
|
+ with open(index_file_name) as index_file:
|
|
|
+ weight_map = json.load(index_file)["weight_map"]
|
|
|
+ weight_files_in_index = set()
|
|
|
+ for weight_name in weight_map:
|
|
|
+ weight_files_in_index.add(
|
|
|
+ os.path.join(hf_folder, weight_map[weight_name]))
|
|
|
+ # Filter out any fields that are not found in the index file.
|
|
|
+ hf_weights_files = [
|
|
|
+ f for f in hf_weights_files if f in weight_files_in_index
|
|
|
+ ]
|
|
|
+ return hf_weights_files
|
|
|
+
|
|
|
+
|
|
|
def filter_files_not_needed_for_inference(
|
|
|
hf_weights_files: List[str]) -> List[str]:
|
|
|
"""
|