david
/
aphrodite-engine
mirror of https://github.com/PygmalionAI/aphrodite-engine


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
							from typing import Optional

import gguf
from transformers import AutoConfig, PretrainedConfig
from transformers.models.auto.configuration_auto import CONFIG_MAPPING

from aphrodite.transformers_utils.configs import (BaiChuanConfig,
                                                  ChatGLMConfig, MPTConfig,
                                                  QWenConfig, RWConfig)

_CONFIG_REGISTRY = {
    "baichuan": BaiChuanConfig,
    "chatglm": ChatGLMConfig,
    "mpt": MPTConfig,
    "qwen": QWenConfig,
    "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
    "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
}


def extract_gguf_config(checkpoint):
    result = gguf.GGUFReader(checkpoint)
    architecture = result.fields['general.architecture']
    architecture = str(bytes(architecture.parts[architecture.data[0]]),
                       encoding='utf-8')
    # Only support llama so far
    if architecture != "llama":
        raise RuntimeError(f"Unsupported architecture {architecture}")

    # write config
    vocab_size = len(result.fields['tokenizer.ggml.token_type'].data)
    context_length = int(result.fields['llama.context_length'].parts[-1])
    n_layer = int(result.fields['llama.block_count'].parts[-1])
    n_head = int(result.fields['llama.attention.head_count'].parts[-1])
    n_local_heads = int(
        result.fields['llama.attention.head_count_kv'].parts[-1])
    intermediate_size = int(
        result.fields['llama.feed_forward_length'].parts[-1])
    norm_eps = float(
        result.fields['llama.attention.layer_norm_rms_epsilon'].parts[-1])
    dim = int(result.fields['llama.embedding_length'].parts[-1])
    arch = "MixtralForCausalLM"
    if 'llama.expert_count' in result.fields:
        arch = "MixtralForCausalLM"
        name = "mixtral"
    else:
        arch = "LlamaForCausalLM"
        name = "llama"
    model_config = {
        "architectures": [arch],
        "bos_token_id": 1,
        "eos_token_id": 2,
        "hidden_act": "silu",
        "hidden_size": dim,
        "intermediate_size": intermediate_size,
        "max_position_embeddings": context_length,
        "model_type": name,
        "num_attention_heads": n_head,
        "num_hidden_layers": n_layer,
        "num_key_value_heads": n_local_heads,
        "rms_norm_eps": norm_eps,
        "torch_dtype": "float16",
        "vocab_size": vocab_size
    }
    if 'llama.rope.freq_base' in result.fields:
        model_config['rope_theta'] = float(
            result.fields['llama.rope.freq_base'].parts[-1])
    if 'llama.expert_count' in result.fields:
        model_config['num_local_experts'] = int(
            result.fields['llama.expert_count'].parts[-1])
        model_config['num_experts_per_tok'] = int(
            result.fields['llama.expert_used_count'].parts[-1])
    if name in _CONFIG_REGISTRY:
        config_class = _CONFIG_REGISTRY[name]
    else:
        config_class = CONFIG_MAPPING[name]
    hf_config = config_class.from_dict(model_config)
    return hf_config


def get_config(model: str,
               trust_remote_code: bool,
               revision: Optional[str] = None,
               code_revision: Optional[str] = None) -> PretrainedConfig:
    if model.endswith("gguf"):
        return extract_gguf_config(model)
    try:
        config = AutoConfig.from_pretrained(
            model,
            trust_remote_code=trust_remote_code,
            revision=revision,
            code_revision=code_revision)
    except ValueError as e:
        if (not trust_remote_code and
                "requires you to execute the configuration file" in str(e)):
            err_msg = (
                "Failed to load the model config. If the model is a custom "
                "model not yet available in the HuggingFace transformers "
                "library, consider setting `trust_remote_code=True` in LLM "
                "or using the `--trust-remote-code` flag in the CLI.")
            raise RuntimeError(err_msg) from e
        else:
            raise e
    if config.model_type in _CONFIG_REGISTRY:
        config_class = _CONFIG_REGISTRY[config.model_type]
        config = config_class.from_pretrained(model,
                                              revision=revision,
                                              code_revision=code_revision)
    return config