123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109 |
- from typing import Optional
- import gguf
- from transformers import AutoConfig, PretrainedConfig
- from transformers.models.auto.configuration_auto import CONFIG_MAPPING
- from aphrodite.transformers_utils.configs import (BaiChuanConfig,
- ChatGLMConfig, MPTConfig,
- QWenConfig, RWConfig)
- _CONFIG_REGISTRY = {
- "baichuan": BaiChuanConfig,
- "chatglm": ChatGLMConfig,
- "mpt": MPTConfig,
- "qwen": QWenConfig,
- "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct)
- "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct)
- }
- def extract_gguf_config(checkpoint):
- result = gguf.GGUFReader(checkpoint)
- architecture = result.fields['general.architecture']
- architecture = str(bytes(architecture.parts[architecture.data[0]]),
- encoding='utf-8')
- # Only support llama so far
- if architecture != "llama":
- raise RuntimeError(f"Unsupported architecture {architecture}")
- # write config
- vocab_size = len(result.fields['tokenizer.ggml.token_type'].data)
- context_length = int(result.fields['llama.context_length'].parts[-1])
- n_layer = int(result.fields['llama.block_count'].parts[-1])
- n_head = int(result.fields['llama.attention.head_count'].parts[-1])
- n_local_heads = int(
- result.fields['llama.attention.head_count_kv'].parts[-1])
- intermediate_size = int(
- result.fields['llama.feed_forward_length'].parts[-1])
- norm_eps = float(
- result.fields['llama.attention.layer_norm_rms_epsilon'].parts[-1])
- dim = int(result.fields['llama.embedding_length'].parts[-1])
- arch = "MixtralForCausalLM"
- if 'llama.expert_count' in result.fields:
- arch = "MixtralForCausalLM"
- name = "mixtral"
- else:
- arch = "LlamaForCausalLM"
- name = "llama"
- model_config = {
- "architectures": [arch],
- "bos_token_id": 1,
- "eos_token_id": 2,
- "hidden_act": "silu",
- "hidden_size": dim,
- "intermediate_size": intermediate_size,
- "max_position_embeddings": context_length,
- "model_type": name,
- "num_attention_heads": n_head,
- "num_hidden_layers": n_layer,
- "num_key_value_heads": n_local_heads,
- "rms_norm_eps": norm_eps,
- "torch_dtype": "float16",
- "vocab_size": vocab_size
- }
- if 'llama.rope.freq_base' in result.fields:
- model_config['rope_theta'] = float(
- result.fields['llama.rope.freq_base'].parts[-1])
- if 'llama.expert_count' in result.fields:
- model_config['num_local_experts'] = int(
- result.fields['llama.expert_count'].parts[-1])
- model_config['num_experts_per_tok'] = int(
- result.fields['llama.expert_used_count'].parts[-1])
- if name in _CONFIG_REGISTRY:
- config_class = _CONFIG_REGISTRY[name]
- else:
- config_class = CONFIG_MAPPING[name]
- hf_config = config_class.from_dict(model_config)
- return hf_config
- def get_config(model: str,
- trust_remote_code: bool,
- revision: Optional[str] = None,
- code_revision: Optional[str] = None) -> PretrainedConfig:
- if model.endswith("gguf"):
- return extract_gguf_config(model)
- try:
- config = AutoConfig.from_pretrained(
- model,
- trust_remote_code=trust_remote_code,
- revision=revision,
- code_revision=code_revision)
- except ValueError as e:
- if (not trust_remote_code and
- "requires you to execute the configuration file" in str(e)):
- err_msg = (
- "Failed to load the model config. If the model is a custom "
- "model not yet available in the HuggingFace transformers "
- "library, consider setting `trust_remote_code=True` in LLM "
- "or using the `--trust-remote-code` flag in the CLI.")
- raise RuntimeError(err_msg) from e
- else:
- raise e
- if config.model_type in _CONFIG_REGISTRY:
- config_class = _CONFIG_REGISTRY[config.model_type]
- config = config_class.from_pretrained(model,
- revision=revision,
- code_revision=code_revision)
- return config
|