config.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. import os
  2. from typing import Optional
  3. from transformers import AutoConfig, PretrainedConfig
  4. from transformers.models.auto.configuration_auto import CONFIG_MAPPING
  5. from loguru import logger
  6. from aphrodite.transformers_utils.configs import (BaiChuanConfig, DbrxConfig,
  7. ChatGLMConfig, MPTConfig,
  8. QWenConfig, RWConfig)
  9. from aphrodite.quantization.gguf_utils import GGUFReader
  10. _CONFIG_REGISTRY = {
  11. "baichuan": BaiChuanConfig,
  12. "chatglm": ChatGLMConfig,
  13. "dbrx": DbrxConfig,
  14. "mpt": MPTConfig,
  15. "qwen": QWenConfig,
  16. "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct)
  17. "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct)
  18. }
  19. def extract_gguf_config(checkpoint):
  20. if os.path.isfile(checkpoint):
  21. result = GGUFReader(checkpoint)
  22. elif os.path.isdir(checkpoint):
  23. try:
  24. return AutoConfig.from_pretrained(checkpoint)
  25. except Exception:
  26. pass
  27. all_gguf_files = sorted([
  28. file for file in os.listdir(checkpoint)
  29. if os.path.splitext(file)[-1].lower() == ".gguf"
  30. ])
  31. # assume the config is always in the first shard
  32. result = GGUFReader(os.path.join(checkpoint, all_gguf_files[0]))
  33. else:
  34. raise RuntimeError(f"Cannot find any model config with `{checkpoint}`")
  35. logger.info("Extracting config from GGUF...")
  36. architecture = result.fields['general.architecture']
  37. architecture = str(bytes(architecture.parts[architecture.data[0]]),
  38. encoding='utf-8')
  39. # Only support llama so far
  40. if architecture != "llama":
  41. raise RuntimeError(f"Unsupported architecture {architecture}, "
  42. "only llama is supported.")
  43. # write config
  44. vocab_size = len(result.fields['tokenizer.ggml.token_type'].data)
  45. context_length = int(result.fields['llama.context_length'].parts[-1])
  46. n_layer = int(result.fields['llama.block_count'].parts[-1])
  47. n_head = int(result.fields['llama.attention.head_count'].parts[-1])
  48. n_local_heads = int(
  49. result.fields['llama.attention.head_count_kv'].parts[-1])
  50. intermediate_size = int(
  51. result.fields['llama.feed_forward_length'].parts[-1])
  52. norm_eps = float(
  53. result.fields['llama.attention.layer_norm_rms_epsilon'].parts[-1])
  54. dim = int(result.fields['llama.embedding_length'].parts[-1])
  55. arch = "MixtralForCausalLM"
  56. if 'llama.expert_count' in result.fields:
  57. arch = "MixtralForCausalLM"
  58. name = "mixtral"
  59. else:
  60. arch = "LlamaForCausalLM"
  61. name = "llama"
  62. model_config = {
  63. "architectures": [arch],
  64. "bos_token_id": 1,
  65. "eos_token_id": 2,
  66. "hidden_act": "silu",
  67. "hidden_size": dim,
  68. "intermediate_size": intermediate_size,
  69. "max_position_embeddings": context_length,
  70. "model_type": name,
  71. "num_attention_heads": n_head,
  72. "num_hidden_layers": n_layer,
  73. "num_key_value_heads": n_local_heads,
  74. "rms_norm_eps": norm_eps,
  75. "torch_dtype": "float16",
  76. "vocab_size": vocab_size
  77. }
  78. if 'llama.rope.freq_base' in result.fields:
  79. model_config['rope_theta'] = float(
  80. result.fields['llama.rope.freq_base'].parts[-1])
  81. if 'llama.expert_count' in result.fields:
  82. model_config['num_local_experts'] = int(
  83. result.fields['llama.expert_count'].parts[-1])
  84. model_config['num_experts_per_tok'] = int(
  85. result.fields['llama.expert_used_count'].parts[-1])
  86. if name in _CONFIG_REGISTRY:
  87. config_class = _CONFIG_REGISTRY[name]
  88. else:
  89. config_class = CONFIG_MAPPING[name]
  90. hf_config = config_class.from_dict(model_config)
  91. return hf_config
  92. def get_config(model: str,
  93. trust_remote_code: bool,
  94. revision: Optional[str] = None,
  95. code_revision: Optional[str] = None) -> PretrainedConfig:
  96. if model.endswith("gguf"):
  97. return extract_gguf_config(model)
  98. try:
  99. config = AutoConfig.from_pretrained(
  100. model,
  101. trust_remote_code=trust_remote_code,
  102. revision=revision,
  103. code_revision=code_revision)
  104. except ValueError as e:
  105. if (not trust_remote_code and
  106. "requires you to execute the configuration file" in str(e)):
  107. err_msg = (
  108. "Failed to load the model config. If the model is a custom "
  109. "model not yet available in the HuggingFace transformers "
  110. "library, consider setting `trust_remote_code=True` in LLM "
  111. "or using the `--trust-remote-code` flag in the CLI.")
  112. raise RuntimeError(err_msg) from e
  113. else:
  114. raise e
  115. if config.model_type in _CONFIG_REGISTRY:
  116. config_class = _CONFIG_REGISTRY[config.model_type]
  117. config = config_class.from_pretrained(model,
  118. revision=revision,
  119. code_revision=code_revision)
  120. return config
  121. def get_hf_text_config(config: PretrainedConfig):
  122. """Get the `sub` config relevant to multimodal models.
  123. No-op for text models.
  124. """
  125. if hasattr(config, "text_config"):
  126. # The code operates under the assumption that
  127. # text_config should have `num_attention_heads`
  128. # (among others). Assert here to fail early
  129. # if transformer config doesn't align with
  130. # the assumption.
  131. assert hasattr(config.text_config, "num_attention_heads")
  132. return config.text_config
  133. else:
  134. return config