import json import os import torch import gguf from sentencepiece import sentencepiece_model_pb2 from safetensors.torch import save_file as safe_save_file from transformers.modeling_utils import shard_checkpoint from transformers.utils import (WEIGHTS_NAME, WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME) def convert_to_state_dict(checkpoint, save_dir, max_shard_size, safe_serialization): if not os.path.exists(save_dir): os.makedirs(save_dir) state_dict = {} result = gguf.GGUFReader(checkpoint) architecture = result.fields['general.architecture'] architecture = str(bytes(architecture.parts[architecture.data[0]]), encoding='utf-8') if architecture != "llama": print(f"Unsupported architecture {architecture}") return # write vocab vocab = sentencepiece_model_pb2.ModelProto() vocab_size = len(result.fields['tokenizer.ggml.token_type'].data) vocab.trainer_spec.model_type = 2 # BPE vocab.trainer_spec.vocab_size = vocab_size vocab.trainer_spec.byte_fallback = True vocab.normalizer_spec.remove_extra_whitespaces = False tokens = result.fields['tokenizer.ggml.tokens'] scores = result.fields['tokenizer.ggml.scores'] types = result.fields['tokenizer.ggml.token_type'] for i in range(vocab_size): new_token = vocab.SentencePiece() new_token.piece = str(bytes(tokens.parts[tokens.data[i]]), encoding='utf-8') new_token.score = scores.parts[scores.data[i]] # llama.cpp tokentype is the same with sentencepiece token type new_token.type = int(types.parts[types.data[i]]) vocab.pieces.append(new_token) with open(os.path.join(save_dir, "tokenizer.model"), 'wb') as f: f.write(vocab.SerializeToString()) tokenizer_config = { "tokenizer_class": "LlamaTokenizer", "legacy": False, "clean_up_tokenization_spaces": False, } if 'tokenizer.ggml.bos_token_id' in result.fields: tokenizer_config["bos_token"] = vocab.pieces[int( result.fields['tokenizer.ggml.bos_token_id'].parts[-1])].piece if 'tokenizer.ggml.eos_token_id' in result.fields: tokenizer_config["eos_token"] = vocab.pieces[int( result.fields['tokenizer.ggml.eos_token_id'].parts[-1])].piece if 'tokenizer.ggml.padding_token_id' in result.fields: tokenizer_config["pad_token"] = vocab.pieces[int( result.fields['tokenizer.ggml.padding_token_id'].parts[-1])].piece if 'tokenizer.ggml.unknown_token_id' in result.fields: tokenizer_config["unk_token"] = vocab.pieces[int( result.fields['tokenizer.ggml.unknown_token_id'].parts[-1])].piece if 'tokenizer.ggml.add_bos_token' in result.fields: tokenizer_config["add_bos_token"] = bool( result.fields['tokenizer.ggml.add_bos_token'].parts[-1]) if 'tokenizer.ggml.add_eos_token' in result.fields: tokenizer_config["add_eos_token"] = bool( result.fields['tokenizer.ggml.add_eos_token'].parts[-1]) if 'tokenizer.chat_template' in result.fields: tokenizer_config["chat_template"] = str(bytes( result.fields['tokenizer.chat_template'].parts[-1]), encoding="utf-8") with open(os.path.join(save_dir, "tokenizer_config.json"), 'w') as f: json.dump(tokenizer_config, f, indent=2) # write config context_length = int(result.fields['llama.context_length'].parts[-1]) n_layer = int(result.fields['llama.block_count'].parts[-1]) n_head = int(result.fields['llama.attention.head_count'].parts[-1]) n_local_heads = int( result.fields['llama.attention.head_count_kv'].parts[-1]) intermediate_size = int( result.fields['llama.feed_forward_length'].parts[-1]) norm_eps = float( result.fields['llama.attention.layer_norm_rms_epsilon'].parts[-1]) dim = int(result.fields['llama.embedding_length'].parts[-1]) kv_dim = dim // n_head * n_local_heads arch = "MixtralForCausalLM" if 'llama.expert_count' in result.fields: arch = "MixtralForCausalLM" name = "mixtral" else: arch = "LlamaForCausalLM" name = "llama" model_config = { "architectures": [arch], "bos_token_id": 1, "eos_token_id": 2, "hidden_act": "silu", "hidden_size": dim, "intermediate_size": intermediate_size, "max_position_embeddings": context_length, "model_type": name, "num_attention_heads": n_head, "num_hidden_layers": n_layer, "num_key_value_heads": n_local_heads, "rms_norm_eps": norm_eps, "torch_dtype": "float16", "vocab_size": vocab_size } if 'llama.rope.freq_base' in result.fields: model_config['rope_theta'] = float( result.fields['llama.rope.freq_base'].parts[-1]) if 'llama.expert_count' in result.fields: model_config['num_local_experts'] = int( result.fields['llama.expert_count'].parts[-1]) model_config['num_experts_per_tok'] = int( result.fields['llama.expert_used_count'].parts[-1]) with open(os.path.join(save_dir, "config.json"), 'w') as f: json.dump(model_config, f, indent=2) # write tensor tensor_mapping = { "token_embd": ("model.embed_tokens", vocab_size), "output": ("lm_head", vocab_size), "output_norm": ("model.norm", -1), "blk.{bid}.attn_norm": ("model.layers.{bid}.input_layernorm", -1), "blk.{bid}.attn_q": ("model.layers.{bid}.self_attn.q_proj", dim), "blk.{bid}.attn_k": ("model.layers.{bid}.self_attn.k_proj", kv_dim), "blk.{bid}.attn_v": ("model.layers.{bid}.self_attn.v_proj", kv_dim), "blk.{bid}.attn_output": ("model.layers.{bid}.self_attn.o_proj", dim), "blk.{bid}.attn_rot_embd": ("model.layers.{bid}.self_attn.rotary_emb.inv_freq", -1), "blk.{bid}.ffn_norm": ("model.layers.{bid}.post_attention_layernorm", -1), "blk.{bid}.ffn_up": ("model.layers.{bid}.mlp.up_proj", intermediate_size), "blk.{bid}.ffn_down": ("model.layers.{bid}.mlp.down_proj", dim), "blk.{bid}.ffn_gate": ("model.layers.{bid}.mlp.gate_proj", intermediate_size), "blk.{bid}.ffn_up.{xid}": ("model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", intermediate_size), "blk.{bid}.ffn_down.{xid}": ("model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", dim), "blk.{bid}.ffn_gate.{xid}": ("model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", intermediate_size), "blk.{bid}.ffn_gate_inp": ("model.layers.{bid}.block_sparse_moe.gate", model_config.get('num_local_experts', 1)), } mapping = {} max_block_num = 200 max_expert_num = 8 for k, v in tensor_mapping.items(): for i in range(max_block_num): for j in range(max_expert_num): fk = k.format(bid=i, xid=j) fv = v[0].format(bid=i, xid=j) if k not in mapping: mapping[fk] = (fv, v[1]) for ts in result.tensors: weight_type = torch.tensor(int(ts.tensor_type), dtype=torch.int) layer, suffix = ts.name.rsplit(".", 1) new_key, output_dim = mapping[layer] new_key += f".{suffix}" data = torch.tensor(ts.data) if output_dim != -1: data = data.view(output_dim, -1) if weight_type > 1: state_dict[new_key.replace("weight", "weight_type")] = weight_type state_dict[new_key] = data if max_shard_size == "0": if safe_serialization: safe_save_file(state_dict, os.path.join(save_dir, SAFE_WEIGHTS_NAME), metadata={"format": "pt"}) else: torch.save(state_dict, os.path.join(save_dir, WEIGHTS_NAME)) else: shards, index = shard_checkpoint( state_dict, max_shard_size, SAFE_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME) for shard_file, shard in shards.items(): if safe_serialization: safe_save_file(shard, os.path.join(save_dir, shard_file), metadata={"format": "pt"}) else: torch.save(shard, os.path.join(save_dir, shard_file)) if index is not None: if safe_serialization: save_index_file = SAFE_WEIGHTS_INDEX_NAME else: save_index_file = WEIGHTS_INDEX_NAME save_index_file = os.path.join(save_dir, save_index_file) # Save the index as well with open(save_index_file, "w", encoding="utf-8") as f: content = json.dumps(index, indent=2, sort_keys=True) + "\n" f.write(content) if __name__ == '__main__': import argparse parser = argparse.ArgumentParser( description='Convert GGUF checkpoints to torch') parser.add_argument('--input', type=str, help='The path to GGUF file') parser.add_argument('--output', type=str, help='The path to output directory') parser.add_argument( '--max-shard-size', default="0", type=str, help='Shard the model in specified shard size, e.g. 10GB. 0 to disable' ) parser.add_argument('--safetensors', action='store_true', help='Save in .safetensors format') args = parser.parse_args() convert_to_state_dict(args.input, args.output, args.max_shard_size, args.safetensors)