Skip to content

Commit bd72ba0

Browse files
committed
Feature: support baichuan serial models, by now, including Baichuan-7B, Baichuan-13B,in the feature, we will support more Baichuan-models
1 parent e4386f4 commit bd72ba0

File tree

3 files changed

+800
-3
lines changed

3 files changed

+800
-3
lines changed

convert-baichuan-hf-to-gguf.py

+308
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,308 @@
1+
#!/usr/bin/env python3
2+
# HF llama --> gguf conversion
3+
4+
from __future__ import annotations
5+
6+
import argparse
7+
import json
8+
import os
9+
import struct
10+
import sys
11+
from pathlib import Path
12+
from typing import TYPE_CHECKING, Any
13+
import itertools
14+
import gguf
15+
import numpy as np
16+
import torch
17+
from sentencepiece import SentencePieceProcessor # type: ignore[import]
18+
19+
20+
if TYPE_CHECKING:
21+
from typing import TypeAlias
22+
23+
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
24+
25+
# reverse HF permute back to original pth layout
26+
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
27+
28+
29+
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
30+
if n_kv_head is not None and n_head != n_kv_head:
31+
n_head //= n_kv_head
32+
33+
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
34+
.swapaxes(1, 2)
35+
.reshape(weights.shape))
36+
37+
def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray:
38+
r = weights.shape[0] // 3
39+
return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
40+
41+
def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray:
42+
r = weights.shape[0] // 3
43+
return weights[r * n_part : r * n_part + r, ...]
44+
45+
def count_model_parts(dir_model: str) -> int:
46+
num_parts = 0
47+
48+
for filename in os.listdir(dir_model):
49+
if filename.startswith("pytorch_model-"):
50+
num_parts += 1
51+
52+
if num_parts > 0:
53+
print("gguf: found " + str(num_parts) + " model parts")
54+
55+
return num_parts
56+
57+
58+
59+
def parse_args() -> argparse.Namespace:
60+
parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
61+
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
62+
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
63+
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
64+
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
65+
return parser.parse_args()
66+
67+
args = parse_args()
68+
69+
dir_model = args.model
70+
ftype = args.ftype
71+
if not dir_model.is_dir():
72+
print(f'Error: {args.model} is not a directory', file = sys.stderr)
73+
sys.exit(1)
74+
75+
# possible tensor data types
76+
# ftype == 0 -> float32
77+
# ftype == 1 -> float16
78+
79+
# map from ftype to string
80+
ftype_str = ["f32", "f16"]
81+
82+
if args.outfile is not None:
83+
fname_out = args.outfile
84+
else:
85+
# output in the same directory as the model by default
86+
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
87+
88+
print("gguf: loading model "+dir_model.name)
89+
90+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
91+
hparams = json.load(f)
92+
print("hello print: ",hparams["architectures"][0])
93+
if hparams["architectures"][0] != "BaichuanForCausalLM":
94+
print("Model architecture not supported: " + hparams["architectures"][0])
95+
96+
sys.exit()
97+
98+
# get number of model parts
99+
num_parts = count_model_parts(dir_model)
100+
print(f"num_parts:{num_parts}\n")
101+
ARCH=gguf.MODEL_ARCH.BAICHUAN
102+
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
103+
104+
print("gguf: get model metadata")
105+
106+
block_count = hparams["num_hidden_layers"]
107+
head_count = hparams["num_attention_heads"]
108+
109+
if "num_key_value_heads" in hparams:
110+
head_count_kv = hparams["num_key_value_heads"]
111+
else:
112+
head_count_kv = head_count
113+
114+
if "_name_or_path" in hparams:
115+
hf_repo = hparams["_name_or_path"]
116+
else:
117+
hf_repo = ""
118+
119+
if "max_sequence_length" in hparams:
120+
ctx_length = hparams["max_sequence_length"]
121+
elif "max_position_embeddings" in hparams:
122+
ctx_length = hparams["max_position_embeddings"]
123+
else:
124+
print("gguf: can not find ctx length parameter.")
125+
126+
sys.exit()
127+
128+
129+
gguf_writer.add_name(dir_model.name)
130+
gguf_writer.add_source_hf_repo(hf_repo)
131+
gguf_writer.add_tensor_data_layout("Meta AI original pth")
132+
gguf_writer.add_context_length(ctx_length)
133+
gguf_writer.add_embedding_length(hparams["hidden_size"])
134+
gguf_writer.add_block_count(block_count)
135+
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
136+
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
137+
gguf_writer.add_head_count(head_count)
138+
gguf_writer.add_head_count_kv(head_count_kv)
139+
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
140+
141+
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
142+
if "type" in hparams["rope_scaling"]:
143+
if hparams["rope_scaling"]["type"] == "linear":
144+
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
145+
146+
147+
# TOKENIZATION
148+
149+
print("gguf: get tokenizer metadata")
150+
151+
tokens: list[bytes] = []
152+
scores: list[float] = []
153+
toktypes: list[int] = []
154+
155+
tokenizer_model_file = dir_model / 'tokenizer.model'
156+
if not tokenizer_model_file.is_file():
157+
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
158+
sys.exit(1)
159+
160+
# vocab type sentencepiece
161+
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
162+
163+
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
164+
165+
for i in range(tokenizer.vocab_size()):
166+
text: bytes
167+
score: float
168+
169+
piece = tokenizer.id_to_piece(i)
170+
text = piece.encode("utf-8")
171+
score = tokenizer.get_score(i)
172+
173+
toktype = 1 # defualt to normal token type
174+
if tokenizer.is_unknown(i):
175+
toktype = 2
176+
if tokenizer.is_control(i):
177+
toktype = 3
178+
179+
# toktype = 4 is user-defined = tokens from added_tokens.json
180+
181+
if tokenizer.is_unused(i):
182+
toktype = 5
183+
if tokenizer.is_byte(i):
184+
toktype = 6
185+
186+
tokens.append(text)
187+
scores.append(score)
188+
toktypes.append(toktype)
189+
190+
added_tokens_file = dir_model / 'added_tokens.json'
191+
if added_tokens_file.is_file():
192+
with open(added_tokens_file, "r", encoding="utf-8") as f:
193+
addtokens_json = json.load(f)
194+
195+
print("gguf: get added tokens")
196+
197+
for key in addtokens_json:
198+
tokens.append( key.encode("utf-8") )
199+
scores.append(-1000.0)
200+
toktypes.append(4) # user-defined token type
201+
202+
203+
gguf_writer.add_tokenizer_model("llama")
204+
gguf_writer.add_token_list(tokens)
205+
gguf_writer.add_token_scores(scores)
206+
gguf_writer.add_token_types(toktypes)
207+
208+
special_vocab = gguf.SpecialVocab(dir_model)
209+
special_vocab.add_to_gguf(gguf_writer)
210+
211+
# TENSORS
212+
213+
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
214+
215+
# tensor info
216+
print("gguf: get tensor metadata")
217+
218+
if num_parts == 0:
219+
part_names = iter(("pytorch_model.bin",))
220+
else:
221+
part_names = (
222+
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
223+
)
224+
225+
226+
for part_name in part_names:
227+
if args.vocab_only:
228+
break
229+
print("gguf: loading model part '" + part_name + "'")
230+
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
231+
232+
tmp=model_part
233+
for i in itertools.count():
234+
if f"model.layers.{i}.self_attn.q_proj.weight" in model_part:
235+
print(f"Permuting layer {i}")
236+
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = reverse_hf_permute(model_part[f"model.layers.{i}.self_attn.q_proj.weight"], head_count, head_count)
237+
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = reverse_hf_permute(model_part[f"model.layers.{i}.self_attn.k_proj.weight"], head_count, head_count_kv)
238+
#tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
239+
elif f"model.layers.{i}.self_attn.W_pack.weight" in model_part:
240+
print(f"Unpacking and permuting layer {i}")
241+
tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count)
242+
tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv)
243+
tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2)
244+
del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
245+
else:
246+
break
247+
248+
for name in model_part.keys():
249+
data = model_part[name]
250+
# we don't need these
251+
if name.endswith(".rotary_emb.inv_freq"):
252+
continue
253+
254+
old_dtype = data.dtype
255+
256+
# convert any unsupported data types to float32
257+
if data.dtype != torch.float16 and data.dtype != torch.float32:
258+
data = data.to(torch.float32)
259+
260+
data = data.squeeze().numpy()
261+
262+
# reverse permute these
263+
# if name.endswith(".q_proj.weight"):
264+
# data = reverse_hf_permute(data, head_count)
265+
# if name.endswith(".k_proj.weight"):
266+
# data = reverse_hf_permute(data, head_count, head_count_kv)
267+
268+
269+
270+
# map tensor names
271+
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
272+
if new_name is None:
273+
print("Can not map tensor '" + name + "'")
274+
sys.exit()
275+
276+
n_dims = len(data.shape)
277+
data_dtype = data.dtype
278+
279+
# if f32 desired, convert any float16 to float32
280+
if ftype == 0 and data_dtype == np.float16:
281+
data = data.astype(np.float32)
282+
283+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
284+
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
285+
data = data.astype(np.float32)
286+
287+
# if f16 desired, convert any float32 2-dim weight tensors to float16
288+
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
289+
data = data.astype(np.float16)
290+
291+
print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
292+
293+
294+
gguf_writer.add_tensor(new_name, data)
295+
296+
297+
print("gguf: write header")
298+
gguf_writer.write_header_to_file()
299+
print("gguf: write metadata")
300+
gguf_writer.write_kv_data_to_file()
301+
if not args.vocab_only:
302+
print("gguf: write tensors")
303+
gguf_writer.write_tensors_to_file()
304+
305+
gguf_writer.close()
306+
307+
print(f"gguf: model successfully exported to '{fname_out}'")
308+
print("")

gguf-py/gguf/gguf.py

+24-2
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@
7979
class MODEL_ARCH(IntEnum):
8080
LLAMA : int = auto()
8181
FALCON : int = auto()
82+
BAICHUAN:int = auto()
8283
GPT2 : int = auto()
8384
GPTJ : int = auto()
8485
GPTNEOX: int = auto()
@@ -108,6 +109,7 @@ class MODEL_TENSOR(IntEnum):
108109
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
109110
MODEL_ARCH.LLAMA: "llama",
110111
MODEL_ARCH.FALCON: "falcon",
112+
MODEL_ARCH.BAICHUAN:"baichuan",
111113
MODEL_ARCH.GPT2: "gpt2",
112114
MODEL_ARCH.GPTJ: "gptj",
113115
MODEL_ARCH.GPTNEOX: "gptneox",
@@ -153,6 +155,22 @@ class MODEL_TENSOR(IntEnum):
153155
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
154156
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
155157
},
158+
MODEL_ARCH.BAICHUAN: {
159+
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
160+
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
161+
MODEL_TENSOR.OUTPUT: "output",
162+
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
163+
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
164+
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
165+
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
166+
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
167+
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
168+
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
169+
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
170+
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
171+
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
172+
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
173+
},
156174
MODEL_ARCH.GPT2: {
157175
# TODO
158176
},
@@ -165,6 +183,10 @@ class MODEL_TENSOR(IntEnum):
165183
MODEL_TENSOR.ROPE_FREQS,
166184
MODEL_TENSOR.ATTN_ROT_EMBD,
167185
],
186+
MODEL_ARCH.BAICHUAN: [
187+
MODEL_TENSOR.ROPE_FREQS,
188+
MODEL_TENSOR.ATTN_ROT_EMBD,
189+
],
168190
}
169191

170192

@@ -187,15 +209,15 @@ class TensorNameMap:
187209
# Output
188210
MODEL_TENSOR.OUTPUT: (
189211
"embed_out", # gptneox
190-
"lm_head", # gpt2 mpt falcon llama-hf
212+
"lm_head", # gpt2 mpt falcon llama-hf baichuan
191213
"output", # llama-pth
192214
),
193215

194216
# Output norm
195217
MODEL_TENSOR.OUTPUT_NORM: (
196218
"gpt_neox.final_layer_norm", # gptneox
197219
"transformer.ln_f", # gpt2 falcon
198-
"model.norm", # llama-hf
220+
"model.norm", # llama-hf baichuan
199221
"norm", # llama-pth
200222
),
201223

0 commit comments

Comments
 (0)