Skip to content

Commit 196407a

Browse files
cebtenzzreyusiwen
authored andcommitted
convert : fix vocab size when not defined in hparams (ggml-org#3421)
1 parent eb87a57 commit 196407a

3 files changed

+15
-34
lines changed

convert-falcon-hf-to-gguf.py

+5-12
Original file line numberDiff line numberDiff line change
@@ -134,26 +134,19 @@ def parse_args() -> argparse.Namespace:
134134

135135
tokens: list[bytearray] = []
136136

137-
tokenizer_json_file = dir_model / 'tokenizer.json'
138-
if not tokenizer_json_file.is_file():
139-
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
140-
sys.exit(1)
141-
142137
# gpt2 tokenizer
143138
gguf_writer.add_tokenizer_model("gpt2")
144139

145-
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
146-
tokenizer_json = json.load(f)
147-
148140
print("gguf: get gpt2 tokenizer vocab")
149141

150-
# The number of tokens in tokenizer.json can differ from the expected vocab size.
151-
# This causes downstream issues with mismatched tensor sizes when running the inference
152-
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
153-
154142
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
155143
tokenizer = AutoTokenizer.from_pretrained(dir_model)
156144

145+
# The number of tokens in tokenizer.json can differ from the expected vocab size.
146+
# This causes downstream issues with mismatched tensor sizes when running the inference
147+
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
148+
assert max(tokenizer.vocab.values()) < vocab_size
149+
157150
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
158151
byte_encoder = bytes_to_unicode()
159152
byte_decoder = {v: k for k, v in byte_encoder.items()}

convert-gptneox-hf-to-gguf.py

+5-10
Original file line numberDiff line numberDiff line change
@@ -131,24 +131,19 @@ def parse_args() -> argparse.Namespace:
131131

132132
tokens: list[bytearray] = []
133133

134-
tokenizer_json_file = dir_model / 'tokenizer.json'
135-
if not tokenizer_json_file.is_file():
136-
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
137-
sys.exit(1)
138-
139134
# gpt2 tokenizer
140135
gguf_writer.add_tokenizer_model("gpt2")
141136

142-
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
143-
tokenizer_json = json.load(f)
144-
145137
print("gguf: get gpt2 tokenizer vocab")
146138

147-
vocab_size = len(tokenizer_json["model"]["vocab"])
148-
149139
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
150140
tokenizer = AutoTokenizer.from_pretrained(dir_model)
151141

142+
# The number of tokens in tokenizer.json can differ from the expected vocab size.
143+
# This causes downstream issues with mismatched tensor sizes when running the inference
144+
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
145+
assert max(tokenizer.vocab.values()) < vocab_size
146+
152147
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
153148
byte_encoder = bytes_to_unicode()
154149
byte_decoder = {v: k for k, v in byte_encoder.items()}

convert-starcoder-hf-to-gguf.py

+5-12
Original file line numberDiff line numberDiff line change
@@ -118,26 +118,19 @@ def parse_args() -> argparse.Namespace:
118118

119119
tokens: list[bytearray] = []
120120

121-
tokenizer_json_file = dir_model / 'tokenizer.json'
122-
if not tokenizer_json_file.is_file():
123-
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
124-
sys.exit(1)
125-
126121
# gpt2 tokenizer
127122
gguf_writer.add_tokenizer_model("gpt2")
128123

129-
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
130-
tokenizer_json = json.load(f)
131-
132124
print("gguf: get gpt2 tokenizer vocab")
133125

134-
# The number of tokens in tokenizer.json can differ from the expected vocab size.
135-
# This causes downstream issues with mismatched tensor sizes when running the inference
136-
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
137-
138126
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
139127
tokenizer = AutoTokenizer.from_pretrained(dir_model)
140128

129+
# The number of tokens in tokenizer.json can differ from the expected vocab size.
130+
# This causes downstream issues with mismatched tensor sizes when running the inference
131+
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
132+
assert max(tokenizer.vocab.values()) < vocab_size
133+
141134
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
142135
byte_encoder = bytes_to_unicode()
143136
byte_decoder = {v: k for k, v in byte_encoder.items()}

0 commit comments

Comments
 (0)