@@ -289,6 +289,58 @@ def _set_vocab_gpt2(self):
289
289
special_vocab = gguf .SpecialVocab (dir_model , load_merges = True )
290
290
special_vocab .add_to_gguf (self .gguf_writer )
291
291
292
+ def _set_vocab_qwen (self ):
293
+ dir_model = self .dir_model
294
+ hparams = self .hparams
295
+ tokens : list [bytearray ] = []
296
+ toktypes : list [int ] = []
297
+
298
+ from transformers import AutoTokenizer
299
+ tokenizer = AutoTokenizer .from_pretrained (dir_model , trust_remote_code = True )
300
+ vocab_size = hparams ["vocab_size" ]
301
+ assert max (tokenizer .get_vocab ().values ()) < vocab_size
302
+
303
+ merges = []
304
+ vocab = {}
305
+ mergeable_ranks = tokenizer .mergeable_ranks
306
+ for token , rank in mergeable_ranks .items ():
307
+ vocab [QwenModel .token_bytes_to_string (token )] = rank
308
+ if len (token ) == 1 :
309
+ continue
310
+ merged = QwenModel .bpe (mergeable_ranks , token , max_rank = rank )
311
+ assert len (merged ) == 2
312
+ merges .append (' ' .join (map (QwenModel .token_bytes_to_string , merged )))
313
+
314
+ # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
315
+ added_vocab = tokenizer .special_tokens
316
+ reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in (vocab | added_vocab ).items ()}
317
+
318
+ for i in range (vocab_size ):
319
+ if i not in reverse_vocab :
320
+ pad_token = f"[PAD{ i } ]" .encode ("utf-8" )
321
+ tokens .append (bytearray (pad_token ))
322
+ toktypes .append (gguf .TokenType .USER_DEFINED )
323
+ elif reverse_vocab [i ] in added_vocab :
324
+ tokens .append (reverse_vocab [i ])
325
+ toktypes .append (gguf .TokenType .CONTROL )
326
+ else :
327
+ tokens .append (reverse_vocab [i ])
328
+ toktypes .append (gguf .TokenType .NORMAL )
329
+
330
+ self .gguf_writer .add_tokenizer_model ("gpt2" )
331
+ self .gguf_writer .add_token_list (tokens )
332
+ self .gguf_writer .add_token_types (toktypes )
333
+
334
+ special_vocab = gguf .SpecialVocab (dir_model , load_merges = False )
335
+ special_vocab .merges = merges
336
+ # only add special tokens when they were not already loaded from config.json
337
+ if len (special_vocab .special_token_ids ) == 0 :
338
+ special_vocab ._set_special_token ("bos" , tokenizer .special_tokens ["<|endoftext|>" ])
339
+ special_vocab ._set_special_token ("eos" , tokenizer .special_tokens ["<|endoftext|>" ])
340
+ # this one is usually not in config.json anyway
341
+ special_vocab ._set_special_token ("unk" , tokenizer .special_tokens ["<|endoftext|>" ])
342
+ special_vocab .add_to_gguf (self .gguf_writer )
343
+
292
344
def _set_vocab_sentencepiece (self ):
293
345
from sentencepiece import SentencePieceProcessor
294
346
@@ -877,6 +929,13 @@ def write_tensors(self):
877
929
878
930
879
931
class StableLMModel (Model ):
932
+ def set_vocab (self ):
933
+ if (self .dir_model / "tokenizer.json" ).is_file ():
934
+ self ._set_vocab_gpt2 ()
935
+ else :
936
+ # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
937
+ self ._set_vocab_qwen ()
938
+
880
939
def set_gguf_parameters (self ):
881
940
hparams = self .hparams
882
941
block_count = hparams ["num_hidden_layers" ]
@@ -922,52 +981,7 @@ def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None =
922
981
return parts
923
982
924
983
def set_vocab (self ):
925
- dir_model = self .dir_model
926
- hparams = self .hparams
927
- tokens : list [bytearray ] = []
928
- toktypes : list [int ] = []
929
-
930
- from transformers import AutoTokenizer
931
- tokenizer = AutoTokenizer .from_pretrained (dir_model , trust_remote_code = True )
932
- vocab_size = hparams ["vocab_size" ]
933
- assert max (tokenizer .get_vocab ().values ()) < vocab_size
934
-
935
- merges = []
936
- vocab = {}
937
- mergeable_ranks = tokenizer .mergeable_ranks
938
- for token , rank in mergeable_ranks .items ():
939
- vocab [self .token_bytes_to_string (token )] = rank
940
- if len (token ) == 1 :
941
- continue
942
- merged = QwenModel .bpe (mergeable_ranks , token , max_rank = rank )
943
- assert len (merged ) == 2
944
- merges .append (' ' .join (map (self .token_bytes_to_string , merged )))
945
-
946
- reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in vocab .items ()}
947
- added_vocab = tokenizer .special_tokens
948
-
949
- for i in range (vocab_size ):
950
- if i not in reverse_vocab :
951
- pad_token = f"[PAD{ i } ]" .encode ("utf-8" )
952
- tokens .append (bytearray (pad_token ))
953
- toktypes .append (gguf .TokenType .USER_DEFINED )
954
- elif reverse_vocab [i ] in added_vocab :
955
- tokens .append (reverse_vocab [i ])
956
- toktypes .append (gguf .TokenType .CONTROL )
957
- else :
958
- tokens .append (reverse_vocab [i ])
959
- toktypes .append (gguf .TokenType .NORMAL )
960
-
961
- self .gguf_writer .add_tokenizer_model ("gpt2" )
962
- self .gguf_writer .add_token_list (tokens )
963
- self .gguf_writer .add_token_types (toktypes )
964
-
965
- special_vocab = gguf .SpecialVocab (dir_model , load_merges = False )
966
- special_vocab .merges = merges
967
- special_vocab ._set_special_token ("bos" , tokenizer .special_tokens ["<|endoftext|>" ])
968
- special_vocab ._set_special_token ("eos" , tokenizer .special_tokens ["<|endoftext|>" ])
969
- special_vocab ._set_special_token ("unk" , tokenizer .special_tokens ["<|endoftext|>" ])
970
- special_vocab .add_to_gguf (self .gguf_writer )
984
+ self ._set_vocab_qwen ()
971
985
972
986
def set_gguf_parameters (self ):
973
987
self .gguf_writer .add_name ("Qwen" )
0 commit comments