Skip to content

Commit 91f6499

Browse files
authored
Respect tokenizer.ggml.add_bos_token value when tokenizing (#4040)
* gguf-py: gguf-dump: Respect --no-tensor flag in JSON mode. * Respect add_bos_token GGUF metadata value * gguf-py: Try to fix SpecialVocab giving up too easily for the Nth time
1 parent 8da4627 commit 91f6499

File tree

12 files changed

+85
-29
lines changed

12 files changed

+85
-29
lines changed

common/common.cpp

+6
Original file line numberDiff line numberDiff line change
@@ -1072,6 +1072,12 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
10721072
return result;
10731073
}
10741074

1075+
bool llama_should_add_bos_token(const llama_model * model) {
1076+
const int add_bos = llama_add_bos_token(model);
1077+
1078+
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
1079+
}
1080+
10751081
//
10761082
// YAML utils
10771083
//

common/common.h

+4
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,10 @@ std::string llama_detokenize_bpe(
200200
llama_context * ctx,
201201
const std::vector<llama_token> & tokens);
202202

203+
// Uses the value from the model metadata if possible, otherwise
204+
// defaults to true when model type is SPM, otherwise false.
205+
bool llama_should_add_bos_token(const llama_model * model);
206+
203207
//
204208
// YAML utils
205209
//

examples/infill/infill.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ int main(int argc, char ** argv) {
230230
LOG_TEE("\n");
231231
LOG_TEE("%s\n", get_system_info(params).c_str());
232232
}
233-
const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
233+
const bool add_bos = llama_should_add_bos_token(model);
234234
LOG("add_bos: %d\n", add_bos);
235235

236236
bool suff_rm_leading_spc = params.escape;

examples/llava/llava-cli.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -208,9 +208,10 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
208208
int n_past = 0;
209209

210210
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
211+
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
211212

212213
// llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:"
213-
eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, true);
214+
eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos);
214215
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
215216
eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false);
216217

examples/main/main.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ int main(int argc, char ** argv) {
229229
}
230230
}
231231

232-
const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
232+
const bool add_bos = llama_should_add_bos_token(model);
233233
LOG("add_bos: %d\n", add_bos);
234234

235235
std::vector<llama_token> embd_inp;

examples/perplexity/perplexity.cpp

+3-5
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
149149
// Output: `perplexity: 13.5106 [114/114]`
150150
// BOS tokens will be added for each chunk before eval
151151

152-
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
153-
const bool add_bos = is_spm;
152+
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
154153

155154
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
156155

@@ -288,8 +287,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
288287
// Output: `perplexity: 13.5106 [114/114]`
289288
// BOS tokens will be added for each chunk before eval
290289

291-
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
292-
const bool add_bos = is_spm;
290+
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
293291
const int n_ctx = llama_n_ctx(ctx);
294292

295293
auto tim1 = std::chrono::high_resolution_clock::now();
@@ -481,7 +479,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
481479
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
482480

483481
// This is needed as usual for LLaMA models
484-
const bool add_bos = is_spm;
482+
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
485483

486484
// Number of tasks to use when computing the score
487485
if ( params.hellaswag_tasks < hs_task_count ) {

examples/server/server.cpp

+6-3
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,7 @@ struct llama_server_context
501501
bool multimodal = false;
502502
bool clean_kv_cache = true;
503503
bool all_slots_are_idle = false;
504+
bool add_bos_token = true;
504505

505506
int32_t id_gen;
506507
int32_t n_ctx; // total context for all clients / slots
@@ -573,6 +574,8 @@ struct llama_server_context
573574

574575
n_ctx = llama_n_ctx(ctx);
575576

577+
add_bos_token = llama_should_add_bos_token(model);
578+
576579
return true;
577580
}
578581

@@ -864,7 +867,7 @@ struct llama_server_context
864867
}
865868

866869
void update_system_prompt() {
867-
system_tokens = ::llama_tokenize(ctx, system_prompt, true);
870+
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
868871

869872
llama_batch_clear(batch);
870873

@@ -1552,7 +1555,7 @@ struct llama_server_context
15521555
}
15531556
else
15541557
{
1555-
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
1558+
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
15561559
}
15571560

15581561
slot.num_prompt_tokens = prompt_tokens.size();
@@ -1629,7 +1632,7 @@ struct llama_server_context
16291632
const bool has_images = process_images(slot);
16301633

16311634
// process the prefix of first image
1632-
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, true) : prompt_tokens;
1635+
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
16331636
for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
16341637
{
16351638
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot.n_past, { slot.id }, false);

gguf-py/gguf/vocab.py

+15-10
Original file line numberDiff line numberDiff line change
@@ -117,24 +117,29 @@ def _set_special_token(self, typ: str, tid: Any) -> None:
117117

118118
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
119119
tokenizer_file = path / 'tokenizer.json'
120-
if not tokenizer_file.is_file():
121-
return False
122-
with open(tokenizer_file, encoding = 'utf-8') as f:
123-
tokenizer = json.load(f)
124-
if self.load_merges:
125-
merges = tokenizer.get('model', {}).get('merges')
126-
if isinstance(merges, list) and merges and isinstance(merges[0], str):
127-
self.merges = merges
120+
if tokenizer_file.is_file():
121+
with open(tokenizer_file, encoding = 'utf-8') as f:
122+
tokenizer = json.load(f)
123+
if self.load_merges:
124+
merges = tokenizer.get('model', {}).get('merges')
125+
if isinstance(merges, list) and merges and isinstance(merges[0], str):
126+
self.merges = merges
127+
added_tokens = tokenizer.get('added_tokens', {})
128+
else:
129+
added_tokens = {}
128130
tokenizer_config_file = path / 'tokenizer_config.json'
129-
added_tokens = tokenizer.get('added_tokens')
130-
if added_tokens is None or not tokenizer_config_file.is_file():
131+
if not tokenizer_config_file.is_file():
131132
return True
132133
with open(tokenizer_config_file, encoding = 'utf-8') as f:
133134
tokenizer_config = json.load(f)
134135
for typ in self.special_token_types:
135136
add_entry = tokenizer_config.get(f'add_{typ}_token')
136137
if isinstance(add_entry, bool):
137138
self.add_special_token[typ] = add_entry
139+
if not added_tokens:
140+
# We will need this to get the content for the token, so if it's empty
141+
# may as well just give up.
142+
continue
138143
entry = tokenizer_config.get(f'{typ}_token')
139144
if isinstance(entry, str):
140145
tc_content = entry

gguf-py/pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "gguf"
3-
version = "0.5.2"
3+
version = "0.5.3"
44
description = "Read and write ML models in GGUF for GGML"
55
authors = ["GGML <[email protected]>"]
66
packages = [

gguf-py/scripts/gguf-dump.py

+8-7
Original file line numberDiff line numberDiff line change
@@ -86,13 +86,14 @@ def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None:
8686
curr["value"] = str(bytes(field.parts[-1]), encoding="utf-8")
8787
else:
8888
curr["value"] = field.parts[-1].tolist()[0]
89-
for idx, tensor in enumerate(reader.tensors):
90-
tensors[tensor.name] = {
91-
"index": idx,
92-
"shape": tensor.shape.tolist(),
93-
"type": tensor.tensor_type.name,
94-
"offset": tensor.field.offset,
95-
}
89+
if not args.no_tensors:
90+
for idx, tensor in enumerate(reader.tensors):
91+
tensors[tensor.name] = {
92+
"index": idx,
93+
"shape": tensor.shape.tolist(),
94+
"type": tensor.tensor_type.name,
95+
"offset": tensor.field.offset,
96+
}
9697
json.dump(result, sys.stdout)
9798

9899

llama.cpp

+32
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,8 @@ enum llm_kv {
255255
LLM_KV_TOKENIZER_UNK_ID,
256256
LLM_KV_TOKENIZER_SEP_ID,
257257
LLM_KV_TOKENIZER_PAD_ID,
258+
LLM_KV_TOKENIZER_ADD_BOS,
259+
LLM_KV_TOKENIZER_ADD_EOS,
258260
LLM_KV_TOKENIZER_HF_JSON,
259261
LLM_KV_TOKENIZER_RWKV,
260262
};
@@ -303,6 +305,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
303305
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
304306
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
305307
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
308+
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
309+
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
306310
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
307311
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
308312
};
@@ -1276,6 +1280,9 @@ struct llama_vocab {
12761280
id special_sep_id = -1;
12771281
id special_pad_id = -1;
12781282

1283+
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
1284+
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
1285+
12791286
id linefeed_id = 13;
12801287
id special_prefix_id = 32007;
12811288
id special_middle_id = 32009;
@@ -2388,6 +2395,23 @@ static void llm_load_vocab(
23882395
__func__, key.c_str(), id, old_id);
23892396
id = old_id;
23902397
}
2398+
2399+
}
2400+
2401+
// Handle add_bos_token and add_eos_token
2402+
std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
2403+
int kid = gguf_find_key(ctx, key.c_str());
2404+
enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
2405+
vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
2406+
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
2407+
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
2408+
}
2409+
key = kv(LLM_KV_TOKENIZER_ADD_EOS);
2410+
kid = gguf_find_key(ctx, key.c_str());
2411+
ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
2412+
vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
2413+
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
2414+
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
23912415
}
23922416
}
23932417

@@ -9288,6 +9312,14 @@ llama_token llama_token_nl(const struct llama_model * model) {
92889312
return model->vocab.linefeed_id;
92899313
}
92909314

9315+
int llama_add_bos_token(const struct llama_model * model) {
9316+
return model->vocab.special_add_bos;
9317+
}
9318+
9319+
int llama_add_eos_token(const struct llama_model * model) {
9320+
return model->vocab.special_add_eos;
9321+
}
9322+
92919323
llama_token llama_token_prefix(const struct llama_model * model) {
92929324
return model->vocab.special_prefix_id;
92939325
}

llama.h

+6
Original file line numberDiff line numberDiff line change
@@ -517,6 +517,12 @@ extern "C" {
517517
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
518518
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
519519

520+
// Returns -1 if unknown, 1 for true or 0 for false.
521+
LLAMA_API int llama_add_bos_token(const struct llama_model * model);
522+
523+
// Returns -1 if unknown, 1 for true or 0 for false.
524+
LLAMA_API int llama_add_eos_token(const struct llama_model * model);
525+
520526
// codellama infill tokens
521527
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
522528
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle

0 commit comments

Comments
 (0)