Skip to content

Commit ff5a3f0

Browse files
goerchggerganov
andauthored
Work on the BPE tokenizer (#3252)
* Work on the BPE tokenizer Tokenizer tests work for Falcon-7B * Try to fix build problem * Fix debug assertion failure * Fix MSVC Unicode BOM problem * Cleanup and an improvement * Fix compiler warning * Cleanup * Test doesn't work over the full range of Unicodes * Update .gitignore and Makefile * Another Makefile rule * Testing Aquila * Moving byte decoding back to `token_to_piece` ... ... because everyone is using it. * Guarding some unusable code pathes * Streamlining code and adding some more assertions Important change: I'm classifying added tokens as control tokens now for BPE. * Adding a comment * Adding another assertion * Fixed vocabulary guarding assertions * Fix PR for recent change * Fix PR for recent change * Fix for compiler warning * Fix PR for recent change * Fix PR for recent change * Fix PR for recent change * Fix for compiler warning * Fixes for more compiler warnings * Remove unused code * Fix initialization of static maps * Add scores and token types back, adapt gptneox * Update llama.cpp Co-authored-by: Georgi Gerganov <[email protected]> * Update unicode.h Co-authored-by: Georgi Gerganov <[email protected]> * Update unicode.h Co-authored-by: Georgi Gerganov <[email protected]> * Ported Starcoder and added some assertions * Fix coding style * Apply @jploski 's fix for missing tokens --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 1c84003 commit ff5a3f0

15 files changed

+852
-227
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -91,4 +91,5 @@ tests/test-quantize-perf
9191
tests/test-sampling
9292
tests/test-tokenizer-0-llama
9393
tests/test-tokenizer-0-falcon
94-
tests/test-tokenizer-1
94+
tests/test-tokenizer-1-llama
95+
tests/test-tokenizer-1-bpe

Makefile

+7-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
33

44
# Binaries only useful for tests
5-
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
5+
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
66

77
# Code coverage output files
88
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -62,9 +62,11 @@ test: $(TEST_TARGETS)
6262
if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
6363
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
6464
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
65-
continue; \
65+
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
6666
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
6767
continue; \
68+
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
69+
continue; \
6870
else \
6971
echo "Running test $$test_target..."; \
7072
./$$test_target; \
@@ -670,6 +672,9 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h gg
670672
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
671673
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
672674

675+
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o common.o $(OBJS)
676+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
677+
673678
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
674679
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
675680

common/common.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -923,6 +923,7 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
923923
result += piece;
924924
}
925925

926+
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
926927
return result;
927928
}
928929

convert-falcon-hf-to-gguf.py

+7-40
Original file line numberDiff line numberDiff line change
@@ -20,28 +20,6 @@
2020
import gguf
2121

2222

23-
def bytes_to_unicode():
24-
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
25-
"""
26-
Returns list of utf-8 byte and a corresponding list of unicode strings.
27-
The reversible bpe codes work on unicode strings.
28-
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
29-
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
30-
This is a significant percentage of your normal, say, 32K bpe vocab.
31-
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
32-
And avoids mapping to whitespace/control characters the bpe code barfs on.
33-
"""
34-
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
35-
cs = bs[:]
36-
n = 0
37-
for b in range(2**8):
38-
if b not in bs:
39-
bs.append(b)
40-
cs.append(2**8+n)
41-
n += 1
42-
return dict(zip(bs, (chr(n) for n in cs)))
43-
44-
4523
def count_model_parts(dir_model: Path) -> int:
4624
num_parts = 0
4725
for filename in os.listdir(dir_model):
@@ -133,6 +111,8 @@ def parse_args() -> argparse.Namespace:
133111
print("gguf: get tokenizer metadata")
134112

135113
tokens: list[bytearray] = []
114+
scores: list[float] = []
115+
toktypes: list[int] = []
136116

137117
# gpt2 tokenizer
138118
gguf_writer.add_tokenizer_model("gpt2")
@@ -148,28 +128,15 @@ def parse_args() -> argparse.Namespace:
148128
assert max(tokenizer.vocab.values()) < vocab_size
149129

150130
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
151-
byte_encoder = bytes_to_unicode()
152-
byte_decoder = {v: k for k, v in byte_encoder.items()}
153131

154132
for i in range(vocab_size):
155-
if i in reverse_vocab:
156-
try:
157-
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
158-
except KeyError:
159-
text = bytearray()
160-
for c in reverse_vocab[i]:
161-
if ord(c) < 256: # single byte character
162-
text.append(byte_decoder[ord(c)])
163-
else: # multibyte special token character
164-
text.extend(c.encode('utf-8'))
165-
else:
166-
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
167-
pad_token = f"[PAD{i}]".encode("utf8")
168-
text = bytearray(pad_token)
169-
170-
tokens.append(text)
133+
tokens.append(reverse_vocab[i])
134+
scores.append(0.0) # dummy
135+
toktypes.append(gguf.TokenType.NORMAL)
171136

172137
gguf_writer.add_token_list(tokens)
138+
gguf_writer.add_token_scores(scores)
139+
gguf_writer.add_token_types(toktypes)
173140

174141
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
175142
special_vocab.add_to_gguf(gguf_writer)

convert-gptneox-hf-to-gguf.py

+7-41
Original file line numberDiff line numberDiff line change
@@ -19,29 +19,6 @@
1919
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
2020
import gguf
2121

22-
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
23-
24-
25-
def bytes_to_unicode():
26-
"""
27-
Returns list of utf-8 byte and a corresponding list of unicode strings.
28-
The reversible bpe codes work on unicode strings.
29-
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
30-
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
31-
This is a significant percentage of your normal, say, 32K bpe vocab.
32-
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
33-
And avoids mapping to whitespace/control characters the bpe code barfs on.
34-
"""
35-
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
36-
cs = bs[:]
37-
n = 0
38-
for b in range(2**8):
39-
if b not in bs:
40-
bs.append(b)
41-
cs.append(2**8+n)
42-
n += 1
43-
return dict(zip(bs, (chr(n) for n in cs)))
44-
4522

4623
def count_model_parts(dir_model: Path) -> int:
4724
num_parts = 0
@@ -130,6 +107,8 @@ def parse_args() -> argparse.Namespace:
130107
print("gguf: get tokenizer metadata")
131108

132109
tokens: list[bytearray] = []
110+
scores: list[float] = []
111+
toktypes: list[int] = []
133112

134113
# gpt2 tokenizer
135114
gguf_writer.add_tokenizer_model("gpt2")
@@ -145,28 +124,15 @@ def parse_args() -> argparse.Namespace:
145124
assert max(tokenizer.vocab.values()) < vocab_size
146125

147126
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
148-
byte_encoder = bytes_to_unicode()
149-
byte_decoder = {v: k for k, v in byte_encoder.items()}
150127

151128
for i in range(vocab_size):
152-
if i in reverse_vocab:
153-
try:
154-
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
155-
except KeyError:
156-
text = bytearray()
157-
for c in reverse_vocab[i]:
158-
if ord(c) < 256: # single byte character
159-
text.append(byte_decoder[ord(c)])
160-
else: # multibyte special token character
161-
text.extend(c.encode('utf-8'))
162-
else:
163-
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
164-
pad_token = f"[PAD{i}]".encode("utf8")
165-
text = bytearray(pad_token)
166-
167-
tokens.append(text)
129+
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
130+
scores.append(0.0) # dummy
131+
toktypes.append(gguf.TokenType.NORMAL)
168132

169133
gguf_writer.add_token_list(tokens)
134+
gguf_writer.add_token_scores(scores)
135+
gguf_writer.add_token_types(toktypes)
170136

171137
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
172138
special_vocab.add_to_gguf(gguf_writer)

convert-starcoder-hf-to-gguf.py

+7-40
Original file line numberDiff line numberDiff line change
@@ -20,28 +20,6 @@
2020
import gguf
2121

2222

23-
def bytes_to_unicode():
24-
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
25-
"""
26-
Returns list of utf-8 byte and a corresponding list of unicode strings.
27-
The reversible bpe codes work on unicode strings.
28-
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
29-
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
30-
This is a significant percentage of your normal, say, 32K bpe vocab.
31-
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
32-
And avoids mapping to whitespace/control characters the bpe code barfs on.
33-
"""
34-
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
35-
cs = bs[:]
36-
n = 0
37-
for b in range(2**8):
38-
if b not in bs:
39-
bs.append(b)
40-
cs.append(2**8+n)
41-
n += 1
42-
return dict(zip(bs, (chr(n) for n in cs)))
43-
44-
4523
def count_model_parts(dir_model: Path) -> int:
4624
num_parts = 0
4725
for filename in os.listdir(dir_model):
@@ -117,6 +95,8 @@ def parse_args() -> argparse.Namespace:
11795
print("gguf: get tokenizer metadata")
11896

11997
tokens: list[bytearray] = []
98+
scores: list[float] = []
99+
toktypes: list[int] = []
120100

121101
# gpt2 tokenizer
122102
gguf_writer.add_tokenizer_model("gpt2")
@@ -132,28 +112,15 @@ def parse_args() -> argparse.Namespace:
132112
assert max(tokenizer.vocab.values()) < vocab_size
133113

134114
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
135-
byte_encoder = bytes_to_unicode()
136-
byte_decoder = {v: k for k, v in byte_encoder.items()}
137115

138116
for i in range(vocab_size):
139-
if i in reverse_vocab:
140-
try:
141-
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
142-
except KeyError:
143-
text = bytearray()
144-
for c in reverse_vocab[i]:
145-
if ord(c) < 256: # single byte character
146-
text.append(byte_decoder[ord(c)])
147-
else: # multibyte special token character
148-
text.extend(c.encode('utf-8'))
149-
else:
150-
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
151-
pad_token = f"[PAD{i}]".encode("utf8")
152-
text = bytearray(pad_token)
153-
154-
tokens.append(text)
117+
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
118+
scores.append(0.0) # dummy
119+
toktypes.append(gguf.TokenType.NORMAL)
155120

156121
gguf_writer.add_token_list(tokens)
122+
gguf_writer.add_token_scores(scores)
123+
gguf_writer.add_token_types(toktypes)
157124

158125
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
159126
special_vocab.add_to_gguf(gguf_writer)

convert.py

+5-19
Original file line numberDiff line numberDiff line change
@@ -338,29 +338,15 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No
338338
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
339339
tokenizer = self.bpe_tokenizer
340340
from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import]
341-
byte_encoder = tokenization_gpt2.bytes_to_unicode()
342-
byte_decoder = {v: k for k, v in byte_encoder.items()}
343-
score = 0.0
344-
for i, item in enumerate(tokenizer):
345-
text: bytes = item.encode("utf-8")
346-
# FIXME: These shouldn't be hardcoded, but it's probably better than the current behavior?
347-
if i <= 258 and text.startswith(b'<') and text.endswith(b'>'):
348-
if i == 0 and text == b'<unk>':
349-
toktype = gguf.TokenType.UNKNOWN
350-
elif i == 1 or i == 2:
351-
toktype = gguf.TokenType.CONTROL
352-
elif i >= 3 and text.startswith(b'<0x'):
353-
toktype = gguf.TokenType.BYTE
354-
else:
355-
toktype = gguf.TokenType.NORMAL
356-
else:
357-
toktype = gguf.TokenType.NORMAL
358-
yield text, score, toktype
341+
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
342+
343+
for i, _ in enumerate(tokenizer):
344+
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
359345

360346
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
361347
for text in self.added_tokens_list:
362348
score = -1000.0
363-
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
349+
yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
364350

365351
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
366352
yield from self.bpe_tokens()

0 commit comments

Comments
 (0)