Skip to content

Commit 12ec0a7

Browse files
ggerganovjaggzhdragnil1
authored andcommitted
llama : fix BPE pre-tokenization (ggml-org#6920)
* merged the changes from deepseeker models to main branch * Moved regex patterns to unicode.cpp and updated unicode.h * Moved header files * Resolved issues * added and refactored unicode_regex_split and related functions * Updated/merged the deepseek coder pr * Refactored code * Adding unicode regex mappings * Adding unicode regex function * Added needed functionality, testing remains * Fixed issues * Fixed issue with gpt2 regex custom preprocessor * unicode : fix? unicode_wstring_to_utf8 * lint : fix whitespaces * tests : add tokenizer tests for numbers * unicode : remove redundant headers * tests : remove and rename tokenizer test scripts * tests : add sample usage * gguf-py : reader prints warnings on duplicate keys * llama : towards llama3 tokenization support (wip) * unicode : shot in the dark to fix tests on Windows * unicode : first try custom implementations * convert : add "tokenizer.ggml.pre" GGUF KV (wip) * llama : use new pre-tokenizer type * convert : fix pre-tokenizer type writing * lint : fix * make : add test-tokenizer-0-llama-v3 * wip * models : add llama v3 vocab file * llama : adapt punctuation regex + add llama 3 regex * minor * unicode : set bomb * unicode : set bomb * unicode : always use std::wregex * unicode : support \p{N}, \p{L} and \p{P} natively * unicode : try fix windows * unicode : category support via std::regex * unicode : clean-up * unicode : simplify * convert : add convert-hf-to-gguf-update.py ggml-ci * lint : update * convert : add falcon ggml-ci * unicode : normalize signatures * lint : fix * lint : fix * convert : remove unused functions * convert : add comments * convert : exercise contractions ggml-ci * lint : fix * cmake : refactor test targets * tests : refactor vocab tests ggml-ci * tests : add more vocabs and tests ggml-ci * unicode : cleanup * scripts : ignore new update script in check-requirements.sh * models : add phi-3, mpt, gpt-2, starcoder * tests : disable obsolete ggml-ci * tests : use faster bpe test ggml-ci * llama : more prominent warning for old BPE models * tests : disable test-tokenizer-1-bpe due to slowness ggml-ci --------- Co-authored-by: Jaggzh <[email protected]> Co-authored-by: Kazim Abrar Mahi <[email protected]>
1 parent 4b19b01 commit 12ec0a7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+2909
-644
lines changed

.github/workflows/python-lint.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,4 @@ jobs:
2121
uses: py-actions/flake8@v2
2222
with:
2323
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
24-
exclude: "examples/*,examples/*/**,*/**/__init__.py"
24+
exclude: "examples/*,examples/*/**,*/**/__init__.py,convert-hf-to-gguf-update.py"

.gitignore

+15
Original file line numberDiff line numberDiff line change
@@ -108,3 +108,18 @@ examples/server/*.mjs.hpp
108108
poetry.lock
109109
poetry.toml
110110
nppBackup
111+
112+
# Test binaries
113+
/tests/test-grammar-parser
114+
/tests/test-llama-grammar
115+
/tests/test-double-float
116+
/tests/test-grad0
117+
/tests/test-opt
118+
/tests/test-quantize-fns
119+
/tests/test-quantize-perf
120+
/tests/test-sampling
121+
/tests/test-tokenizer-0
122+
/tests/test-tokenizer-1-spm
123+
/tests/test-tokenizer-1-bpe
124+
/tests/test-rope
125+
/tests/test-backend-ops

Makefile

+29-15
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,23 @@ BUILD_TARGETS = \
66

77
# Binaries only useful for tests
88
TEST_TARGETS = \
9-
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
10-
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
11-
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
12-
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
13-
tests/test-json-schema-to-grammar tests/test-grammar-integration
9+
tests/test-autorelease \
10+
tests/test-backend-ops \
11+
tests/test-double-float \
12+
tests/test-grad0 \
13+
tests/test-grammar-integration \
14+
tests/test-grammar-parser \
15+
tests/test-json-schema-to-grammar \
16+
tests/test-llama-grammar \
17+
tests/test-model-load-cancel \
18+
tests/test-opt \
19+
tests/test-quantize-fns \
20+
tests/test-quantize-perf \
21+
tests/test-rope \
22+
tests/test-sampling \
23+
tests/test-tokenizer-0 \
24+
tests/test-tokenizer-1-bpe \
25+
tests/test-tokenizer-1-spm
1426

1527
# Code coverage output files
1628
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -60,11 +72,17 @@ default: $(BUILD_TARGETS)
6072
test: $(TEST_TARGETS)
6173
@failures=0; \
6274
for test_target in $(TEST_TARGETS); do \
63-
if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
64-
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
65-
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
75+
if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
76+
./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
77+
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
78+
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
6679
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
67-
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
80+
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
81+
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
82+
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
83+
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
84+
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
85+
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
6886
continue; \
6987
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
7088
continue; \
@@ -982,19 +1000,15 @@ tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
9821000
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
9831001
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
9841002

985-
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
986-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
987-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
988-
989-
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
1003+
tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
9901004
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
9911005
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
9921006

9931007
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
9941008
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
9951009
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
9961010

997-
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
1011+
tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
9981012
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
9991013
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
10001014

common/common.cpp

+12
Original file line numberDiff line numberDiff line change
@@ -1693,6 +1693,18 @@ std::vector<std::string> string_split(std::string input, char separator) {
16931693
return parts;
16941694
}
16951695

1696+
std::string string_strip(const std::string & str) {
1697+
size_t start = 0;
1698+
size_t end = str.size();
1699+
while (start < end && std::isspace(str[start])) {
1700+
start++;
1701+
}
1702+
while (end > start && std::isspace(str[end - 1])) {
1703+
end--;
1704+
}
1705+
return str.substr(start, end - start);
1706+
}
1707+
16961708
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
16971709
std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
16981710
{"top_k", llama_sampler_type::TOP_K},

common/common.h

+1
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ bool validate_file_name(const std::string & filename);
196196
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
197197
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
198198
std::vector<std::string> string_split(std::string input, char separator);
199+
std::string string_strip(const std::string & str);
199200
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
200201

201202
//

0 commit comments

Comments
 (0)