examples : Add tokenize

zakkor · zakkor · commit d8d898e32624 · 2023-11-11T18:21:20.000+02:00
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -24,6 +24,7 @@ else()
     add_subdirectory(llama-bench)
     add_subdirectory(llava)
     add_subdirectory(main)
+    add_subdirectory(tokenize)
     add_subdirectory(parallel)
     add_subdirectory(perplexity)
     add_subdirectory(quantize)
diff --git a/examples/tokenize/CMakeLists.txt b/examples/tokenize/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET tokenize)
+add_executable(${TARGET} tokenize.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp
@@ -0,0 +1,61 @@
+#include "common.h"
+#include "llama.h"
+
+#include <cmath>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+template <typename C, typename T>
+inline std::string LOG_TOKENS_TOSTR_LINES(const C & ctx, const T & tokens)
+{
+    std::stringstream buf;
+
+    for (const auto &token : tokens)
+    {
+        auto detokenized = llama_token_to_piece(ctx, token);
+
+        detokenized.erase(
+            std::remove_if(
+                detokenized.begin(),
+                detokenized.end(),
+                [](const unsigned char c) { return !std::isprint(c); }),
+            detokenized.end());
+
+        buf << std::to_string(token) << "=" << detokenized << std::endl;
+    }
+
+    return buf.str();
+}
+
+int main(int argc, char ** argv) {
+    gpt_params params;
+
+    if (argc != 3 || argv[1][0] == '-') {
+        printf("usage: %s MODEL_PATH PROMPT\n" , argv[0]);
+        return 1;
+    }
+
+    params.model = argv[1];
+
+    params.prompt = argv[2];
+
+    llama_backend_init(params.numa);
+
+    llama_model_params model_params = llama_model_default_params();
+    model_params.vocab_only = true;
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+
+    llama_context_params ctx_params = llama_context_default_params();
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+
+    const bool add_bos = true;
+
+    std::vector<llama_token> tokens;
+
+    tokens = ::llama_tokenize(model, params.prompt, add_bos, true);
+
+    std::cout << LOG_TOKENS_TOSTR_LINES(ctx, tokens).c_str();
+
+    return 0;
+}