Merge branch 'main' into DRY

llmixer · llmixer · commit 07f8fe99ed99 · 2024-11-25T17:33:07.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.2]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@74d73dc85cc2057446bf63cc37ff649ae7cebd80
+
 ## [0.3.1]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@c919d5db39c8a7fcb64737f008e4b105ee0acd20
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -6,6 +6,10 @@ option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python
 option(LLAVA_BUILD "Build llava shared library and install alongside python package" ON)
 
 function(llama_cpp_python_install_target target)
+    if(NOT TARGET ${target})
+        return()
+    endif()
+
     install(
         TARGETS ${target}
         LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
@@ -75,7 +79,22 @@ if (LLAMA_BUILD)
     add_subdirectory(vendor/llama.cpp)
     llama_cpp_python_install_target(llama)
     llama_cpp_python_install_target(ggml)
-    
+
+    llama_cpp_python_install_target(ggml-base)
+
+    llama_cpp_python_install_target(ggml-amx)
+    llama_cpp_python_install_target(ggml-blas)
+    llama_cpp_python_install_target(ggml-can)
+    llama_cpp_python_install_target(ggml-cpu)
+    llama_cpp_python_install_target(ggml-cuda)
+    llama_cpp_python_install_target(ggml-hip)
+    llama_cpp_python_install_target(ggml-kompute)
+    llama_cpp_python_install_target(ggml-metal)
+    llama_cpp_python_install_target(ggml-musa)
+    llama_cpp_python_install_target(ggml-rpc)
+    llama_cpp_python_install_target(ggml-sycl)
+    llama_cpp_python_install_target(ggml-vulkan)
+
     # Workaround for Windows + CUDA https://github.com/abetlen/llama-cpp-python/issues/563
     if (WIN32)
         install(
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.1"
+__version__ = "0.3.2"
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -362,13 +362,6 @@ def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: i
             self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep
         )
 
-    def sample_tail_free(
-        self, candidates: "_LlamaTokenDataArray", z: float, min_keep: int
-    ):
-        llama_cpp.llama_sample_tail_free(
-            self.ctx, llama_cpp.byref(candidates.candidates), z, min_keep
-        )
-
     def sample_typical(
         self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int
     ):
@@ -685,9 +678,6 @@ def sample(
                 ctx_main.sample_top_k(
                     token_data_array, self.params.top_k, min_keep=min_keep
                 )
-                ctx_main.sample_tail_free(
-                    token_data_array, self.params.tfs_z, min_keep=min_keep
-                )
                 ctx_main.sample_typical(
                     token_data_array, self.params.typical_p, min_keep=min_keep
                 )
@@ -776,10 +766,6 @@ def add_min_p(self, p: float, min_keep: int):
         sampler = llama_cpp.llama_sampler_init_min_p(p, min_keep)
         self._add_sampler(sampler)
 
-    def add_tail_free(self, z: float, min_keep: int):
-        sampler = llama_cpp.llama_sampler_init_tail_free(z, min_keep)
-        self._add_sampler(sampler)
-
     def add_typical(self, p: float, min_keep: int):
         sampler = llama_cpp.llama_sampler_init_typical(p, min_keep)
         self._add_sampler(sampler)
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -753,7 +753,6 @@ def apply_func(token_data_array: llama_cpp.llama_token_data_array_p):
                 min_keep = max(1, n_probs)
                 sampler.add_dry(self._model, dry_multiplier, dry_base, dry_allowed_length, dry_range, dry_seq_breakers)
                 sampler.add_top_k(top_k)
-                sampler.add_tail_free(tfs_z, min_keep)
                 sampler.add_typical(typical_p, min_keep)
                 sampler.add_top_p(top_p, min_keep)
                 sampler.add_min_p(min_p, min_keep)
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -3191,17 +3191,6 @@ def llama_sampler_init_min_p(p: float, min_keep: int) -> llama_sampler_p:
     ...
 
 
-# /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
-# LLAMA_API struct llama_sampler * llama_sampler_init_tail_free  (float   z, size_t min_keep);
-@ctypes_function(
-    "llama_sampler_init_tail_free",
-    [ctypes.c_float, ctypes.c_size_t],
-    llama_sampler_p_ctypes,
-)
-def llama_sampler_init_tail_free(z: float, min_keep: int) -> llama_sampler_p:
-    ...
-
-
 # /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
 # LLAMA_API struct llama_sampler * llama_sampler_init_typical    (float   p, size_t min_keep);
 @ctypes_function(
@@ -3375,6 +3364,41 @@ def llama_sampler_init_penalties(
     ...
 
 
+# ///  @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
+# LLAMA_API struct llama_sampler *    llama_sampler_init_dry(
+#         const struct llama_model *  model,
+#                            float    dry_multiplier,
+#                            float    dry_base,
+#                          int32_t    dry_allowed_length,
+#                          int32_t    dry_penalty_last_n,
+#                       const char ** seq_breakers,
+#                           size_t    num_breakers);
+@ctypes_function(
+    "llama_sampler_init_dry",
+    [
+        llama_model_p_ctypes,
+        ctypes.c_float,
+        ctypes.c_float,
+        ctypes.c_int32,
+        ctypes.c_int32,
+        ctypes.POINTER(ctypes.c_char_p),
+        ctypes.c_size_t,
+    ],
+    llama_sampler_p_ctypes,
+)
+def llama_sampler_init_dry(
+    model: llama_model_p,
+    dry_multiplier: float,
+    dry_base: float,
+    dry_allowed_length: int,
+    dry_penalty_last_n: int,
+    seq_breakers: CtypesArray[bytes],
+    num_breakers: int,
+    /,
+) -> llama_sampler_p:
+    ...
+
+
 # LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
 #                          int32_t   n_vocab,
 #                          int32_t   n_logit_bias,
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit c421ac072d46172ab18924e1e8be53680b54ed3b
+Subproject commit 74d73dc85cc2057446bf63cc37ff649ae7cebd80