Allow returning probs w/ greedy sampling (negative temp)

brittlewis12 · brittlewis12 · commit 638c0ffb930b · 2023-11-17T16:46:00.000-05:00
* ggml-org/llama.cpp#3813
diff --git a/Sources/llmfarm_core_cpp/ggml/common.cpp b/Sources/llmfarm_core_cpp/ggml/common.cpp
@@ -52,6 +52,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params));
         } else if (arg == "--temp") {
             params.temp = std::stof(get_next_arg(i, argc, argv, arg, params));
+            params.temp = std::max(params.temp, 0.0f);
         } else if (arg == "--repeat-last-n") {
             params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params));
         } else if (arg == "--repeat-penalty") {
diff --git a/Sources/llmfarm_core_cpp/ggml/sampling.cpp b/Sources/llmfarm_core_cpp/ggml/sampling.cpp
@@ -121,8 +121,12 @@ llama_token llama_sampling_sample(
         llama_sample_grammar(ctx, &cur_p, ctx_seq.grammar);
     }
 
-    if (temp <= 0) {
-        // Greedy sampling
+    if (temp < 0.0) {
+        // greedy sampling, with probs
+        llama_sample_softmax(ctx_main, &cur_p);
+        id = cur_p.data[0].id;
+    } else if (temp == 0.0) {
+        // greedy sampling, no probs
         id = llama_sample_token_greedy(ctx, &cur_p);
     } else {
         if (mirostat == 1) {
diff --git a/Sources/llmfarm_core_cpp/spm-headers/llama.h b/Sources/llmfarm_core_cpp/spm-headers/llama.h
@@ -661,6 +661,7 @@ extern "C" {
                            float * mu);
 
     /// @details Selects the token with the highest probability.
+    ///          Does not compute the token probabilities. Use llama_sample_softmax() instead.
     LLAMA_API llama_token llama_sample_token_greedy(
             struct llama_context * ctx,
           llama_token_data_array * candidates);