jmont-dev · jmont-dev · Mar 30, 2025 · Mar 29, 2025 · Mar 30, 2025 · Mar 30, 2025
diff --git a/README.md b/README.md
@@ -227,17 +227,20 @@ You can use a streaming generation to bind a callback function that is invoked e
 
 ```C++
 
-void on_receive_response(const ollama::response& response)
+bool on_receive_response(const ollama::response& response)
 {   
     // Print the token received
     std::cout << response << std::flush;
 
     // The server will set "done" to true for the last response
     if (response.as_json()["done"]==true) std::cout << std::endl;
+
+    // Return true to continue streaming this response; return false to stop immediately.
+    return true;
 }
 
 // This function will be called every token
-std::function<void(const ollama::response&)> response_callback = on_receive_response;  
+std::function<bool(const ollama::response&)> response_callback = on_receive_response;  
 
 // Bind the callback to the generation
 ollama::generate("llama3:8b", "Why is the sky blue?", response_callback);
@@ -251,16 +254,19 @@ You can launch a streaming call in a thread if you don't want it to block the pr
 
 std::atomic<bool> done{false};
 
-void on_receive_response(const ollama::response& response)
+bool on_receive_response(const ollama::response& response)
 {   
     std::cout << response << std::flush;
 
     if (response.as_json()["done"]==true) { done=true;  std::cout << std::endl;}
+
+    // Return true to continue streaming this response; return false to stop immediately.
+    return !done;
 }
 
 // Use std::function to define a callback from an existing function
 // You can also use a lambda with an equivalent signature
-std::function<void(const ollama::response&)> response_callback = on_receive_response;  
+std::function<bool(const ollama::response&)> response_callback = on_receive_response;  
 
 // You can launch the generation in a thread with a callback to use it asynchronously.
 std::thread new_thread( [response_callback]{ 
@@ -270,6 +276,8 @@ std::thread new_thread( [response_callback]{
 while (!done) { std::this_thread::sleep_for(std::chrono::microseconds(100) ); }
 new_thread.join();
 ```
+The return value of the function determines whether to continue streaming or stop. This is useful in cases where you want to stop immediately instead of waiting for an entire response to return.
+
 ### Using Images
 Generations can include images for vision-enabled models such as `llava`. The `ollama::image` class can load an image from a file and encode it as a [base64](https://en.wikipedia.org/wiki/Base64) string.
 
@@ -352,14 +360,17 @@ The default chat generation does not stream tokens and will return the entire re
 
 ```C++
 
-void on_receive_response(const ollama::response& response)
+bool on_receive_response(const ollama::response& response)
 {   
   std::cout << response << std::flush;
 
   if (response.as_json()["done"]==true) std::cout << std::endl;
+
+  // Return true to continue streaming, or false to stop immediately
+  return true;
 }
 
-std::function<void(const ollama::response&)> response_callback = on_receive_response;  
+std::function<bool(const ollama::response&)> response_callback = on_receive_response;  
 
 ollama::message message("user", "Why is the sky blue?");       
 

diff --git a/examples/main.cpp b/examples/main.cpp
@@ -12,11 +12,13 @@ using json = nlohmann::json;
 
 std::atomic<bool> done{false};
 
-void on_receive_response(const ollama::response& response)
+bool on_receive_response(const ollama::response& response)
 {   
     std::cout << response << std::flush;
 
     if (response.as_json()["done"]==true) { done=true;  std::cout << std::endl;}
+
+    return !done; // Return true to continue streaming this response; return false to stop immediately.
 }
 
 // Install ollama, llama3, and llava first to run this demo
@@ -130,13 +132,14 @@ int main()
     // Perform a simple generation which includes model options.
     std::cout << ollama::generate("llama3:8b", "Why is the sky green?", options) << std::endl;
 
-    std::function<void(const ollama::response&)> response_callback = on_receive_response;  
+    std::function<bool(const ollama::response&)> response_callback = on_receive_response;  
     ollama::generate("llama3:8b", "Why is the sky orange?", response_callback);
 
     // You can launch the generation in a thread with a callback to use it asynchronously.
     std::thread new_thread( [response_callback]{ ollama::generate("llama3:8b", "Why is the sky gray?", response_callback); } );
 
     // Prevent the main thread from exiting while we wait for an asynchronous response.
+    // Alternatively, we can set done=true to stop this thread immediately.
     while (!done) { std::this_thread::sleep_for(std::chrono::microseconds(100) ); }
     new_thread.join();
 

diff --git a/include/ollama.hpp b/include/ollama.hpp
@@ -419,22 +419,22 @@ class Ollama
         return response;        
     }
 
-    bool generate(const std::string& model,const std::string& prompt, ollama::response& context, std::function<void(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
+    bool generate(const std::string& model,const std::string& prompt, ollama::response& context, std::function<bool(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
     {
         ollama::request request(model, prompt, options, true, images);
         if ( context.as_json().contains("context") ) request["context"] = context.as_json()["context"];
         return generate(request, on_receive_token);
     }
 
-    bool generate(const std::string& model,const std::string& prompt, std::function<void(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
+    bool generate(const std::string& model,const std::string& prompt, std::function<bool(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
     {
         ollama::request request(model, prompt, options, true, images);
         return generate(request, on_receive_token);
     }
 
 
     // Generate a streaming reply where a user-defined callback function is invoked when each token is received.
-    bool generate(ollama::request& request, std::function<void(const ollama::response&)> on_receive_token)
+    bool generate(ollama::request& request, std::function<bool(const ollama::response&)> on_receive_token)
     {
         request["stream"] = true;
 
@@ -446,22 +446,25 @@ class Ollama
         auto stream_callback = [on_receive_token, partial_responses](const char *data, size_t data_length)->bool{
 
             std::string message(data, data_length);
+            bool continue_stream = true;
+
             if (ollama::log_replies) std::cout << message << std::endl;
             try 
             {   
                 partial_responses->push_back(message);
                 std::string total_response = std::accumulate(partial_responses->begin(), partial_responses->end(), std::string(""));                
                 ollama::response response(total_response);
                 partial_responses->clear();  
-                on_receive_token(response); 
+                continue_stream = on_receive_token(response); 
             }
             catch (const ollama::invalid_json_exception& e) { /* Partial response was received. Will do nothing and attempt to concatenate with the next response. */ }
 
-            return true;
+            return continue_stream;
         };
 
         if (auto res = this->cli->Post("/api/generate", request_string, "application/json", stream_callback)) { return true; }
-        else { if (ollama::use_exceptions) throw ollama::exception( "No response from server returned at URL"+this->server_url+" Error: "+httplib::to_string( res.error() ) ); } 
+        else if (res.error()==httplib::Error::Canceled) { /* Request cancelled by user. */ return true; }        
+        else { if (ollama::use_exceptions) throw ollama::exception( "No response from server returned at URL "+this->server_url+" Error: "+httplib::to_string( res.error() ) ); } 
 
         return false;
     }
@@ -498,14 +501,14 @@ class Ollama
         return response;
     }
 
-    bool chat(const std::string& model, const ollama::messages& messages, std::function<void(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
+    bool chat(const std::string& model, const ollama::messages& messages, std::function<bool(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
     {
         ollama::request request(model, messages, options, true, format, keep_alive_duration);
         return chat(request, on_receive_token);
     }
 
 
-    bool chat(ollama::request& request, std::function<void(const ollama::response&)> on_receive_token)
+    bool chat(ollama::request& request, std::function<bool(const ollama::response&)> on_receive_token)
     {
         ollama::response response;        
         request["stream"] = true;
@@ -518,6 +521,8 @@ class Ollama
         auto stream_callback = [on_receive_token, partial_responses](const char *data, size_t data_length)->bool{
 
             std::string message(data, data_length);
+            bool continue_stream = true;
+
             if (ollama::log_replies) std::cout << message << std::endl;
             try 
             {   
@@ -527,14 +532,15 @@ class Ollama
                 partial_responses->clear();  
 
                 if ( response.has_error() ) { if (ollama::use_exceptions) throw ollama::exception("Ollama response returned error: "+response.get_error() ); }
-                on_receive_token(response);
+                continue_stream = on_receive_token(response);
             }
             catch (const ollama::invalid_json_exception& e) { /* Partial response was received. Will do nothing and attempt to concatenate with the next response. */ }
 
-            return true;
+            return continue_stream;
         };
 
         if (auto res = this->cli->Post("/api/chat", request_string, "application/json", stream_callback)) { return true; }
+        else if (res.error()==httplib::Error::Canceled) { /* Request cancelled by user. */ return true; }
         else { if (ollama::use_exceptions) throw ollama::exception( "No response from server returned at URL"+this->server_url+" Error: "+httplib::to_string( res.error() ) ); }
 
         return false;
@@ -872,7 +878,7 @@ class Ollama
     private:
 
 /*
-    bool send_request(const ollama::request& request, std::function<void(const ollama::response&)> on_receive_response=nullptr)
+    bool send_request(const ollama::request& request, std::function<bool(const ollama::response&)> on_receive_response=nullptr)
     {
 
         return true;
@@ -910,17 +916,17 @@ namespace ollama
         return ollama.generate(request);
     }
 
-    inline bool generate(const std::string& model,const std::string& prompt, std::function<void(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
+    inline bool generate(const std::string& model,const std::string& prompt, std::function<bool(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
     {
         return ollama.generate(model, prompt, on_receive_response, options, images);
     }
 
-    inline bool generate(const std::string& model,const std::string& prompt, ollama::response& context, std::function<void(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
+    inline bool generate(const std::string& model,const std::string& prompt, ollama::response& context, std::function<bool(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
     {
         return ollama.generate(model, prompt, context, on_receive_response, options, images);
     }
 
-    inline bool generate(ollama::request& request, std::function<void(const ollama::response&)> on_receive_response)
+    inline bool generate(ollama::request& request, std::function<bool(const ollama::response&)> on_receive_response)
     {
         return ollama.generate(request, on_receive_response);
     }
@@ -935,12 +941,12 @@ namespace ollama
         return ollama.chat(request);
     }
 
-    inline bool chat(const std::string& model, const ollama::messages& messages, std::function<void(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
+    inline bool chat(const std::string& model, const ollama::messages& messages, std::function<bool(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
     {
         return ollama.chat(model, messages, on_receive_response, options, format, keep_alive_duration);
     }
 
-    inline bool chat(ollama::request& request, std::function<void(const ollama::response&)> on_receive_response)
+    inline bool chat(ollama::request& request, std::function<bool(const ollama::response&)> on_receive_response)
     {
         return ollama.chat(request, on_receive_response);
     }