Skip to content

Add support for stopping generation during a stream. #36

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 17 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -227,17 +227,20 @@ You can use a streaming generation to bind a callback function that is invoked e

```C++

void on_receive_response(const ollama::response& response)
bool on_receive_response(const ollama::response& response)
{
// Print the token received
std::cout << response << std::flush;

// The server will set "done" to true for the last response
if (response.as_json()["done"]==true) std::cout << std::endl;

// Return true to continue streaming this response; return false to stop immediately.
return true;
}

// This function will be called every token
std::function<void(const ollama::response&)> response_callback = on_receive_response;
std::function<bool(const ollama::response&)> response_callback = on_receive_response;

// Bind the callback to the generation
ollama::generate("llama3:8b", "Why is the sky blue?", response_callback);
Expand All @@ -251,16 +254,19 @@ You can launch a streaming call in a thread if you don't want it to block the pr

std::atomic<bool> done{false};

void on_receive_response(const ollama::response& response)
bool on_receive_response(const ollama::response& response)
{
std::cout << response << std::flush;

if (response.as_json()["done"]==true) { done=true; std::cout << std::endl;}

// Return true to continue streaming this response; return false to stop immediately.
return !done;
}

// Use std::function to define a callback from an existing function
// You can also use a lambda with an equivalent signature
std::function<void(const ollama::response&)> response_callback = on_receive_response;
std::function<bool(const ollama::response&)> response_callback = on_receive_response;

// You can launch the generation in a thread with a callback to use it asynchronously.
std::thread new_thread( [response_callback]{
Expand All @@ -270,6 +276,8 @@ std::thread new_thread( [response_callback]{
while (!done) { std::this_thread::sleep_for(std::chrono::microseconds(100) ); }
new_thread.join();
```
The return value of the function determines whether to continue streaming or stop. This is useful in cases where you want to stop immediately instead of waiting for an entire response to return.

### Using Images
Generations can include images for vision-enabled models such as `llava`. The `ollama::image` class can load an image from a file and encode it as a [base64](https://en.wikipedia.org/wiki/Base64) string.

Expand Down Expand Up @@ -352,14 +360,17 @@ The default chat generation does not stream tokens and will return the entire re

```C++

void on_receive_response(const ollama::response& response)
bool on_receive_response(const ollama::response& response)
{
std::cout << response << std::flush;

if (response.as_json()["done"]==true) std::cout << std::endl;

// Return true to continue streaming, or false to stop immediately
return true;
}

std::function<void(const ollama::response&)> response_callback = on_receive_response;
std::function<bool(const ollama::response&)> response_callback = on_receive_response;

ollama::message message("user", "Why is the sky blue?");

Expand Down
7 changes: 5 additions & 2 deletions examples/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@ using json = nlohmann::json;

std::atomic<bool> done{false};

void on_receive_response(const ollama::response& response)
bool on_receive_response(const ollama::response& response)
{
std::cout << response << std::flush;

if (response.as_json()["done"]==true) { done=true; std::cout << std::endl;}

return !done; // Return true to continue streaming this response; return false to stop immediately.
}

// Install ollama, llama3, and llava first to run this demo
Expand Down Expand Up @@ -130,13 +132,14 @@ int main()
// Perform a simple generation which includes model options.
std::cout << ollama::generate("llama3:8b", "Why is the sky green?", options) << std::endl;

std::function<void(const ollama::response&)> response_callback = on_receive_response;
std::function<bool(const ollama::response&)> response_callback = on_receive_response;
ollama::generate("llama3:8b", "Why is the sky orange?", response_callback);

// You can launch the generation in a thread with a callback to use it asynchronously.
std::thread new_thread( [response_callback]{ ollama::generate("llama3:8b", "Why is the sky gray?", response_callback); } );

// Prevent the main thread from exiting while we wait for an asynchronous response.
// Alternatively, we can set done=true to stop this thread immediately.
while (!done) { std::this_thread::sleep_for(std::chrono::microseconds(100) ); }
new_thread.join();

Expand Down
38 changes: 22 additions & 16 deletions include/ollama.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -419,22 +419,22 @@ class Ollama
return response;
}

bool generate(const std::string& model,const std::string& prompt, ollama::response& context, std::function<void(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
bool generate(const std::string& model,const std::string& prompt, ollama::response& context, std::function<bool(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
{
ollama::request request(model, prompt, options, true, images);
if ( context.as_json().contains("context") ) request["context"] = context.as_json()["context"];
return generate(request, on_receive_token);
}

bool generate(const std::string& model,const std::string& prompt, std::function<void(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
bool generate(const std::string& model,const std::string& prompt, std::function<bool(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
{
ollama::request request(model, prompt, options, true, images);
return generate(request, on_receive_token);
}


// Generate a streaming reply where a user-defined callback function is invoked when each token is received.
bool generate(ollama::request& request, std::function<void(const ollama::response&)> on_receive_token)
bool generate(ollama::request& request, std::function<bool(const ollama::response&)> on_receive_token)
{
request["stream"] = true;

Expand All @@ -446,22 +446,25 @@ class Ollama
auto stream_callback = [on_receive_token, partial_responses](const char *data, size_t data_length)->bool{

std::string message(data, data_length);
bool continue_stream = true;

if (ollama::log_replies) std::cout << message << std::endl;
try
{
partial_responses->push_back(message);
std::string total_response = std::accumulate(partial_responses->begin(), partial_responses->end(), std::string(""));
ollama::response response(total_response);
partial_responses->clear();
on_receive_token(response);
continue_stream = on_receive_token(response);
}
catch (const ollama::invalid_json_exception& e) { /* Partial response was received. Will do nothing and attempt to concatenate with the next response. */ }

return true;
return continue_stream;
};

if (auto res = this->cli->Post("/api/generate", request_string, "application/json", stream_callback)) { return true; }
else { if (ollama::use_exceptions) throw ollama::exception( "No response from server returned at URL"+this->server_url+" Error: "+httplib::to_string( res.error() ) ); }
else if (res.error()==httplib::Error::Canceled) { /* Request cancelled by user. */ return true; }
else { if (ollama::use_exceptions) throw ollama::exception( "No response from server returned at URL "+this->server_url+" Error: "+httplib::to_string( res.error() ) ); }

return false;
}
Expand Down Expand Up @@ -498,14 +501,14 @@ class Ollama
return response;
}

bool chat(const std::string& model, const ollama::messages& messages, std::function<void(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
bool chat(const std::string& model, const ollama::messages& messages, std::function<bool(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
{
ollama::request request(model, messages, options, true, format, keep_alive_duration);
return chat(request, on_receive_token);
}


bool chat(ollama::request& request, std::function<void(const ollama::response&)> on_receive_token)
bool chat(ollama::request& request, std::function<bool(const ollama::response&)> on_receive_token)
{
ollama::response response;
request["stream"] = true;
Expand All @@ -518,6 +521,8 @@ class Ollama
auto stream_callback = [on_receive_token, partial_responses](const char *data, size_t data_length)->bool{

std::string message(data, data_length);
bool continue_stream = true;

if (ollama::log_replies) std::cout << message << std::endl;
try
{
Expand All @@ -527,14 +532,15 @@ class Ollama
partial_responses->clear();

if ( response.has_error() ) { if (ollama::use_exceptions) throw ollama::exception("Ollama response returned error: "+response.get_error() ); }
on_receive_token(response);
continue_stream = on_receive_token(response);
}
catch (const ollama::invalid_json_exception& e) { /* Partial response was received. Will do nothing and attempt to concatenate with the next response. */ }

return true;
return continue_stream;
};

if (auto res = this->cli->Post("/api/chat", request_string, "application/json", stream_callback)) { return true; }
else if (res.error()==httplib::Error::Canceled) { /* Request cancelled by user. */ return true; }
else { if (ollama::use_exceptions) throw ollama::exception( "No response from server returned at URL"+this->server_url+" Error: "+httplib::to_string( res.error() ) ); }

return false;
Expand Down Expand Up @@ -872,7 +878,7 @@ class Ollama
private:

/*
bool send_request(const ollama::request& request, std::function<void(const ollama::response&)> on_receive_response=nullptr)
bool send_request(const ollama::request& request, std::function<bool(const ollama::response&)> on_receive_response=nullptr)
{

return true;
Expand Down Expand Up @@ -910,17 +916,17 @@ namespace ollama
return ollama.generate(request);
}

inline bool generate(const std::string& model,const std::string& prompt, std::function<void(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
inline bool generate(const std::string& model,const std::string& prompt, std::function<bool(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
{
return ollama.generate(model, prompt, on_receive_response, options, images);
}

inline bool generate(const std::string& model,const std::string& prompt, ollama::response& context, std::function<void(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
inline bool generate(const std::string& model,const std::string& prompt, ollama::response& context, std::function<bool(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
{
return ollama.generate(model, prompt, context, on_receive_response, options, images);
}

inline bool generate(ollama::request& request, std::function<void(const ollama::response&)> on_receive_response)
inline bool generate(ollama::request& request, std::function<bool(const ollama::response&)> on_receive_response)
{
return ollama.generate(request, on_receive_response);
}
Expand All @@ -935,12 +941,12 @@ namespace ollama
return ollama.chat(request);
}

inline bool chat(const std::string& model, const ollama::messages& messages, std::function<void(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
inline bool chat(const std::string& model, const ollama::messages& messages, std::function<bool(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
{
return ollama.chat(model, messages, on_receive_response, options, format, keep_alive_duration);
}

inline bool chat(ollama::request& request, std::function<void(const ollama::response&)> on_receive_response)
inline bool chat(ollama::request& request, std::function<bool(const ollama::response&)> on_receive_response)
{
return ollama.chat(request, on_receive_response);
}
Expand Down
Loading