Skip to content

Add support for manual requests for generate, chat, and embedding endpoints. #20

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ The test cases do a good job of providing discrete examples for each of the API
- [Chat with Images](#chat-with-images)
- [Embedding Generation](#embedding-generation)
- [Debug Information](#debug-information)
- [Manual Requests](#manual-requests)
- [Single-header vs Separate Headers](#single-header-vs-separate-headers)
- [About this software:](#about-this-software)
- [License](#license)
Expand Down Expand Up @@ -389,7 +390,20 @@ Debug logging for requests and replies to the server can easily be turned on and
ollama::show_requests(true);
ollama::show_replies(true);
```


### Manual Requests
For those looking for greater control of the requests sent to the ollama server, manual requests can be created through the `ollama::request` class. This class extends `nlohmann::json` and can be treated as a standard JSON object.

```C++
ollama::request request(ollama::message_type::generation);
request["model"]="mistral";
request["prompt"]="Why is the sky blue?";
request["stream"] = false;
request["system"] = "Talk like a pirate for the next reply."
std::cout << ollama::generate(request) << std::endl;
```
This provides the most customization of the request. Users should take care to ensure that valid fields are provided, otherwise an exception will likely be thrown on response. Manual requests can be made for generate, chat, and embedding endpoints.

## Single-header vs Separate Headers
For convenience, ollama-hpp includes a single-header version of the library in `singleheader/ollama.hpp` which bundles the core ollama.hpp code with single-header versions of nlohmann json, httplib, and base64.h. Each of these libraries is available under the MIT license and their respective licenses are included.
The single-header include can be regenerated from these standalone files by running `./make_single_header.sh`
Expand Down
66 changes: 58 additions & 8 deletions include/ollama.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -377,12 +377,16 @@ class Ollama
Ollama(): Ollama("http://localhost:11434") {}
~Ollama() { delete this->cli; }

// Generate a non-streaming reply as a string.
ollama::response generate(const std::string& model,const std::string& prompt, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
{
ollama::request request(model, prompt, options, false, images);
return generate(request);
}

// Generate a non-streaming reply as a string.
ollama::response generate(const ollama::request& request)
{
ollama::response response;
ollama::request request(model, prompt, options, false, images);

std::string request_string = request.dump();
if (ollama::log_requests) std::cout << request_string << std::endl;
Expand All @@ -400,14 +404,19 @@ class Ollama
if (ollama::use_exceptions) throw ollama::exception("No response returned from server "+this->server_url+". Error was: "+httplib::to_string( res.error() ));
}

return response;
return response;
}

// Generate a streaming reply where a user-defined callback function is invoked when each token is received.
bool generate(const std::string& model,const std::string& prompt, std::function<void(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
{

ollama::request request(model, prompt, options, true, images);
return generate(request, on_receive_token);
}


// Generate a streaming reply where a user-defined callback function is invoked when each token is received.
bool generate(ollama::request& request, std::function<void(const ollama::response&)> on_receive_token)
{

std::string request_string = request.dump();
if (ollama::log_requests) std::cout << request_string << std::endl;
Expand All @@ -428,12 +437,17 @@ class Ollama
return false;
}

// Generate a non-streaming reply as a string.
ollama::response chat(const std::string& model, const ollama::messages& messages, json options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
{
ollama::request request(model, messages, options, false, format, keep_alive_duration);
return chat(request);
}


// Generate a non-streaming reply as a string.
ollama::response chat(ollama::request& request)
{
ollama::response response;
ollama::request request(model, messages, options, false, format, keep_alive_duration);

std::string request_string = request.dump();
if (ollama::log_requests) std::cout << request_string << std::endl;
Expand All @@ -456,9 +470,14 @@ class Ollama

bool chat(const std::string& model, const ollama::messages& messages, std::function<void(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
{
ollama::request request(model, messages, options, true, format, keep_alive_duration);
return chat(request, on_receive_token);
}


bool chat(ollama::request& request, std::function<void(const ollama::response&)> on_receive_token)
{
ollama::response response;
ollama::request request(model, messages, options, true, format, keep_alive_duration);

std::string request_string = request.dump();
if (ollama::log_requests) std::cout << request_string << std::endl;
Expand Down Expand Up @@ -719,6 +738,12 @@ class Ollama
ollama::response generate_embeddings(const std::string& model, const std::string& input, const json& options=nullptr, bool truncate = true, const std::string& keep_alive_duration="5m")
{
ollama::request request = ollama::request::from_embedding(model, input, options, truncate, keep_alive_duration);
return generate_embeddings(request);
}


ollama::response generate_embeddings(ollama::request& request)
{
ollama::response response;

std::string request_string = request.dump();
Expand Down Expand Up @@ -806,21 +831,41 @@ namespace ollama
return ollama.generate(model, prompt, options, images);
}

inline ollama::response generate(const ollama::request& request)
{
return ollama.generate(request);
}

inline bool generate(const std::string& model,const std::string& prompt, std::function<void(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
{
return ollama.generate(model, prompt, on_receive_response, options, images);
}

inline bool generate(ollama::request& request, std::function<void(const ollama::response&)> on_receive_response)
{
return ollama.generate(request, on_receive_response);
}

inline ollama::response chat(const std::string& model, const ollama::messages& messages, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
{
return ollama.chat(model, messages, options, format, keep_alive_duration);
}

inline ollama::response chat(ollama::request& request)
{
return ollama.chat(request);
}

inline bool chat(const std::string& model, const ollama::messages& messages, std::function<void(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
{
return ollama.chat(model, messages, on_receive_response, options, format, keep_alive_duration);
}

inline bool chat(ollama::request& request, std::function<void(const ollama::response&)> on_receive_response)
{
return ollama.chat(request, on_receive_response);
}

inline bool create(const std::string& modelName, const std::string& modelFile, bool loadFromFile=true)
{
return ollama.create_model(modelName, modelFile, loadFromFile);
Expand Down Expand Up @@ -891,6 +936,11 @@ namespace ollama
return ollama.generate_embeddings(model, input, options, truncate, keep_alive_duration);
}

inline ollama::response generate_embeddings(ollama::request& request)
{
return ollama.generate_embeddings(request);
}

inline void setReadTimeout(const int& seconds)
{
ollama.setReadTimeout(seconds);
Expand Down
66 changes: 58 additions & 8 deletions singleheader/ollama.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35167,12 +35167,16 @@ class Ollama
Ollama(): Ollama("http://localhost:11434") {}
~Ollama() { delete this->cli; }

// Generate a non-streaming reply as a string.
ollama::response generate(const std::string& model,const std::string& prompt, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
{
ollama::request request(model, prompt, options, false, images);
return generate(request);
}

// Generate a non-streaming reply as a string.
ollama::response generate(const ollama::request& request)
{
ollama::response response;
ollama::request request(model, prompt, options, false, images);

std::string request_string = request.dump();
if (ollama::log_requests) std::cout << request_string << std::endl;
Expand All @@ -35190,14 +35194,19 @@ class Ollama
if (ollama::use_exceptions) throw ollama::exception("No response returned from server "+this->server_url+". Error was: "+httplib::to_string( res.error() ));
}

return response;
return response;
}

// Generate a streaming reply where a user-defined callback function is invoked when each token is received.
bool generate(const std::string& model,const std::string& prompt, std::function<void(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
{

ollama::request request(model, prompt, options, true, images);
return generate(request, on_receive_token);
}


// Generate a streaming reply where a user-defined callback function is invoked when each token is received.
bool generate(ollama::request& request, std::function<void(const ollama::response&)> on_receive_token)
{

std::string request_string = request.dump();
if (ollama::log_requests) std::cout << request_string << std::endl;
Expand All @@ -35218,12 +35227,17 @@ class Ollama
return false;
}

// Generate a non-streaming reply as a string.
ollama::response chat(const std::string& model, const ollama::messages& messages, json options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
{
ollama::request request(model, messages, options, false, format, keep_alive_duration);
return chat(request);
}


// Generate a non-streaming reply as a string.
ollama::response chat(ollama::request& request)
{
ollama::response response;
ollama::request request(model, messages, options, false, format, keep_alive_duration);

std::string request_string = request.dump();
if (ollama::log_requests) std::cout << request_string << std::endl;
Expand All @@ -35246,9 +35260,14 @@ class Ollama

bool chat(const std::string& model, const ollama::messages& messages, std::function<void(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
{
ollama::request request(model, messages, options, true, format, keep_alive_duration);
return chat(request, on_receive_token);
}


bool chat(ollama::request& request, std::function<void(const ollama::response&)> on_receive_token)
{
ollama::response response;
ollama::request request(model, messages, options, true, format, keep_alive_duration);

std::string request_string = request.dump();
if (ollama::log_requests) std::cout << request_string << std::endl;
Expand Down Expand Up @@ -35509,6 +35528,12 @@ class Ollama
ollama::response generate_embeddings(const std::string& model, const std::string& input, const json& options=nullptr, bool truncate = true, const std::string& keep_alive_duration="5m")
{
ollama::request request = ollama::request::from_embedding(model, input, options, truncate, keep_alive_duration);
return generate_embeddings(request);
}


ollama::response generate_embeddings(ollama::request& request)
{
ollama::response response;

std::string request_string = request.dump();
Expand Down Expand Up @@ -35596,21 +35621,41 @@ namespace ollama
return ollama.generate(model, prompt, options, images);
}

inline ollama::response generate(const ollama::request& request)
{
return ollama.generate(request);
}

inline bool generate(const std::string& model,const std::string& prompt, std::function<void(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
{
return ollama.generate(model, prompt, on_receive_response, options, images);
}

inline bool generate(ollama::request& request, std::function<void(const ollama::response&)> on_receive_response)
{
return ollama.generate(request, on_receive_response);
}

inline ollama::response chat(const std::string& model, const ollama::messages& messages, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
{
return ollama.chat(model, messages, options, format, keep_alive_duration);
}

inline ollama::response chat(ollama::request& request)
{
return ollama.chat(request);
}

inline bool chat(const std::string& model, const ollama::messages& messages, std::function<void(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
{
return ollama.chat(model, messages, on_receive_response, options, format, keep_alive_duration);
}

inline bool chat(ollama::request& request, std::function<void(const ollama::response&)> on_receive_response)
{
return ollama.chat(request, on_receive_response);
}

inline bool create(const std::string& modelName, const std::string& modelFile, bool loadFromFile=true)
{
return ollama.create_model(modelName, modelFile, loadFromFile);
Expand Down Expand Up @@ -35681,6 +35726,11 @@ namespace ollama
return ollama.generate_embeddings(model, input, options, truncate, keep_alive_duration);
}

inline ollama::response generate_embeddings(ollama::request& request)
{
return ollama.generate_embeddings(request);
}

inline void setReadTimeout(const int& seconds)
{
ollama.setReadTimeout(seconds);
Expand Down
30 changes: 30 additions & 0 deletions test/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,36 @@ TEST_SUITE("Ollama Tests") {
CHECK(response.as_json().contains("embeddings") == true);
}

TEST_CASE("Manual Requests") {

ollama::request request(ollama::message_type::generation);
request["model"] = test_model;
request["prompt"] = "Why is the sky blue?";
request["stream"] = false;
ollama::response response = ollama::generate(request);

CHECK(response.as_json().contains("response") == true);

request = ollama::request(ollama::message_type::chat);
request["model"] = test_model;
ollama::messages messages = { ollama::message("user","Why is the sky blue?") };
request["messages"] = messages.to_json();
request["stream"] = false;
response = ollama::chat(request);

CHECK(response.as_json().contains("message") == true);

request = ollama::request(ollama::message_type::embedding);
request["model"] = test_model;
request["input"] = "Why is the sky blue?";
request["stream"] = false;
response = ollama::generate_embeddings(request);

CHECK(response.as_json().contains("embeddings") == true);
}



TEST_CASE("Enable Debug Logging") {

ollama::show_requests(true);
Expand Down