Skip to content

Commit 5e31828

Browse files
authored
ggml : add RPC backend (ggml-org#6829)
* ggml : add RPC backend The RPC backend proxies all operations to a remote server which runs a regular backend (CPU, CUDA, Metal, etc). * set TCP_NODELAY * add CI workflows * Address review comments * fix warning * implement llama_max_devices() for RPC * Address review comments * Address review comments * wrap sockfd into a struct * implement get_alignment and get_max_size * add get_device_memory * fix warning * win32 support * add README * readme : trim trailing whitespace * Address review comments * win32 fix * Address review comments * fix compile warnings on macos
1 parent 5416002 commit 5e31828

12 files changed

+1395
-98
lines changed

.github/workflows/build.yml

+32
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,36 @@ jobs:
340340
cd build
341341
ctest -L main --verbose
342342
343+
ubuntu-latest-cmake-rpc:
344+
runs-on: ubuntu-latest
345+
346+
continue-on-error: true
347+
348+
steps:
349+
- name: Clone
350+
id: checkout
351+
uses: actions/checkout@v4
352+
353+
- name: Dependencies
354+
id: depends
355+
run: |
356+
sudo apt-get update
357+
sudo apt-get install build-essential
358+
359+
- name: Build
360+
id: cmake_build
361+
run: |
362+
mkdir build
363+
cd build
364+
cmake -DLLAMA_RPC=ON ..
365+
cmake --build . --config Release -j $(nproc)
366+
367+
- name: Test
368+
id: cmake_test
369+
run: |
370+
cd build
371+
ctest -L main --verbose
372+
343373
ubuntu-22-cmake-vulkan:
344374
runs-on: ubuntu-22.04
345375

@@ -663,6 +693,8 @@ jobs:
663693
strategy:
664694
matrix:
665695
include:
696+
- build: 'rpc'
697+
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
666698
- build: 'noavx'
667699
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
668700
- build: 'avx2'

CMakeLists.txt

+13
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
123123
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
124124
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
125125
option(LLAMA_MPI "llama: use MPI" OFF)
126+
option(LLAMA_RPC "llama: use RPC" OFF)
126127
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
127128
option(LLAMA_SYCL "llama: use SYCL" OFF)
128129
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
@@ -494,6 +495,17 @@ if (LLAMA_MPI)
494495
endif()
495496
endif()
496497

498+
if (LLAMA_RPC)
499+
add_compile_definitions(GGML_USE_RPC)
500+
501+
if (WIN32)
502+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ws2_32)
503+
endif()
504+
505+
set(GGML_HEADERS_RPC ggml-rpc.h)
506+
set(GGML_SOURCES_RPC ggml-rpc.cpp)
507+
endif()
508+
497509
if (LLAMA_CLBLAST)
498510
find_package(CLBlast)
499511
if (CLBlast_FOUND)
@@ -1176,6 +1188,7 @@ add_library(ggml OBJECT
11761188
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
11771189
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
11781190
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
1191+
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
11791192
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
11801193
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
11811194
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}

common/common.cpp

+10
Original file line numberDiff line numberDiff line change
@@ -1060,6 +1060,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
10601060
#endif // GGML_USE_CUDA_SYCL_VULKAN
10611061
return true;
10621062
}
1063+
if (arg == "--rpc") {
1064+
if (++i >= argc) {
1065+
invalid_param = true;
1066+
return true;
1067+
}
1068+
params.rpc_servers = argv[i];
1069+
return true;
1070+
}
10631071
if (arg == "--no-mmap") {
10641072
params.use_mmap = false;
10651073
return true;
@@ -1557,6 +1565,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
15571565
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
15581566
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
15591567
}
1568+
printf(" --rpc SERVERS comma separated list of RPC servers\n");
15601569
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
15611570
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
15621571
printf(" -gan N, --grp-attn-n N\n");
@@ -1830,6 +1839,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
18301839
if (params.n_gpu_layers != -1) {
18311840
mparams.n_gpu_layers = params.n_gpu_layers;
18321841
}
1842+
mparams.rpc_servers = params.rpc_servers.c_str();
18331843
mparams.main_gpu = params.main_gpu;
18341844
mparams.split_mode = params.split_mode;
18351845
mparams.tensor_split = params.tensor_split;

common/common.h

+1
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ struct gpt_params {
8282
float yarn_beta_slow = 1.0f; // YaRN high correction dim
8383
int32_t yarn_orig_ctx = 0; // YaRN original context length
8484
float defrag_thold = -1.0f; // KV cache defragmentation threshold
85+
std::string rpc_servers = ""; // comma separated list of RPC servers
8586

8687
ggml_backend_sched_eval_callback cb_eval = nullptr;
8788
void * cb_eval_user_data = nullptr;

examples/CMakeLists.txt

+3
Original file line numberDiff line numberDiff line change
@@ -49,4 +49,7 @@ else()
4949
add_subdirectory(server)
5050
endif()
5151
add_subdirectory(export-lora)
52+
if (LLAMA_RPC)
53+
add_subdirectory(rpc)
54+
endif()
5255
endif()

examples/rpc/CMakeLists.txt

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
add_executable(rpc-server rpc-server.cpp)
2+
target_link_libraries(rpc-server PRIVATE ggml llama)

examples/rpc/README.md

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
## Overview
2+
3+
The `rpc-server` allows running `ggml` backend on a remote host.
4+
The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
5+
This can be used for distributed LLM inference with `llama.cpp` in the following way:
6+
7+
```mermaid
8+
flowchart TD
9+
rpcb---|TCP|srva
10+
rpcb---|TCP|srvb
11+
rpcb-.-|TCP|srvn
12+
subgraph hostn[Host N]
13+
srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"]
14+
end
15+
subgraph hostb[Host B]
16+
srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"]
17+
end
18+
subgraph hosta[Host A]
19+
srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"]
20+
end
21+
subgraph host[Main Host]
22+
ggml[llama.cpp]---rpcb[RPC backend]
23+
end
24+
style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
25+
```
26+
27+
Each host can run a different backend, e.g. one with CUDA and another with Metal.
28+
You can also run multiple `rpc-server` instances on the same host, each with a different backend.
29+
30+
## Usage
31+
32+
On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options.
33+
For example, to build the CUDA backend with RPC support:
34+
35+
```bash
36+
mkdir build-rpc-cuda
37+
cd build-rpc-cuda
38+
cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON
39+
cmake --build . --config Release
40+
```
41+
42+
Then, start the `rpc-server` with the backend:
43+
44+
```bash
45+
$ bin/rpc-server 0.0.0.0 50052
46+
create_backend: using CUDA backend
47+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
48+
ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
49+
ggml_cuda_init: found 1 CUDA devices:
50+
Device 0: NVIDIA T1200 Laptop GPU, compute capability 7.5, VMM: yes
51+
Starting RPC server on 0.0.0.0:50052
52+
```
53+
54+
When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
55+
```bash
56+
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server 0.0.0.0 50052
57+
```
58+
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
59+
60+
61+
On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`:
62+
63+
```bash
64+
mkdir build-rpc
65+
cd build-rpc
66+
cmake .. -DLLAMA_RPC=ON
67+
cmake --build . --config Release
68+
```
69+
70+
Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
71+
72+
```bash
73+
$ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
74+
```

examples/rpc/rpc-server.cpp

+70
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#ifdef GGML_USE_CUDA
2+
#include "ggml-cuda.h"
3+
#endif
4+
5+
#ifdef GGML_USE_METAL
6+
#include "ggml-metal.h"
7+
#endif
8+
9+
#include "ggml-rpc.h"
10+
#include <string>
11+
#include <stdio.h>
12+
13+
static ggml_backend_t create_backend() {
14+
ggml_backend_t backend = NULL;
15+
#ifdef GGML_USE_CUDA
16+
fprintf(stderr, "%s: using CUDA backend\n", __func__);
17+
backend = ggml_backend_cuda_init(0); // init device 0
18+
if (!backend) {
19+
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
20+
}
21+
#elif GGML_USE_METAL
22+
fprintf(stderr, "%s: using Metal backend\n", __func__);
23+
backend = ggml_backend_metal_init();
24+
if (!backend) {
25+
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
26+
}
27+
#endif
28+
29+
// if there aren't GPU Backends fallback to CPU backend
30+
if (!backend) {
31+
fprintf(stderr, "%s: using CPU backend\n", __func__);
32+
backend = ggml_backend_cpu_init();
33+
}
34+
return backend;
35+
}
36+
37+
static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
38+
#ifdef GGML_USE_CUDA
39+
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
40+
#else
41+
// TODO: implement for other backends
42+
*free_mem = 1;
43+
*total_mem = 1;
44+
#endif
45+
}
46+
47+
int main(int argc, char * argv[]) {
48+
if (argc < 3) {
49+
fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]);
50+
return 1;
51+
}
52+
const char * host = argv[1];
53+
int port = std::stoi(argv[2]);
54+
if (port <= 0 || port > 65535) {
55+
fprintf(stderr, "Invalid port number: %d\n", port);
56+
return 1;
57+
}
58+
ggml_backend_t backend = create_backend();
59+
if (!backend) {
60+
fprintf(stderr, "Failed to create backend\n");
61+
return 1;
62+
}
63+
printf("Starting RPC server on %s:%d\n", host, port);
64+
size_t free_mem, total_mem;
65+
get_backend_memory(&free_mem, &total_mem);
66+
std::string endpoint = std::string(host) + ":" + std::to_string(port);
67+
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
68+
ggml_backend_free(backend);
69+
return 0;
70+
}

0 commit comments

Comments
 (0)