Skip to content

Commit 5656d10

Browse files
mpi : add support for distributed inference via MPI (ggml-org#2099)
* MPI support, first cut * fix warnings, update README * fixes * wrap includes * PR comments * Update CMakeLists.txt * Add GH workflow, fix test * Add info to README * mpi : trying to move more MPI stuff into ggml-mpi (WIP) (ggml-org#2099) * mpi : add names for layer inputs + prep ggml_mpi_graph_compute() * mpi : move all MPI logic into ggml-mpi Not tested yet * mpi : various fixes - communication now works but results are wrong * mpi : fix output tensor after MPI compute (still not working) * mpi : fix inference * mpi : minor * Add OpenMPI to GH action * [mpi] continue-on-error: true * mpi : fix after master merge * [mpi] Link MPI C++ libraries to fix OpenMPI * tests : fix new llama_backend API * [mpi] use MPI_INT32_T * mpi : factor out recv / send in functions and reuse * mpi : extend API to allow usage with outer backends (e.g. Metal) --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 1d16309 commit 5656d10

18 files changed

+460
-35
lines changed

.github/workflows/build.yml

+34
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,40 @@ jobs:
104104
cd build
105105
ctest --verbose --timeout 900
106106
107+
ubuntu-latest-cmake-mpi:
108+
runs-on: ubuntu-latest
109+
110+
continue-on-error: true
111+
112+
strategy:
113+
matrix:
114+
mpi_library: [mpich, libopenmpi-dev]
115+
116+
steps:
117+
- name: Clone
118+
id: checkout
119+
uses: actions/checkout@v1
120+
121+
- name: Dependencies
122+
id: depends
123+
run: |
124+
sudo apt-get update
125+
sudo apt-get install build-essential ${{ matrix.mpi_library }}
126+
127+
- name: Build
128+
id: cmake_build
129+
run: |
130+
mkdir build
131+
cd build
132+
cmake -DLLAMA_MPI=ON ..
133+
cmake --build . --config Release
134+
135+
- name: Test
136+
id: cmake_test
137+
run: |
138+
cd build
139+
ctest --verbose
140+
107141
macOS-latest-make:
108142
runs-on: macos-latest
109143

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ build-static/
2020
build-cublas/
2121
build-opencl/
2222
build-metal/
23+
build-mpi/
2324
build-no-accel/
2425
build-sanitize-addr/
2526
build-sanitize-thread/

CMakeLists.txt

+24
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv
7575
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
7676
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
7777
option(LLAMA_METAL "llama: use Metal" OFF)
78+
option(LLAMA_MPI "llama: use MPI" OFF)
7879
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
7980
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
8081

@@ -308,6 +309,28 @@ if (LLAMA_METAL)
308309
)
309310
endif()
310311

312+
if (LLAMA_MPI)
313+
cmake_minimum_required(VERSION 3.10)
314+
find_package(MPI)
315+
if (MPI_C_FOUND)
316+
message(STATUS "MPI found")
317+
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
318+
add_compile_definitions(GGML_USE_MPI)
319+
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
320+
set(cxx_flags ${cxx_flags} -Wno-cast-qual)
321+
set(c_flags ${c_flags} -Wno-cast-qual)
322+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
323+
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
324+
# Even if you're only using the C header, C++ programs may bring in MPI
325+
# C++ functions, so more linkage is needed
326+
if (MPI_CXX_FOUND)
327+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_CXX_LIBRARIES})
328+
endif()
329+
else()
330+
message(WARNING "MPI not found")
331+
endif()
332+
endif()
333+
311334
if (LLAMA_CLBLAST)
312335
find_package(CLBlast)
313336
if (CLBlast_FOUND)
@@ -476,6 +499,7 @@ add_library(ggml OBJECT
476499
${GGML_SOURCES_CUDA}
477500
${GGML_SOURCES_OPENCL}
478501
${GGML_SOURCES_METAL}
502+
${GGML_SOURCES_MPI}
479503
${GGML_SOURCES_EXTRA}
480504
)
481505

Makefile

+9
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,15 @@ ifndef LLAMA_NO_ACCELERATE
147147
endif
148148
endif # LLAMA_NO_ACCELERATE
149149

150+
ifdef LLAMA_MPI
151+
CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
152+
CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
153+
OBJS += ggml-mpi.o
154+
155+
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
156+
$(CC) $(CFLAGS) -c $< -o $@
157+
endif # LLAMA_MPI
158+
150159
ifdef LLAMA_OPENBLAS
151160
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
152161
LDFLAGS += -lopenblas

README.md

+39
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,45 @@ Any value larger than 0 will offload the computation to the GPU. For example:
268268
./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
269269
```
270270

271+
### MPI Build
272+
273+
MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
274+
275+
First you will need MPI libraries installed on your system. The two most popular (only?) options are [MPICH](https://www.mpich.org) and [OpenMPI](https://www.open-mpi.org). Either can be installed with a package manager (`apt`, Homebrew, MacPorts, etc).
276+
277+
Next you will need to build the project with `LLAMA_MPI` set to true on all machines; if you're building with `make`, you will also need to specify an MPI-capable compiler (when building with CMake, this is configured automatically):
278+
279+
- Using `make`:
280+
281+
```bash
282+
make CC=mpicc CXX=mpicxx LLAMA_MPI=1
283+
```
284+
285+
- Using `CMake`:
286+
287+
```bash
288+
cmake -S . -B build -DLLAMA_MPI=ON
289+
```
290+
291+
Once the programs are built, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
292+
293+
Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
294+
295+
Here is an example hostfile:
296+
297+
```
298+
192.168.0.1:2
299+
malvolio.local:1
300+
```
301+
302+
The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
303+
304+
Finally, you're ready to run a computation using `mpirun`:
305+
306+
```bash
307+
mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
308+
```
309+
271310
### BLAS Build
272311

273312
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:

examples/embd-input/embd-input-lib.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
3434
}
3535
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
3636

37-
llama_init_backend(params.numa);
37+
llama_backend_init(params.numa);
3838

3939
llama_model * model;
4040
llama_context * ctx;

examples/embedding/embedding.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
3535
params.prompt = gpt_random_prompt(rng);
3636
}
3737

38-
llama_init_backend(params.numa);
38+
llama_backend_init(params.numa);
3939

4040
llama_model * model;
4141
llama_context * ctx;
@@ -93,5 +93,7 @@ int main(int argc, char ** argv) {
9393
llama_free(ctx);
9494
llama_free_model(model);
9595

96+
llama_backend_free();
97+
9698
return 0;
9799
}

examples/main/main.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ int main(int argc, char ** argv) {
105105
params.prompt = gpt_random_prompt(rng);
106106
}
107107

108-
llama_init_backend(params.numa);
108+
llama_backend_init(params.numa);
109109

110110
llama_model * model;
111111
llama_context * ctx;
@@ -671,5 +671,7 @@ int main(int argc, char ** argv) {
671671
llama_free(ctx);
672672
llama_free_model(model);
673673

674+
llama_backend_free();
675+
674676
return 0;
675677
}

examples/perplexity/perplexity.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
147147
params.prompt = gpt_random_prompt(rng);
148148
}
149149

150-
llama_init_backend(params.numa);
150+
llama_backend_init(params.numa);
151151

152152
llama_model * model;
153153
llama_context * ctx;
@@ -172,5 +172,7 @@ int main(int argc, char ** argv) {
172172
llama_free(ctx);
173173
llama_free_model(model);
174174

175+
llama_backend_free();
176+
175177
return 0;
176178
}

examples/quantize/quantize.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
180180
usage(argv[0]);
181181
}
182182

183-
llama_init_backend(false);
183+
llama_backend_init(false);
184184

185185
// parse command line arguments
186186
const std::string fname_inp = argv[arg_idx];
@@ -257,5 +257,7 @@ int main(int argc, char ** argv) {
257257
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
258258
}
259259

260+
llama_backend_free();
261+
260262
return 0;
261263
}

examples/server/server.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -1079,7 +1079,7 @@ int main(int argc, char **argv)
10791079
params.model_alias = params.model;
10801080
}
10811081

1082-
llama_init_backend(params.numa);
1082+
llama_backend_init(params.numa);
10831083

10841084
LOG_INFO("build info", {{"build", BUILD_NUMBER},
10851085
{"commit", BUILD_COMMIT}});
@@ -1309,5 +1309,7 @@ int main(int argc, char **argv)
13091309
return 1;
13101310
}
13111311

1312+
llama_backend_free();
1313+
13121314
return 0;
13131315
}

examples/simple/simple.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ int main(int argc, char ** argv)
6666
// Init LLM :
6767
//---------------------------------
6868

69-
llama_init_backend(params.numa);
69+
llama_backend_init(params.numa);
7070

7171
llama_model * model;
7272
llama_context * ctx;
@@ -173,6 +173,8 @@ int main(int argc, char ** argv)
173173
llama_free( ctx );
174174
llama_free_model( model );
175175

176+
llama_backend_free();
177+
176178
return 0;
177179
}
178180

ggml-metal.m

+1
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,7 @@ void ggml_metal_graph_compute(
450450
//}
451451

452452
switch (dst->op) {
453+
case GGML_OP_NONE:
453454
case GGML_OP_RESHAPE:
454455
case GGML_OP_VIEW:
455456
case GGML_OP_TRANSPOSE:

0 commit comments

Comments
 (0)