mpi : add support for distributed inference via MPI (ggml-org#2099)

evanmiller · ggerganov · web-flow · commit 5656d10599bd · 2023-07-10T18:49:56.000+03:00
* MPI support, first cut * fix warnings, update README * fixes * wrap includes * PR comments * Update CMakeLists.txt * Add GH workflow, fix test * Add info to README * mpi : trying to move more MPI stuff into ggml-mpi (WIP) (ggml-org#2099) * mpi : add names for layer inputs + prep ggml_mpi_graph_compute() * mpi : move all MPI logic into ggml-mpi Not tested yet * mpi : various fixes - communication now works but results are wrong * mpi : fix output tensor after MPI compute (still not working) * mpi : fix inference * mpi : minor * Add OpenMPI to GH action * [mpi] continue-on-error: true * mpi : fix after master merge * [mpi] Link MPI C++ libraries to fix OpenMPI * tests : fix new llama_backend API * [mpi] use MPI_INT32_T * mpi : factor out recv / send in functions and reuse * mpi : extend API to allow usage with outer backends (e.g. Metal) --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -104,6 +104,40 @@ jobs:
           cd build
           ctest --verbose --timeout 900
 
+  ubuntu-latest-cmake-mpi:
+    runs-on: ubuntu-latest
+
+    continue-on-error: true
+
+    strategy:
+      matrix:
+        mpi_library: [mpich, libopenmpi-dev]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ${{ matrix.mpi_library }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake -DLLAMA_MPI=ON ..
+          cmake --build . --config Release
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest --verbose
+
   macOS-latest-make:
     runs-on: macos-latest
 
diff --git a/.gitignore b/.gitignore
@@ -20,6 +20,7 @@ build-static/
 build-cublas/
 build-opencl/
 build-metal/
+build-mpi/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -75,6 +75,7 @@ option(LLAMA_CUDA_DMMV_F16                   "llama: use 16 bit floats for dmmv
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
+option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 
@@ -308,6 +309,28 @@ if (LLAMA_METAL)
         )
 endif()
 
+if (LLAMA_MPI)
+    cmake_minimum_required(VERSION 3.10)
+    find_package(MPI)
+    if (MPI_C_FOUND)
+        message(STATUS "MPI found")
+        set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
+        add_compile_definitions(GGML_USE_MPI)
+        add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
+        set(cxx_flags ${cxx_flags} -Wno-cast-qual)
+        set(c_flags   ${c_flags}   -Wno-cast-qual)
+        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${MPI_C_LIBRARIES})
+        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
+        # Even if you're only using the C header, C++ programs may bring in MPI
+        # C++ functions, so more linkage is needed
+        if (MPI_CXX_FOUND)
+            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}     ${MPI_CXX_LIBRARIES})
+        endif()
+    else()
+        message(WARNING "MPI not found")
+    endif()
+endif()
+
 if (LLAMA_CLBLAST)
     find_package(CLBlast)
     if (CLBlast_FOUND)
@@ -476,6 +499,7 @@ add_library(ggml OBJECT
             ${GGML_SOURCES_CUDA}
             ${GGML_SOURCES_OPENCL}
             ${GGML_SOURCES_METAL}
+            ${GGML_SOURCES_MPI}
             ${GGML_SOURCES_EXTRA}
             )
 
diff --git a/Makefile b/Makefile
@@ -147,6 +147,15 @@ ifndef LLAMA_NO_ACCELERATE
 	endif
 endif # LLAMA_NO_ACCELERATE
 
+ifdef LLAMA_MPI
+	CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
+	CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
+	OBJS     += ggml-mpi.o
+
+ggml-mpi.o: ggml-mpi.c ggml-mpi.h
+	$(CC) $(CFLAGS) -c $< -o $@
+endif # LLAMA_MPI
+
 ifdef LLAMA_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
 	LDFLAGS += -lopenblas
diff --git a/README.md b/README.md
@@ -268,6 +268,45 @@ Any value larger than 0 will offload the computation to the GPU. For example:
 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
 ```
 
+### MPI Build
+
+MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
+
+First you will need MPI libraries installed on your system. The two most popular (only?) options are [MPICH](https://www.mpich.org) and [OpenMPI](https://www.open-mpi.org). Either can be installed with a package manager (`apt`, Homebrew, MacPorts, etc).
+
+Next you will need to build the project with `LLAMA_MPI` set to true on all machines; if you're building with `make`, you will also need to specify an MPI-capable compiler (when building with CMake, this is configured automatically):
+
+- Using `make`:
+
+  ```bash
+  make CC=mpicc CXX=mpicxx LLAMA_MPI=1
+  ```
+
+- Using `CMake`:
+
+  ```bash
+  cmake -S . -B build -DLLAMA_MPI=ON
+  ```
+
+Once the programs are built, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
+
+Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
+
+Here is an example hostfile:
+
+```
+192.168.0.1:2
+malvolio.local:1
+```
+
+The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
+
+Finally, you're ready to run a computation using `mpirun`:
+
+```bash
+mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
+```
+
 ### BLAS Build
 
 Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp
@@ -34,7 +34,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
     }
     fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
@@ -93,5 +93,7 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
     llama_free_model(model);
 
+    llama_backend_free();
+
     return 0;
 }
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -105,7 +105,7 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
@@ -671,5 +671,7 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
     llama_free_model(model);
 
+    llama_backend_free();
+
     return 0;
 }
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
@@ -172,5 +172,7 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
     llama_free_model(model);
 
+    llama_backend_free();
+
     return 0;
 }
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
@@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
         usage(argv[0]);
     }
 
-    llama_init_backend(false);
+    llama_backend_init(false);
 
     // parse command line arguments
     const std::string fname_inp = argv[arg_idx];
@@ -257,5 +257,7 @@ int main(int argc, char ** argv) {
         printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
     }
 
+    llama_backend_free();
+
     return 0;
 }
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -1079,7 +1079,7 @@ int main(int argc, char **argv)
         params.model_alias = params.model;
     }
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     LOG_INFO("build info", {{"build", BUILD_NUMBER},
                             {"commit", BUILD_COMMIT}});
@@ -1309,5 +1309,7 @@ int main(int argc, char **argv)
         return 1;
     }
 
+    llama_backend_free();
+
     return 0;
 }
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
@@ -66,7 +66,7 @@ int main(int argc, char ** argv)
     // Init LLM :
     //---------------------------------
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
@@ -173,6 +173,8 @@ int main(int argc, char ** argv)
     llama_free( ctx );
     llama_free_model( model );
 
+    llama_backend_free();
+
     return 0;
 }
 
diff --git a/ggml-metal.m b/ggml-metal.m
@@ -450,6 +450,7 @@ void ggml_metal_graph_compute(
                 //}
 
                 switch (dst->op) {
+                    case GGML_OP_NONE:
                     case GGML_OP_RESHAPE:
                     case GGML_OP_VIEW:
                     case GGML_OP_TRANSPOSE:
diff --git a/ggml-mpi.c b/ggml-mpi.c
diff --git a/ggml-mpi.h b/ggml-mpi.h
diff --git a/llama.cpp b/llama.cpp
diff --git a/llama.h b/llama.h
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp

Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {`
`34`	`34`	`}`
`35`	`35`	`fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);`
`36`	`36`
`37`		`- llama_init_backend(params.numa);`
	`37`	`+ llama_backend_init(params.numa);`
`38`	`38`
`39`	`39`	`llama_model * model;`
`40`	`40`	`llama_context * ctx;`
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {`
`35`	`35`	`params.prompt = gpt_random_prompt(rng);`
`36`	`36`	`}`
`37`	`37`
`38`		`- llama_init_backend(params.numa);`
	`38`	`+ llama_backend_init(params.numa);`
`39`	`39`
`40`	`40`	`llama_model * model;`
`41`	`41`	`llama_context * ctx;`
`@@ -93,5 +93,7 @@ int main(int argc, char ** argv) {`
`93`	`93`	`llama_free(ctx);`
`94`	`94`	`llama_free_model(model);`
`95`	`95`
	`96`	`+ llama_backend_free();`
	`97`	`+`
`96`	`98`	`return 0;`
`97`	`99`	`}`
Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,7 @@ int main(int argc, char ** argv) {`
`105`	`105`	`params.prompt = gpt_random_prompt(rng);`
`106`	`106`	`}`
`107`	`107`
`108`		`- llama_init_backend(params.numa);`
	`108`	`+ llama_backend_init(params.numa);`
`109`	`109`
`110`	`110`	`llama_model * model;`
`111`	`111`	`llama_context * ctx;`
`@@ -671,5 +671,7 @@ int main(int argc, char ** argv) {`
`671`	`671`	`llama_free(ctx);`
`672`	`672`	`llama_free_model(model);`
`673`	`673`
	`674`	`+ llama_backend_free();`
	`675`	`+`
`674`	`676`	`return 0;`
`675`	`677`	`}`
Original file line number	Diff line number	Diff line change
`@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {`
`147`	`147`	`params.prompt = gpt_random_prompt(rng);`
`148`	`148`	`}`
`149`	`149`
`150`		`- llama_init_backend(params.numa);`
	`150`	`+ llama_backend_init(params.numa);`
`151`	`151`
`152`	`152`	`llama_model * model;`
`153`	`153`	`llama_context * ctx;`
`@@ -172,5 +172,7 @@ int main(int argc, char ** argv) {`
`172`	`172`	`llama_free(ctx);`
`173`	`173`	`llama_free_model(model);`
`174`	`174`
	`175`	`+ llama_backend_free();`
	`176`	`+`
`175`	`177`	`return 0;`
`176`	`178`	`}`
Original file line number	Diff line number	Diff line change
`@@ -180,7 +180,7 @@ int main(int argc, char ** argv) {`
`180`	`180`	`usage(argv[0]);`
`181`	`181`	`}`
`182`	`182`
`183`		`- llama_init_backend(false);`
	`183`	`+ llama_backend_init(false);`
`184`	`184`
`185`	`185`	`// parse command line arguments`
`186`	`186`	`const std::string fname_inp = argv[arg_idx];`
`@@ -257,5 +257,7 @@ int main(int argc, char ** argv) {`
`257`	`257`	`printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);`
`258`	`258`	`}`
`259`	`259`
	`260`	`+ llama_backend_free();`
	`261`	`+`
`260`	`262`	`return 0;`
`261`	`263`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1079,7 +1079,7 @@ int main(int argc, char **argv)`
`1079`	`1079`	`params.model_alias = params.model;`
`1080`	`1080`	`}`
`1081`	`1081`
`1082`		`- llama_init_backend(params.numa);`
	`1082`	`+ llama_backend_init(params.numa);`
`1083`	`1083`
`1084`	`1084`	`LOG_INFO("build info", {{"build", BUILD_NUMBER},`
`1085`	`1085`	`{"commit", BUILD_COMMIT}});`
`@@ -1309,5 +1309,7 @@ int main(int argc, char **argv)`
`1309`	`1309`	`return 1;`
`1310`	`1310`	`}`
`1311`	`1311`
	`1312`	`+ llama_backend_free();`
	`1313`	`+`
`1312`	`1314`	`return 0;`
`1313`	`1315`	`}`