Merge 'origin/master' into hipblas

SlyEcho · SlyEcho · commit 8c2c4978a32d · 2023-07-11T17:53:54.000+03:00
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -104,6 +104,40 @@ jobs:
           cd build
           ctest --verbose --timeout 900
 
+  ubuntu-latest-cmake-mpi:
+    runs-on: ubuntu-latest
+
+    continue-on-error: true
+
+    strategy:
+      matrix:
+        mpi_library: [mpich, libopenmpi-dev]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ${{ matrix.mpi_library }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake -DLLAMA_MPI=ON ..
+          cmake --build . --config Release
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest --verbose
+
   macOS-latest-make:
     runs-on: macos-latest
 
diff --git a/.gitignore b/.gitignore
@@ -20,6 +20,7 @@ build-static/
 build-cublas/
 build-opencl/
 build-metal/
+build-mpi/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -76,6 +76,7 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
+option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 
@@ -309,6 +310,28 @@ if (LLAMA_METAL)
         )
 endif()
 
+if (LLAMA_MPI)
+    cmake_minimum_required(VERSION 3.10)
+    find_package(MPI)
+    if (MPI_C_FOUND)
+        message(STATUS "MPI found")
+        set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
+        add_compile_definitions(GGML_USE_MPI)
+        add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
+        set(cxx_flags ${cxx_flags} -Wno-cast-qual)
+        set(c_flags   ${c_flags}   -Wno-cast-qual)
+        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${MPI_C_LIBRARIES})
+        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
+        # Even if you're only using the C header, C++ programs may bring in MPI
+        # C++ functions, so more linkage is needed
+        if (MPI_CXX_FOUND)
+            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}     ${MPI_CXX_LIBRARIES})
+        endif()
+    else()
+        message(WARNING "MPI not found")
+    endif()
+endif()
+
 if (LLAMA_CLBLAST)
     find_package(CLBlast)
     if (CLBlast_FOUND)
@@ -509,6 +532,7 @@ add_library(ggml OBJECT
             ${GGML_SOURCES_CUDA}
             ${GGML_SOURCES_OPENCL}
             ${GGML_SOURCES_METAL}
+            ${GGML_SOURCES_MPI}
             ${GGML_SOURCES_EXTRA}
             )
 
diff --git a/Makefile b/Makefile
@@ -147,6 +147,15 @@ ifndef LLAMA_NO_ACCELERATE
 	endif
 endif # LLAMA_NO_ACCELERATE
 
+ifdef LLAMA_MPI
+	CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
+	CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
+	OBJS     += ggml-mpi.o
+
+ggml-mpi.o: ggml-mpi.c ggml-mpi.h
+	$(CC) $(CFLAGS) -c $< -o $@
+endif # LLAMA_MPI
+
 ifdef LLAMA_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
 	LDFLAGS += -lopenblas
diff --git a/README.md b/README.md
@@ -268,6 +268,45 @@ Any value larger than 0 will offload the computation to the GPU. For example:
 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
 ```
 
+### MPI Build
+
+MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
+
+First you will need MPI libraries installed on your system. The two most popular (only?) options are [MPICH](https://www.mpich.org) and [OpenMPI](https://www.open-mpi.org). Either can be installed with a package manager (`apt`, Homebrew, MacPorts, etc).
+
+Next you will need to build the project with `LLAMA_MPI` set to true on all machines; if you're building with `make`, you will also need to specify an MPI-capable compiler (when building with CMake, this is configured automatically):
+
+- Using `make`:
+
+  ```bash
+  make CC=mpicc CXX=mpicxx LLAMA_MPI=1
+  ```
+
+- Using `CMake`:
+
+  ```bash
+  cmake -S . -B build -DLLAMA_MPI=ON
+  ```
+
+Once the programs are built, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
+
+Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
+
+Here is an example hostfile:
+
+```
+192.168.0.1:2
+malvolio.local:1
+```
+
+The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
+
+Finally, you're ready to run a computation using `mpirun`:
+
+```bash
+mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
+```
+
 ### BLAS Build
 
 Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
diff --git a/examples/common.cpp b/examples/common.cpp
@@ -267,7 +267,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.lora_adapter = argv[i];
-            params.use_mmap = false;
         } else if (arg == "--lora-base") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -499,7 +498,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  --mtest               compute maximum memory usage\n");
     fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
     fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
-    fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
+    fprintf(stderr, "  --lora FNAME          apply LoRA adapter\n");
     fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
     fprintf(stderr, "  -m FNAME, --model FNAME\n");
     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp
@@ -34,7 +34,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
     }
     fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
@@ -93,5 +93,7 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
     llama_free_model(model);
 
+    llama_backend_free();
+
     return 0;
 }
diff --git a/examples/main/README.md b/examples/main/README.md
@@ -293,5 +293,5 @@ These options provide extra functionality and customization when running the LLa
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
 -   `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
--   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
+-   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -105,7 +105,7 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
@@ -671,5 +671,7 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
     llama_free_model(model);
 
+    llama_backend_free();
+
     return 0;
 }
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
@@ -172,5 +172,7 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
     llama_free_model(model);
 
+    llama_backend_free();
+
     return 0;
 }
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
@@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
         usage(argv[0]);
     }
 
-    llama_init_backend(false);
+    llama_backend_init(false);
 
     // parse command line arguments
     const std::string fname_inp = argv[arg_idx];
@@ -257,5 +257,7 @@ int main(int argc, char ** argv) {
         printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
     }
 
+    llama_backend_free();
+
     return 0;
 }
diff --git a/examples/server/README.md b/examples/server/README.md
@@ -16,7 +16,7 @@ Command line options:
 -   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
 -   `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
 -   `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
--   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
+-   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
 -   `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
 -   `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -632,7 +632,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
     fprintf(stderr, "  -a ALIAS, --alias ALIAS\n");
     fprintf(stderr, "                        set an alias for the model, will be added as `model` field in completion response\n");
-    fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
+    fprintf(stderr, "  --lora FNAME          apply LoRA adapter\n");
     fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
     fprintf(stderr, "  --host                ip address to listen (default  (default: %s)\n", sparams.hostname.c_str());
     fprintf(stderr, "  --port PORT           port to listen (default  (default: %d)\n", sparams.port);
@@ -820,7 +820,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 break;
             }
             params.lora_adapter = argv[i];
-            params.use_mmap = false;
         }
         else if (arg == "--lora-base")
         {
@@ -1079,7 +1078,7 @@ int main(int argc, char **argv)
         params.model_alias = params.model;
     }
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     LOG_INFO("build info", {{"build", BUILD_NUMBER},
                             {"commit", BUILD_COMMIT}});
@@ -1309,5 +1308,7 @@ int main(int argc, char **argv)
         return 1;
     }
 
+    llama_backend_free();
+
     return 0;
 }
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
@@ -66,7 +66,7 @@ int main(int argc, char ** argv)
     // Init LLM :
     //---------------------------------
 
-    llama_init_backend(params.numa);
+    llama_backend_init(params.numa);
 
     llama_model * model;
     llama_context * ctx;
@@ -173,6 +173,8 @@ int main(int argc, char ** argv)
     llama_free( ctx );
     llama_free_model( model );
 
+    llama_backend_free();
+
     return 0;
 }
 
diff --git a/ggml-metal.m b/ggml-metal.m
@@ -450,6 +450,7 @@ void ggml_metal_graph_compute(
                 //}
 
                 switch (dst->op) {
+                    case GGML_OP_NONE:
                     case GGML_OP_RESHAPE:
                     case GGML_OP_VIEW:
                     case GGML_OP_TRANSPOSE:
diff --git a/ggml-mpi.c b/ggml-mpi.c
diff --git a/ggml-mpi.h b/ggml-mpi.h
diff --git a/llama-util.h b/llama-util.h
diff --git a/llama.cpp b/llama.cpp
diff --git a/llama.h b/llama.h
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp

Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {`
`34`	`34`	`}`
`35`	`35`	`fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);`
`36`	`36`
`37`		`- llama_init_backend(params.numa);`
	`37`	`+ llama_backend_init(params.numa);`
`38`	`38`
`39`	`39`	`llama_model * model;`
`40`	`40`	`llama_context * ctx;`
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {`
`35`	`35`	`params.prompt = gpt_random_prompt(rng);`
`36`	`36`	`}`
`37`	`37`
`38`		`- llama_init_backend(params.numa);`
	`38`	`+ llama_backend_init(params.numa);`
`39`	`39`
`40`	`40`	`llama_model * model;`
`41`	`41`	`llama_context * ctx;`
`@@ -93,5 +93,7 @@ int main(int argc, char ** argv) {`
`93`	`93`	`llama_free(ctx);`
`94`	`94`	`llama_free_model(model);`
`95`	`95`
	`96`	`+ llama_backend_free();`
	`97`	`+`
`96`	`98`	`return 0;`
`97`	`99`	`}`
Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,7 @@ int main(int argc, char ** argv) {`
`105`	`105`	`params.prompt = gpt_random_prompt(rng);`
`106`	`106`	`}`
`107`	`107`
`108`		`- llama_init_backend(params.numa);`
	`108`	`+ llama_backend_init(params.numa);`
`109`	`109`
`110`	`110`	`llama_model * model;`
`111`	`111`	`llama_context * ctx;`
`@@ -671,5 +671,7 @@ int main(int argc, char ** argv) {`
`671`	`671`	`llama_free(ctx);`
`672`	`672`	`llama_free_model(model);`
`673`	`673`
	`674`	`+ llama_backend_free();`
	`675`	`+`
`674`	`676`	`return 0;`
`675`	`677`	`}`
Original file line number	Diff line number	Diff line change
`@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {`
`147`	`147`	`params.prompt = gpt_random_prompt(rng);`
`148`	`148`	`}`
`149`	`149`
`150`		`- llama_init_backend(params.numa);`
	`150`	`+ llama_backend_init(params.numa);`
`151`	`151`
`152`	`152`	`llama_model * model;`
`153`	`153`	`llama_context * ctx;`
`@@ -172,5 +172,7 @@ int main(int argc, char ** argv) {`
`172`	`172`	`llama_free(ctx);`
`173`	`173`	`llama_free_model(model);`
`174`	`174`
	`175`	`+ llama_backend_free();`
	`176`	`+`
`175`	`177`	`return 0;`
`176`	`178`	`}`
Original file line number	Diff line number	Diff line change
`@@ -180,7 +180,7 @@ int main(int argc, char ** argv) {`
`180`	`180`	`usage(argv[0]);`
`181`	`181`	`}`
`182`	`182`
`183`		`- llama_init_backend(false);`
	`183`	`+ llama_backend_init(false);`
`184`	`184`
`185`	`185`	`// parse command line arguments`
`186`	`186`	`const std::string fname_inp = argv[arg_idx];`
`@@ -257,5 +257,7 @@ int main(int argc, char ** argv) {`
`257`	`257`	`printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);`
`258`	`258`	`}`
`259`	`259`
	`260`	`+ llama_backend_free();`
	`261`	`+`
`260`	`262`	`return 0;`
`261`	`263`	`}`
Original file line number	Diff line number	Diff line change
`@@ -632,7 +632,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,`
`632`	`632`	`fprintf(stderr, " model path (default: %s)\n", params.model.c_str());`
`633`	`633`	`fprintf(stderr, " -a ALIAS, --alias ALIAS\n");`
`634`	`634`	fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n");
`635`		`- fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");`
	`635`	`+ fprintf(stderr, " --lora FNAME apply LoRA adapter\n");`
`636`	`636`	`fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");`
`637`	`637`	`fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());`
`638`	`638`	`fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port);`
`@@ -820,7 +820,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,`
`820`	`820`	`break;`
`821`	`821`	`}`
`822`	`822`	`params.lora_adapter = argv[i];`
`823`		`- params.use_mmap = false;`
`824`	`823`	`}`
`825`	`824`	`else if (arg == "--lora-base")`
`826`	`825`	`{`
`@@ -1079,7 +1078,7 @@ int main(int argc, char **argv)`
`1079`	`1078`	`params.model_alias = params.model;`
`1080`	`1079`	`}`
`1081`	`1080`
`1082`		`- llama_init_backend(params.numa);`
	`1081`	`+ llama_backend_init(params.numa);`
`1083`	`1082`
`1084`	`1083`	`LOG_INFO("build info", {{"build", BUILD_NUMBER},`
`1085`	`1084`	`{"commit", BUILD_COMMIT}});`
`@@ -1309,5 +1308,7 @@ int main(int argc, char **argv)`
`1309`	`1308`	`return 1;`
`1310`	`1309`	`}`
`1311`	`1310`
	`1311`	`+ llama_backend_free();`
	`1312`	`+`
`1312`	`1313`	`return 0;`
`1313`	`1314`	`}`