Skip to content

Commit 8c2c497

Browse files
committed
Merge 'origin/master' into hipblas
2 parents e610466 + 2347463 commit 8c2c497

22 files changed

+481
-48
lines changed

.github/workflows/build.yml

+34
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,40 @@ jobs:
104104
cd build
105105
ctest --verbose --timeout 900
106106
107+
ubuntu-latest-cmake-mpi:
108+
runs-on: ubuntu-latest
109+
110+
continue-on-error: true
111+
112+
strategy:
113+
matrix:
114+
mpi_library: [mpich, libopenmpi-dev]
115+
116+
steps:
117+
- name: Clone
118+
id: checkout
119+
uses: actions/checkout@v1
120+
121+
- name: Dependencies
122+
id: depends
123+
run: |
124+
sudo apt-get update
125+
sudo apt-get install build-essential ${{ matrix.mpi_library }}
126+
127+
- name: Build
128+
id: cmake_build
129+
run: |
130+
mkdir build
131+
cd build
132+
cmake -DLLAMA_MPI=ON ..
133+
cmake --build . --config Release
134+
135+
- name: Test
136+
id: cmake_test
137+
run: |
138+
cd build
139+
ctest --verbose
140+
107141
macOS-latest-make:
108142
runs-on: macos-latest
109143

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ build-static/
2020
build-cublas/
2121
build-opencl/
2222
build-metal/
23+
build-mpi/
2324
build-no-accel/
2425
build-sanitize-addr/
2526
build-sanitize-thread/

CMakeLists.txt

+24
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
7676
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
7777
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
7878
option(LLAMA_METAL "llama: use Metal" OFF)
79+
option(LLAMA_MPI "llama: use MPI" OFF)
7980
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
8081
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
8182

@@ -309,6 +310,28 @@ if (LLAMA_METAL)
309310
)
310311
endif()
311312

313+
if (LLAMA_MPI)
314+
cmake_minimum_required(VERSION 3.10)
315+
find_package(MPI)
316+
if (MPI_C_FOUND)
317+
message(STATUS "MPI found")
318+
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
319+
add_compile_definitions(GGML_USE_MPI)
320+
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
321+
set(cxx_flags ${cxx_flags} -Wno-cast-qual)
322+
set(c_flags ${c_flags} -Wno-cast-qual)
323+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
324+
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
325+
# Even if you're only using the C header, C++ programs may bring in MPI
326+
# C++ functions, so more linkage is needed
327+
if (MPI_CXX_FOUND)
328+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_CXX_LIBRARIES})
329+
endif()
330+
else()
331+
message(WARNING "MPI not found")
332+
endif()
333+
endif()
334+
312335
if (LLAMA_CLBLAST)
313336
find_package(CLBlast)
314337
if (CLBlast_FOUND)
@@ -509,6 +532,7 @@ add_library(ggml OBJECT
509532
${GGML_SOURCES_CUDA}
510533
${GGML_SOURCES_OPENCL}
511534
${GGML_SOURCES_METAL}
535+
${GGML_SOURCES_MPI}
512536
${GGML_SOURCES_EXTRA}
513537
)
514538

Makefile

+9
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,15 @@ ifndef LLAMA_NO_ACCELERATE
147147
endif
148148
endif # LLAMA_NO_ACCELERATE
149149

150+
ifdef LLAMA_MPI
151+
CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
152+
CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
153+
OBJS += ggml-mpi.o
154+
155+
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
156+
$(CC) $(CFLAGS) -c $< -o $@
157+
endif # LLAMA_MPI
158+
150159
ifdef LLAMA_OPENBLAS
151160
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
152161
LDFLAGS += -lopenblas

README.md

+39
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,45 @@ Any value larger than 0 will offload the computation to the GPU. For example:
268268
./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
269269
```
270270

271+
### MPI Build
272+
273+
MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
274+
275+
First you will need MPI libraries installed on your system. The two most popular (only?) options are [MPICH](https://www.mpich.org) and [OpenMPI](https://www.open-mpi.org). Either can be installed with a package manager (`apt`, Homebrew, MacPorts, etc).
276+
277+
Next you will need to build the project with `LLAMA_MPI` set to true on all machines; if you're building with `make`, you will also need to specify an MPI-capable compiler (when building with CMake, this is configured automatically):
278+
279+
- Using `make`:
280+
281+
```bash
282+
make CC=mpicc CXX=mpicxx LLAMA_MPI=1
283+
```
284+
285+
- Using `CMake`:
286+
287+
```bash
288+
cmake -S . -B build -DLLAMA_MPI=ON
289+
```
290+
291+
Once the programs are built, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
292+
293+
Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
294+
295+
Here is an example hostfile:
296+
297+
```
298+
192.168.0.1:2
299+
malvolio.local:1
300+
```
301+
302+
The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
303+
304+
Finally, you're ready to run a computation using `mpirun`:
305+
306+
```bash
307+
mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
308+
```
309+
271310
### BLAS Build
272311

273312
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:

examples/common.cpp

+1-2
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
267267
break;
268268
}
269269
params.lora_adapter = argv[i];
270-
params.use_mmap = false;
271270
} else if (arg == "--lora-base") {
272271
if (++i >= argc) {
273272
invalid_param = true;
@@ -499,7 +498,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
499498
fprintf(stderr, " --mtest compute maximum memory usage\n");
500499
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
501500
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
502-
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
501+
fprintf(stderr, " --lora FNAME apply LoRA adapter\n");
503502
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
504503
fprintf(stderr, " -m FNAME, --model FNAME\n");
505504
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());

examples/embd-input/embd-input-lib.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
3434
}
3535
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
3636

37-
llama_init_backend(params.numa);
37+
llama_backend_init(params.numa);
3838

3939
llama_model * model;
4040
llama_context * ctx;

examples/embedding/embedding.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
3535
params.prompt = gpt_random_prompt(rng);
3636
}
3737

38-
llama_init_backend(params.numa);
38+
llama_backend_init(params.numa);
3939

4040
llama_model * model;
4141
llama_context * ctx;
@@ -93,5 +93,7 @@ int main(int argc, char ** argv) {
9393
llama_free(ctx);
9494
llama_free_model(model);
9595

96+
llama_backend_free();
97+
9698
return 0;
9799
}

examples/main/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -293,5 +293,5 @@ These options provide extra functionality and customization when running the LLa
293293
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
294294
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
295295
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
296-
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
296+
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
297297
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.

examples/main/main.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ int main(int argc, char ** argv) {
105105
params.prompt = gpt_random_prompt(rng);
106106
}
107107

108-
llama_init_backend(params.numa);
108+
llama_backend_init(params.numa);
109109

110110
llama_model * model;
111111
llama_context * ctx;
@@ -671,5 +671,7 @@ int main(int argc, char ** argv) {
671671
llama_free(ctx);
672672
llama_free_model(model);
673673

674+
llama_backend_free();
675+
674676
return 0;
675677
}

examples/perplexity/perplexity.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
147147
params.prompt = gpt_random_prompt(rng);
148148
}
149149

150-
llama_init_backend(params.numa);
150+
llama_backend_init(params.numa);
151151

152152
llama_model * model;
153153
llama_context * ctx;
@@ -172,5 +172,7 @@ int main(int argc, char ** argv) {
172172
llama_free(ctx);
173173
llama_free_model(model);
174174

175+
llama_backend_free();
176+
175177
return 0;
176178
}

examples/quantize/quantize.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
180180
usage(argv[0]);
181181
}
182182

183-
llama_init_backend(false);
183+
llama_backend_init(false);
184184

185185
// parse command line arguments
186186
const std::string fname_inp = argv[arg_idx];
@@ -257,5 +257,7 @@ int main(int argc, char ** argv) {
257257
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
258258
}
259259

260+
llama_backend_free();
261+
260262
return 0;
261263
}

examples/server/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Command line options:
1616
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
1717
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
1818
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
19-
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
19+
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
2020
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
2121
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
2222
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.

examples/server/server.cpp

+4-3
Original file line numberDiff line numberDiff line change
@@ -632,7 +632,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
632632
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
633633
fprintf(stderr, " -a ALIAS, --alias ALIAS\n");
634634
fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n");
635-
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
635+
fprintf(stderr, " --lora FNAME apply LoRA adapter\n");
636636
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
637637
fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
638638
fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port);
@@ -820,7 +820,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
820820
break;
821821
}
822822
params.lora_adapter = argv[i];
823-
params.use_mmap = false;
824823
}
825824
else if (arg == "--lora-base")
826825
{
@@ -1079,7 +1078,7 @@ int main(int argc, char **argv)
10791078
params.model_alias = params.model;
10801079
}
10811080

1082-
llama_init_backend(params.numa);
1081+
llama_backend_init(params.numa);
10831082

10841083
LOG_INFO("build info", {{"build", BUILD_NUMBER},
10851084
{"commit", BUILD_COMMIT}});
@@ -1309,5 +1308,7 @@ int main(int argc, char **argv)
13091308
return 1;
13101309
}
13111310

1311+
llama_backend_free();
1312+
13121313
return 0;
13131314
}

examples/simple/simple.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ int main(int argc, char ** argv)
6666
// Init LLM :
6767
//---------------------------------
6868

69-
llama_init_backend(params.numa);
69+
llama_backend_init(params.numa);
7070

7171
llama_model * model;
7272
llama_context * ctx;
@@ -173,6 +173,8 @@ int main(int argc, char ** argv)
173173
llama_free( ctx );
174174
llama_free_model( model );
175175

176+
llama_backend_free();
177+
176178
return 0;
177179
}
178180

ggml-metal.m

+1
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,7 @@ void ggml_metal_graph_compute(
450450
//}
451451

452452
switch (dst->op) {
453+
case GGML_OP_NONE:
453454
case GGML_OP_RESHAPE:
454455
case GGML_OP_VIEW:
455456
case GGML_OP_TRANSPOSE:

0 commit comments

Comments
 (0)