Skip to content

Commit 254a7a7

Browse files
CUDA full GPU acceleration, KV cache in VRAM (ggml-org#1827)
* Fixed CUDA RoPE * ggml_cuda_mul_mat_vec_p021 * ggml_cuda_scale * ggml_cuda_diag_mask_inf * ggml_is_permuted * ggml_cuda_cpy * flatten rows for ggml_cuda_op * Added a --low-vram option * Fixed Windows performance * Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM
1 parent 9254920 commit 254a7a7

11 files changed

+853
-149
lines changed

examples/common.cpp

+8
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
331331
}
332332
#else
333333
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
334+
#endif // GGML_USE_CUBLAS
335+
} else if (arg == "--low-vram" || arg == "-lv") {
336+
#ifdef GGML_USE_CUBLAS
337+
params.low_vram = true;
338+
#else
339+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
334340
#endif // GGML_USE_CUBLAS
335341
} else if (arg == "--no-mmap") {
336342
params.use_mmap = false;
@@ -479,6 +485,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
479485
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
480486
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
481487
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
488+
fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
482489
#endif
483490
fprintf(stderr, " --mtest compute maximum memory usage\n");
484491
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
@@ -528,6 +535,7 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
528535
lparams.n_gpu_layers = params.n_gpu_layers;
529536
lparams.main_gpu = params.main_gpu;
530537
memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
538+
lparams.low_vram = params.low_vram;
531539
lparams.seed = params.seed;
532540
lparams.f16_kv = params.memory_f16;
533541
lparams.use_mmap = params.use_mmap;

examples/common.h

+9-8
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,16 @@
2121
int32_t get_num_physical_cores();
2222

2323
struct gpt_params {
24-
int32_t seed = -1; // RNG seed
25-
int32_t n_threads = get_num_physical_cores();
26-
int32_t n_predict = -1; // new tokens to predict
27-
int32_t n_ctx = 512; // context size
28-
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
29-
int32_t n_keep = 0; // number of tokens to keep from initial prompt
30-
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
31-
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
24+
int32_t seed = -1; // RNG seed
25+
int32_t n_threads = get_num_physical_cores();
26+
int32_t n_predict = -1; // new tokens to predict
27+
int32_t n_ctx = 512; // context size
28+
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
29+
int32_t n_keep = 0; // number of tokens to keep from initial prompt
30+
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
31+
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
3232
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
33+
bool low_vram = 0; // if true, reduce VRAM usage at the cost of performance
3334

3435
// sampling parameters
3536
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens

examples/main/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -288,5 +288,6 @@ These options provide extra functionality and customization when running the LLa
288288
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
289289
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
290290
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
291+
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
291292
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
292293
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.

examples/server/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,7 @@ Test();
289289
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
290290
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
291291
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
292+
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
292293
- `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**.
293294
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`;
294295
- `--port`: Set the port to listen. Default: `8080`.

examples/server/server.cpp

+9
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,7 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params &params)
405405
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
406406
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
407407
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
408+
fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
408409
#endif
409410
fprintf(stderr, " -m FNAME, --model FNAME\n");
410411
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
@@ -537,6 +538,14 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
537538
}
538539
#else
539540
fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
541+
#endif // GGML_USE_CUBLAS
542+
}
543+
else if (arg == "--low-vram" || arg == "-lv")
544+
{
545+
#ifdef GGML_USE_CUBLAS
546+
params.low_vram = true;
547+
#else
548+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
540549
#endif // GGML_USE_CUBLAS
541550
}
542551
else if (arg == "--main-gpu" || arg == "-mg")

0 commit comments

Comments
 (0)