Skip to content

Commit 905d87b

Browse files
ggml : GPU-accelerated token generation (#1412)
* CUDA kernel for q4_0 dequant. + mat. vec. mult. * Added q4_1 via template * Added missing __syncthreads(); * --gpu_layers -> --gpu-layers * Shorter dequantize_mul_mat_vec line * q5_0 dequantize_mul_mat kernel * More readable dequantize_mul_mat_vec logic * dequantize_mul_mat_vec kernels for q5_1, q8_0, f16 * llama : offload "output" tensor to GPU too + coding style fixes --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent f954edd commit 905d87b

File tree

8 files changed

+336
-42
lines changed

8 files changed

+336
-42
lines changed

examples/common.cpp

+17-8
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
277277
params.use_color = true;
278278
} else if (arg == "--mlock") {
279279
params.use_mlock = true;
280+
} else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
281+
if (++i >= argc) {
282+
invalid_param = true;
283+
break;
284+
}
285+
params.n_gpu_layers = std::stoi(argv[i]);
280286
} else if (arg == "--no-mmap") {
281287
params.use_mmap = false;
282288
} else if (arg == "--mtest") {
@@ -421,6 +427,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
421427
if (llama_mmap_supported()) {
422428
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
423429
}
430+
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
431+
fprintf(stderr, " number of layers to store in VRAM\n");
424432
fprintf(stderr, " --mtest compute maximum memory usage\n");
425433
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
426434
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
@@ -463,14 +471,15 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
463471
struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
464472
auto lparams = llama_context_default_params();
465473

466-
lparams.n_ctx = params.n_ctx;
467-
lparams.n_parts = params.n_parts;
468-
lparams.seed = params.seed;
469-
lparams.f16_kv = params.memory_f16;
470-
lparams.use_mmap = params.use_mmap;
471-
lparams.use_mlock = params.use_mlock;
472-
lparams.logits_all = params.perplexity;
473-
lparams.embedding = params.embedding;
474+
lparams.n_ctx = params.n_ctx;
475+
lparams.n_parts = params.n_parts;
476+
lparams.n_gpu_layers = params.n_gpu_layers;
477+
lparams.seed = params.seed;
478+
lparams.f16_kv = params.memory_f16;
479+
lparams.use_mmap = params.use_mmap;
480+
lparams.use_mlock = params.use_mlock;
481+
lparams.logits_all = params.perplexity;
482+
lparams.embedding = params.embedding;
474483

475484
llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
476485

examples/common.h

+6-5
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,14 @@
2121
int32_t get_num_physical_cores();
2222

2323
struct gpt_params {
24-
int32_t seed = -1; // RNG seed
24+
int32_t seed = -1; // RNG seed
2525
int32_t n_threads = get_num_physical_cores();
2626
int32_t n_predict = -1; // new tokens to predict
27-
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
28-
int32_t n_ctx = 512; // context size
29-
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
30-
int32_t n_keep = 0; // number of tokens to keep from initial prompt
27+
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
28+
int32_t n_ctx = 512; // context size
29+
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
30+
int32_t n_keep = 0; // number of tokens to keep from initial prompt
31+
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
3132

3233
// sampling parameters
3334
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens

0 commit comments

Comments
 (0)