Skip to content

Commit 32c5411

Browse files
authored
Revert "Support using mmap when applying LoRA (#2095)" (#2206)
Has perf regression when mlock is used. This reverts commit 2347463.
1 parent ff5d58f commit 32c5411

File tree

5 files changed

+9
-7
lines changed

5 files changed

+9
-7
lines changed

examples/common.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
285285
break;
286286
}
287287
params.lora_adapter = argv[i];
288+
params.use_mmap = false;
288289
} else if (arg == "--lora-base") {
289290
if (++i >= argc) {
290291
invalid_param = true;
@@ -520,7 +521,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
520521
fprintf(stderr, " --mtest compute maximum memory usage\n");
521522
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
522523
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
523-
fprintf(stderr, " --lora FNAME apply LoRA adapter\n");
524+
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
524525
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
525526
fprintf(stderr, " -m FNAME, --model FNAME\n");
526527
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());

examples/main/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -293,5 +293,5 @@ These options provide extra functionality and customization when running the LLa
293293
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
294294
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
295295
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
296-
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
296+
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
297297
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.

examples/server/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Command line options:
1616
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
1717
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
1818
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
19-
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
19+
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
2020
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
2121
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
2222
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.

examples/server/server.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -632,7 +632,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
632632
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
633633
fprintf(stderr, " -a ALIAS, --alias ALIAS\n");
634634
fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n");
635-
fprintf(stderr, " --lora FNAME apply LoRA adapter\n");
635+
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
636636
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
637637
fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
638638
fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port);
@@ -820,6 +820,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
820820
break;
821821
}
822822
params.lora_adapter = argv[i];
823+
params.use_mmap = false;
823824
}
824825
else if (arg == "--lora-base")
825826
{

llama-util.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -175,13 +175,13 @@ struct llama_mmap {
175175
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
176176
size = file->size;
177177
int fd = fileno(file->fp);
178-
int flags = MAP_PRIVATE;
178+
int flags = MAP_SHARED;
179179
// prefetch/readahead impairs performance on NUMA systems
180180
if (numa) { prefetch = 0; }
181181
#ifdef __linux__
182182
if (prefetch) { flags |= MAP_POPULATE; }
183183
#endif
184-
addr = mmap(NULL, file->size, PROT_READ | PROT_WRITE, flags, fd, 0);
184+
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
185185
if (addr == MAP_FAILED) {
186186
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
187187
}
@@ -223,7 +223,7 @@ struct llama_mmap {
223223
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
224224
}
225225

226-
addr = MapViewOfFile(hMapping, FILE_MAP_COPY, 0, 0, 0);
226+
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
227227
error = GetLastError();
228228
CloseHandle(hMapping);
229229

0 commit comments

Comments
 (0)