Skip to content

Commit f963b63

Browse files
comexblackhole89
authored andcommitted
Rewrite loading code to try to satisfy everyone:
- Support all three formats (ggml, ggmf, ggjt). (However, I didn't include the hack needed to support GPT4All files without conversion. Those can still be used after converting them with convert.py from my other PR.) - Support both mmap and read (mmap is used by default, but can be disabled with `--no-mmap`, and is automatically disabled for pre-ggjt files or on platforms where mmap is not supported). - Support multi-file models like before, but automatically determine the number of parts rather than requiring `--n_parts`. - Improve validation and error checking. - Stop using the per-file type field (f16) entirely in favor of just relying on the per-tensor type/size fields. This has no immediate benefit, but makes it easier to experiment with different formats, and should make it easier to support the new GPTQ-for-LLaMa models in the future (I have some work in progress on that front). - Support VirtualLock on Windows (using the same `--mlock` option as on Unix). - Indicate loading progress when using mmap + mlock. (Which led me to the interesting observation that on my Linux machine, with a warm file cache, mlock actually takes some time, whereas mmap without mlock starts almost instantly...) - To help implement this, move mlock support from ggml to the loading code. - madvise/PrefetchVirtualMemory support (based on #740) - Switch from ifstream to the `fopen` family of functions to avoid unnecessary copying and, when mmap is enabled, allow reusing the same file descriptor for both metadata reads and mmap (whereas the existing implementation opens the file a second time to mmap). - Quantization now produces a single-file output even with multi-file inputs (not really a feature as much as 'it was easier this way'). Implementation notes: I tried to factor the code into more discrete pieces than before. Regarding code style: I tried to follow the code style, but I'm naughty and used a few advanced C++ features repeatedly: - Destructors to make it easier to ensure everything gets cleaned up. - Exceptions. I don't even usually use exceptions when writing C++, and I can remove them if desired... but here they make the loading code much more succinct while still properly handling a variety of errors, ranging from API calls failing to integer overflow and allocation failure. The exceptions are converted to error codes at the API boundary.) Co-authored-by: Pavol Rusnak <[email protected]> (for the bit I copied from #740)
1 parent aaf3b23 commit f963b63

File tree

14 files changed

+1209
-829
lines changed

14 files changed

+1209
-829
lines changed

CMakeLists.txt

+8-1
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ if (LLAMA_ALL_WARNINGS)
140140
-Wpedantic
141141
-Wcast-qual
142142
-Wno-unused-function
143+
-Wno-multichar
143144
)
144145
else()
145146
# todo : msvc
@@ -152,6 +153,10 @@ if (LLAMA_ALL_WARNINGS)
152153

153154
endif()
154155

156+
if (MSVC)
157+
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
158+
endif()
159+
155160
if (LLAMA_LTO)
156161
include(CheckIPOSupported)
157162
check_ipo_supported(RESULT result OUTPUT output)
@@ -241,7 +246,9 @@ endif()
241246

242247
add_library(llama
243248
llama.cpp
244-
llama.h)
249+
llama.h
250+
llama_internal.h
251+
llama_util.h)
245252

246253
target_include_directories(llama PUBLIC .)
247254
target_compile_features(llama PUBLIC cxx_std_11) # don't bump

Makefile

+2-2
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ LDFLAGS =
3737

3838
# warnings
3939
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function
40-
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
40+
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
4141

4242
# OS specific
4343
# TODO: support Windows
@@ -142,7 +142,7 @@ default: main quantize perplexity embedding
142142
ggml.o: ggml.c ggml.h
143143
$(CC) $(CFLAGS) -c ggml.c -o ggml.o
144144

145-
llama.o: llama.cpp llama.h
145+
llama.o: llama.cpp llama.h llama_util.h llama_internal.h
146146
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
147147

148148
common.o: examples/common.cpp examples/common.h

examples/common.cpp

+6-3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
#include "common.h"
22

3-
#include "ggml.h"
4-
53
#include <cassert>
64
#include <cstring>
75
#include <fstream>
@@ -161,6 +159,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
161159
params.use_color = true;
162160
} else if (arg == "--mlock") {
163161
params.use_mlock = true;
162+
} else if (arg == "--no-mmap") {
163+
params.use_mmap = false;
164164
} else if (arg == "--mtest") {
165165
params.mem_test = true;
166166
} else if (arg == "--verbose-prompt") {
@@ -240,9 +240,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
240240
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
241241
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
242242
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
243-
if (ggml_mlock_supported()) {
243+
if (llama_mlock_supported()) {
244244
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
245245
}
246+
if (llama_mmap_supported()) {
247+
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
248+
}
246249
fprintf(stderr, " --mtest compute maximum memory usage\n");
247250
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
248251
fprintf(stderr, " -m FNAME, --model FNAME\n");

examples/common.h

+1
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ struct gpt_params {
4747
bool instruct = false; // instruction mode (used for Alpaca models)
4848
bool ignore_eos = false; // do not stop generating after eos
4949
bool perplexity = false; // compute perplexity over the prompt
50+
bool use_mmap = true; // use mmap for faster loads
5051
bool use_mlock = false; // use mlock to keep model in memory
5152
bool mem_test = false; // compute maximum memory usage
5253
bool verbose_prompt = false; // print prompt tokens before generation

examples/embedding/embedding.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ int main(int argc, char ** argv) {
3838
lparams.seed = params.seed;
3939
lparams.f16_kv = params.memory_f16;
4040
lparams.logits_all = params.perplexity;
41+
lparams.use_mmap = params.use_mmap;
4142
lparams.use_mlock = params.use_mlock;
4243
lparams.embedding = params.embedding;
4344

examples/main/main.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ int main(int argc, char ** argv) {
9797
lparams.n_parts = params.n_parts;
9898
lparams.seed = params.seed;
9999
lparams.f16_kv = params.memory_f16;
100+
lparams.use_mmap = params.use_mmap;
100101
lparams.use_mlock = params.use_mlock;
101102

102103
ctx = llama_init_from_file(params.model.c_str(), lparams);

examples/perplexity/perplexity.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ int main(int argc, char ** argv) {
115115
lparams.seed = params.seed;
116116
lparams.f16_kv = params.memory_f16;
117117
lparams.logits_all = params.perplexity;
118+
lparams.use_mmap = params.use_mmap;
118119
lparams.use_mlock = params.use_mlock;
119120
lparams.embedding = params.embedding;
120121

examples/quantize-stats/quantize-stats.cpp

+4-5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "ggml.h"
22
#include "llama.h"
3+
#include "llama_internal.h"
34

45
#include <algorithm>
56
#include <cassert>
@@ -266,15 +267,13 @@ int main(int argc, char ** argv) {
266267
}
267268
}
268269

269-
// Sort tensors for consistent output
270-
const auto tensors = llama_internal_get_tensor_map(ctx);
271-
std::map<std::string, struct ggml_tensor *> tensors_sorted { tensors.begin(), tensors.end() };
270+
const auto &tensors = llama_internal_get_tensor_map(ctx);
272271

273272
// check layer tensors
274273
int included_layers = 0;
275274
int64_t max_nelements = 0;
276275
bool is_f16 = false;
277-
for (const auto& kv_tensor : tensors_sorted) {
276+
for (const auto& kv_tensor : tensors) {
278277
if (!layer_included(params, kv_tensor.first)) {
279278
continue;
280279
}
@@ -315,7 +314,7 @@ int main(int argc, char ** argv) {
315314

316315
error_stats global_stats {};
317316

318-
for (const auto& kv_tensor : tensors_sorted) {
317+
for (const auto& kv_tensor : tensors) {
319318
if (!layer_included(params, kv_tensor.first)) {
320319
continue;
321320
}

ggml.c

-78
Original file line numberDiff line numberDiff line change
@@ -97,17 +97,6 @@ typedef void* thread_ret_t;
9797
#define static_assert(cond, msg) _Static_assert(cond, msg)
9898
#endif
9999

100-
#define GGML_MLOCK_SUPPORT 0
101-
102-
#ifdef __has_include
103-
#if __has_include(<sys/mman.h>)
104-
#undef GGML_MLOCK_SUPPORT
105-
#define GGML_MLOCK_SUPPORT 1
106-
#include <sys/mman.h>
107-
#endif
108-
#endif
109-
110-
111100
/*#define GGML_PERF*/
112101
#define GGML_DEBUG 0
113102
#define GGML_GELU_FP16
@@ -2690,21 +2679,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
26902679

26912680
static_assert(GGML_OP_COUNT == 35, "GGML_OP_COUNT != 35");
26922681

2693-
//
2694-
// ggml object
2695-
//
2696-
2697-
struct ggml_object {
2698-
size_t offs;
2699-
size_t size;
2700-
2701-
struct ggml_object * next;
2702-
2703-
char padding[8];
2704-
};
2705-
2706-
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
2707-
27082682
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
27092683
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
27102684

@@ -2716,7 +2690,6 @@ struct ggml_context {
27162690
size_t mem_size;
27172691
void * mem_buffer;
27182692
bool mem_buffer_owned;
2719-
bool mem_buffer_mlocked;
27202693
bool no_alloc;
27212694

27222695
int n_objects;
@@ -3003,7 +2976,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
30032976
/*.mem_size =*/ params.mem_size,
30042977
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
30052978
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
3006-
/*.mem_buffer_mlocked =*/ false,
30072979
/*.no_alloc =*/ params.no_alloc,
30082980
/*.n_objects =*/ 0,
30092981
/*.objects_begin =*/ NULL,
@@ -3036,14 +3008,6 @@ void ggml_free(struct ggml_context * ctx) {
30363008
GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
30373009
__func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
30383010

3039-
#if GGML_MLOCK_SUPPORT
3040-
if (ctx->mem_buffer_mlocked) {
3041-
if (munlock(ctx->mem_buffer, ctx->mem_size)) {
3042-
fprintf(stderr, "%s: failed to munlock buffer: %s\n", __func__, strerror(errno));
3043-
}
3044-
}
3045-
#endif
3046-
30473011
if (ctx->mem_buffer_owned) {
30483012
free(ctx->mem_buffer);
30493013
}
@@ -3072,48 +3036,6 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
30723036
return result;
30733037
}
30743038

3075-
#ifdef __APPLE__
3076-
#define MLOCK_SUGGESTION \
3077-
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
3078-
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
3079-
#else
3080-
#define MLOCK_SUGGESTION \
3081-
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
3082-
#endif
3083-
3084-
bool ggml_mlock_supported(void) {
3085-
return GGML_MLOCK_SUPPORT;
3086-
}
3087-
3088-
bool ggml_mlock(
3089-
struct ggml_context * ctx,
3090-
const void *opt_extra_addr,
3091-
size_t opt_extra_len,
3092-
char **err_p) {
3093-
// TODO: Use SetProcessWorkingSetSize() + VirtualLock() on WIN32
3094-
#if GGML_MLOCK_SUPPORT
3095-
if (ctx->mem_buffer_mlocked) {
3096-
return true;
3097-
}
3098-
if (mlock(ctx->mem_buffer, ctx->mem_size) ||
3099-
(opt_extra_len &&
3100-
mlock(opt_extra_addr, opt_extra_len))) {
3101-
if ((*err_p = malloc(1024))) {
3102-
snprintf(*err_p, 1024,
3103-
"failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
3104-
ctx->mem_size + opt_extra_len,
3105-
strerror(errno));
3106-
}
3107-
return false;
3108-
}
3109-
ctx->mem_buffer_mlocked = true;
3110-
return true;
3111-
#else // GGML_MLOCK_SUPPORT
3112-
*err_p = strdup("can't mlock because it's not supported on this system");
3113-
return false;
3114-
#endif // GGML_MLOCK_SUPPORT
3115-
}
3116-
31173039
////////////////////////////////////////////////////////////////////////////////
31183040

31193041
struct ggml_tensor * ggml_new_tensor_impl(

ggml.h

+13-7
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,19 @@ enum ggml_op {
253253
GGML_OP_COUNT,
254254
};
255255

256+
257+
// ggml object
258+
struct ggml_object {
259+
size_t offs;
260+
size_t size;
261+
262+
struct ggml_object * next;
263+
264+
char padding[8];
265+
};
266+
267+
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
268+
256269
// n-dimensional tensor
257270
struct ggml_tensor {
258271
enum ggml_type type;
@@ -344,13 +357,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
344357

345358
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
346359

347-
bool ggml_mlock_supported(void);
348-
bool ggml_mlock(
349-
struct ggml_context * ctx,
350-
const void *opt_extra_addr,
351-
size_t opt_extra_len,
352-
char **err_p);
353-
354360
struct ggml_tensor * ggml_new_tensor(
355361
struct ggml_context * ctx,
356362
enum ggml_type type,

0 commit comments

Comments
 (0)