Skip to content

Commit 7a3d6b6

Browse files
ggerganovcebtenzzre
authored andcommitted
ggml : quantization refactoring (ggml-org#3833)
* ggml : factor all quantization code in ggml-quants ggml-ci * ggml-quants : fix Zig and Swift builds + quantize tool ggml-ci * quantize : --pure option for disabling k-quant mixtures --------- Co-authored-by: cebtenzzre <[email protected]>
1 parent 77864df commit 7a3d6b6

File tree

11 files changed

+4073
-4086
lines changed

11 files changed

+4073
-4086
lines changed

Diff for: CMakeLists.txt

+4-8
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,6 @@ option(LLAMA_CLBLAST "llama: use CLBlast"
9494
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
9595
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
9696
option(LLAMA_MPI "llama: use MPI" OFF)
97-
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
9897
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
9998

10099
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
@@ -278,13 +277,8 @@ if (LLAMA_BLAS)
278277
endif()
279278
endif()
280279

281-
if (LLAMA_K_QUANTS)
282-
set(GGML_HEADERS_EXTRA k_quants.h)
283-
set(GGML_SOURCES_EXTRA k_quants.c)
284-
add_compile_definitions(GGML_USE_K_QUANTS)
285-
if (LLAMA_QKK_64)
286-
add_compile_definitions(GGML_QKK_64)
287-
endif()
280+
if (LLAMA_QKK_64)
281+
add_compile_definitions(GGML_QKK_64)
288282
endif()
289283

290284
if (LLAMA_CUBLAS)
@@ -673,6 +667,8 @@ add_library(ggml OBJECT
673667
ggml-alloc.h
674668
ggml-backend.c
675669
ggml-backend.h
670+
ggml-quants.c
671+
ggml-quants.h
676672
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
677673
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
678674
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}

Diff for: Makefile

+6-12
Original file line numberDiff line numberDiff line change
@@ -342,13 +342,9 @@ else
342342
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
343343
endif
344344

345-
ifndef LLAMA_NO_K_QUANTS
346-
MK_CPPFLAGS += -DGGML_USE_K_QUANTS
347-
OBJS += k_quants.o
348345
ifdef LLAMA_QKK_64
349346
MK_CPPFLAGS += -DGGML_QKK_64
350347
endif
351-
endif
352348

353349
ifndef LLAMA_NO_ACCELERATE
354350
# Mac OS - include Accelerate framework.
@@ -365,7 +361,7 @@ ifdef LLAMA_MPI
365361
MK_CPPFLAGS += -DGGML_USE_MPI
366362
MK_CFLAGS += -Wno-cast-qual
367363
MK_CXXFLAGS += -Wno-cast-qual
368-
OBJS += ggml-mpi.o
364+
OBJS += ggml-mpi.o
369365
endif # LLAMA_MPI
370366

371367
ifdef LLAMA_OPENBLAS
@@ -382,7 +378,7 @@ endif # LLAMA_BLIS
382378
ifdef LLAMA_CUBLAS
383379
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
384380
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
385-
OBJS += ggml-cuda.o
381+
OBJS += ggml-cuda.o
386382
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
387383
ifdef LLAMA_CUDA_NVCC
388384
NVCC = $(LLAMA_CUDA_NVCC)
@@ -497,11 +493,6 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
497493
$(CC) $(CFLAGS) -c $< -o $@
498494
endif # LLAMA_MPI
499495

500-
ifndef LLAMA_NO_K_QUANTS
501-
k_quants.o: k_quants.c k_quants.h
502-
$(CC) $(CFLAGS) -c $< -o $@
503-
endif # LLAMA_NO_K_QUANTS
504-
505496
# combine build flags with cmdline overrides
506497
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
507498
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
@@ -542,7 +533,10 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
542533
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
543534
$(CC) $(CFLAGS) -c $< -o $@
544535

545-
OBJS += ggml-alloc.o ggml-backend.o
536+
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
537+
$(CC) $(CFLAGS) -c $< -o $@
538+
539+
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
546540

547541
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
548542
$(CXX) $(CXXFLAGS) -c $< -o $@

Diff for: Package.swift

+1-2
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,12 @@ let package = Package(
4242
"llama.cpp",
4343
"ggml-alloc.c",
4444
"ggml-backend.c",
45-
"k_quants.c",
45+
"ggml-quants.c",
4646
] + additionalSources,
4747
resources: resources,
4848
publicHeadersPath: "spm-headers",
4949
cSettings: [
5050
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
51-
.define("GGML_USE_K_QUANTS"),
5251
.define("GGML_USE_ACCELERATE")
5352
// NOTE: NEW_LAPACK will required iOS version 16.4+
5453
// We should consider add this in the future when we drop support for iOS 14

Diff for: build.zig

+8-13
Original file line numberDiff line numberDiff line change
@@ -116,15 +116,10 @@ pub fn build(b: *std.build.Builder) !void {
116116
var make = try Maker.init(b);
117117
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
118118

119-
if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) {
120-
try make.addFlag("-DGGML_USE_K_QUANTS");
121-
const k_quants = make.obj("k_quants", "k_quants.c");
122-
try make.objs.append(k_quants);
123-
}
124-
125119
const ggml = make.obj("ggml", "ggml.c");
126120
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
127121
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
122+
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
128123
const llama = make.obj("llama", "llama.cpp");
129124
const common = make.obj("common", "common/common.cpp");
130125
const console = make.obj("console", "common/console.cpp");
@@ -133,14 +128,14 @@ pub fn build(b: *std.build.Builder) !void {
133128
const train = make.obj("train", "common/train.cpp");
134129
const clip = make.obj("clip", "examples/llava/clip.cpp");
135130

136-
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser });
137-
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
138-
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
139-
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
140-
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
141-
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
131+
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, console, grammar_parser });
132+
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
133+
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
134+
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
135+
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train });
136+
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train });
142137

143-
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser, clip });
138+
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, grammar_parser, clip });
144139
if (server.target.isWindows()) {
145140
server.linkSystemLibrary("ws2_32");
146141
}

Diff for: examples/quantize/quantize.cpp

+5-4
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
1818
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
1919
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
2020
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
21-
#ifdef GGML_USE_K_QUANTS
2221
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
2322
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
2423
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
@@ -31,7 +30,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
3130
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
3231
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
3332
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
34-
#endif
3533
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
3634
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
3735
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
@@ -70,13 +68,14 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
7068
}
7169

7270
// usage:
73-
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
71+
// ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
7472
//
7573
[[noreturn]]
7674
static void usage(const char * executable) {
77-
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
75+
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
7876
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
7977
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
78+
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
8079
printf("\nAllowed quantization types:\n");
8180
for (auto & it : QUANT_OPTIONS) {
8281
if (it.name != "COPY") {
@@ -103,6 +102,8 @@ int main(int argc, char ** argv) {
103102
params.quantize_output_tensor = false;
104103
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
105104
params.allow_requantize = true;
105+
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
106+
params.pure = true;
106107
} else {
107108
usage(argv[0]);
108109
}

0 commit comments

Comments
 (0)