Skip to content

Commit 8a86b95

Browse files
committed
quantize : --pure option for disabling k-quant mixtures
1 parent ee37e35 commit 8a86b95

File tree

3 files changed

+10
-3
lines changed

3 files changed

+10
-3
lines changed

Diff for: examples/quantize/quantize.cpp

+5-2
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,14 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
6868
}
6969

7070
// usage:
71-
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
71+
// ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
7272
//
7373
[[noreturn]]
7474
static void usage(const char * executable) {
75-
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
75+
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
7676
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
7777
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
78+
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
7879
printf("\nAllowed quantization types:\n");
7980
for (auto & it : QUANT_OPTIONS) {
8081
if (it.name != "COPY") {
@@ -101,6 +102,8 @@ int main(int argc, char ** argv) {
101102
params.quantize_output_tensor = false;
102103
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
103104
params.allow_requantize = true;
105+
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
106+
params.pure = true;
104107
} else {
105108
usage(argv[0]);
106109
}

Diff for: llama.cpp

+4-1
Original file line numberDiff line numberDiff line change
@@ -8380,7 +8380,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
83808380

83818381
if (quantize) {
83828382
new_type = quantized_type;
8383-
new_type = get_k_quant_type(qs, new_type, tensor, ftype);
8383+
if (!params->pure) {
8384+
new_type = get_k_quant_type(qs, new_type, tensor, ftype);
8385+
}
83848386

83858387
// If we've decided to quantize to the same type the tensor is already
83868388
// in then there's nothing to do.
@@ -8835,6 +8837,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
88358837
/*.allow_requantize =*/ false,
88368838
/*.quantize_output_tensor =*/ true,
88378839
/*.only_copy =*/ false,
8840+
/*.pure =*/ false,
88388841
};
88398842

88408843
return result;

Diff for: llama.h

+1
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ extern "C" {
191191
bool allow_requantize; // allow quantizing non-f32/f16 tensors
192192
bool quantize_output_tensor; // quantize output.weight
193193
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
194+
bool pure; // disable k-quant mixtures and quantize all tensors to the same type
194195
} llama_model_quantize_params;
195196

196197
// grammar types

0 commit comments

Comments
 (0)