Skip to content

Commit 1e4f89a

Browse files
KerfuffleV2Nexesenex
authored andcommitted
llama : allow quantizing k-quants to fall back when tensor size incompatible (ggml-org#3747)
* Allow quantizing k-quants to fall back when tensor size incompatible * quantizing: Add warning when tensors were incompatible with k-quants Clean up k-quants state passing a bit
1 parent 349c1e8 commit 1e4f89a

File tree

1 file changed

+65
-43
lines changed

1 file changed

+65
-43
lines changed

Diff for: llama.cpp

+65-43
Original file line numberDiff line numberDiff line change
@@ -8307,6 +8307,24 @@ struct no_init {
83078307
no_init() { /* do nothing */ }
83088308
};
83098309

8310+
struct quantize_state_internal {
8311+
const llama_model & model;
8312+
const llama_model_quantize_params * params;
8313+
#ifdef GGML_USE_K_QUANTS
8314+
int n_attention_wv = 0;
8315+
int n_feed_forward_w2 = 0;
8316+
int i_attention_wv = 0;
8317+
int i_feed_forward_w2 = 0;
8318+
8319+
int n_k_quantized = 0;
8320+
int n_fallback = 0;
8321+
#endif
8322+
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
8323+
: model(model)
8324+
, params(params)
8325+
{}
8326+
};
8327+
83108328
static void llama_convert_tensor_internal(
83118329
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
83128330
const size_t nelements, const int nthread
@@ -8367,20 +8385,21 @@ static void llama_convert_tensor_internal(
83678385

83688386
#ifdef GGML_USE_K_QUANTS
83698387
static ggml_type get_k_quant_type(
8370-
ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
8371-
int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
8388+
quantize_state_internal & qs,
8389+
ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
83728390
) {
83738391
const std::string name = ggml_get_name(tensor);
83748392
// TODO: avoid hardcoded tensor names - use the TN_* constants
8375-
const auto tn = LLM_TN(model.arch);
8393+
const llm_arch arch = qs.model.arch;
8394+
const auto tn = LLM_TN(arch);
83768395

83778396
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
83788397
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
83798398
};
83808399

83818400
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
83828401
int nx = tensor->ne[0];
8383-
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
8402+
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
83848403
new_type = GGML_TYPE_Q8_0;
83858404
}
83868405
else if (new_type != GGML_TYPE_Q8_0) {
@@ -8389,46 +8408,46 @@ static ggml_type get_k_quant_type(
83898408
} else if (name.find("attn_v.weight") != std::string::npos) {
83908409
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
83918410
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8392-
new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8411+
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
83938412
}
83948413
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
83958414
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
8396-
use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
8397-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
8415+
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
8416+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
83988417
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
8399-
(*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
8400-
if (model.type == MODEL_70B) {
8418+
(qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
8419+
if (qs.model.type == MODEL_70B) {
84018420
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
84028421
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
84038422
// nearly negligible increase in model size by quantizing this tensor with more bits:
84048423
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
84058424
}
8406-
++*i_attention_wv;
8425+
++qs.i_attention_wv;
84078426
} else if (name.find("ffn_down.weight") != std::string::npos) {
84088427
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
84098428
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8410-
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8411-
: model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
8429+
new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8430+
: arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
84128431
: GGML_TYPE_Q3_K;
84138432
}
84148433
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
8415-
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
8434+
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
84168435
}
84178436
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8418-
if (model.arch == LLM_ARCH_FALCON) {
8419-
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8420-
use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8437+
if (arch == LLM_ARCH_FALCON) {
8438+
new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8439+
use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
84218440
} else {
8422-
if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8441+
if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
84238442
}
84248443
}
8425-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8426-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
8444+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8445+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) {
84278446
new_type = GGML_TYPE_Q5_K;
84288447
}
8429-
++*i_feed_forward_w2;
8448+
++qs.i_feed_forward_w2;
84308449
} else if (name.find("attn_output.weight") != std::string::npos) {
8431-
if (model.arch != LLM_ARCH_FALCON) {
8450+
if (arch != LLM_ARCH_FALCON) {
84328451
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
84338452
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
84348453
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
@@ -8455,20 +8474,23 @@ static ggml_type get_k_quant_type(
84558474
int nx = tensor->ne[0];
84568475
int ny = tensor->ne[1];
84578476
if (nx % QK_K != 0) {
8458-
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
8477+
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
84598478
convert_incompatible_tensor = true;
8479+
} else {
8480+
++qs.n_k_quantized;
84608481
}
84618482
}
84628483
if (convert_incompatible_tensor) {
8463-
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
8464-
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
8465-
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
8466-
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
8467-
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
8468-
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
8469-
} else {
8470-
throw std::runtime_error("Unsupported tensor size encountered\n");
8484+
switch (new_type) {
8485+
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
8486+
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
8487+
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
8488+
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
8489+
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
8490+
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
84718491
}
8492+
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
8493+
++qs.n_fallback;
84728494
}
84738495

84748496
return new_type;
@@ -8526,6 +8548,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
85268548
llm_load_arch(ml, model);
85278549
llm_load_hparams(ml, model);
85288550

8551+
struct quantize_state_internal qs(model, params);
8552+
85298553
if (params->only_copy) {
85308554
ftype = model.ftype;
85318555
}
@@ -8539,29 +8563,23 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
85398563
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
85408564

85418565
#ifdef GGML_USE_K_QUANTS
8542-
int n_attention_wv = 0;
8543-
int n_feed_forward_w2 = 0;
8544-
85458566
for (int i = 0; i < ml.n_tensors; ++i) {
85468567
struct ggml_tensor * meta = ml.get_tensor_meta(i);
85478568

85488569
const std::string name = ggml_get_name(meta);
85498570

85508571
// TODO: avoid hardcoded tensor names - use the TN_* constants
85518572
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
8552-
++n_attention_wv;
8573+
++qs.n_attention_wv;
85538574
}
85548575
else if (name.find("ffn_down.weight") != std::string::npos) {
8555-
++n_feed_forward_w2;
8576+
++qs.n_feed_forward_w2;
85568577
}
85578578
}
8558-
if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
8579+
if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
85598580
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
8560-
__func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
8581+
__func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer);
85618582
}
8562-
8563-
int i_attention_wv = 0;
8564-
int i_feed_forward_w2 = 0;
85658583
#endif
85668584

85678585
size_t total_size_org = 0;
@@ -8628,9 +8646,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
86288646
if (quantize) {
86298647
new_type = quantized_type;
86308648
#ifdef GGML_USE_K_QUANTS
8631-
new_type = get_k_quant_type(
8632-
new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
8633-
);
8649+
new_type = get_k_quant_type(qs, new_type, tensor, ftype);
86348650
#endif
86358651
// If we've decided to quantize to the same type the tensor is already
86368652
// in then there's nothing to do.
@@ -8756,6 +8772,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
87568772
LLAMA_LOG_INFO("\n");
87578773
}
87588774
}
8775+
#ifdef GGML_USE_K_QUANTS
8776+
if (qs.n_fallback > 0) {
8777+
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n",
8778+
__func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
8779+
}
8780+
#endif
87598781
}
87608782

87618783
static int llama_apply_lora_from_file_internal(

0 commit comments

Comments
 (0)