Skip to content

Commit 4059df1

Browse files
committed
quantizing: Add warning when tensors were incompatible with k-quants
Clean up k-quants state passing a bit
1 parent 7f20d78 commit 4059df1

File tree

1 file changed

+56
-34
lines changed

1 file changed

+56
-34
lines changed

Diff for: llama.cpp

+56-34
Original file line numberDiff line numberDiff line change
@@ -7985,6 +7985,24 @@ struct no_init {
79857985
no_init() { /* do nothing */ }
79867986
};
79877987

7988+
struct quantize_state_internal {
7989+
const llama_model & model;
7990+
const llama_model_quantize_params * params;
7991+
#ifdef GGML_USE_K_QUANTS
7992+
int n_attention_wv = 0;
7993+
int n_feed_forward_w2 = 0;
7994+
int i_attention_wv = 0;
7995+
int i_feed_forward_w2 = 0;
7996+
7997+
int n_k_quantized = 0;
7998+
int n_fallback = 0;
7999+
#endif
8000+
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
8001+
: model(model)
8002+
, params(params)
8003+
{}
8004+
};
8005+
79888006
static void llama_convert_tensor_internal(
79898007
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
79908008
const size_t nelements, const int nthread
@@ -8045,20 +8063,21 @@ static void llama_convert_tensor_internal(
80458063

80468064
#ifdef GGML_USE_K_QUANTS
80478065
static ggml_type get_k_quant_type(
8048-
ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
8049-
int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
8066+
quantize_state_internal & qs,
8067+
ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
80508068
) {
80518069
const std::string name = ggml_get_name(tensor);
80528070
// TODO: avoid hardcoded tensor names - use the TN_* constants
8053-
const auto tn = LLM_TN(model.arch);
8071+
const llm_arch arch = qs.model.arch;
8072+
const auto tn = LLM_TN(arch);
80548073

80558074
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
80568075
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
80578076
};
80588077

80598078
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
80608079
int nx = tensor->ne[0];
8061-
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
8080+
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
80628081
new_type = GGML_TYPE_Q8_0;
80638082
}
80648083
else if (new_type != GGML_TYPE_Q8_0) {
@@ -8067,46 +8086,46 @@ static ggml_type get_k_quant_type(
80678086
} else if (name.find("attn_v.weight") != std::string::npos) {
80688087
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
80698088
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8070-
new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8089+
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
80718090
}
80728091
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
80738092
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
8074-
use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
8075-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
8093+
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
8094+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
80768095
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
8077-
(*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
8078-
if (model.type == MODEL_70B) {
8096+
(qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
8097+
if (qs.model.type == MODEL_70B) {
80798098
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
80808099
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
80818100
// nearly negligible increase in model size by quantizing this tensor with more bits:
80828101
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
80838102
}
8084-
++*i_attention_wv;
8103+
++qs.i_attention_wv;
80858104
} else if (name.find("ffn_down.weight") != std::string::npos) {
80868105
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
80878106
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8088-
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8089-
: model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
8107+
new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8108+
: arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
80908109
: GGML_TYPE_Q3_K;
80918110
}
80928111
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
8093-
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
8112+
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
80948113
}
80958114
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8096-
if (model.arch == LLM_ARCH_FALCON) {
8097-
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8098-
use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8115+
if (arch == LLM_ARCH_FALCON) {
8116+
new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8117+
use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
80998118
} else {
8100-
if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8119+
if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
81018120
}
81028121
}
8103-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8104-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
8122+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8123+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) {
81058124
new_type = GGML_TYPE_Q5_K;
81068125
}
8107-
++*i_feed_forward_w2;
8126+
++qs.i_feed_forward_w2;
81088127
} else if (name.find("attn_output.weight") != std::string::npos) {
8109-
if (model.arch != LLM_ARCH_FALCON) {
8128+
if (arch != LLM_ARCH_FALCON) {
81108129
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
81118130
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
81128131
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
@@ -8135,6 +8154,8 @@ static ggml_type get_k_quant_type(
81358154
if (nx % QK_K != 0) {
81368155
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
81378156
convert_incompatible_tensor = true;
8157+
} else {
8158+
++qs.n_k_quantized;
81388159
}
81398160
}
81408161
if (convert_incompatible_tensor) {
@@ -8147,6 +8168,7 @@ static ggml_type get_k_quant_type(
81478168
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
81488169
}
81498170
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
8171+
++qs.n_fallback;
81508172
}
81518173

81528174
return new_type;
@@ -8204,6 +8226,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
82048226
llm_load_arch(ml, model);
82058227
llm_load_hparams(ml, model);
82068228

8229+
struct quantize_state_internal qs(model, params);
8230+
82078231
if (params->only_copy) {
82088232
ftype = model.ftype;
82098233
}
@@ -8217,29 +8241,23 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
82178241
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
82188242

82198243
#ifdef GGML_USE_K_QUANTS
8220-
int n_attention_wv = 0;
8221-
int n_feed_forward_w2 = 0;
8222-
82238244
for (int i = 0; i < ml.n_tensors; ++i) {
82248245
struct ggml_tensor * meta = ml.get_tensor_meta(i);
82258246

82268247
const std::string name = ggml_get_name(meta);
82278248

82288249
// TODO: avoid hardcoded tensor names - use the TN_* constants
82298250
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
8230-
++n_attention_wv;
8251+
++qs.n_attention_wv;
82318252
}
82328253
else if (name.find("ffn_down.weight") != std::string::npos) {
8233-
++n_feed_forward_w2;
8254+
++qs.n_feed_forward_w2;
82348255
}
82358256
}
8236-
if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
8257+
if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
82378258
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
8238-
__func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
8259+
__func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer);
82398260
}
8240-
8241-
int i_attention_wv = 0;
8242-
int i_feed_forward_w2 = 0;
82438261
#endif
82448262

82458263
size_t total_size_org = 0;
@@ -8306,9 +8324,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
83068324
if (quantize) {
83078325
new_type = quantized_type;
83088326
#ifdef GGML_USE_K_QUANTS
8309-
new_type = get_k_quant_type(
8310-
new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
8311-
);
8327+
new_type = get_k_quant_type(qs, new_type, tensor, ftype);
83128328
#endif
83138329
// If we've decided to quantize to the same type the tensor is already
83148330
// in then there's nothing to do.
@@ -8434,6 +8450,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
84348450
LLAMA_LOG_INFO("\n");
84358451
}
84368452
}
8453+
#ifdef GGML_USE_K_QUANTS
8454+
if (qs.n_fallback > 0) {
8455+
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n",
8456+
__func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
8457+
}
8458+
#endif
84378459
}
84388460

84398461
static int llama_apply_lora_from_file_internal(

0 commit comments

Comments
 (0)