Skip to content

Commit bd6d9e2

Browse files
authored
llama : allow quantizing k-quants to fall back when tensor size incompatible (#3747)
* Allow quantizing k-quants to fall back when tensor size incompatible * quantizing: Add warning when tensors were incompatible with k-quants Clean up k-quants state passing a bit
1 parent ee1a0ec commit bd6d9e2

File tree

1 file changed

+65
-43
lines changed

1 file changed

+65
-43
lines changed

Diff for: llama.cpp

+65-43
Original file line numberDiff line numberDiff line change
@@ -8049,6 +8049,24 @@ struct no_init {
80498049
no_init() { /* do nothing */ }
80508050
};
80518051

8052+
struct quantize_state_internal {
8053+
const llama_model & model;
8054+
const llama_model_quantize_params * params;
8055+
#ifdef GGML_USE_K_QUANTS
8056+
int n_attention_wv = 0;
8057+
int n_feed_forward_w2 = 0;
8058+
int i_attention_wv = 0;
8059+
int i_feed_forward_w2 = 0;
8060+
8061+
int n_k_quantized = 0;
8062+
int n_fallback = 0;
8063+
#endif
8064+
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
8065+
: model(model)
8066+
, params(params)
8067+
{}
8068+
};
8069+
80528070
static void llama_convert_tensor_internal(
80538071
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
80548072
const size_t nelements, const int nthread
@@ -8109,20 +8127,21 @@ static void llama_convert_tensor_internal(
81098127

81108128
#ifdef GGML_USE_K_QUANTS
81118129
static ggml_type get_k_quant_type(
8112-
ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
8113-
int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
8130+
quantize_state_internal & qs,
8131+
ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
81148132
) {
81158133
const std::string name = ggml_get_name(tensor);
81168134
// TODO: avoid hardcoded tensor names - use the TN_* constants
8117-
const auto tn = LLM_TN(model.arch);
8135+
const llm_arch arch = qs.model.arch;
8136+
const auto tn = LLM_TN(arch);
81188137

81198138
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
81208139
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
81218140
};
81228141

81238142
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
81248143
int nx = tensor->ne[0];
8125-
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
8144+
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
81268145
new_type = GGML_TYPE_Q8_0;
81278146
}
81288147
else if (new_type != GGML_TYPE_Q8_0) {
@@ -8131,46 +8150,46 @@ static ggml_type get_k_quant_type(
81318150
} else if (name.find("attn_v.weight") != std::string::npos) {
81328151
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
81338152
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8134-
new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8153+
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
81358154
}
81368155
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
81378156
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
8138-
use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
8139-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
8157+
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
8158+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
81408159
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
8141-
(*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
8142-
if (model.type == MODEL_70B) {
8160+
(qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
8161+
if (qs.model.type == MODEL_70B) {
81438162
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
81448163
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
81458164
// nearly negligible increase in model size by quantizing this tensor with more bits:
81468165
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
81478166
}
8148-
++*i_attention_wv;
8167+
++qs.i_attention_wv;
81498168
} else if (name.find("ffn_down.weight") != std::string::npos) {
81508169
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
81518170
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8152-
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8153-
: model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
8171+
new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8172+
: arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
81548173
: GGML_TYPE_Q3_K;
81558174
}
81568175
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
8157-
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
8176+
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
81588177
}
81598178
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8160-
if (model.arch == LLM_ARCH_FALCON) {
8161-
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8162-
use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8179+
if (arch == LLM_ARCH_FALCON) {
8180+
new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8181+
use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
81638182
} else {
8164-
if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8183+
if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
81658184
}
81668185
}
8167-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8168-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
8186+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8187+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) {
81698188
new_type = GGML_TYPE_Q5_K;
81708189
}
8171-
++*i_feed_forward_w2;
8190+
++qs.i_feed_forward_w2;
81728191
} else if (name.find("attn_output.weight") != std::string::npos) {
8173-
if (model.arch != LLM_ARCH_FALCON) {
8192+
if (arch != LLM_ARCH_FALCON) {
81748193
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
81758194
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
81768195
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
@@ -8197,20 +8216,23 @@ static ggml_type get_k_quant_type(
81978216
int nx = tensor->ne[0];
81988217
int ny = tensor->ne[1];
81998218
if (nx % QK_K != 0) {
8200-
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
8219+
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
82018220
convert_incompatible_tensor = true;
8221+
} else {
8222+
++qs.n_k_quantized;
82028223
}
82038224
}
82048225
if (convert_incompatible_tensor) {
8205-
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
8206-
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
8207-
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
8208-
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
8209-
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
8210-
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
8211-
} else {
8212-
throw std::runtime_error("Unsupported tensor size encountered\n");
8226+
switch (new_type) {
8227+
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
8228+
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
8229+
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
8230+
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
8231+
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
8232+
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
82138233
}
8234+
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
8235+
++qs.n_fallback;
82148236
}
82158237

82168238
return new_type;
@@ -8268,6 +8290,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
82688290
llm_load_arch(ml, model);
82698291
llm_load_hparams(ml, model);
82708292

8293+
struct quantize_state_internal qs(model, params);
8294+
82718295
if (params->only_copy) {
82728296
ftype = model.ftype;
82738297
}
@@ -8281,29 +8305,23 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
82818305
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
82828306

82838307
#ifdef GGML_USE_K_QUANTS
8284-
int n_attention_wv = 0;
8285-
int n_feed_forward_w2 = 0;
8286-
82878308
for (int i = 0; i < ml.n_tensors; ++i) {
82888309
struct ggml_tensor * meta = ml.get_tensor_meta(i);
82898310

82908311
const std::string name = ggml_get_name(meta);
82918312

82928313
// TODO: avoid hardcoded tensor names - use the TN_* constants
82938314
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
8294-
++n_attention_wv;
8315+
++qs.n_attention_wv;
82958316
}
82968317
else if (name.find("ffn_down.weight") != std::string::npos) {
8297-
++n_feed_forward_w2;
8318+
++qs.n_feed_forward_w2;
82988319
}
82998320
}
8300-
if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
8321+
if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
83018322
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
8302-
__func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
8323+
__func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer);
83038324
}
8304-
8305-
int i_attention_wv = 0;
8306-
int i_feed_forward_w2 = 0;
83078325
#endif
83088326

83098327
size_t total_size_org = 0;
@@ -8370,9 +8388,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
83708388
if (quantize) {
83718389
new_type = quantized_type;
83728390
#ifdef GGML_USE_K_QUANTS
8373-
new_type = get_k_quant_type(
8374-
new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
8375-
);
8391+
new_type = get_k_quant_type(qs, new_type, tensor, ftype);
83768392
#endif
83778393
// If we've decided to quantize to the same type the tensor is already
83788394
// in then there's nothing to do.
@@ -8498,6 +8514,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
84988514
LLAMA_LOG_INFO("\n");
84998515
}
85008516
}
8517+
#ifdef GGML_USE_K_QUANTS
8518+
if (qs.n_fallback > 0) {
8519+
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n",
8520+
__func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
8521+
}
8522+
#endif
85018523
}
85028524

85038525
static int llama_apply_lora_from_file_internal(

0 commit comments

Comments
 (0)