@@ -8049,6 +8049,24 @@ struct no_init {
8049
8049
no_init () { /* do nothing */ }
8050
8050
};
8051
8051
8052
+ struct quantize_state_internal {
8053
+ const llama_model & model;
8054
+ const llama_model_quantize_params * params;
8055
+ #ifdef GGML_USE_K_QUANTS
8056
+ int n_attention_wv = 0 ;
8057
+ int n_feed_forward_w2 = 0 ;
8058
+ int i_attention_wv = 0 ;
8059
+ int i_feed_forward_w2 = 0 ;
8060
+
8061
+ int n_k_quantized = 0 ;
8062
+ int n_fallback = 0 ;
8063
+ #endif
8064
+ quantize_state_internal (const llama_model & model, const llama_model_quantize_params * params)
8065
+ : model(model)
8066
+ , params(params)
8067
+ {}
8068
+ };
8069
+
8052
8070
static void llama_convert_tensor_internal (
8053
8071
struct ggml_tensor * tensor, std::vector<no_init<float >> & output, std::vector<std::thread> & workers,
8054
8072
const size_t nelements, const int nthread
@@ -8109,20 +8127,21 @@ static void llama_convert_tensor_internal(
8109
8127
8110
8128
#ifdef GGML_USE_K_QUANTS
8111
8129
static ggml_type get_k_quant_type (
8112
- ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv ,
8113
- int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
8130
+ quantize_state_internal & qs ,
8131
+ ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
8114
8132
) {
8115
8133
const std::string name = ggml_get_name (tensor);
8116
8134
// TODO: avoid hardcoded tensor names - use the TN_* constants
8117
- const auto tn = LLM_TN (model.arch );
8135
+ const llm_arch arch = qs.model .arch ;
8136
+ const auto tn = LLM_TN (arch);
8118
8137
8119
8138
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
8120
8139
return i_layer < num_layers/8 || i_layer >= 7 *num_layers/8 || (i_layer - num_layers/8 )%3 == 2 ;
8121
8140
};
8122
8141
8123
8142
if (name == tn (LLM_TENSOR_OUTPUT, " weight" )) {
8124
8143
int nx = tensor->ne [0 ];
8125
- if (model. arch == LLM_ARCH_FALCON || nx % QK_K != 0 ) {
8144
+ if (arch == LLM_ARCH_FALCON || nx % QK_K != 0 ) {
8126
8145
new_type = GGML_TYPE_Q8_0;
8127
8146
}
8128
8147
else if (new_type != GGML_TYPE_Q8_0) {
@@ -8131,46 +8150,46 @@ static ggml_type get_k_quant_type(
8131
8150
} else if (name.find (" attn_v.weight" ) != std::string::npos) {
8132
8151
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8133
8152
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8134
- new_type = * i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8153
+ new_type = qs. i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8135
8154
}
8136
8155
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
8137
8156
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
8138
- use_more_bits (* i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
8139
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && * i_attention_wv < 4 ) new_type = GGML_TYPE_Q5_K;
8157
+ use_more_bits (qs. i_attention_wv , qs. n_attention_wv )) new_type = GGML_TYPE_Q6_K;
8158
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs. i_attention_wv < 4 ) new_type = GGML_TYPE_Q5_K;
8140
8159
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
8141
- (* i_attention_wv < n_attention_wv/8 || * i_attention_wv >= 7 *n_attention_wv/8 )) new_type = GGML_TYPE_Q6_K;
8142
- if (model.type == MODEL_70B) {
8160
+ (qs. i_attention_wv < qs. n_attention_wv /8 || qs. i_attention_wv >= 7 *qs. n_attention_wv /8 )) new_type = GGML_TYPE_Q6_K;
8161
+ if (qs. model .type == MODEL_70B) {
8143
8162
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
8144
8163
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
8145
8164
// nearly negligible increase in model size by quantizing this tensor with more bits:
8146
8165
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
8147
8166
}
8148
- ++* i_attention_wv;
8167
+ ++qs. i_attention_wv ;
8149
8168
} else if (name.find (" ffn_down.weight" ) != std::string::npos) {
8150
8169
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8151
8170
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8152
- new_type = * i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8153
- : model. arch != LLM_ARCH_FALCON || use_more_bits (* i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
8171
+ new_type = qs. i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8172
+ : arch != LLM_ARCH_FALCON || use_more_bits (qs. i_feed_forward_w2 , qs. n_feed_forward_w2 ) ? GGML_TYPE_Q4_K
8154
8173
: GGML_TYPE_Q3_K;
8155
8174
}
8156
8175
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
8157
- new_type = model. arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
8176
+ new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
8158
8177
}
8159
8178
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8160
- if (model. arch == LLM_ARCH_FALCON) {
8161
- new_type = * i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8162
- use_more_bits (* i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8179
+ if (arch == LLM_ARCH_FALCON) {
8180
+ new_type = qs. i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8181
+ use_more_bits (qs. i_feed_forward_w2 , qs. n_feed_forward_w2 ) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8163
8182
} else {
8164
- if (use_more_bits (* i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8183
+ if (use_more_bits (qs. i_feed_forward_w2 , qs. n_feed_forward_w2 )) new_type = GGML_TYPE_Q6_K;
8165
8184
}
8166
8185
}
8167
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits (* i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8168
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model. arch != LLM_ARCH_FALCON && * i_feed_forward_w2 < 4 ) {
8186
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits (qs. i_feed_forward_w2 , qs. n_feed_forward_w2 )) new_type = GGML_TYPE_Q6_K;
8187
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs. i_feed_forward_w2 < 4 ) {
8169
8188
new_type = GGML_TYPE_Q5_K;
8170
8189
}
8171
- ++* i_feed_forward_w2;
8190
+ ++qs. i_feed_forward_w2 ;
8172
8191
} else if (name.find (" attn_output.weight" ) != std::string::npos) {
8173
- if (model. arch != LLM_ARCH_FALCON) {
8192
+ if (arch != LLM_ARCH_FALCON) {
8174
8193
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
8175
8194
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
8176
8195
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
@@ -8197,20 +8216,23 @@ static ggml_type get_k_quant_type(
8197
8216
int nx = tensor->ne [0 ];
8198
8217
int ny = tensor->ne [1 ];
8199
8218
if (nx % QK_K != 0 ) {
8200
- LLAMA_LOG_WARN (" \n\n %s : tensor cols %d x %d are not divisible by %d, required for k-quants \n " , __func__, nx, ny, QK_K);
8219
+ LLAMA_LOG_WARN (" \n\n %s : tensor cols %d x %d are not divisible by %d, required for %s " , __func__, nx, ny, QK_K, ggml_type_name (new_type) );
8201
8220
convert_incompatible_tensor = true ;
8221
+ } else {
8222
+ ++qs.n_k_quantized ;
8202
8223
}
8203
8224
}
8204
8225
if (convert_incompatible_tensor) {
8205
- if (name == tn (LLM_TENSOR_OUTPUT, " weight" )) {
8206
- new_type = GGML_TYPE_F16; // fall back to F16 instead of just failing.
8207
- LLAMA_LOG_WARN (" F16 will be used for this tensor instead.\n " );
8208
- } else if (name == tn (LLM_TENSOR_TOKEN_EMBD, " weight" )) {
8209
- new_type = GGML_TYPE_Q4_0; // fall back to Q4_0 instead of just failing.
8210
- LLAMA_LOG_WARN (" Q4_0 will be used for this tensor instead.\n " );
8211
- } else {
8212
- throw std::runtime_error (" Unsupported tensor size encountered\n " );
8226
+ switch (new_type) {
8227
+ case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break ;
8228
+ case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break ;
8229
+ case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break ;
8230
+ case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break ;
8231
+ case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break ;
8232
+ default : throw std::runtime_error (" \n Unsupported tensor size encountered\n " );
8213
8233
}
8234
+ LLAMA_LOG_WARN (" - using fallback quantization %s\n " , ggml_type_name (new_type));
8235
+ ++qs.n_fallback ;
8214
8236
}
8215
8237
8216
8238
return new_type;
@@ -8268,6 +8290,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8268
8290
llm_load_arch (ml, model);
8269
8291
llm_load_hparams (ml, model);
8270
8292
8293
+ struct quantize_state_internal qs (model, params);
8294
+
8271
8295
if (params->only_copy ) {
8272
8296
ftype = model.ftype ;
8273
8297
}
@@ -8281,29 +8305,23 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8281
8305
gguf_set_val_u32 (ctx_out, " general.file_type" , ftype);
8282
8306
8283
8307
#ifdef GGML_USE_K_QUANTS
8284
- int n_attention_wv = 0 ;
8285
- int n_feed_forward_w2 = 0 ;
8286
-
8287
8308
for (int i = 0 ; i < ml.n_tensors ; ++i) {
8288
8309
struct ggml_tensor * meta = ml.get_tensor_meta (i);
8289
8310
8290
8311
const std::string name = ggml_get_name (meta);
8291
8312
8292
8313
// TODO: avoid hardcoded tensor names - use the TN_* constants
8293
8314
if (name.find (" attn_v.weight" ) != std::string::npos || name.find (" attn_qkv.weight" ) != std::string::npos) {
8294
- ++n_attention_wv;
8315
+ ++qs. n_attention_wv ;
8295
8316
}
8296
8317
else if (name.find (" ffn_down.weight" ) != std::string::npos) {
8297
- ++n_feed_forward_w2;
8318
+ ++qs. n_feed_forward_w2 ;
8298
8319
}
8299
8320
}
8300
- if (n_attention_wv != n_feed_forward_w2 || (uint32_t )n_attention_wv != model.hparams .n_layer ) {
8321
+ if (qs. n_attention_wv != qs. n_feed_forward_w2 || (uint32_t )qs. n_attention_wv != model.hparams .n_layer ) {
8301
8322
LLAMA_LOG_WARN (" %s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n " ,
8302
- __func__, n_attention_wv, n_feed_forward_w2, model.hparams .n_layer );
8323
+ __func__, qs. n_attention_wv , qs. n_feed_forward_w2 , model.hparams .n_layer );
8303
8324
}
8304
-
8305
- int i_attention_wv = 0 ;
8306
- int i_feed_forward_w2 = 0 ;
8307
8325
#endif
8308
8326
8309
8327
size_t total_size_org = 0 ;
@@ -8370,9 +8388,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8370
8388
if (quantize) {
8371
8389
new_type = quantized_type;
8372
8390
#ifdef GGML_USE_K_QUANTS
8373
- new_type = get_k_quant_type (
8374
- new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
8375
- );
8391
+ new_type = get_k_quant_type (qs, new_type, tensor, ftype);
8376
8392
#endif
8377
8393
// If we've decided to quantize to the same type the tensor is already
8378
8394
// in then there's nothing to do.
@@ -8498,6 +8514,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8498
8514
LLAMA_LOG_INFO (" \n " );
8499
8515
}
8500
8516
}
8517
+ #ifdef GGML_USE_K_QUANTS
8518
+ if (qs.n_fallback > 0 ) {
8519
+ LLAMA_LOG_WARN (" %s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n " ,
8520
+ __func__, qs.n_fallback , qs.n_k_quantized + qs.n_fallback );
8521
+ }
8522
+ #endif
8501
8523
}
8502
8524
8503
8525
static int llama_apply_lora_from_file_internal (
0 commit comments