@@ -7985,6 +7985,24 @@ struct no_init {
7985
7985
no_init () { /* do nothing */ }
7986
7986
};
7987
7987
7988
+ struct quantize_state_internal {
7989
+ const llama_model & model;
7990
+ const llama_model_quantize_params * params;
7991
+ #ifdef GGML_USE_K_QUANTS
7992
+ int n_attention_wv = 0 ;
7993
+ int n_feed_forward_w2 = 0 ;
7994
+ int i_attention_wv = 0 ;
7995
+ int i_feed_forward_w2 = 0 ;
7996
+
7997
+ int n_k_quantized = 0 ;
7998
+ int n_fallback = 0 ;
7999
+ #endif
8000
+ quantize_state_internal (const llama_model & model, const llama_model_quantize_params * params)
8001
+ : model(model)
8002
+ , params(params)
8003
+ {}
8004
+ };
8005
+
7988
8006
static void llama_convert_tensor_internal (
7989
8007
struct ggml_tensor * tensor, std::vector<no_init<float >> & output, std::vector<std::thread> & workers,
7990
8008
const size_t nelements, const int nthread
@@ -8045,20 +8063,21 @@ static void llama_convert_tensor_internal(
8045
8063
8046
8064
#ifdef GGML_USE_K_QUANTS
8047
8065
static ggml_type get_k_quant_type (
8048
- ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv ,
8049
- int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
8066
+ quantize_state_internal & qs ,
8067
+ ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
8050
8068
) {
8051
8069
const std::string name = ggml_get_name (tensor);
8052
8070
// TODO: avoid hardcoded tensor names - use the TN_* constants
8053
- const auto tn = LLM_TN (model.arch );
8071
+ const llm_arch arch = qs.model .arch ;
8072
+ const auto tn = LLM_TN (arch);
8054
8073
8055
8074
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
8056
8075
return i_layer < num_layers/8 || i_layer >= 7 *num_layers/8 || (i_layer - num_layers/8 )%3 == 2 ;
8057
8076
};
8058
8077
8059
8078
if (name == tn (LLM_TENSOR_OUTPUT, " weight" )) {
8060
8079
int nx = tensor->ne [0 ];
8061
- if (model. arch == LLM_ARCH_FALCON || nx % QK_K != 0 ) {
8080
+ if (arch == LLM_ARCH_FALCON || nx % QK_K != 0 ) {
8062
8081
new_type = GGML_TYPE_Q8_0;
8063
8082
}
8064
8083
else if (new_type != GGML_TYPE_Q8_0) {
@@ -8067,46 +8086,46 @@ static ggml_type get_k_quant_type(
8067
8086
} else if (name.find (" attn_v.weight" ) != std::string::npos) {
8068
8087
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8069
8088
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8070
- new_type = * i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8089
+ new_type = qs. i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8071
8090
}
8072
8091
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
8073
8092
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
8074
- use_more_bits (* i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
8075
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && * i_attention_wv < 4 ) new_type = GGML_TYPE_Q5_K;
8093
+ use_more_bits (qs. i_attention_wv , qs. n_attention_wv )) new_type = GGML_TYPE_Q6_K;
8094
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs. i_attention_wv < 4 ) new_type = GGML_TYPE_Q5_K;
8076
8095
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
8077
- (* i_attention_wv < n_attention_wv/8 || * i_attention_wv >= 7 *n_attention_wv/8 )) new_type = GGML_TYPE_Q6_K;
8078
- if (model.type == MODEL_70B) {
8096
+ (qs. i_attention_wv < qs. n_attention_wv /8 || qs. i_attention_wv >= 7 *qs. n_attention_wv /8 )) new_type = GGML_TYPE_Q6_K;
8097
+ if (qs. model .type == MODEL_70B) {
8079
8098
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
8080
8099
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
8081
8100
// nearly negligible increase in model size by quantizing this tensor with more bits:
8082
8101
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
8083
8102
}
8084
- ++* i_attention_wv;
8103
+ ++qs. i_attention_wv ;
8085
8104
} else if (name.find (" ffn_down.weight" ) != std::string::npos) {
8086
8105
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8087
8106
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8088
- new_type = * i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8089
- : model. arch != LLM_ARCH_FALCON || use_more_bits (* i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
8107
+ new_type = qs. i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8108
+ : arch != LLM_ARCH_FALCON || use_more_bits (qs. i_feed_forward_w2 , qs. n_feed_forward_w2 ) ? GGML_TYPE_Q4_K
8090
8109
: GGML_TYPE_Q3_K;
8091
8110
}
8092
8111
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
8093
- new_type = model. arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
8112
+ new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
8094
8113
}
8095
8114
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8096
- if (model. arch == LLM_ARCH_FALCON) {
8097
- new_type = * i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8098
- use_more_bits (* i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8115
+ if (arch == LLM_ARCH_FALCON) {
8116
+ new_type = qs. i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8117
+ use_more_bits (qs. i_feed_forward_w2 , qs. n_feed_forward_w2 ) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8099
8118
} else {
8100
- if (use_more_bits (* i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8119
+ if (use_more_bits (qs. i_feed_forward_w2 , qs. n_feed_forward_w2 )) new_type = GGML_TYPE_Q6_K;
8101
8120
}
8102
8121
}
8103
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits (* i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8104
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model. arch != LLM_ARCH_FALCON && * i_feed_forward_w2 < 4 ) {
8122
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits (qs. i_feed_forward_w2 , qs. n_feed_forward_w2 )) new_type = GGML_TYPE_Q6_K;
8123
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs. i_feed_forward_w2 < 4 ) {
8105
8124
new_type = GGML_TYPE_Q5_K;
8106
8125
}
8107
- ++* i_feed_forward_w2;
8126
+ ++qs. i_feed_forward_w2 ;
8108
8127
} else if (name.find (" attn_output.weight" ) != std::string::npos) {
8109
- if (model. arch != LLM_ARCH_FALCON) {
8128
+ if (arch != LLM_ARCH_FALCON) {
8110
8129
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
8111
8130
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
8112
8131
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
@@ -8135,6 +8154,8 @@ static ggml_type get_k_quant_type(
8135
8154
if (nx % QK_K != 0 ) {
8136
8155
LLAMA_LOG_WARN (" \n\n %s : tensor cols %d x %d are not divisible by %d, required for %s" , __func__, nx, ny, QK_K, ggml_type_name (new_type));
8137
8156
convert_incompatible_tensor = true ;
8157
+ } else {
8158
+ ++qs.n_k_quantized ;
8138
8159
}
8139
8160
}
8140
8161
if (convert_incompatible_tensor) {
@@ -8147,6 +8168,7 @@ static ggml_type get_k_quant_type(
8147
8168
default : throw std::runtime_error (" \n Unsupported tensor size encountered\n " );
8148
8169
}
8149
8170
LLAMA_LOG_WARN (" - using fallback quantization %s\n " , ggml_type_name (new_type));
8171
+ ++qs.n_fallback ;
8150
8172
}
8151
8173
8152
8174
return new_type;
@@ -8204,6 +8226,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8204
8226
llm_load_arch (ml, model);
8205
8227
llm_load_hparams (ml, model);
8206
8228
8229
+ struct quantize_state_internal qs (model, params);
8230
+
8207
8231
if (params->only_copy ) {
8208
8232
ftype = model.ftype ;
8209
8233
}
@@ -8217,29 +8241,23 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8217
8241
gguf_set_val_u32 (ctx_out, " general.file_type" , ftype);
8218
8242
8219
8243
#ifdef GGML_USE_K_QUANTS
8220
- int n_attention_wv = 0 ;
8221
- int n_feed_forward_w2 = 0 ;
8222
-
8223
8244
for (int i = 0 ; i < ml.n_tensors ; ++i) {
8224
8245
struct ggml_tensor * meta = ml.get_tensor_meta (i);
8225
8246
8226
8247
const std::string name = ggml_get_name (meta);
8227
8248
8228
8249
// TODO: avoid hardcoded tensor names - use the TN_* constants
8229
8250
if (name.find (" attn_v.weight" ) != std::string::npos || name.find (" attn_qkv.weight" ) != std::string::npos) {
8230
- ++n_attention_wv;
8251
+ ++qs. n_attention_wv ;
8231
8252
}
8232
8253
else if (name.find (" ffn_down.weight" ) != std::string::npos) {
8233
- ++n_feed_forward_w2;
8254
+ ++qs. n_feed_forward_w2 ;
8234
8255
}
8235
8256
}
8236
- if (n_attention_wv != n_feed_forward_w2 || (uint32_t )n_attention_wv != model.hparams .n_layer ) {
8257
+ if (qs. n_attention_wv != qs. n_feed_forward_w2 || (uint32_t )qs. n_attention_wv != model.hparams .n_layer ) {
8237
8258
LLAMA_LOG_WARN (" %s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n " ,
8238
- __func__, n_attention_wv, n_feed_forward_w2, model.hparams .n_layer );
8259
+ __func__, qs. n_attention_wv , qs. n_feed_forward_w2 , model.hparams .n_layer );
8239
8260
}
8240
-
8241
- int i_attention_wv = 0 ;
8242
- int i_feed_forward_w2 = 0 ;
8243
8261
#endif
8244
8262
8245
8263
size_t total_size_org = 0 ;
@@ -8306,9 +8324,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8306
8324
if (quantize) {
8307
8325
new_type = quantized_type;
8308
8326
#ifdef GGML_USE_K_QUANTS
8309
- new_type = get_k_quant_type (
8310
- new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
8311
- );
8327
+ new_type = get_k_quant_type (qs, new_type, tensor, ftype);
8312
8328
#endif
8313
8329
// If we've decided to quantize to the same type the tensor is already
8314
8330
// in then there's nothing to do.
@@ -8434,6 +8450,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8434
8450
LLAMA_LOG_INFO (" \n " );
8435
8451
}
8436
8452
}
8453
+ #ifdef GGML_USE_K_QUANTS
8454
+ if (qs.n_fallback > 0 ) {
8455
+ LLAMA_LOG_WARN (" %s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n " ,
8456
+ __func__, qs.n_fallback , qs.n_k_quantized + qs.n_fallback );
8457
+ }
8458
+ #endif
8437
8459
}
8438
8460
8439
8461
static int llama_apply_lora_from_file_internal (
0 commit comments