@@ -8307,6 +8307,24 @@ struct no_init {
8307
8307
no_init() { /* do nothing */ }
8308
8308
};
8309
8309
8310
+ struct quantize_state_internal {
8311
+ const llama_model & model;
8312
+ const llama_model_quantize_params * params;
8313
+ #ifdef GGML_USE_K_QUANTS
8314
+ int n_attention_wv = 0;
8315
+ int n_feed_forward_w2 = 0;
8316
+ int i_attention_wv = 0;
8317
+ int i_feed_forward_w2 = 0;
8318
+
8319
+ int n_k_quantized = 0;
8320
+ int n_fallback = 0;
8321
+ #endif
8322
+ quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
8323
+ : model(model)
8324
+ , params(params)
8325
+ {}
8326
+ };
8327
+
8310
8328
static void llama_convert_tensor_internal(
8311
8329
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
8312
8330
const size_t nelements, const int nthread
@@ -8367,20 +8385,21 @@ static void llama_convert_tensor_internal(
8367
8385
8368
8386
#ifdef GGML_USE_K_QUANTS
8369
8387
static ggml_type get_k_quant_type(
8370
- ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv ,
8371
- int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
8388
+ quantize_state_internal & qs ,
8389
+ ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
8372
8390
) {
8373
8391
const std::string name = ggml_get_name(tensor);
8374
8392
// TODO: avoid hardcoded tensor names - use the TN_* constants
8375
- const auto tn = LLM_TN(model.arch);
8393
+ const llm_arch arch = qs.model.arch;
8394
+ const auto tn = LLM_TN(arch);
8376
8395
8377
8396
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
8378
8397
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
8379
8398
};
8380
8399
8381
8400
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
8382
8401
int nx = tensor->ne[0];
8383
- if (model. arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
8402
+ if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
8384
8403
new_type = GGML_TYPE_Q8_0;
8385
8404
}
8386
8405
else if (new_type != GGML_TYPE_Q8_0) {
@@ -8389,46 +8408,46 @@ static ggml_type get_k_quant_type(
8389
8408
} else if (name.find("attn_v.weight") != std::string::npos) {
8390
8409
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8391
8410
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8392
- new_type = * i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8411
+ new_type = qs. i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8393
8412
}
8394
8413
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
8395
8414
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
8396
- use_more_bits(* i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
8397
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && * i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
8415
+ use_more_bits(qs. i_attention_wv, qs. n_attention_wv)) new_type = GGML_TYPE_Q6_K;
8416
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs. i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
8398
8417
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
8399
- (* i_attention_wv < n_attention_wv/8 || * i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
8400
- if (model.type == MODEL_70B) {
8418
+ (qs. i_attention_wv < qs. n_attention_wv/8 || qs. i_attention_wv >= 7*qs. n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
8419
+ if (qs. model.type == MODEL_70B) {
8401
8420
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
8402
8421
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
8403
8422
// nearly negligible increase in model size by quantizing this tensor with more bits:
8404
8423
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
8405
8424
}
8406
- ++* i_attention_wv;
8425
+ ++qs. i_attention_wv;
8407
8426
} else if (name.find("ffn_down.weight") != std::string::npos) {
8408
8427
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8409
8428
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8410
- new_type = * i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8411
- : model. arch != LLM_ARCH_FALCON || use_more_bits(* i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
8429
+ new_type = qs. i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8430
+ : arch != LLM_ARCH_FALCON || use_more_bits(qs. i_feed_forward_w2, qs. n_feed_forward_w2) ? GGML_TYPE_Q4_K
8412
8431
: GGML_TYPE_Q3_K;
8413
8432
}
8414
8433
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
8415
- new_type = model. arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
8434
+ new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
8416
8435
}
8417
8436
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8418
- if (model. arch == LLM_ARCH_FALCON) {
8419
- new_type = * i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8420
- use_more_bits(* i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8437
+ if (arch == LLM_ARCH_FALCON) {
8438
+ new_type = qs. i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8439
+ use_more_bits(qs. i_feed_forward_w2, qs. n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8421
8440
} else {
8422
- if (use_more_bits(* i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8441
+ if (use_more_bits(qs. i_feed_forward_w2, qs. n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8423
8442
}
8424
8443
}
8425
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(* i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8426
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model. arch != LLM_ARCH_FALCON && * i_feed_forward_w2 < 4) {
8444
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs. i_feed_forward_w2, qs. n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8445
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs. i_feed_forward_w2 < 4) {
8427
8446
new_type = GGML_TYPE_Q5_K;
8428
8447
}
8429
- ++* i_feed_forward_w2;
8448
+ ++qs. i_feed_forward_w2;
8430
8449
} else if (name.find("attn_output.weight") != std::string::npos) {
8431
- if (model. arch != LLM_ARCH_FALCON) {
8450
+ if (arch != LLM_ARCH_FALCON) {
8432
8451
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
8433
8452
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
8434
8453
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
@@ -8455,20 +8474,23 @@ static ggml_type get_k_quant_type(
8455
8474
int nx = tensor->ne[0];
8456
8475
int ny = tensor->ne[1];
8457
8476
if (nx % QK_K != 0) {
8458
- LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n ", __func__, nx, ny, QK_K);
8477
+ LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s ", __func__, nx, ny, QK_K, ggml_type_name(new_type) );
8459
8478
convert_incompatible_tensor = true;
8479
+ } else {
8480
+ ++qs.n_k_quantized;
8460
8481
}
8461
8482
}
8462
8483
if (convert_incompatible_tensor) {
8463
- if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
8464
- new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
8465
- LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
8466
- } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
8467
- new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
8468
- LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
8469
- } else {
8470
- throw std::runtime_error("Unsupported tensor size encountered\n");
8484
+ switch (new_type) {
8485
+ case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
8486
+ case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
8487
+ case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
8488
+ case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
8489
+ case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
8490
+ default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
8471
8491
}
8492
+ LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
8493
+ ++qs.n_fallback;
8472
8494
}
8473
8495
8474
8496
return new_type;
@@ -8526,6 +8548,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8526
8548
llm_load_arch(ml, model);
8527
8549
llm_load_hparams(ml, model);
8528
8550
8551
+ struct quantize_state_internal qs(model, params);
8552
+
8529
8553
if (params->only_copy) {
8530
8554
ftype = model.ftype;
8531
8555
}
@@ -8539,29 +8563,23 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8539
8563
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
8540
8564
8541
8565
#ifdef GGML_USE_K_QUANTS
8542
- int n_attention_wv = 0;
8543
- int n_feed_forward_w2 = 0;
8544
-
8545
8566
for (int i = 0; i < ml.n_tensors; ++i) {
8546
8567
struct ggml_tensor * meta = ml.get_tensor_meta(i);
8547
8568
8548
8569
const std::string name = ggml_get_name(meta);
8549
8570
8550
8571
// TODO: avoid hardcoded tensor names - use the TN_* constants
8551
8572
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
8552
- ++n_attention_wv;
8573
+ ++qs. n_attention_wv;
8553
8574
}
8554
8575
else if (name.find("ffn_down.weight") != std::string::npos) {
8555
- ++n_feed_forward_w2;
8576
+ ++qs. n_feed_forward_w2;
8556
8577
}
8557
8578
}
8558
- if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
8579
+ if (qs. n_attention_wv != qs. n_feed_forward_w2 || (uint32_t)qs. n_attention_wv != model.hparams.n_layer) {
8559
8580
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
8560
- __func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
8581
+ __func__, qs. n_attention_wv, qs. n_feed_forward_w2, model.hparams.n_layer);
8561
8582
}
8562
-
8563
- int i_attention_wv = 0;
8564
- int i_feed_forward_w2 = 0;
8565
8583
#endif
8566
8584
8567
8585
size_t total_size_org = 0;
@@ -8628,9 +8646,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8628
8646
if (quantize) {
8629
8647
new_type = quantized_type;
8630
8648
#ifdef GGML_USE_K_QUANTS
8631
- new_type = get_k_quant_type(
8632
- new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
8633
- );
8649
+ new_type = get_k_quant_type(qs, new_type, tensor, ftype);
8634
8650
#endif
8635
8651
// If we've decided to quantize to the same type the tensor is already
8636
8652
// in then there's nothing to do.
@@ -8756,6 +8772,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8756
8772
LLAMA_LOG_INFO("\n");
8757
8773
}
8758
8774
}
8775
+ #ifdef GGML_USE_K_QUANTS
8776
+ if (qs.n_fallback > 0) {
8777
+ LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n",
8778
+ __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
8779
+ }
8780
+ #endif
8759
8781
}
8760
8782
8761
8783
static int llama_apply_lora_from_file_internal(
0 commit comments