Skip to content

Commit d13fd96

Browse files
ikawrakowKawrakow
authored andcommitted
llama : use Q4_K for attn_v for Q2_K_S when n_gqa >= 4 (ggml-org#4996)
Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent d07de0a commit d13fd96

File tree

1 file changed

+6
-1
lines changed

1 file changed

+6
-1
lines changed

llama.cpp

+6-1
Original file line numberDiff line numberDiff line change
@@ -8477,7 +8477,12 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
84778477
}
84788478
else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
84798479
} else if (name.find("attn_v.weight") != std::string::npos) {
8480-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8480+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
8481+
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
8482+
}
8483+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
8484+
new_type = GGML_TYPE_Q4_K;
8485+
}
84818486
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
84828487
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
84838488
}

0 commit comments

Comments
 (0)