Skip to content

Commit c48679a

Browse files
authored
Support attention_bias on LLaMA architecture
QKVO bias, should fix InternLM (ggml-org#3133) and works for LLaMAfied Qwen models (ggml-org#3743 (comment)).
1 parent 8d6d9f0 commit c48679a

File tree

1 file changed

+25
-4
lines changed

1 file changed

+25
-4
lines changed

llama.cpp

+25-4
Original file line numberDiff line numberDiff line change
@@ -1248,6 +1248,9 @@ struct llama_layer {
12481248
struct ggml_tensor * wqkv;
12491249

12501250
// attention bias
1251+
struct ggml_tensor * bq;
1252+
struct ggml_tensor * bk;
1253+
struct ggml_tensor * bv;
12511254
struct ggml_tensor * bo;
12521255
struct ggml_tensor * bqkv;
12531256

@@ -2781,6 +2784,11 @@ static void llm_load_tensors(
27812784
layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
27822785
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
27832786
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2787+
2788+
layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend);
2789+
layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend);
2790+
layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend);
2791+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
27842792

27852793
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
27862794

@@ -2791,8 +2799,9 @@ static void llm_load_tensors(
27912799
if (backend == GGML_BACKEND_GPU) {
27922800
vram_weights +=
27932801
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
2794-
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
2795-
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
2802+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.bq) +
2803+
ggml_nbytes(layer.bk) + ggml_nbytes(layer.bv) + ggml_nbytes(layer.bo) +
2804+
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
27962805
}
27972806
}
27982807
} break;
@@ -3891,13 +3900,25 @@ struct llm_build_context {
38913900
// compute Q and K and RoPE them
38923901
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
38933902
cb(Qcur, "Qcur", il);
3903+
if (model.layers[il].bq) {
3904+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
3905+
cb(Qcur, "Qcur", il);
3906+
}
38943907

38953908
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
38963909
cb(Kcur, "Kcur", il);
3910+
if (model.layers[il].bk) {
3911+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
3912+
cb(Kcur, "Kcur", il);
3913+
}
38973914

38983915
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
38993916
cb(Vcur, "Vcur", il);
3900-
3917+
if (model.layers[il].bv) {
3918+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
3919+
cb(Vcur, "Vcur", il);
3920+
}
3921+
39013922
Qcur = ggml_rope_custom(
39023923
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
39033924
n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
@@ -3915,7 +3936,7 @@ struct llm_build_context {
39153936
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
39163937

39173938
cur = llm_build_kqv(ctx0, hparams, kv_self,
3918-
model.layers[il].wo, NULL,
3939+
model.layers[il].wo, model.layers[il].bo,
39193940
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
39203941
cb(cur, "kqv_out", il);
39213942
}

0 commit comments

Comments
 (0)