@@ -1248,6 +1248,9 @@ struct llama_layer {
1248
1248
struct ggml_tensor * wqkv;
1249
1249
1250
1250
// attention bias
1251
+ struct ggml_tensor * bq;
1252
+ struct ggml_tensor * bk;
1253
+ struct ggml_tensor * bv;
1251
1254
struct ggml_tensor * bo;
1252
1255
struct ggml_tensor * bqkv;
1253
1256
@@ -2781,6 +2784,11 @@ static void llm_load_tensors(
2781
2784
layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
2782
2785
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
2783
2786
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2787
+
2788
+ layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend);
2789
+ layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend);
2790
+ layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend);
2791
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
2784
2792
2785
2793
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2786
2794
@@ -2791,8 +2799,9 @@ static void llm_load_tensors(
2791
2799
if (backend == GGML_BACKEND_GPU) {
2792
2800
vram_weights +=
2793
2801
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
2794
- ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.ffn_norm ) +
2795
- ggml_nbytes (layer.ffn_gate ) + ggml_nbytes (layer.ffn_down ) + ggml_nbytes (layer.ffn_up );
2802
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.bq) +
2803
+ ggml_nbytes(layer.bk) + ggml_nbytes(layer.bv) + ggml_nbytes(layer.bo) +
2804
+ ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
2796
2805
}
2797
2806
}
2798
2807
} break;
@@ -3891,13 +3900,25 @@ struct llm_build_context {
3891
3900
// compute Q and K and RoPE them
3892
3901
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
3893
3902
cb(Qcur, "Qcur", il);
3903
+ if (model.layers[il].bq) {
3904
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
3905
+ cb(Qcur, "Qcur", il);
3906
+ }
3894
3907
3895
3908
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
3896
3909
cb(Kcur, "Kcur", il);
3910
+ if (model.layers[il].bk) {
3911
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
3912
+ cb(Kcur, "Kcur", il);
3913
+ }
3897
3914
3898
3915
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
3899
3916
cb(Vcur, "Vcur", il);
3900
-
3917
+ if (model.layers[il].bv) {
3918
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
3919
+ cb(Vcur, "Vcur", il);
3920
+ }
3921
+
3901
3922
Qcur = ggml_rope_custom(
3902
3923
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
3903
3924
n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
@@ -3915,7 +3936,7 @@ struct llm_build_context {
3915
3936
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
3916
3937
3917
3938
cur = llm_build_kqv(ctx0, hparams, kv_self,
3918
- model.layers [il].wo , NULL ,
3939
+ model.layers[il].wo, model.layers[il].bo,
3919
3940
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
3920
3941
cb(cur, "kqv_out", il);
3921
3942
}
0 commit comments