@@ -2752,19 +2752,19 @@ static void llm_load_tensors(
2752
2752
layer.attn_norm_b = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_NORM, " bias" , i), {n_embd}, backend);
2753
2753
2754
2754
layer.wqkv = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_QKV, " weight" , i), {n_embd, n_embd + 2 *n_embd_gqa}, backend_split);
2755
- layer.bqkv = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_QKV, " bias" , i), {n_embd + 2 *n_embd_gqa}, backend_split );
2755
+ layer.bqkv = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_QKV, " bias" , i), {n_embd + 2 *n_embd_gqa}, backend );
2756
2756
2757
2757
layer.wo = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_OUT, " weight" , i), {n_embd, n_embd}, backend_split);
2758
- layer.bo = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_OUT, " bias" , i), {n_embd}, backend_split );
2758
+ layer.bo = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_OUT, " bias" , i), {n_embd}, backend );
2759
2759
2760
2760
layer.ffn_norm = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_NORM, " weight" , i), {n_embd}, backend);
2761
2761
layer.ffn_norm_b = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_NORM, " bias" , i), {n_embd}, backend);
2762
2762
2763
2763
layer.w2 = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_DOWN, " weight" , i), {n_ff, n_embd}, backend_split);
2764
- layer.b2 = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_DOWN, " bias" , i), {n_embd}, backend_split );
2764
+ layer.b2 = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_DOWN, " bias" , i), {n_embd}, backend );
2765
2765
2766
2766
layer.w3 = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff}, backend_split);
2767
- layer.b3 = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " bias" , i), {n_ff}, backend_split );
2767
+ layer.b3 = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " bias" , i), {n_ff}, backend );
2768
2768
2769
2769
if (backend == GGML_BACKEND_GPU) {
2770
2770
vram_weights +=
@@ -4695,6 +4695,8 @@ static struct ggml_cgraph * llm_build_starcoder(
4695
4695
4696
4696
const float norm_eps = hparams.f_norm_eps ;
4697
4697
4698
+ const int n_gpu_layers = model.n_gpu_layers ;
4699
+
4698
4700
const int32_t n_tokens = batch.n_tokens ;
4699
4701
const int32_t n_kv = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx : kv_self.n ;
4700
4702
const int32_t kv_head = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx - n_tokens : kv_self.head ;
@@ -4739,6 +4741,27 @@ static struct ggml_cgraph * llm_build_starcoder(
4739
4741
}
4740
4742
}
4741
4743
4744
+ const int i_gpu_start = n_layer - n_gpu_layers;
4745
+ (void ) i_gpu_start;
4746
+
4747
+ // offload functions set the tensor output backend to GPU
4748
+ // tensors are GPU-accelerated if any input or the output has been offloaded
4749
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
4750
+ offload_func_t offload_func_kq = llama_nop;
4751
+ offload_func_t offload_func_v = llama_nop;
4752
+
4753
+ #ifdef GGML_USE_CUBLAS
4754
+ if (n_gpu_layers > n_layer) {
4755
+ offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
4756
+ }
4757
+ if (n_gpu_layers > n_layer + 1 ) {
4758
+ offload_func_v = ggml_cuda_assign_buffers_no_alloc;
4759
+ }
4760
+ if (n_gpu_layers > n_layer + 2 ) {
4761
+ offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
4762
+ }
4763
+ #endif // GGML_USE_CUBLAS
4764
+
4742
4765
{
4743
4766
// Compute position embeddings.
4744
4767
struct ggml_tensor * inp_positions = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_tokens);
@@ -4764,6 +4787,7 @@ static struct ggml_cgraph * llm_build_starcoder(
4764
4787
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4765
4788
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1 );
4766
4789
ggml_set_name (KQ_mask, " KQ_mask" );
4790
+ offload_func_kq (KQ_mask);
4767
4791
ggml_allocr_alloc (lctx.alloc , KQ_mask);
4768
4792
if (!ggml_allocr_is_measure (lctx.alloc )) {
4769
4793
float * data = (float *) KQ_mask->data ;
@@ -4787,44 +4811,65 @@ static struct ggml_cgraph * llm_build_starcoder(
4787
4811
ggml_set_name (inpL, " inpL" );
4788
4812
4789
4813
for (int il = 0 ; il < n_layer; ++il) {
4814
+ offload_func_t offload_func = llama_nop;
4815
+
4816
+ #ifdef GGML_USE_CUBLAS
4817
+ if (il >= i_gpu_start) {
4818
+ offload_func = ggml_cuda_assign_buffers_no_alloc;
4819
+ }
4820
+ #endif // GGML_USE_CUBLAS
4790
4821
{
4791
4822
// Norm
4792
4823
cur = ggml_norm (ctx0, inpL, norm_eps);
4824
+ offload_func (cur);
4793
4825
cur = ggml_add (ctx0, ggml_mul (ctx0, cur, model.layers [il].attn_norm ), model.layers [il].attn_norm_b );
4826
+ offload_func (cur);
4794
4827
}
4795
4828
4796
4829
{
4797
4830
// Self Attention
4798
- cur = ggml_add (ctx0, ggml_mul_mat (ctx0, model.layers [il].wqkv , cur), model.layers [il].bqkv );
4831
+ cur = ggml_mul_mat (ctx0, model.layers [il].wqkv , cur);
4832
+ offload_func_kq (cur);
4799
4833
4800
- struct ggml_tensor * tmpq = ggml_view_2d (ctx0, cur, n_embd, n_tokens, cur->nb [1 ], 0 *sizeof (float )*n_embd);
4801
- struct ggml_tensor * tmpk = ggml_view_2d (ctx0, cur, n_embd_gqa, n_tokens, cur->nb [1 ], sizeof (float )*n_embd);
4802
- struct ggml_tensor * tmpv = ggml_view_2d (ctx0, cur, n_embd_gqa, n_tokens, cur->nb [1 ], sizeof (float )*(n_embd + n_embd_gqa));
4834
+ cur = ggml_add (ctx0, cur, model.layers [il].bqkv );
4835
+ offload_func_kq (cur);
4803
4836
4804
- struct ggml_tensor * Qcur = tmpq;
4837
+ struct ggml_tensor * tmpq = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, n_embd, n_tokens, cur->nb [1 ], 0 *sizeof (float )*(n_embd)));
4838
+ struct ggml_tensor * tmpk = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, n_embd_gqa, n_tokens, cur->nb [1 ], 1 *sizeof (float )*(n_embd)));
4839
+ struct ggml_tensor * tmpv = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, n_embd_gqa, n_tokens, cur->nb [1 ], 1 *sizeof (float )*(n_embd + n_embd_gqa)));
4840
+
4841
+ ggml_set_name (tmpq, " tmpq" );
4842
+ ggml_set_name (tmpk, " tmpk" );
4843
+ ggml_set_name (tmpv, " tmpv" );
4844
+
4845
+ offload_func_kq (tmpq);
4846
+ offload_func_kq (tmpk);
4847
+ offload_func_v (tmpv);
4848
+
4849
+ struct ggml_tensor * Qcur = ggml_reshape_3d (ctx0, tmpq, n_embd_head, n_head, n_tokens);
4805
4850
struct ggml_tensor * Kcur = tmpk;
4806
4851
4807
4852
{
4808
- struct ggml_tensor * Vcur = ggml_transpose (ctx0, ggml_reshape_2d (ctx0, ggml_cont (ctx0, tmpv), n_embd_gqa, n_tokens));
4853
+ struct ggml_tensor * Vcur = ggml_transpose (ctx0, tmpv);
4854
+ offload_func_v (Vcur);
4809
4855
ggml_set_name (Vcur, " Vcur" );
4810
4856
4811
4857
struct ggml_tensor * k = ggml_view_1d (ctx0, kv_self.k , n_tokens*n_embd_gqa, (ggml_element_size (kv_self.k )*n_embd_gqa)*(il*n_ctx + kv_head));
4858
+ offload_func_kq (k);
4812
4859
ggml_set_name (k, " k" );
4813
4860
4814
4861
struct ggml_tensor * v = ggml_view_2d (ctx0, kv_self.v , n_tokens, n_embd_gqa,
4815
4862
( n_ctx)*ggml_element_size (kv_self.v ),
4816
4863
(il*n_ctx)*ggml_element_size (kv_self.v )*n_embd_gqa + kv_head*ggml_element_size (kv_self.v ));
4864
+ offload_func_v (v);
4865
+ ggml_set_name (v, " v" );
4817
4866
4818
4867
ggml_build_forward_expand (gf, ggml_cpy (ctx0, Kcur, k));
4819
4868
ggml_build_forward_expand (gf, ggml_cpy (ctx0, Vcur, v));
4820
4869
}
4821
4870
4822
- struct ggml_tensor * Q =
4823
- ggml_permute (ctx0,
4824
- ggml_cpy (ctx0,
4825
- Qcur,
4826
- ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
4827
- 0 , 2 , 1 , 3 );
4871
+ struct ggml_tensor * Q = ggml_permute (ctx0, Qcur, 0 , 2 , 1 , 3 );
4872
+ offload_func_kq (Q);
4828
4873
ggml_set_name (Q, " Q" );
4829
4874
4830
4875
struct ggml_tensor * K =
@@ -4833,23 +4878,28 @@ static struct ggml_cgraph * llm_build_starcoder(
4833
4878
ggml_element_size (kv_self.k )*n_embd_gqa,
4834
4879
ggml_element_size (kv_self.k )*n_embd_head,
4835
4880
ggml_element_size (kv_self.k )*n_embd_gqa*n_ctx*il);
4881
+ offload_func_kq (K);
4836
4882
ggml_set_name (K, " K" );
4837
4883
4838
4884
// K * Q
4839
4885
struct ggml_tensor * KQ = ggml_mul_mat (ctx0, K, Q);
4886
+ offload_func_kq (KQ);
4840
4887
ggml_set_name (KQ, " KQ" );
4841
4888
4842
4889
// KQ_scaled = KQ / sqrt(n_embd_head)
4843
4890
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
4844
4891
struct ggml_tensor * KQ_scaled = ggml_scale_inplace (ctx0, KQ, KQ_scale);
4892
+ offload_func_kq (KQ_scaled);
4845
4893
ggml_set_name (KQ_scaled, " KQ_scaled" );
4846
4894
4847
4895
// KQ_masked = mask_past(KQ_scaled)
4848
4896
struct ggml_tensor * KQ_masked = ggml_add (ctx0, KQ_scaled, KQ_mask);
4897
+ offload_func_kq (KQ_masked);
4849
4898
ggml_set_name (KQ_masked, " KQ_masked" );
4850
4899
4851
4900
// KQ = soft_max(KQ_masked)
4852
4901
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace (ctx0, KQ_masked);
4902
+ offload_func_v (KQ_soft_max);
4853
4903
ggml_set_name (KQ_soft_max, " KQ_soft_max" );
4854
4904
4855
4905
// split cached V into n_head heads
@@ -4862,22 +4912,25 @@ static struct ggml_cgraph * llm_build_starcoder(
4862
4912
ggml_set_name (V, " V" );
4863
4913
4864
4914
struct ggml_tensor * KQV = ggml_mul_mat (ctx0, V, KQ_soft_max);
4915
+ offload_func_v (KQV);
4865
4916
ggml_set_name (KQV, " KQV" );
4866
4917
4867
- // KQV_merged = KQV.permute(0, 2, 1, 3)
4868
4918
struct ggml_tensor * KQV_merged = ggml_permute (ctx0, KQV, 0 , 2 , 1 , 3 );
4919
+ offload_func_v (KQV_merged);
4869
4920
ggml_set_name (KQV_merged, " KQV_merged" );
4870
4921
4871
- // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
4872
4922
cur = ggml_cont_2d (ctx0, KQV_merged, n_embd, n_tokens);
4923
+ offload_func_v (cur);
4873
4924
ggml_set_name (cur, " KQV_merged_contiguous" );
4874
4925
}
4875
4926
4876
4927
// Projection
4877
4928
cur = ggml_add (ctx0, ggml_mul_mat (ctx0, model.layers [il].wo , cur), model.layers [il].bo );
4929
+ offload_func (cur);
4878
4930
4879
4931
// Add the input
4880
4932
cur = ggml_add (ctx0, cur, inpL);
4933
+ offload_func (cur);
4881
4934
4882
4935
struct ggml_tensor * inpFF = cur;
4883
4936
@@ -4886,27 +4939,36 @@ static struct ggml_cgraph * llm_build_starcoder(
4886
4939
// Norm
4887
4940
{
4888
4941
cur = ggml_norm (ctx0, inpFF, norm_eps);
4942
+ offload_func_nr (cur);
4943
+
4889
4944
cur = ggml_add (ctx0, ggml_mul (ctx0, cur, model.layers [il].ffn_norm ), model.layers [il].ffn_norm_b );
4945
+ offload_func_nr (cur);
4890
4946
}
4891
4947
4892
4948
cur = ggml_add (ctx0, ggml_mul_mat (ctx0, model.layers [il].w3 , cur), model.layers [il].b3 );
4949
+ offload_func (cur);
4893
4950
4894
4951
// GELU activation
4895
4952
cur = ggml_gelu (ctx0, cur);
4953
+ offload_func (cur);
4896
4954
4897
4955
// Projection
4898
4956
cur = ggml_add (ctx0, ggml_mul_mat (ctx0, model.layers [il].w2 , cur), model.layers [il].b2 );
4957
+ offload_func (cur);
4899
4958
}
4900
4959
4901
4960
inpL = ggml_add (ctx0, cur, inpFF);
4961
+
4902
4962
}
4903
4963
4904
4964
// Output Norm
4905
4965
{
4906
4966
cur = ggml_norm (ctx0, inpL, norm_eps);
4967
+ offload_func_nr (cur);
4968
+
4907
4969
cur = ggml_add (ctx0, ggml_mul (ctx0, cur, model.output_norm ), model.output_norm_b );
4970
+ ggml_set_name (cur, " result_norm" );
4908
4971
}
4909
- ggml_set_name (cur, " result_norm" );
4910
4972
4911
4973
cur = ggml_mul_mat (ctx0, model.output , cur);
4912
4974
ggml_set_name (cur, " result_output" );
0 commit comments