Skip to content
This repository was archived by the owner on Feb 6, 2024. It is now read-only.

Commit 4cd5f2d

Browse files
committed
enable starcoder gpu offloading
* ggml-org/llama.cpp#3827
1 parent 638c0ff commit 4cd5f2d

File tree

1 file changed

+81
-19
lines changed

1 file changed

+81
-19
lines changed

Diff for: Sources/llmfarm_core_cpp/llama/llama.cpp

+81-19
Original file line numberDiff line numberDiff line change
@@ -2752,19 +2752,19 @@ static void llm_load_tensors(
27522752
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
27532753

27542754
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2755-
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
2755+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
27562756

27572757
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2758-
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
2758+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
27592759

27602760
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
27612761
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
27622762

27632763
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
2764-
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
2764+
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
27652765

27662766
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2767-
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
2767+
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
27682768

27692769
if (backend == GGML_BACKEND_GPU) {
27702770
vram_weights +=
@@ -4695,6 +4695,8 @@ static struct ggml_cgraph * llm_build_starcoder(
46954695

46964696
const float norm_eps = hparams.f_norm_eps;
46974697

4698+
const int n_gpu_layers = model.n_gpu_layers;
4699+
46984700
const int32_t n_tokens = batch.n_tokens;
46994701
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
47004702
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
@@ -4739,6 +4741,27 @@ static struct ggml_cgraph * llm_build_starcoder(
47394741
}
47404742
}
47414743

4744+
const int i_gpu_start = n_layer - n_gpu_layers;
4745+
(void) i_gpu_start;
4746+
4747+
// offload functions set the tensor output backend to GPU
4748+
// tensors are GPU-accelerated if any input or the output has been offloaded
4749+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
4750+
offload_func_t offload_func_kq = llama_nop;
4751+
offload_func_t offload_func_v = llama_nop;
4752+
4753+
#ifdef GGML_USE_CUBLAS
4754+
if (n_gpu_layers > n_layer) {
4755+
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
4756+
}
4757+
if (n_gpu_layers > n_layer + 1) {
4758+
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
4759+
}
4760+
if (n_gpu_layers > n_layer + 2) {
4761+
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
4762+
}
4763+
#endif // GGML_USE_CUBLAS
4764+
47424765
{
47434766
// Compute position embeddings.
47444767
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
@@ -4764,6 +4787,7 @@ static struct ggml_cgraph * llm_build_starcoder(
47644787
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
47654788
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
47664789
ggml_set_name(KQ_mask, "KQ_mask");
4790+
offload_func_kq(KQ_mask);
47674791
ggml_allocr_alloc(lctx.alloc, KQ_mask);
47684792
if (!ggml_allocr_is_measure(lctx.alloc)) {
47694793
float * data = (float *) KQ_mask->data;
@@ -4787,44 +4811,65 @@ static struct ggml_cgraph * llm_build_starcoder(
47874811
ggml_set_name(inpL, "inpL");
47884812

47894813
for (int il = 0; il < n_layer; ++il) {
4814+
offload_func_t offload_func = llama_nop;
4815+
4816+
#ifdef GGML_USE_CUBLAS
4817+
if (il >= i_gpu_start) {
4818+
offload_func = ggml_cuda_assign_buffers_no_alloc;
4819+
}
4820+
#endif // GGML_USE_CUBLAS
47904821
{
47914822
// Norm
47924823
cur = ggml_norm(ctx0, inpL, norm_eps);
4824+
offload_func(cur);
47934825
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
4826+
offload_func(cur);
47944827
}
47954828

47964829
{
47974830
// Self Attention
4798-
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
4831+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
4832+
offload_func_kq(cur);
47994833

4800-
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
4801-
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
4802-
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
4834+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
4835+
offload_func_kq(cur);
48034836

4804-
struct ggml_tensor * Qcur = tmpq;
4837+
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
4838+
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
4839+
struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
4840+
4841+
ggml_set_name(tmpq, "tmpq");
4842+
ggml_set_name(tmpk, "tmpk");
4843+
ggml_set_name(tmpv, "tmpv");
4844+
4845+
offload_func_kq(tmpq);
4846+
offload_func_kq(tmpk);
4847+
offload_func_v(tmpv);
4848+
4849+
struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens);
48054850
struct ggml_tensor * Kcur = tmpk;
48064851

48074852
{
4808-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
4853+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, tmpv);
4854+
offload_func_v(Vcur);
48094855
ggml_set_name(Vcur, "Vcur");
48104856

48114857
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
4858+
offload_func_kq(k);
48124859
ggml_set_name(k, "k");
48134860

48144861
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
48154862
( n_ctx)*ggml_element_size(kv_self.v),
48164863
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
4864+
offload_func_v(v);
4865+
ggml_set_name(v, "v");
48174866

48184867
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
48194868
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
48204869
}
48214870

4822-
struct ggml_tensor * Q =
4823-
ggml_permute(ctx0,
4824-
ggml_cpy(ctx0,
4825-
Qcur,
4826-
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
4827-
0, 2, 1, 3);
4871+
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
4872+
offload_func_kq(Q);
48284873
ggml_set_name(Q, "Q");
48294874

48304875
struct ggml_tensor * K =
@@ -4833,23 +4878,28 @@ static struct ggml_cgraph * llm_build_starcoder(
48334878
ggml_element_size(kv_self.k)*n_embd_gqa,
48344879
ggml_element_size(kv_self.k)*n_embd_head,
48354880
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
4881+
offload_func_kq(K);
48364882
ggml_set_name(K, "K");
48374883

48384884
// K * Q
48394885
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
4886+
offload_func_kq(KQ);
48404887
ggml_set_name(KQ, "KQ");
48414888

48424889
// KQ_scaled = KQ / sqrt(n_embd_head)
48434890
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
48444891
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
4892+
offload_func_kq(KQ_scaled);
48454893
ggml_set_name(KQ_scaled, "KQ_scaled");
48464894

48474895
// KQ_masked = mask_past(KQ_scaled)
48484896
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
4897+
offload_func_kq(KQ_masked);
48494898
ggml_set_name(KQ_masked, "KQ_masked");
48504899

48514900
// KQ = soft_max(KQ_masked)
48524901
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
4902+
offload_func_v(KQ_soft_max);
48534903
ggml_set_name(KQ_soft_max, "KQ_soft_max");
48544904

48554905
// split cached V into n_head heads
@@ -4862,22 +4912,25 @@ static struct ggml_cgraph * llm_build_starcoder(
48624912
ggml_set_name(V, "V");
48634913

48644914
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
4915+
offload_func_v(KQV);
48654916
ggml_set_name(KQV, "KQV");
48664917

4867-
// KQV_merged = KQV.permute(0, 2, 1, 3)
48684918
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
4919+
offload_func_v(KQV_merged);
48694920
ggml_set_name(KQV_merged, "KQV_merged");
48704921

4871-
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
48724922
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
4923+
offload_func_v(cur);
48734924
ggml_set_name(cur, "KQV_merged_contiguous");
48744925
}
48754926

48764927
// Projection
48774928
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
4929+
offload_func(cur);
48784930

48794931
// Add the input
48804932
cur = ggml_add(ctx0, cur, inpL);
4933+
offload_func(cur);
48814934

48824935
struct ggml_tensor * inpFF = cur;
48834936

@@ -4886,27 +4939,36 @@ static struct ggml_cgraph * llm_build_starcoder(
48864939
// Norm
48874940
{
48884941
cur = ggml_norm(ctx0, inpFF, norm_eps);
4942+
offload_func_nr(cur);
4943+
48894944
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
4945+
offload_func_nr(cur);
48904946
}
48914947

48924948
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
4949+
offload_func(cur);
48934950

48944951
// GELU activation
48954952
cur = ggml_gelu(ctx0, cur);
4953+
offload_func(cur);
48964954

48974955
// Projection
48984956
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
4957+
offload_func(cur);
48994958
}
49004959

49014960
inpL = ggml_add(ctx0, cur, inpFF);
4961+
49024962
}
49034963

49044964
// Output Norm
49054965
{
49064966
cur = ggml_norm(ctx0, inpL, norm_eps);
4967+
offload_func_nr(cur);
4968+
49074969
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
4970+
ggml_set_name(cur, "result_norm");
49084971
}
4909-
ggml_set_name(cur, "result_norm");
49104972

49114973
cur = ggml_mul_mat(ctx0, model.output, cur);
49124974
ggml_set_name(cur, "result_output");

0 commit comments

Comments
 (0)