@@ -1228,8 +1228,8 @@ struct llama_model {
1228
1228
llama_hparams hparams = {};
1229
1229
llama_vocab vocab;
1230
1230
1231
- struct ggml_tensor * tok_embeddings ;
1232
- struct ggml_tensor * pos_embeddings ;
1231
+ struct ggml_tensor * tok_embd ;
1232
+ struct ggml_tensor * pos_embd ;
1233
1233
struct ggml_tensor * tok_norm;
1234
1234
struct ggml_tensor * tok_norm_b;
1235
1235
@@ -2484,7 +2484,7 @@ static void llm_load_tensors(
2484
2484
case LLM_ARCH_LLAMA:
2485
2485
case LLM_ARCH_REFACT:
2486
2486
{
2487
- model.tok_embeddings = ml.create_tensor (ctx, tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2487
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2488
2488
2489
2489
// output
2490
2490
{
@@ -2552,7 +2552,7 @@ static void llm_load_tensors(
2552
2552
} break;
2553
2553
case LLM_ARCH_BAICHUAN:
2554
2554
{
2555
- model.tok_embeddings = ml.create_tensor (ctx, tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2555
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2556
2556
{
2557
2557
ggml_backend_type backend_norm;
2558
2558
ggml_backend_type backend_output;
@@ -2620,7 +2620,7 @@ static void llm_load_tensors(
2620
2620
{
2621
2621
// TODO: CPU-only for now
2622
2622
2623
- model.tok_embeddings = ml.create_tensor (ctx, tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2623
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2624
2624
2625
2625
// output
2626
2626
{
@@ -2696,8 +2696,8 @@ static void llm_load_tensors(
2696
2696
} break;
2697
2697
case LLM_ARCH_STARCODER:
2698
2698
{
2699
- model.tok_embeddings = ml.create_tensor (ctx, tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2700
- model.pos_embeddings = ml.create_tensor (ctx, tn (LLM_TENSOR_POS_EMBD, " weight" ), {n_embd, hparams.n_ctx_train }, GGML_BACKEND_CPU);
2699
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2700
+ model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
2701
2701
2702
2702
// output
2703
2703
{
@@ -2775,7 +2775,7 @@ static void llm_load_tensors(
2775
2775
} break;
2776
2776
case LLM_ARCH_PERSIMMON:
2777
2777
{
2778
- model.tok_embeddings = ml.create_tensor (ctx, tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2778
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2779
2779
2780
2780
{
2781
2781
ggml_backend_type backend_norm;
@@ -2838,9 +2838,9 @@ static void llm_load_tensors(
2838
2838
{
2839
2839
// TODO: CPU-only for now
2840
2840
2841
- model.tok_embeddings = ml.create_tensor (ctx, tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2842
- model.tok_norm = ml.create_tensor (ctx, tn (LLM_TENSOR_TOKEN_EMBD_NORM, " weight" ), {n_embd}, GGML_BACKEND_CPU);
2843
- model.tok_norm_b = ml.create_tensor (ctx, tn (LLM_TENSOR_TOKEN_EMBD_NORM, " bias" ), {n_embd}, GGML_BACKEND_CPU);
2841
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2842
+ model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
2843
+ model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
2844
2844
2845
2845
// output
2846
2846
{
@@ -2918,7 +2918,7 @@ static void llm_load_tensors(
2918
2918
} break;
2919
2919
case LLM_ARCH_MPT:
2920
2920
{
2921
- model.tok_embeddings = ml.create_tensor (ctx, tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2921
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2922
2922
2923
2923
// output
2924
2924
{
@@ -3099,6 +3099,31 @@ enum llm_rope_type {
3099
3099
LLM_ROPE_GLM,
3100
3100
};
3101
3101
3102
+ static struct ggml_tensor * llm_build_inp_embd(
3103
+ struct ggml_context * ctx,
3104
+ const llama_batch & batch,
3105
+ struct ggml_tensor * tok_embd,
3106
+ int64_t n_embd,
3107
+ int32_t n_tokens,
3108
+ const llm_build_cb & cb) {
3109
+ struct ggml_tensor * inpL;
3110
+
3111
+ if (batch.token) {
3112
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens);
3113
+ cb(inp_tokens, "inp_tokens", -1);
3114
+
3115
+ inpL = ggml_get_rows(ctx, tok_embd, inp_tokens);
3116
+ } else {
3117
+ #ifdef GGML_USE_MPI
3118
+ GGML_ASSERT(false && "not implemented");
3119
+ #endif
3120
+
3121
+ inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
3122
+ }
3123
+
3124
+ return inpL;
3125
+ }
3126
+
3102
3127
// Persimmon: n_rot = n_embd_head/2
3103
3128
// Other: n_rot = n_embd_head
3104
3129
static void llm_build_k_shift(
@@ -3463,18 +3488,7 @@ static struct ggml_cgraph * llm_build_llama(
3463
3488
struct ggml_tensor * cur;
3464
3489
struct ggml_tensor * inpL;
3465
3490
3466
- if (batch.token ) {
3467
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_tokens);
3468
- cb (inp_tokens, " inp_tokens" , -1 );
3469
-
3470
- inpL = ggml_get_rows (ctx0, model.tok_embeddings , inp_tokens);
3471
- } else {
3472
- #ifdef GGML_USE_MPI
3473
- GGML_ASSERT (false && " not implemented" );
3474
- #endif
3475
-
3476
- inpL = ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, n_embd, n_tokens);
3477
- }
3491
+ inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
3478
3492
cb(inpL, "inp_embd", -1);
3479
3493
3480
3494
// inp_pos - contains the positions
@@ -3619,18 +3633,7 @@ static struct ggml_cgraph * llm_build_baichaun(
3619
3633
struct ggml_tensor * cur;
3620
3634
struct ggml_tensor * inpL;
3621
3635
3622
- if (batch.token ) {
3623
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_tokens);
3624
- cb (inp_tokens, " inp_tokens" , -1 );
3625
-
3626
- inpL = ggml_get_rows (ctx0, model.tok_embeddings , inp_tokens);
3627
- } else {
3628
- #ifdef GGML_USE_MPI
3629
- GGML_ASSERT (false && " not implemented" );
3630
- #endif
3631
-
3632
- inpL = ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, n_embd, n_tokens);
3633
- }
3636
+ inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
3634
3637
cb(inpL, "inp_embd", -1);
3635
3638
3636
3639
// inp_pos - contains the positions
@@ -3789,18 +3792,7 @@ static struct ggml_cgraph * llm_build_falcon(
3789
3792
struct ggml_tensor * cur;
3790
3793
struct ggml_tensor * inpL;
3791
3794
3792
- if (batch.token ) {
3793
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_tokens);
3794
- cb (inp_tokens, " inp_tokens" , -1 );
3795
-
3796
- inpL = ggml_get_rows (ctx0, model.tok_embeddings , inp_tokens);
3797
- } else {
3798
- #ifdef GGML_USE_MPI
3799
- GGML_ASSERT (false && " not implemented" );
3800
- #endif
3801
-
3802
- inpL = ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, n_embd, n_tokens);
3803
- }
3795
+ inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
3804
3796
cb(inpL, "inp_embd", -1);
3805
3797
3806
3798
// inp_pos - contains the positions
@@ -3953,23 +3945,11 @@ static struct ggml_cgraph * llm_build_starcoder(
3953
3945
ggml_cgraph * gf = ggml_new_graph(ctx0);
3954
3946
3955
3947
struct ggml_tensor * cur;
3956
- struct ggml_tensor * embd;
3957
3948
struct ggml_tensor * pos;
3958
3949
struct ggml_tensor * inpL;
3959
3950
3960
- if (batch.token ) {
3961
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_tokens);
3962
- cb (inp_tokens, " inp_tokens" , -1 );
3963
-
3964
- embd = ggml_get_rows (ctx0, model.tok_embeddings , inp_tokens);
3965
- } else {
3966
- #ifdef GGML_USE_MPI
3967
- GGML_ASSERT (false && " not implemented" );
3968
- #endif
3969
-
3970
- embd = ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, n_embd, n_tokens);
3971
- }
3972
- cb (embd, " inp_embd" , -1 );
3951
+ inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
3952
+ cb(inpL, "inp_embd", -1);
3973
3953
3974
3954
// inp_pos - contains the positions
3975
3955
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
@@ -3983,10 +3963,10 @@ static struct ggml_cgraph * llm_build_starcoder(
3983
3963
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3984
3964
cb(KQ_mask, "KQ_mask", -1);
3985
3965
3986
- pos = ggml_get_rows (ctx0, model.pos_embeddings , inp_pos);
3966
+ pos = ggml_get_rows(ctx0, model.pos_embd , inp_pos);
3987
3967
cb(pos, "pos_embd", -1);
3988
3968
3989
- inpL = ggml_add (ctx0, embd , pos);
3969
+ inpL = ggml_add(ctx0, inpL , pos);
3990
3970
cb(inpL, "inpL", -1);
3991
3971
3992
3972
for (int il = 0; il < n_layer; ++il) {
@@ -4108,14 +4088,7 @@ static struct ggml_cgraph * llm_build_persimmon(
4108
4088
struct ggml_tensor * cur;
4109
4089
struct ggml_tensor * inpL;
4110
4090
4111
- if (batch.token ) {
4112
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_tokens);
4113
- cb (inp_tokens, " inp_tokens" , -1 );
4114
-
4115
- inpL = ggml_get_rows (ctx0, model.tok_embeddings , inp_tokens);
4116
- } else {
4117
- inpL = ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, n_embd, n_tokens);
4118
- }
4091
+ inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
4119
4092
cb(inpL, "imp_embd", -1);
4120
4093
4121
4094
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
@@ -4358,18 +4331,7 @@ static struct ggml_cgraph * llm_build_refact(
4358
4331
struct ggml_tensor * cur;
4359
4332
struct ggml_tensor * inpL;
4360
4333
4361
- if (batch.token ) {
4362
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_tokens);
4363
- cb (inp_tokens, " inp_tokens" , -1 );
4364
-
4365
- inpL = ggml_get_rows (ctx0, model.tok_embeddings , inp_tokens);
4366
- } else {
4367
- #ifdef GGML_USE_MPI
4368
- GGML_ASSERT (false && " not implemented" );
4369
- #endif
4370
-
4371
- inpL = ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, n_embd, n_tokens);
4372
- }
4334
+ inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
4373
4335
cb(inpL, "inp_embd", -1);
4374
4336
4375
4337
// KQ_scale
@@ -4499,22 +4461,10 @@ static struct ggml_cgraph * llm_build_bloom(
4499
4461
ggml_cgraph * gf = ggml_new_graph(ctx0);
4500
4462
4501
4463
struct ggml_tensor * cur;
4502
- struct ggml_tensor * embd;
4503
4464
struct ggml_tensor * inpL;
4504
4465
4505
- if (batch.token ) {
4506
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_tokens);
4507
- cb (inp_tokens, " inp_tokens" , -1 );
4508
-
4509
- embd = ggml_get_rows (ctx0, model.tok_embeddings , inp_tokens);
4510
- } else {
4511
- #ifdef GGML_USE_MPI
4512
- GGML_ASSERT (false && " not implemented" );
4513
- #endif
4514
-
4515
- embd = ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, n_embd, n_tokens);
4516
- }
4517
- cb (embd, " inp_embd" , -1 );
4466
+ inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
4467
+ cb(inpL, "inp_embd", -1);
4518
4468
4519
4469
// KQ_scale
4520
4470
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
@@ -4524,7 +4474,7 @@ static struct ggml_cgraph * llm_build_bloom(
4524
4474
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4525
4475
cb(KQ_mask, "KQ_mask", -1);
4526
4476
4527
- inpL = llm_build_norm (ctx0, embd ,
4477
+ inpL = llm_build_norm(ctx0, inpL ,
4528
4478
model.tok_norm,
4529
4479
model.tok_norm_b,
4530
4480
LLM_NORM, norm_eps, cb, -1);
@@ -4648,18 +4598,7 @@ static struct ggml_cgraph * llm_build_mpt(
4648
4598
struct ggml_tensor * cur;
4649
4599
struct ggml_tensor * inpL;
4650
4600
4651
- if (batch.token ) {
4652
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_tokens);
4653
- cb (inp_tokens, " inp_tokens" , -1 );
4654
-
4655
- inpL = ggml_get_rows (ctx0, model.tok_embeddings , inp_tokens);
4656
- } else {
4657
- #ifdef GGML_USE_MPI
4658
- GGML_ASSERT (false && " not implemented" );
4659
- #endif
4660
-
4661
- inpL = ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, n_embd, n_tokens);
4662
- }
4601
+ inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
4663
4602
cb(inpL, "inp_embd", -1);
4664
4603
4665
4604
// KQ_scale
0 commit comments