Skip to content

Commit a3eefe9

Browse files
committed
llama : model loading
1 parent d38e41e commit a3eefe9

File tree

1 file changed

+45
-6
lines changed

1 file changed

+45
-6
lines changed

llama.cpp

+45-6
Original file line numberDiff line numberDiff line change
@@ -338,10 +338,14 @@ enum llm_tensor {
338338
LLM_TENSOR_ATTN_NORM,
339339
LLM_TENSOR_ATTN_NORM_2,
340340
LLM_TENSOR_ATTN_ROT_EMBD,
341+
LLM_TENSOR_FFN_GATE_INP,
342+
LLM_TENSOR_FFN_NORM,
341343
LLM_TENSOR_FFN_GATE,
342344
LLM_TENSOR_FFN_DOWN,
343345
LLM_TENSOR_FFN_UP,
344-
LLM_TENSOR_FFN_NORM,
346+
LLM_TENSOR_FFN_DOWN_EXP,
347+
LLM_TENSOR_FFN_GATE_EXP,
348+
LLM_TENSOR_FFN_UP_EXP,
345349
LLM_TENSOR_ATTN_Q_NORM,
346350
LLM_TENSOR_ATTN_K_NORM,
347351
};
@@ -360,10 +364,14 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
360364
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
361365
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
362366
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
367+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
363368
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
364369
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
365370
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
366371
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
372+
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
373+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
374+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
367375
},
368376
},
369377
{
@@ -585,6 +593,10 @@ struct LLM_TN {
585593
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
586594
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
587595
}
596+
597+
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
598+
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
599+
}
588600
};
589601

590602
//
@@ -1268,6 +1280,12 @@ struct llama_layer {
12681280
struct ggml_tensor * ffn_down; // w2
12691281
struct ggml_tensor * ffn_up; // w3
12701282

1283+
// ff MoE
1284+
struct ggml_tensor * ffn_gate_inp;
1285+
struct ggml_tensor * ffn_gate_exp[8];
1286+
struct ggml_tensor * ffn_down_exp[8];
1287+
struct ggml_tensor * ffn_up_exp[8];
1288+
12711289
// ff bias
12721290
struct ggml_tensor * ffn_down_b; // b2
12731291
struct ggml_tensor * ffn_up_b; // b3
@@ -3025,9 +3043,20 @@ static void llm_load_tensors(
30253043

30263044
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
30273045

3028-
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3029-
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3030-
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3046+
layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false);
3047+
3048+
if (layer.ffn_gate_inp == nullptr) {
3049+
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3050+
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3051+
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3052+
} else {
3053+
// MoE branch
3054+
for (int x = 0; x < 8; ++x) {
3055+
layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
3056+
layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split);
3057+
layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
3058+
}
3059+
}
30313060

30323061
if (backend == GGML_BACKEND_GPU) {
30333062
vram_weights +=
@@ -3037,8 +3066,18 @@ static void llm_load_tensors(
30373066
(layer.bk ? ggml_nbytes(layer.bk) : 0) +
30383067
(layer.bv ? ggml_nbytes(layer.bv) : 0) +
30393068
(layer.bo ? ggml_nbytes(layer.bo) : 0) +
3040-
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
3041-
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3069+
ggml_nbytes(layer.ffn_norm);
3070+
3071+
if (layer.ffn_gate_inp == nullptr) {
3072+
vram_weights +=
3073+
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3074+
} else {
3075+
vram_weights += ggml_nbytes(layer.ffn_gate_inp);
3076+
for (int x = 0; x < 8; ++x) {
3077+
vram_weights +=
3078+
ggml_nbytes(layer.ffn_gate_exp[x]) + ggml_nbytes(layer.ffn_down_exp[x]) + ggml_nbytes(layer.ffn_up_exp[x]);
3079+
}
3080+
}
30423081
}
30433082
}
30443083
} break;

0 commit comments

Comments
 (0)