91
91
#define LLAMA_ATTRIBUTE_FORMAT(...)
92
92
#endif
93
93
94
- #define LLAMA_MAX_NODES 8192
94
+ #define LLAMA_MAX_NODES 8192
95
+ #define LLAMA_MAX_EXPERTS 8
95
96
96
97
//
97
98
// logging
@@ -231,6 +232,8 @@ enum llm_kv {
231
232
LLM_KV_FEED_FORWARD_LENGTH,
232
233
LLM_KV_USE_PARALLEL_RESIDUAL,
233
234
LLM_KV_TENSOR_DATA_LAYOUT,
235
+ LLM_KV_EXPERT_COUNT,
236
+ LLM_KV_EXPERT_USED_COUNT,
234
237
235
238
LLM_KV_ATTENTION_HEAD_COUNT,
236
239
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -281,6 +284,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
281
284
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
282
285
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
283
286
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
287
+ { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
288
+ { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
284
289
285
290
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
286
291
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -1176,6 +1181,8 @@ struct llama_hparams {
1176
1181
uint32_t n_layer;
1177
1182
uint32_t n_rot;
1178
1183
uint32_t n_ff;
1184
+ uint32_t n_expert = 0;
1185
+ uint32_t n_expert_used = 0;
1179
1186
1180
1187
float f_norm_eps;
1181
1188
float f_norm_rms_eps;
@@ -1190,15 +1197,18 @@ struct llama_hparams {
1190
1197
float f_max_alibi_bias;
1191
1198
1192
1199
bool operator!=(const llama_hparams & other) const {
1193
- if (this->vocab_only != other.vocab_only) return true;
1194
- if (this->n_vocab != other.n_vocab) return true;
1195
- if (this->n_ctx_train != other.n_ctx_train) return true;
1196
- if (this->n_embd != other.n_embd) return true;
1197
- if (this->n_head != other.n_head) return true;
1198
- if (this->n_head_kv != other.n_head_kv) return true;
1199
- if (this->n_layer != other.n_layer) return true;
1200
- if (this->n_rot != other.n_rot) return true;
1201
- if (this->n_ff != other.n_ff) return true;
1200
+ if (this->vocab_only != other.vocab_only) return true;
1201
+ if (this->n_vocab != other.n_vocab) return true;
1202
+ if (this->n_ctx_train != other.n_ctx_train) return true;
1203
+ if (this->n_embd != other.n_embd) return true;
1204
+ if (this->n_head != other.n_head) return true;
1205
+ if (this->n_head_kv != other.n_head_kv) return true;
1206
+ if (this->n_layer != other.n_layer) return true;
1207
+ if (this->n_rot != other.n_rot) return true;
1208
+ if (this->n_ff != other.n_ff) return true;
1209
+ if (this->n_expert != other.n_expert) return true;
1210
+ if (this->n_expert_used != other.n_expert_used) return true;
1211
+
1202
1212
if (this->rope_finetuned != other.rope_finetuned) return true;
1203
1213
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
1204
1214
@@ -1282,9 +1292,9 @@ struct llama_layer {
1282
1292
1283
1293
// ff MoE
1284
1294
struct ggml_tensor * ffn_gate_inp;
1285
- struct ggml_tensor * ffn_gate_exp[8 ];
1286
- struct ggml_tensor * ffn_down_exp[8 ];
1287
- struct ggml_tensor * ffn_up_exp[8 ];
1295
+ struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS ];
1296
+ struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS ];
1297
+ struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS ];
1288
1298
1289
1299
// ff bias
1290
1300
struct ggml_tensor * ffn_down_b; // b2
@@ -2458,6 +2468,16 @@ static void llm_load_hparams(
2458
2468
ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
2459
2469
ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
2460
2470
ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
2471
+ ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
2472
+ ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
2473
+
2474
+ GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
2475
+ GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
2476
+ if (hparams.n_expert > 0) {
2477
+ GGML_ASSERT(hparams.n_expert_used > 0);
2478
+ } else {
2479
+ GGML_ASSERT(hparams.n_expert_used == 0);
2480
+ }
2461
2481
2462
2482
// n_head_kv is optional, default to n_head
2463
2483
hparams.n_head_kv = hparams.n_head;
@@ -2889,6 +2909,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2889
2909
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
2890
2910
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
2891
2911
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
2912
+ LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
2913
+ LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
2892
2914
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
2893
2915
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
2894
2916
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
@@ -3046,10 +3068,16 @@ static void llm_load_tensors(
3046
3068
layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false);
3047
3069
3048
3070
if (layer.ffn_gate_inp == nullptr) {
3071
+ GGML_ASSERT(hparams.n_expert == 0);
3072
+ GGML_ASSERT(hparams.n_expert_used == 0);
3073
+
3049
3074
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3050
3075
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3051
3076
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3052
3077
} else {
3078
+ GGML_ASSERT(hparams.n_expert > 0);
3079
+ GGML_ASSERT(hparams.n_expert_used > 0);
3080
+
3053
3081
// MoE branch
3054
3082
for (int x = 0; x < 8; ++x) {
3055
3083
layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
@@ -3073,7 +3101,7 @@ static void llm_load_tensors(
3073
3101
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3074
3102
} else {
3075
3103
vram_weights += ggml_nbytes(layer.ffn_gate_inp);
3076
- for (int x = 0; x < 8 ; ++x) {
3104
+ for (uint32_t x = 0; x < hparams.n_expert ; ++x) {
3077
3105
vram_weights +=
3078
3106
ggml_nbytes(layer.ffn_gate_exp[x]) + ggml_nbytes(layer.ffn_down_exp[x]) + ggml_nbytes(layer.ffn_up_exp[x]);
3079
3107
}
@@ -4058,6 +4086,8 @@ struct llm_build_context {
4058
4086
const int64_t n_head_kv;
4059
4087
const int64_t n_embd_head;
4060
4088
const int64_t n_embd_gqa;
4089
+ const int64_t n_expert;
4090
+ const int64_t n_expert_used;
4061
4091
4062
4092
const float freq_base;
4063
4093
const float freq_scale;
@@ -4099,6 +4129,8 @@ struct llm_build_context {
4099
4129
n_head_kv (hparams.n_head_kv),
4100
4130
n_embd_head (hparams.n_embd_head()),
4101
4131
n_embd_gqa (hparams.n_embd_gqa()),
4132
+ n_expert (hparams.n_expert),
4133
+ n_expert_used (hparams.n_expert_used),
4102
4134
freq_base (cparams.rope_freq_base),
4103
4135
freq_scale (cparams.rope_freq_scale),
4104
4136
ext_factor (cparams.yarn_ext_factor),
@@ -4242,25 +4274,21 @@ struct llm_build_context {
4242
4274
LLM_NORM_RMS, cb, il);
4243
4275
cb(cur, "ffn_norm", il);
4244
4276
4245
- // TODO: param
4246
- const int n_experts = 8;
4247
- const int n_experts_per_tok = 2;
4248
-
4249
4277
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
4250
4278
cb(logits, "ffn_moe_logits", il);
4251
4279
4252
4280
ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
4253
4281
cb(probs, "ffn_moe_probs", il);
4254
4282
4255
4283
// select experts
4256
- ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_experts_per_tok ); // [n_tokens, num_experts_per_tok]
4284
+ ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used ); // [n_tokens, num_experts_per_tok]
4257
4285
cb(selected_experts->src[0], "ffn_moe_argsort", il);
4258
4286
4259
4287
ggml_tensor * weights = ggml_get_rows(ctx0,
4260
- ggml_reshape_3d(ctx0, probs, 1, n_experts , n_tokens), selected_experts);
4288
+ ggml_reshape_3d(ctx0, probs, 1, n_expert , n_tokens), selected_experts);
4261
4289
cb(weights, "ffn_moe_weights", il);
4262
4290
4263
- weights = ggml_reshape_2d(ctx0, weights, n_experts_per_tok , n_tokens); // [n_tokens, num_experts_per_tok]
4291
+ weights = ggml_reshape_2d(ctx0, weights, n_expert_used , n_tokens); // [n_tokens, num_experts_per_tok]
4264
4292
4265
4293
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
4266
4294
cb(weights_sum, "ffn_moe_weights_sum", il);
@@ -4271,18 +4299,13 @@ struct llm_build_context {
4271
4299
// compute expert outputs
4272
4300
ggml_tensor * moe_out = nullptr;
4273
4301
4274
- for (int i = 0; i < n_experts_per_tok ; ++i) {
4302
+ for (int i = 0; i < n_expert_used ; ++i) {
4275
4303
ggml_tensor * cur_expert;
4276
4304
4277
- // TODO: fix
4278
- ggml_tensor ** ffn_up_exp = (ggml_tensor **) model.layers[il].ffn_up_exp;
4279
- ggml_tensor ** ffn_gate_exp = (ggml_tensor **) model.layers[il].ffn_gate_exp;
4280
- ggml_tensor ** ffn_down_exp = (ggml_tensor **) model.layers[il].ffn_down_exp;
4281
-
4282
- ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, ffn_up_exp, n_experts, selected_experts, i, cur);
4305
+ ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
4283
4306
cb(cur_up, "ffn_moe_up", il);
4284
4307
4285
- ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, ffn_gate_exp, n_experts , selected_experts, i, cur);
4308
+ ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il]. ffn_gate_exp, n_expert , selected_experts, i, cur);
4286
4309
cb(cur_gate, "ffn_moe_gate", il);
4287
4310
4288
4311
cur_gate = ggml_silu(ctx0, cur_gate);
@@ -4291,7 +4314,7 @@ struct llm_build_context {
4291
4314
cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
4292
4315
cb(cur_expert, "ffn_moe_gate_par", il);
4293
4316
4294
- cur_expert = ggml_mul_mat_id(ctx0, ffn_down_exp, n_experts , selected_experts, i, cur_expert); // [n_tokens, n_embd]
4317
+ cur_expert = ggml_mul_mat_id(ctx0, model.layers[il]. ffn_down_exp, n_expert , selected_experts, i, cur_expert); // [n_tokens, n_embd]
4295
4318
cb(cur_expert, "ffn_moe_down", il);
4296
4319
4297
4320
cur_expert = ggml_mul(ctx0, cur_expert,
@@ -8192,11 +8215,9 @@ static void llama_convert_tensor_internal(
8192
8215
workers.clear();
8193
8216
}
8194
8217
8195
- static ggml_type get_k_quant_type(
8196
- quantize_state_internal & qs,
8197
- ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
8198
- ) {
8218
+ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
8199
8219
const std::string name = ggml_get_name(tensor);
8220
+
8200
8221
// TODO: avoid hardcoded tensor names - use the TN_* constants
8201
8222
const llm_arch arch = qs.model.arch;
8202
8223
const auto tn = LLM_TN(arch);
@@ -8230,7 +8251,18 @@ static ggml_type get_k_quant_type(
8230
8251
// nearly negligible increase in model size by quantizing this tensor with more bits:
8231
8252
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
8232
8253
}
8254
+ if (qs.model.hparams.n_expert == 8) {
8255
+ // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
8256
+ // TODO: explore better strategies
8257
+ new_type = GGML_TYPE_Q8_0;
8258
+ }
8233
8259
++qs.i_attention_wv;
8260
+ } else if (name.find("attn_k.weight") != std::string::npos) {
8261
+ if (qs.model.hparams.n_expert == 8) {
8262
+ // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
8263
+ // TODO: explore better strategies
8264
+ new_type = GGML_TYPE_Q8_0;
8265
+ }
8234
8266
} else if (name.find("ffn_down.weight") != std::string::npos) {
8235
8267
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8236
8268
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
0 commit comments