@@ -23,6 +23,7 @@ class TensorNameMap:
23
23
"model.embedding" , # mamba-qbert
24
24
"backbone.embedding" , # mamba
25
25
"backbone.embeddings" , # mamba-hf
26
+ "transformer.in_out_embed" , # Grok
26
27
),
27
28
28
29
# Token type embeddings
@@ -66,6 +67,7 @@ class TensorNameMap:
66
67
"lm_head.ln" , # phi2
67
68
"model.norm_f" , # mamba-qbert
68
69
"backbone.norm_f" , # mamba
70
+ "transformer.rms_norm" , # Grok
69
71
),
70
72
71
73
# Rope frequencies
@@ -93,6 +95,7 @@ class TensorNameMap:
93
95
"model.layers.{bid}.attention_norm" , # internlm2
94
96
"model.layers.{bid}.norm" , # mamba-qbert
95
97
"backbone.layers.{bid}.norm" , # mamba
98
+ "transformer.decoder_layer.{bid}.rms_norm" , # Grok
96
99
),
97
100
98
101
# Attention norm 2
@@ -116,32 +119,35 @@ class TensorNameMap:
116
119
117
120
# Attention query
118
121
MODEL_TENSOR .ATTN_Q : (
119
- "model.layers.{bid}.self_attn.q_proj" , # llama-hf
120
- "layers.{bid}.attention.wq" , # llama-pth
121
- "encoder.layer.{bid}.attention.self.query" , # bert
122
- "transformer.h.{bid}.attn.q_proj" , # gpt-j
123
- "model.layers.layers.{bid}.self_attn.q_proj" , # plamo
124
- "model.layers.{bid}.attention.wq" # internlm2
122
+ "model.layers.{bid}.self_attn.q_proj" , # llama-hf
123
+ "layers.{bid}.attention.wq" , # llama-pth
124
+ "encoder.layer.{bid}.attention.self.query" , # bert
125
+ "transformer.h.{bid}.attn.q_proj" , # gpt-j
126
+ "model.layers.layers.{bid}.self_attn.q_proj" , # plamo
127
+ "model.layers.{bid}.attention.wq" , # internlm2
128
+ "transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok
125
129
),
126
130
127
131
# Attention key
128
132
MODEL_TENSOR .ATTN_K : (
129
- "model.layers.{bid}.self_attn.k_proj" , # llama-hf
130
- "layers.{bid}.attention.wk" , # llama-pth
131
- "encoder.layer.{bid}.attention.self.key" , # bert
132
- "transformer.h.{bid}.attn.k_proj" , # gpt-j
133
- "model.layers.layers.{bid}.self_attn.k_proj" , # plamo
134
- "model.layers.{bid}.attention.wk" # internlm2
133
+ "model.layers.{bid}.self_attn.k_proj" , # llama-hf
134
+ "layers.{bid}.attention.wk" , # llama-pth
135
+ "encoder.layer.{bid}.attention.self.key" , # bert
136
+ "transformer.h.{bid}.attn.k_proj" , # gpt-j
137
+ "model.layers.layers.{bid}.self_attn.k_proj" , # plamo
138
+ "model.layers.{bid}.attention.wk" , # internlm2
139
+ "transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok
135
140
),
136
141
137
142
# Attention value
138
143
MODEL_TENSOR .ATTN_V : (
139
- "model.layers.{bid}.self_attn.v_proj" , # llama-hf
140
- "layers.{bid}.attention.wv" , # llama-pth
141
- "encoder.layer.{bid}.attention.self.value" , # bert
142
- "transformer.h.{bid}.attn.v_proj" , # gpt-j
143
- "model.layers.layers.{bid}.self_attn.v_proj" , # plamo
144
- "model.layers.{bid}.attention.wv" # internlm2
144
+ "model.layers.{bid}.self_attn.v_proj" , # llama-hf
145
+ "layers.{bid}.attention.wv" , # llama-pth
146
+ "encoder.layer.{bid}.attention.self.value" , # bert
147
+ "transformer.h.{bid}.attn.v_proj" , # gpt-j
148
+ "model.layers.layers.{bid}.self_attn.v_proj" , # plamo
149
+ "model.layers.{bid}.attention.wv" , # internlm2
150
+ "transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok
145
151
),
146
152
147
153
# Attention output
@@ -162,12 +168,14 @@ class TensorNameMap:
162
168
"model.layers.layers.{bid}.self_attn.o_proj" , # plamo
163
169
"model.layers.{bid}.attention.wo" , # internlm2
164
170
"encoder.layers.{bid}.attn.out_proj" , # nomic-bert
171
+ "transformer.decoder_layer.{bid}.multi_head_attention.linear" # Grok
165
172
),
166
173
167
174
# Attention output norm
168
175
MODEL_TENSOR .ATTN_OUT_NORM : (
169
176
"encoder.layer.{bid}.attention.output.LayerNorm" , # bert
170
177
"encoder.layers.{bid}.norm1" , # nomic-bert
178
+ "transformer.decoder_layer.{bid}.rms_norm_1" , # Grok
171
179
),
172
180
173
181
# Rotary embeddings
@@ -190,11 +198,13 @@ class TensorNameMap:
190
198
"model.layers.{bid}.ln2" , # yi
191
199
"h.{bid}.ln_2" , # gpt2
192
200
"model.layers.{bid}.ffn_norm" , # internlm2
201
+ "transformer.decoder_layer.{bid}.rms_norm_2" , # Grok
193
202
),
194
203
195
204
MODEL_TENSOR .FFN_GATE_INP : (
196
205
"layers.{bid}.feed_forward.gate" , # mixtral
197
206
"model.layers.{bid}.block_sparse_moe.gate" , # mixtral
207
+ "transformer.decoder_layer.{bid}.router" # Grok
198
208
),
199
209
200
210
# Feed-forward up
@@ -223,6 +233,7 @@ class TensorNameMap:
223
233
MODEL_TENSOR .FFN_UP_EXP : (
224
234
"layers.{bid}.feed_forward.experts.{xid}.w3" , # mixtral
225
235
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w3" , # mixtral
236
+ "transformer.decoder_layer.{bid}.moe.{xid}.linear_v" , # Grok
226
237
),
227
238
228
239
# AWQ-activation gate
@@ -243,6 +254,7 @@ class TensorNameMap:
243
254
MODEL_TENSOR .FFN_GATE_EXP : (
244
255
"layers.{bid}.feed_forward.experts.{xid}.w1" , # mixtral
245
256
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w1" , # mixtral
257
+ "transformer.decoder_layer.{bid}.moe.{xid}.linear" # Grok
246
258
),
247
259
248
260
# Feed-forward down
@@ -270,6 +282,8 @@ class TensorNameMap:
270
282
MODEL_TENSOR .FFN_DOWN_EXP : (
271
283
"layers.{bid}.feed_forward.experts.{xid}.w2" , # mixtral
272
284
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w2" , # mixtral
285
+ "transformer.decoder_layer.{bid}.moe.{xid}.linear_1" , # Grok
286
+
273
287
),
274
288
275
289
MODEL_TENSOR .ATTN_Q_NORM : (
@@ -287,8 +301,9 @@ class TensorNameMap:
287
301
),
288
302
289
303
MODEL_TENSOR .LAYER_OUT_NORM : (
290
- "encoder.layer.{bid}.output.LayerNorm" , # bert
291
- "encoder.layers.{bid}.norm2" , # nomic-bert
304
+ "encoder.layer.{bid}.output.LayerNorm" , # bert
305
+ "encoder.layers.{bid}.norm2" , # nomic-bert
306
+ "transformer.decoder_layer.{bid}.rms_norm_3" , # Grok
292
307
),
293
308
294
309
MODEL_TENSOR .SSM_IN : (
0 commit comments