@@ -10,7 +10,7 @@ class TensorNameMap:
10
10
# Token embeddings
11
11
MODEL_TENSOR .TOKEN_EMBD : (
12
12
"gpt_neox.embed_in" , # gptneox
13
- "transformer.wte" , # gpt2 gpt-j mpt refact qwen
13
+ "transformer.wte" , # gpt2 gpt-j mpt refact qwen dbrx
14
14
"transformer.word_embeddings" , # falcon
15
15
"word_embeddings" , # bloom
16
16
"model.embed_tokens" , # llama-hf
@@ -48,7 +48,7 @@ class TensorNameMap:
48
48
# Output
49
49
MODEL_TENSOR .OUTPUT : (
50
50
"embed_out" , # gptneox
51
- "lm_head" , # gpt2 mpt falcon llama-hf baichuan qwen mamba
51
+ "lm_head" , # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx
52
52
"output" , # llama-pth bloom internlm2
53
53
"word_embeddings_for_head" , # persimmon
54
54
"lm_head.linear" , # phi2
@@ -60,7 +60,7 @@ class TensorNameMap:
60
60
"transformer.ln_f" , # gpt2 gpt-j falcon
61
61
"model.norm" , # llama-hf baichuan internlm2
62
62
"norm" , # llama-pth
63
- "transformer.norm_f" , # mpt
63
+ "transformer.norm_f" , # mpt dbrx
64
64
"ln_f" , # refact bloom qwen gpt2
65
65
"language_model.encoder.final_layernorm" , # persimmon
66
66
"model.final_layernorm" , # persimmon
@@ -96,6 +96,7 @@ class TensorNameMap:
96
96
"model.layers.{bid}.norm" , # mamba-qbert
97
97
"backbone.layers.{bid}.norm" , # mamba
98
98
"transformer.decoder_layer.{bid}.rms_norm" , # Grok
99
+ "transformer.blocks.{bid}.norm_attn_norm.norm_1" , # dbrx
99
100
),
100
101
101
102
# Attention norm 2
@@ -108,6 +109,7 @@ class TensorNameMap:
108
109
"gpt_neox.layers.{bid}.attention.query_key_value" , # gptneox
109
110
"transformer.h.{bid}.attn.c_attn" , # gpt2 qwen
110
111
"transformer.blocks.{bid}.attn.Wqkv" , # mpt
112
+ "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv" , # dbrx
111
113
"transformer.h.{bid}.self_attention.query_key_value" , # falcon
112
114
"h.{bid}.self_attention.query_key_value" , # bloom
113
115
"language_model.encoder.layers.{bid}.self_attention.query_key_value" , # persimmon
@@ -152,30 +154,32 @@ class TensorNameMap:
152
154
153
155
# Attention output
154
156
MODEL_TENSOR .ATTN_OUT : (
155
- "gpt_neox.layers.{bid}.attention.dense" , # gptneox
156
- "transformer.h.{bid}.attn.c_proj" , # gpt2 refact qwen
157
- "transformer.blocks.{bid}.attn.out_proj" , # mpt
158
- "transformer.h.{bid}.self_attention.dense" , # falcon
159
- "h.{bid}.self_attention.dense" , # bloom
160
- "model.layers.{bid}.self_attn.o_proj" , # llama-hf
161
- "layers.{bid}.attention.wo" , # llama-pth
162
- "encoder.layer.{bid}.attention.output.dense" , # bert
163
- "transformer.h.{bid}.attn.out_proj" , # gpt-j
164
- "language_model.encoder.layers.{bid}.self_attention.dense" , # persimmon
165
- "model.layers.{bid}.self_attn.dense" , # persimmon
166
- "h.{bid}.attn.c_proj" , # gpt2
167
- "transformer.h.{bid}.mixer.out_proj" , # phi2
168
- "model.layers.layers.{bid}.self_attn.o_proj" , # plamo
169
- "model.layers.{bid}.attention.wo" , # internlm2
170
- "encoder.layers.{bid}.attn.out_proj" , # nomic-bert
171
- "transformer.decoder_layer.{bid}.multi_head_attention.linear" # Grok
157
+ "gpt_neox.layers.{bid}.attention.dense" , # gptneox
158
+ "transformer.h.{bid}.attn.c_proj" , # gpt2 refact qwen
159
+ "transformer.blocks.{bid}.attn.out_proj" , # mpt
160
+ "transformer.h.{bid}.self_attention.dense" , # falcon
161
+ "h.{bid}.self_attention.dense" , # bloom
162
+ "model.layers.{bid}.self_attn.o_proj" , # llama-hf
163
+ "layers.{bid}.attention.wo" , # llama-pth
164
+ "encoder.layer.{bid}.attention.output.dense" , # bert
165
+ "transformer.h.{bid}.attn.out_proj" , # gpt-j
166
+ "language_model.encoder.layers.{bid}.self_attention.dense" , # persimmon
167
+ "model.layers.{bid}.self_attn.dense" , # persimmon
168
+ "h.{bid}.attn.c_proj" , # gpt2
169
+ "transformer.h.{bid}.mixer.out_proj" , # phi2
170
+ "model.layers.layers.{bid}.self_attn.o_proj" , # plamo
171
+ "model.layers.{bid}.attention.wo" , # internlm2
172
+ "encoder.layers.{bid}.attn.out_proj" , # nomic-bert
173
+ "transformer.decoder_layer.{bid}.multi_head_attention.linear" , # Grok
174
+ "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj" , # dbrx
172
175
),
173
176
174
177
# Attention output norm
175
178
MODEL_TENSOR .ATTN_OUT_NORM : (
176
179
"encoder.layer.{bid}.attention.output.LayerNorm" , # bert
177
180
"encoder.layers.{bid}.norm1" , # nomic-bert
178
181
"transformer.decoder_layer.{bid}.rms_norm_1" , # Grok
182
+ "transformer.blocks.{bid}.norm_attn_norm.norm_2" , # dbrx
179
183
),
180
184
181
185
# Rotary embeddings
@@ -202,9 +206,10 @@ class TensorNameMap:
202
206
),
203
207
204
208
MODEL_TENSOR .FFN_GATE_INP : (
205
- "layers.{bid}.feed_forward.gate" , # mixtral
206
- "model.layers.{bid}.block_sparse_moe.gate" , # mixtral
207
- "transformer.decoder_layer.{bid}.router" # Grok
209
+ "layers.{bid}.feed_forward.gate" , # mixtral
210
+ "model.layers.{bid}.block_sparse_moe.gate" , # mixtral
211
+ "transformer.decoder_layer.{bid}.router" , # Grok
212
+ "transformer.blocks.{bid}.ffn.router.layer" , # dbrx
208
213
),
209
214
210
215
# Feed-forward up
@@ -233,6 +238,7 @@ class TensorNameMap:
233
238
MODEL_TENSOR .FFN_UP_EXP : (
234
239
"layers.{bid}.feed_forward.experts.w3" , # mixtral (merged)
235
240
"transformer.decoder_layer.{bid}.moe.linear_v" , # Grok (merged)
241
+ "transformer.blocks.{bid}.ffn.experts.mlp.v1" , # dbrx
236
242
),
237
243
238
244
# AWQ-activation gate
@@ -251,8 +257,9 @@ class TensorNameMap:
251
257
),
252
258
253
259
MODEL_TENSOR .FFN_GATE_EXP : (
254
- "layers.{bid}.feed_forward.experts.w1" , # mixtral (merged)
255
- "transformer.decoder_layer.{bid}.moe.linear" # Grok (merged)
260
+ "layers.{bid}.feed_forward.experts.w1" , # mixtral (merged)
261
+ "transformer.decoder_layer.{bid}.moe.linear" , # Grok (merged)
262
+ "transformer.blocks.{bid}.ffn.experts.mlp.w1" , # dbrx
256
263
),
257
264
258
265
# Feed-forward down
@@ -280,6 +287,7 @@ class TensorNameMap:
280
287
MODEL_TENSOR .FFN_DOWN_EXP : (
281
288
"layers.{bid}.feed_forward.experts.w2" , # mixtral (merged)
282
289
"transformer.decoder_layer.{bid}.moe.linear_1" , # Grok (merged)
290
+ "transformer.blocks.{bid}.ffn.experts.mlp.w2" , # dbrx
283
291
),
284
292
285
293
MODEL_TENSOR .ATTN_Q_NORM : (
0 commit comments