@@ -88,6 +88,7 @@ class MODEL_ARCH(IntEnum):
88
88
PERSIMMON : int = auto ()
89
89
REFACT : int = auto ()
90
90
BERT : int = auto ()
91
+ PLAMO : int = auto ()
91
92
92
93
93
94
class MODEL_TENSOR (IntEnum ):
@@ -125,6 +126,7 @@ class MODEL_TENSOR(IntEnum):
125
126
MODEL_ARCH .PERSIMMON : "persimmon" ,
126
127
MODEL_ARCH .REFACT : "refact" ,
127
128
MODEL_ARCH .BERT : "bert" ,
129
+ MODEL_ARCH .PLAMO : "plamo" ,
128
130
}
129
131
130
132
TENSOR_NAMES : dict [MODEL_TENSOR , str ] = {
@@ -282,6 +284,21 @@ class MODEL_TENSOR(IntEnum):
282
284
MODEL_TENSOR .FFN_DOWN ,
283
285
MODEL_TENSOR .FFN_UP ,
284
286
],
287
+ MODEL_ARCH .PLAMO : [
288
+ MODEL_TENSOR .TOKEN_EMBD ,
289
+ MODEL_TENSOR .OUTPUT_NORM ,
290
+ MODEL_TENSOR .OUTPUT ,
291
+ MODEL_TENSOR .ROPE_FREQS ,
292
+ MODEL_TENSOR .ATTN_NORM ,
293
+ MODEL_TENSOR .ATTN_Q ,
294
+ MODEL_TENSOR .ATTN_K ,
295
+ MODEL_TENSOR .ATTN_V ,
296
+ MODEL_TENSOR .ATTN_OUT ,
297
+ MODEL_TENSOR .ATTN_ROT_EMBD ,
298
+ MODEL_TENSOR .FFN_GATE ,
299
+ MODEL_TENSOR .FFN_DOWN ,
300
+ MODEL_TENSOR .FFN_UP ,
301
+ ],
285
302
MODEL_ARCH .GPT2 : [
286
303
# TODO
287
304
],
@@ -366,6 +383,7 @@ class TensorNameMap:
366
383
"layers.{bid}.attention_norm" , # llama-pth
367
384
"encoder.layer.{bid}.attention.output.LayerNorm" , # bert
368
385
"language_model.encoder.layers.{bid}.input_layernorm" , # persimmon
386
+ "model.layers.layers.{bid}.norm" , # plamo
369
387
),
370
388
371
389
# Attention norm 2
@@ -384,45 +402,50 @@ class TensorNameMap:
384
402
385
403
# Attention query
386
404
MODEL_TENSOR .ATTN_Q : (
387
- "model.layers.{bid}.self_attn.q_proj" , # llama-hf
388
- "layers.{bid}.attention.wq" , # llama-pth
389
- "encoder.layer.{bid}.attention.self.query" , # bert
390
- "transformer.h.{bid}.attn.q_proj" , # gpt-j
405
+ "model.layers.{bid}.self_attn.q_proj" , # llama-hf
406
+ "layers.{bid}.attention.wq" , # llama-pth
407
+ "encoder.layer.{bid}.attention.self.query" , # bert
408
+ "transformer.h.{bid}.attn.q_proj" , # gpt-j
409
+ "model.layers.layers.{bid}.self_attn.q_proj" , # plamo
391
410
),
392
411
393
412
# Attention key
394
413
MODEL_TENSOR .ATTN_K : (
395
- "model.layers.{bid}.self_attn.k_proj" , # llama-hf
396
- "layers.{bid}.attention.wk" , # llama-pth
397
- "encoder.layer.{bid}.attention.self.key" , # bert
398
- "transformer.h.{bid}.attn.k_proj" , # gpt-j
414
+ "model.layers.{bid}.self_attn.k_proj" , # llama-hf
415
+ "layers.{bid}.attention.wk" , # llama-pth
416
+ "encoder.layer.{bid}.attention.self.key" , # bert
417
+ "transformer.h.{bid}.attn.k_proj" , # gpt-j
418
+ "model.layers.layers.{bid}.self_attn.k_proj" , # plamo
399
419
),
400
420
401
421
# Attention value
402
422
MODEL_TENSOR .ATTN_V : (
403
- "model.layers.{bid}.self_attn.v_proj" , # llama-hf
404
- "layers.{bid}.attention.wv" , # llama-pth
405
- "encoder.layer.{bid}.attention.self.value" , # bert
406
- "transformer.h.{bid}.attn.v_proj" , # gpt-j
423
+ "model.layers.{bid}.self_attn.v_proj" , # llama-hf
424
+ "layers.{bid}.attention.wv" , # llama-pth
425
+ "encoder.layer.{bid}.attention.self.value" , # bert
426
+ "transformer.h.{bid}.attn.v_proj" , # gpt-j
427
+ "model.layers.layers.{bid}.self_attn.v_proj" , # plamo
407
428
),
408
429
409
430
# Attention output
410
431
MODEL_TENSOR .ATTN_OUT : (
411
- "gpt_neox.layers.{bid}.attention.dense" , # gptneox
412
- "transformer.h.{bid}.attn.c_proj" , # gpt2 refact
413
- "transformer.blocks.{bid}.attn.out_proj" , # mpt
414
- "transformer.h.{bid}.self_attention.dense" , # falcon
415
- "model.layers.{bid}.self_attn.o_proj" , # llama-hf
416
- "layers.{bid}.attention.wo" , # llama-pth
417
- "encoder.layer.{bid}.attention.output.dense" , # bert
418
- "transformer.h.{bid}.attn.out_proj" , # gpt-j
419
- "language_model.encoder.layers.{bid}.self_attention.dense" # persimmon
432
+ "gpt_neox.layers.{bid}.attention.dense" , # gptneox
433
+ "transformer.h.{bid}.attn.c_proj" , # gpt2 refact
434
+ "transformer.blocks.{bid}.attn.out_proj" , # mpt
435
+ "transformer.h.{bid}.self_attention.dense" , # falcon
436
+ "model.layers.{bid}.self_attn.o_proj" , # llama-hf
437
+ "layers.{bid}.attention.wo" , # llama-pth
438
+ "encoder.layer.{bid}.attention.output.dense" , # bert
439
+ "transformer.h.{bid}.attn.out_proj" , # gpt-j
440
+ "language_model.encoder.layers.{bid}.self_attention.dense" , # persimmon
441
+ "model.layers.layers.{bid}.self_attn.o_proj" , # plamo
420
442
),
421
443
422
444
# Rotary embeddings
423
445
MODEL_TENSOR .ATTN_ROT_EMBD : (
424
- "model.layers.{bid}.self_attn.rotary_emb.inv_freq" , # llama-hf
425
- "layers.{bid}.attention.inner_attention.rope.freqs" , # llama-pth
446
+ "model.layers.{bid}.self_attn.rotary_emb.inv_freq" , # llama-hf
447
+ "layers.{bid}.attention.inner_attention.rope.freqs" , # llama-pth
448
+ "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq" , # plamo
426
449
),
427
450
428
451
# Feed-forward norm
@@ -447,12 +470,14 @@ class TensorNameMap:
447
470
"encoder.layer.{bid}.intermediate.dense" , # bert
448
471
"transformer.h.{bid}.mlp.fc_in" , # gpt-j
449
472
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h" , # persimmon
473
+ "model.layers.layers.{bid}.mlp.up_proj" , # plamo
450
474
),
451
475
452
476
# Feed-forward gate
453
477
MODEL_TENSOR .FFN_GATE : (
454
- "model.layers.{bid}.mlp.gate_proj" , # llama-hf refact
455
- "layers.{bid}.feed_forward.w1" , # llama-pth
478
+ "model.layers.{bid}.mlp.gate_proj" , # llama-hf refact
479
+ "layers.{bid}.feed_forward.w1" , # llama-pth
480
+ "model.layers.layers.{bid}.mlp.gate_proj" , # plamo
456
481
),
457
482
458
483
# Feed-forward down
@@ -466,6 +491,7 @@ class TensorNameMap:
466
491
"encoder.layer.{bid}.output.dense" , # bert
467
492
"transformer.h.{bid}.mlp.fc_out" , # gpt-j
468
493
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h" , # persimmon
494
+ "model.layers.layers.{bid}.mlp.down_proj" , # plamo
469
495
),
470
496
471
497
MODEL_TENSOR .ATTN_Q_NORM : (
0 commit comments