@@ -2466,6 +2466,157 @@ def set_vocab(self, *args, **kwargs):
2466
2466
self .gguf_writer .add_add_eos_token (True )
2467
2467
2468
2468
2469
+ @Model .register ("ArcticForCausalLM" )
2470
+ class ArcticModel (Model ):
2471
+ model_arch = gguf .MODEL_ARCH .ARCTIC
2472
+
2473
+ def set_vocab (self ):
2474
+ # The reason for using a custom implementation here is that the
2475
+ # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
2476
+ # tokenizer.model and used them as BOS and EOS instead of adding new tokens.
2477
+ from sentencepiece import SentencePieceProcessor
2478
+
2479
+ tokenizer_path = self .dir_model / 'tokenizer.model'
2480
+
2481
+ if not tokenizer_path .is_file ():
2482
+ logger .error (f'Error: Missing { tokenizer_path } ' )
2483
+ sys .exit (1 )
2484
+
2485
+ # Read the whole vocabulary from the tokenizer.model file
2486
+ tokenizer = SentencePieceProcessor ()
2487
+ tokenizer .LoadFromFile (str (tokenizer_path ))
2488
+
2489
+ vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
2490
+
2491
+ tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
2492
+ scores : list [float ] = [- 10000.0 ] * vocab_size
2493
+ toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
2494
+
2495
+ for token_id in range (tokenizer .vocab_size ()):
2496
+
2497
+ piece = tokenizer .IdToPiece (token_id )
2498
+ text = piece .encode ("utf-8" )
2499
+ score = tokenizer .GetScore (token_id )
2500
+
2501
+ toktype = SentencePieceTokenTypes .NORMAL
2502
+ if tokenizer .IsUnknown (token_id ):
2503
+ toktype = SentencePieceTokenTypes .UNKNOWN
2504
+ elif tokenizer .IsControl (token_id ):
2505
+ toktype = SentencePieceTokenTypes .CONTROL
2506
+ elif tokenizer .IsUnused (token_id ):
2507
+ toktype = SentencePieceTokenTypes .UNUSED
2508
+ elif tokenizer .IsByte (token_id ):
2509
+ toktype = SentencePieceTokenTypes .BYTE
2510
+
2511
+ tokens [token_id ] = text
2512
+ scores [token_id ] = score
2513
+ toktypes [token_id ] = toktype
2514
+
2515
+ # Use the added_tokens_decoder field from tokeniser_config.json as the source
2516
+ # of information about added/redefined tokens and modify them accordingly.
2517
+ tokenizer_config_file = self .dir_model / 'tokenizer_config.json'
2518
+ if tokenizer_config_file .is_file ():
2519
+ with open (tokenizer_config_file , "r" , encoding = "utf-8" ) as f :
2520
+ tokenizer_config_json = json .load (f )
2521
+
2522
+ if "added_tokens_decoder" in tokenizer_config_json :
2523
+ added_tokens_decoder = tokenizer_config_json ["added_tokens_decoder" ]
2524
+ for token_id , token_json in added_tokens_decoder .items ():
2525
+ token_id = int (token_id )
2526
+ if (token_id >= vocab_size ):
2527
+ logger .debug (f'ignore token { token_id } : id is out of range, max={ vocab_size - 1 } ' )
2528
+ continue
2529
+
2530
+ token_content = token_json ["content" ]
2531
+ token_type = SentencePieceTokenTypes .USER_DEFINED
2532
+ token_score = - 10000.0
2533
+
2534
+ # Map unk_token to UNKNOWN, other special tokens to CONTROL
2535
+ # Set the score to 0.0 as in the original tokenizer.model
2536
+ if ("special" in token_json ) and token_json ["special" ]:
2537
+ if token_content == tokenizer_config_json ["unk_token" ]:
2538
+ token_type = SentencePieceTokenTypes .UNKNOWN
2539
+ else :
2540
+ token_type = SentencePieceTokenTypes .CONTROL
2541
+ token_score = 0.0
2542
+
2543
+ logger .info (f"Setting added token { token_id } to '{ token_content } ' (type: { token_type } , score: { token_score :.2f} )" )
2544
+ tokens [token_id ] = token_content .encode ("utf-8" )
2545
+ toktypes [token_id ] = token_type
2546
+ scores [token_id ] = token_score
2547
+
2548
+ self .gguf_writer .add_tokenizer_model ("llama" )
2549
+ self .gguf_writer .add_tokenizer_pre ("default" )
2550
+ self .gguf_writer .add_token_list (tokens )
2551
+ self .gguf_writer .add_token_scores (scores )
2552
+ self .gguf_writer .add_token_types (toktypes )
2553
+
2554
+ special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
2555
+ special_vocab .add_to_gguf (self .gguf_writer )
2556
+
2557
+ def set_gguf_parameters (self ):
2558
+ super ().set_gguf_parameters ()
2559
+ hparams = self .hparams
2560
+ self .gguf_writer .add_vocab_size (hparams ["vocab_size" ])
2561
+ self .gguf_writer .add_rope_dimension_count (hparams ["hidden_size" ] // hparams ["num_attention_heads" ])
2562
+
2563
+ _experts : list [dict [str , Tensor ]] | None = None
2564
+
2565
+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2566
+ n_head = self .hparams ["num_attention_heads" ]
2567
+ n_kv_head = self .hparams .get ("num_key_value_heads" )
2568
+
2569
+ if name .endswith ("q_proj.weight" ):
2570
+ data_torch = LlamaModel .permute (data_torch , n_head , n_head )
2571
+ if name .endswith ("k_proj.weight" ):
2572
+ data_torch = LlamaModel .permute (data_torch , n_head , n_kv_head )
2573
+
2574
+ # process the experts separately
2575
+ if name .find ("block_sparse_moe.experts" ) != - 1 :
2576
+ n_experts = self .hparams ["num_local_experts" ]
2577
+
2578
+ assert bid is not None
2579
+
2580
+ if self ._experts is None :
2581
+ self ._experts = [{} for _ in range (self .block_count )]
2582
+
2583
+ self ._experts [bid ][name ] = data_torch
2584
+
2585
+ if len (self ._experts [bid ]) >= n_experts * 3 :
2586
+ tensors : list [tuple [str , Tensor ]] = []
2587
+
2588
+ # merge the experts into a single 3d tensor
2589
+ for wid in ["w1" , "w2" , "w3" ]:
2590
+ datas : list [Tensor ] = []
2591
+
2592
+ for xid in range (n_experts ):
2593
+ ename = f"model.layers.{ bid } .block_sparse_moe.experts.{ xid } .{ wid } .weight"
2594
+ datas .append (self ._experts [bid ][ename ])
2595
+ del self ._experts [bid ][ename ]
2596
+
2597
+ data_torch = torch .stack (datas , dim = 0 )
2598
+
2599
+ merged_name = f"layers.{ bid } .feed_forward.experts.{ wid } .weight"
2600
+
2601
+ new_name = self .map_tensor_name (merged_name )
2602
+
2603
+ tensors .append ((new_name , data_torch ))
2604
+ return tensors
2605
+ else :
2606
+ return []
2607
+
2608
+ return [(self .map_tensor_name (name ), data_torch )]
2609
+
2610
+ def write_tensors (self ):
2611
+ super ().write_tensors ()
2612
+
2613
+ if self ._experts is not None :
2614
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
2615
+ experts = [k for d in self ._experts for k in d .keys ()]
2616
+ if len (experts ) > 0 :
2617
+ raise ValueError (f"Unprocessed experts: { experts } " )
2618
+
2619
+
2469
2620
###### CONVERSION LOGIC ######
2470
2621
2471
2622
0 commit comments