@@ -78,7 +78,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
78
78
if not self .is_safetensors :
79
79
self .part_names = Model .get_model_part_names (self .dir_model , ".bin" )
80
80
self .hparams = Model .load_hparams (self .dir_model )
81
- self .block_count = self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" ])
81
+ self .block_count = self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" , "num_layers" ])
82
82
self .tensor_map = gguf .get_tensor_name_map (self .model_arch , self .block_count )
83
83
self .tensor_names = None
84
84
if self .ftype == gguf .LlamaFileType .GUESSED :
@@ -2388,6 +2388,166 @@ def set_vocab(self, *args, **kwargs):
2388
2388
self .gguf_writer .add_add_bos_token (True )
2389
2389
self .gguf_writer .add_add_eos_token (True )
2390
2390
2391
+ @Model .register ("ChatGLMModel" )
2392
+ class ChatGLMModel (Model ):
2393
+ model_arch = gguf .MODEL_ARCH .CHATGLM
2394
+
2395
+ def set_vocab (self ):
2396
+ dir_model = self .dir_model
2397
+ hparams = self .hparams
2398
+ tokens : list [bytearray ] = []
2399
+ toktypes : list [int ] = []
2400
+ scores : list [float ] = []
2401
+
2402
+ from transformers import AutoTokenizer
2403
+ tokenizer = AutoTokenizer .from_pretrained (dir_model , trust_remote_code = True )
2404
+ vocab_size = hparams .get ("padded_vocab_size" , len (tokenizer .get_vocab ()))
2405
+ assert max (tokenizer .get_vocab ().values ()) < vocab_size
2406
+
2407
+ reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in tokenizer .get_vocab ().items ()}
2408
+
2409
+ for token_id in range (vocab_size ):
2410
+ piece = tokenizer ._convert_id_to_token (token_id )
2411
+ if token_id == 0 :
2412
+ piece = "<unk>"
2413
+ elif token_id == 1 :
2414
+ piece = "<bos>"
2415
+ elif token_id == 2 :
2416
+ piece = "<eos>"
2417
+
2418
+ text = piece .encode ("utf-8" )
2419
+ score = 0.0
2420
+ if len (piece ) != 0 and token_id < 64789 :
2421
+ score = tokenizer .tokenizer .sp_model .get_score (token_id )
2422
+
2423
+ if len (piece ) == 0 :
2424
+ text = f"[PAD{ token_id } ]" .encode ("utf-8" )
2425
+
2426
+ if token_id >= 64789 :
2427
+ toktype = SentencePieceTokenTypes .UNKNOWN
2428
+ tokens .append (text )
2429
+ scores .append (score )
2430
+ toktypes .append (toktype )
2431
+ continue
2432
+
2433
+ toktype = SentencePieceTokenTypes .NORMAL
2434
+ if tokenizer .tokenizer .sp_model .is_unknown (token_id ):
2435
+ toktype = SentencePieceTokenTypes .UNKNOWN
2436
+ elif tokenizer .tokenizer .sp_model .is_control (token_id ):
2437
+ toktype = SentencePieceTokenTypes .CONTROL
2438
+ elif tokenizer .tokenizer .sp_model .is_unused (token_id ):
2439
+ toktype = SentencePieceTokenTypes .UNUSED
2440
+ elif tokenizer .tokenizer .sp_model .is_byte (token_id ):
2441
+ toktype = SentencePieceTokenTypes .BYTE
2442
+
2443
+ tokens .append (text )
2444
+ scores .append (score )
2445
+ toktypes .append (toktype )
2446
+
2447
+ self .gguf_writer .add_tokenizer_model ("llama" )
2448
+ self .gguf_writer .add_token_list (tokens )
2449
+ self .gguf_writer .add_token_scores (scores )
2450
+ self .gguf_writer .add_token_types (toktypes )
2451
+
2452
+ special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
2453
+ special_vocab .add_to_gguf (self .gguf_writer )
2454
+
2455
+ def set_gguf_parameters (self ):
2456
+ self .gguf_writer .add_name ("ChatGLM-6b-chat" )
2457
+ n_embed = self .hparams .get ("hidden_size" , self .hparams .get ("n_embed" ))
2458
+ n_head = self .hparams .get ("n_head" , self .hparams .get ("num_attention_heads" ))
2459
+ n_head_kv = self .hparams .get ("multi_query_group_num" , n_head )
2460
+ self .gguf_writer .add_context_length (self .hparams .get ("seq_length" , n_embed ))
2461
+ self .gguf_writer .add_embedding_length (n_embed )
2462
+ self .gguf_writer .add_feed_forward_length (self .hparams .get ("ffn_hidden_size" , 4 * n_embed ))
2463
+ self .gguf_writer .add_block_count (self .hparams ["num_layers" ])
2464
+ self .gguf_writer .add_head_count (n_head )
2465
+ self .gguf_writer .add_head_count_kv (n_head_kv )
2466
+ self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["layernorm_epsilon" ])
2467
+ self .gguf_writer .add_file_type (self .ftype )
2468
+ self .gguf_writer .add_rope_dimension_count (64 )
2469
+ self .gguf_writer .add_add_bos_token (False )
2470
+
2471
+ def write_tensors (self ):
2472
+ block_count = self .hparams ["num_layers" ]
2473
+ tensors = dict (self .get_tensors ())
2474
+ tensor_map = gguf .get_tensor_name_map (self .model_arch , block_count )
2475
+ has_lm_head = True
2476
+ n_head = self .hparams .get ("n_head" , self .hparams .get ("num_attention_heads" ))
2477
+ n_embed = self .hparams .get ("hidden_size" , self .hparams .get ("n_embed" ))
2478
+
2479
+ for name , data_torch in tensors .items ():
2480
+ if name .endswith (".rotary_pos_emb.inv_freq" ):
2481
+ continue
2482
+
2483
+ if "lm_head.weight" not in tensors .keys () and "output.weight" not in tensors .keys ():
2484
+ has_lm_head = False
2485
+
2486
+ name = re .sub (r'transformer\.' , '' , name )
2487
+
2488
+ old_dtype = data_torch .dtype
2489
+
2490
+ # convert any unsupported data types to float32
2491
+ if data_torch .dtype not in (torch .float16 , torch .float32 ):
2492
+ data_torch = data_torch .to (torch .float32 )
2493
+
2494
+ data = data_torch .squeeze ().numpy ()
2495
+
2496
+ if re .match (r"h\.\d+\.self_attention\.query_key_value\.weight" , name ):
2497
+ # Map bloom-style qkv_linear to gpt-style qkv_linear
2498
+ # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
2499
+ # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
2500
+ qkv_weights = data .reshape ((n_head , 3 , n_embed // n_head , n_embed ))
2501
+ data = np .concatenate (
2502
+ (
2503
+ qkv_weights [:, 0 , :, :].reshape ((- 1 , n_embed )),
2504
+ qkv_weights [:, 1 , :, :].reshape ((- 1 , n_embed )),
2505
+ qkv_weights [:, 2 , :, :].reshape ((- 1 , n_embed )),
2506
+ ),
2507
+ axis = 0 ,
2508
+ )
2509
+ print ("re-format attention.linear_qkv.weight" )
2510
+ elif re .match (r"h\.\d+\.self_attention\.query_key_value\.bias" , name ):
2511
+ qkv_bias = data .reshape ((n_head , 3 , n_embed // n_head ))
2512
+ data = np .concatenate (
2513
+ (
2514
+ qkv_bias [:, 0 , :].reshape ((n_embed ,)),
2515
+ qkv_bias [:, 1 , :].reshape ((n_embed ,)),
2516
+ qkv_bias [:, 2 , :].reshape ((n_embed ,)),
2517
+ ),
2518
+ axis = 0 ,
2519
+ )
2520
+ print ("re-format attention.linear_qkv.bias" )
2521
+
2522
+ # map tensor names
2523
+ new_name = tensor_map .get_name (name , try_suffixes = (".weight" , ".bias" ))
2524
+ if new_name is None :
2525
+ print (f"Can not map tensor { name !r} " )
2526
+ sys .exit ()
2527
+
2528
+ n_dims = len (data .shape )
2529
+ data_dtype = data .dtype
2530
+
2531
+ # if f32 desired, convert any float16 to float32
2532
+ if self .ftype == 0 and data_dtype == np .float16 :
2533
+ data = data .astype (np .float32 )
2534
+
2535
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
2536
+ if self .ftype == 1 and data_dtype == np .float16 and n_dims == 1 :
2537
+ data = data .astype (np .float32 )
2538
+
2539
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
2540
+ if self .ftype == 1 and data_dtype == np .float32 and name .endswith (".weight" ) and n_dims == 2 :
2541
+ data = data .astype (np .float16 )
2542
+
2543
+ print (f"=> { new_name } , shape = { data .shape } , { old_dtype } --> { data .dtype } " )
2544
+
2545
+ self .gguf_writer .add_tensor (new_name , data )
2546
+
2547
+ if not has_lm_head and name == "word_embeddings.weight" :
2548
+ self .gguf_writer .add_tensor ("output.weight" , data )
2549
+ print (name , f"=> output.weight, shape = { data .shape } , { old_dtype } --> { data .dtype } " )
2550
+
2391
2551
2392
2552
###### CONVERSION LOGIC ######
2393
2553
0 commit comments