@@ -47,6 +47,9 @@ def _get_model_architecture(self):
47
47
return gguf .MODEL_ARCH .MPT
48
48
if arch in ("BaichuanForCausalLM" , "BaiChuanForCausalLM" ):
49
49
return gguf .MODEL_ARCH .BAICHUAN
50
+ if arch == "FalconForCausalLM" :
51
+ return gguf .MODEL_ARCH .FALCON
52
+
50
53
raise NotImplementedError (f'Architecture "{ arch } " not supported!' )
51
54
52
55
def set_vocab (self ):
@@ -180,6 +183,8 @@ def from_model_architecture(model_architecture):
180
183
return MPTModel
181
184
if model_architecture in ("BaichuanForCausalLM" , "BaiChuanForCausalLM" ):
182
185
return BaichuanModel
186
+ if model_architecture == "FalconForCausalLM" :
187
+ return FalconModel
183
188
return Model
184
189
185
190
class StableLMModel (Model ):
@@ -537,3 +542,96 @@ def write_tensors(self):
537
542
print (name + " -> " + new_name + ", n_dims = " + str (n_dims ) + ", " + str (old_dtype ) + " --> " + str (data .dtype ))
538
543
self .gguf_writer .add_tensor (new_name , data )
539
544
545
+
546
+ class FalconModel (Model ):
547
+ def set_gguf_parameters (self ):
548
+ block_count = self .hparams .get ("num_hidden_layers" )
549
+ if block_count is None :
550
+ block_count = self .hparams ["n_layer" ] # old name
551
+
552
+ n_head = self .hparams .get ("num_attention_heads" )
553
+ if n_head is None :
554
+ n_head = self .hparams ["n_head" ] # old name
555
+
556
+ n_head_kv = self .hparams .get ("num_kv_heads" )
557
+ if n_head_kv is None :
558
+ n_head_kv = self .hparams .get ("n_head_kv" , 1 ) # old name
559
+
560
+ self .gguf_writer .add_name ("Falcon" )
561
+ self .gguf_writer .add_context_length (2048 ) # not in config.json
562
+ self .gguf_writer .add_tensor_data_layout ("jploski" ) # qkv tensor transform
563
+ self .gguf_writer .add_embedding_length (self .hparams ["hidden_size" ])
564
+ self .gguf_writer .add_feed_forward_length (4 * self .hparams ["hidden_size" ])
565
+ self .gguf_writer .add_block_count (block_count )
566
+ self .gguf_writer .add_head_count (n_head )
567
+ self .gguf_writer .add_head_count_kv (n_head_kv )
568
+ self .gguf_writer .add_layer_norm_eps (self .hparams ["layer_norm_epsilon" ])
569
+ self .gguf_writer .add_file_type (self .ftype )
570
+
571
+ def write_tensors (self ):
572
+ block_count = self .hparams .get ("num_hidden_layers" )
573
+ if block_count is None :
574
+ block_count = self .hparams ["n_layer" ] # old name
575
+
576
+ n_head = self .hparams .get ("num_attention_heads" )
577
+ if n_head is None :
578
+ n_head = self .hparams ["n_head" ] # old name
579
+
580
+ n_head_kv = self .hparams .get ("num_kv_heads" )
581
+ if n_head_kv is None :
582
+ n_head_kv = self .hparams .get ("n_head_kv" , 1 ) # old name
583
+
584
+ head_dim = self .hparams ["hidden_size" ] // n_head
585
+ tensor_map = gguf .get_tensor_name_map (self .model_arch , block_count )
586
+
587
+ for name , data in self .get_tensors ():
588
+ old_dtype = data .dtype
589
+
590
+ # convert any unsupported data types to float32
591
+ if data .dtype != torch .float16 and data .dtype != torch .float32 :
592
+ data = data .to (torch .float32 )
593
+
594
+ # QKV tensor transform
595
+ # The original query_key_value tensor contains n_head_kv "kv groups",
596
+ # each consisting of n_head/n_head_kv query weights followed by one key
597
+ # and one value weight (shared by all query heads in the kv group).
598
+ # This layout makes it a big pain to work with in GGML.
599
+ # So we rearrange them here,, so that we have n_head query weights
600
+ # followed by n_head_kv key weights followed by n_head_kv value weights,
601
+ # in contiguous fashion.
602
+ # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
603
+
604
+ if "query_key_value" in name :
605
+ qkv = data .view (n_head_kv , n_head // n_head_kv + 2 , head_dim , head_dim * n_head )
606
+ q = qkv [:, :- 2 ].reshape (n_head * head_dim , head_dim * n_head )
607
+ k = qkv [:, [- 2 ]].reshape (n_head_kv * head_dim , head_dim * n_head )
608
+ v = qkv [:, [- 1 ]].reshape (n_head_kv * head_dim , head_dim * n_head )
609
+ data = torch .cat ((q ,k ,v )).reshape_as (data )
610
+
611
+ data = data .squeeze ().numpy ()
612
+
613
+ # map tensor names
614
+ new_name = tensor_map .get_name (name , try_suffixes = (".weight" , ".bias" ))
615
+ if new_name is None :
616
+ print ("Can not map tensor '" + name + "'" )
617
+ sys .exit ()
618
+
619
+ n_dims = len (data .shape )
620
+ data_dtype = data .dtype
621
+
622
+ # if f32 desired, convert any float16 to float32
623
+ if self .ftype == 0 and data_dtype == np .float16 :
624
+ data = data .astype (np .float32 )
625
+
626
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
627
+ if self .ftype == 1 and data_dtype == np .float16 and n_dims == 1 :
628
+ data = data .astype (np .float32 )
629
+
630
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
631
+ if self .ftype == 1 and data_dtype == np .float32 and name .endswith (".weight" ) and n_dims == 2 :
632
+ data = data .astype (np .float16 )
633
+
634
+ print (new_name + ", n_dims = " + str (n_dims ) + ", " + str (old_dtype ) + " --> " + str (data .dtype ))
635
+
636
+ self .gguf_writer .add_tensor (new_name , data )
637
+
0 commit comments