@@ -136,7 +136,7 @@ def find_n_mult(n_ff: int, n_embd: int) -> int:
136
136
calc_ff = (((8 * n_embd ) // 3 + n_mult - 1 ) // n_mult )* n_mult
137
137
if calc_ff == n_ff :
138
138
return n_mult
139
- return 1
139
+ raise Exception ( f"failed to find n_mult for (n_ff= { n_ff } , n_embd= { n_embd } )." )
140
140
141
141
@dataclass
142
142
class Params :
@@ -154,9 +154,15 @@ def guessed(model: 'LazyModel') -> 'Params':
154
154
# try transformer naming first
155
155
if "model.layers.0.self_attn.q_proj.weight" in model :
156
156
n_layer = next (i for i in itertools .count () if f"model.layers.{ i } .self_attn.q_proj.weight" not in model )
157
+ elif "model.layers.0.self_attn.W_pack.weight" in model : # next: try baichuan naming
158
+ n_layer = next (i for i in itertools .count () if f"model.layers.{ i } .self_attn.W_pack.weight" not in model )
157
159
else :
158
160
n_layer = next (i for i in itertools .count () if f"layers.{ i } .attention.wq.weight" not in model )
159
161
162
+ if n_layer < 1 :
163
+ raise Exception ("failed to guess 'n_layer'. This model is unknown or unsupported.\n "
164
+ "Suggestion: provide 'config.json' of the model in the same directory containing model files." )
165
+
160
166
n_head = n_embd // 128 # guessed
161
167
162
168
return Params (
@@ -321,6 +327,10 @@ def astype(self, data_type: DataType) -> 'Tensor': ...
321
327
@abstractmethod
322
328
def permute (self , n_head : int ) -> 'Tensor' : ...
323
329
@abstractmethod
330
+ def permute_part (self , n_part : int , n_head : int ) -> 'UnquantizedTensor' : ...
331
+ @abstractmethod
332
+ def part (self , n_part : int ) -> 'UnquantizedTensor' : ...
333
+ @abstractmethod
324
334
def to_ggml (self ) -> 'GGMLCompatibleTensor' : ...
325
335
326
336
@@ -345,6 +355,14 @@ def astype(self, data_type: DataType) -> Tensor:
345
355
def to_ggml (self ) -> 'UnquantizedTensor' :
346
356
return self
347
357
358
+ def permute_part (self , n_part : int , n_head : int ) -> 'UnquantizedTensor' :
359
+ r = self .ndarray .shape [0 ] // 3
360
+ return UnquantizedTensor (permute (self .ndarray [r * n_part : r * n_part + r , ...], n_head ))
361
+
362
+ def part (self , n_part : int ) -> 'UnquantizedTensor' :
363
+ r = self .ndarray .shape [0 ] // 3
364
+ return UnquantizedTensor (self .ndarray [r * n_part : r * n_part + r , ...])
365
+
348
366
def permute (self , n_head : int ) -> 'UnquantizedTensor' :
349
367
return UnquantizedTensor (permute (self .ndarray , n_head ))
350
368
@@ -642,6 +660,19 @@ def load() -> Tensor:
642
660
return lazy_tensor .load ().permute (n_head )
643
661
return LazyTensor (load , lazy_tensor .shape , lazy_tensor .data_type , f'permute({ n_head } ) ' + lazy_tensor .description )
644
662
663
+ def permute_part_lazy (lazy_tensor : LazyTensor , n_part : int , n_head : int ) -> LazyTensor :
664
+ def load () -> Tensor :
665
+ return lazy_tensor .load ().permute_part (n_part , n_head )
666
+ s = lazy_tensor .shape .copy ()
667
+ s [0 ] = s [0 ] // 3
668
+ return LazyTensor (load , s , lazy_tensor .data_type , f'permute({ n_head } ) ' + lazy_tensor .description )
669
+
670
+ def part_lazy (lazy_tensor : LazyTensor , n_part : int ) -> LazyTensor :
671
+ def load () -> Tensor :
672
+ return lazy_tensor .load ().part (n_part )
673
+ s = lazy_tensor .shape .copy ()
674
+ s [0 ] = s [0 ] // 3
675
+ return LazyTensor (load , s , lazy_tensor .data_type , 'part ' + lazy_tensor .description )
645
676
646
677
def convert_transformers_to_orig (model : LazyModel , params : Params ) -> LazyModel :
647
678
out : LazyModel = {}
@@ -650,11 +681,17 @@ def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
650
681
out ["output.weight" ] = model ["lm_head.weight" ]
651
682
652
683
for i in itertools .count ():
653
- if f"model.layers.{ i } .self_attn.q_proj.weight" not in model :
684
+ if f"model.layers.{ i } .self_attn.q_proj.weight" in model :
685
+ out [f"layers.{ i } .attention.wq.weight" ] = permute_lazy (model [f"model.layers.{ i } .self_attn.q_proj.weight" ], params .n_head )
686
+ out [f"layers.{ i } .attention.wk.weight" ] = permute_lazy (model [f"model.layers.{ i } .self_attn.k_proj.weight" ], params .n_head )
687
+ out [f"layers.{ i } .attention.wv.weight" ] = model [f"model.layers.{ i } .self_attn.v_proj.weight" ]
688
+ elif f"model.layers.{ i } .self_attn.W_pack.weight" in model :
689
+ out [f"layers.{ i } .attention.wq.weight" ] = permute_part_lazy (model [f"model.layers.{ i } .self_attn.W_pack.weight" ], 0 , params .n_head )
690
+ out [f"layers.{ i } .attention.wk.weight" ] = permute_part_lazy (model [f"model.layers.{ i } .self_attn.W_pack.weight" ], 1 , params .n_head )
691
+ out [f"layers.{ i } .attention.wv.weight" ] = part_lazy (model [f"model.layers.{ i } .self_attn.W_pack.weight" ], 2 )
692
+ else :
654
693
break
655
- out [f"layers.{ i } .attention.wq.weight" ] = permute_lazy (model [f"model.layers.{ i } .self_attn.q_proj.weight" ], params .n_head )
656
- out [f"layers.{ i } .attention.wk.weight" ] = permute_lazy (model [f"model.layers.{ i } .self_attn.k_proj.weight" ], params .n_head )
657
- out [f"layers.{ i } .attention.wv.weight" ] = model [f"model.layers.{ i } .self_attn.v_proj.weight" ]
694
+
658
695
out [f"layers.{ i } .attention.wo.weight" ] = model [f"model.layers.{ i } .self_attn.o_proj.weight" ]
659
696
660
697
out [f"layers.{ i } .feed_forward.w1.weight" ] = model [f"model.layers.{ i } .mlp.gate_proj.weight" ]
0 commit comments