@@ -2370,104 +2370,69 @@ def set_gguf_parameters(self):
2370
2370
self .gguf_writer .add_vocab_size (hparams ["vocab_size" ])
2371
2371
self .gguf_writer .add_rope_dimension_count (hparams ["hidden_size" ] // hparams ["num_attention_heads" ])
2372
2372
2373
- # Same as super class, but permuting q_proj, k_proj
2374
- def write_tensors (self ):
2375
- block_count = self .hparams .get ("n_layers" , self .hparams .get ("num_hidden_layers" , self .hparams .get ("n_layer" )))
2376
- tensor_map = gguf .get_tensor_name_map (self .model_arch , block_count )
2377
- n_head = self .hparams .get ("num_attention_heads" )
2378
- n_kv_head = self .hparams .get ("num_key_value_heads" )
2379
- n_experts = self .hparams .get ("num_local_experts" )
2380
- experts = dict ()
2381
- for name , data_torch in self .get_tensors ():
2382
- # we don't need these
2383
- if name .endswith ((".attention.masked_bias" , ".attention.bias" , ".attention.rotary_emb.inv_freq" )):
2384
- continue
2385
-
2386
- old_dtype = data_torch .dtype
2387
-
2388
- # convert any unsupported data types to float32
2389
- if data_torch .dtype not in (torch .float16 , torch .float32 ):
2390
- data_torch = data_torch .to (torch .float32 )
2391
-
2392
- data = data_torch .numpy ()
2393
-
2394
- if name .endswith ("q_proj.weight" ):
2395
- data = permute (data , n_head , n_head )
2396
- if name .endswith ("k_proj.weight" ):
2397
- data = permute (data , n_head , n_kv_head )
2398
-
2399
- data = data .squeeze ()
2373
+ @staticmethod
2374
+ def permute (weights : Tensor , n_head : int , n_head_kv : int | None ):
2375
+ if n_head_kv is not None and n_head != n_head_kv :
2376
+ n_head = n_head_kv
2377
+ return (weights .reshape (n_head , 2 , weights .shape [0 ] // n_head // 2 , * weights .shape [1 :])
2378
+ .swapaxes (1 , 2 )
2379
+ .reshape (weights .shape ))
2400
2380
2401
- # process the experts separately
2402
- if name .find ("block_sparse_moe.experts" ) != - 1 :
2403
- experts [name ] = data
2404
- if len (experts ) >= n_experts :
2405
- # merge the experts into a single 3d tensor
2406
- for bid in range (block_count ):
2407
- for wid in range (1 , 4 ):
2408
- full = True
2409
- for xid in range (n_experts ):
2410
- ename = f"model.layers.{ bid } .block_sparse_moe.experts.{ xid } .w{ wid } .weight"
2411
- if ename not in experts :
2412
- full = False
2413
- break
2414
- if not full :
2415
- continue
2381
+ _experts : list [dict [str , Tensor ]] | None = None
2416
2382
2417
- datas = []
2418
- for xid in range (n_experts ):
2419
- ename = f"model.layers.{ bid } .block_sparse_moe.experts.{ xid } .w{ wid } .weight"
2420
- datas .append (experts [ename ])
2421
- del experts [ename ]
2383
+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2384
+ n_head = self .hparams ["num_attention_heads" ]
2385
+ n_kv_head = self .hparams .get ("num_key_value_heads" )
2422
2386
2423
- data = np .stack (datas , axis = 0 )
2424
- data_dtype = data .dtype
2387
+ if name .endswith ("q_proj.weight" ):
2388
+ data_torch = LlamaModel .permute (data_torch , n_head , n_head )
2389
+ if name .endswith ("k_proj.weight" ):
2390
+ data_torch = LlamaModel .permute (data_torch , n_head , n_kv_head )
2425
2391
2426
- if self .ftype == 0 and data_dtype == np .float16 :
2427
- data = data .astype (np .float32 )
2392
+ # process the experts separately
2393
+ if name .find ("block_sparse_moe.experts" ) != - 1 :
2394
+ n_experts = self .hparams ["num_local_experts" ]
2428
2395
2429
- if self .ftype == 1 and data_dtype == np .float32 :
2430
- data = data .astype (np .float16 )
2396
+ assert bid is not None
2431
2397
2432
- merged_name = f"layers.{ bid } .feed_forward.experts.w{ wid } .weight"
2398
+ if self ._experts is None :
2399
+ self ._experts = [{} for _ in range (self .block_count )]
2433
2400
2434
- new_name = tensor_map .get_name (merged_name , try_suffixes = (".weight" , ".bias" ))
2435
- if new_name is None :
2436
- print (f"Can not map tensor { name !r} " )
2437
- sys .exit ()
2401
+ self ._experts [bid ][name ] = data_torch
2438
2402
2439
- print (f"{ new_name } , n_dims = { len (data .shape )} , shape = { data .shape } --> { data .dtype } " )
2403
+ if len (self ._experts [bid ]) >= n_experts * 3 :
2404
+ tensors : list [tuple [str , Tensor ]] = []
2440
2405
2441
- self .gguf_writer .add_tensor (new_name , data )
2442
- continue
2406
+ # merge the experts into a single 3d tensor
2407
+ for wid in ["w1" , "w2" , "w3" ]:
2408
+ datas : list [Tensor ] = []
2443
2409
2444
- # map tensor names
2445
- new_name = tensor_map .get_name (name , try_suffixes = (".weight" , ".bias" ))
2446
- if new_name is None :
2447
- print (f"Can not map tensor { name !r} " )
2448
- sys .exit ()
2410
+ for xid in range (n_experts ):
2411
+ ename = f"model.layers.{ bid } .block_sparse_moe.experts.{ xid } .{ wid } .weight"
2412
+ datas .append (self ._experts [bid ][ename ])
2413
+ del self ._experts [bid ][ename ]
2449
2414
2450
- n_dims = len (data .shape )
2451
- data_dtype = data .dtype
2415
+ data_torch = torch .stack (datas , dim = 0 )
2452
2416
2453
- # if f32 desired, convert any float16 to float32
2454
- if self .ftype == 0 and data_dtype == np .float16 :
2455
- data = data .astype (np .float32 )
2417
+ merged_name = f"layers.{ bid } .feed_forward.experts.{ wid } .weight"
2456
2418
2457
- # 1d tensors need to be converted to float32
2458
- if self .ftype == 1 and data_dtype == np .float16 and n_dims == 1 :
2459
- data = data .astype (np .float32 )
2419
+ new_name = self .map_tensor_name (merged_name )
2460
2420
2461
- # if f16 desired, convert any float32 2-dim weight tensors to float16
2462
- if self .ftype == 1 and data_dtype == np .float32 and name .endswith (".weight" ) and n_dims == 2 :
2463
- data = data .astype (np .float16 )
2421
+ tensors .append ((new_name , data_torch ))
2422
+ return tensors
2423
+ else :
2424
+ return []
2464
2425
2465
- print ( f" { new_name } , n_dims = { n_dims } , { old_dtype } --> { data . dtype } " )
2426
+ return [( self . map_tensor_name ( name ), data_torch )]
2466
2427
2467
- self .gguf_writer .add_tensor (new_name , data )
2428
+ def write_tensors (self ):
2429
+ super ().write_tensors ()
2468
2430
2469
- if len (experts ) > 0 :
2470
- raise ValueError (f"Unprocessed experts: { experts .keys ()} " )
2431
+ if self ._experts is not None :
2432
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
2433
+ experts = [k for d in self ._experts for k in d .keys ()]
2434
+ if len (experts ) > 0 :
2435
+ raise ValueError (f"Unprocessed experts: { experts } " )
2471
2436
2472
2437
2473
2438
###### CONVERSION LOGIC ######
0 commit comments