@@ -240,23 +240,6 @@ def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: i
240
240
return False
241
241
242
242
def write_tensors (self ):
243
- # same as ggml_compute_fp32_to_bf16 in ggml-impl.h
244
- def np_fp32_to_bf16 (n : np .ndarray ):
245
- # force nan to quiet
246
- n = np .where ((n & 0x7fffffff ) > 0x7f800000 , (n & 0xffff0000 ) | (64 << 16 ), n )
247
- # flush subnormals to zero
248
- n = np .where ((n & 0x7f800000 ) == 0 , n & 0x80000000 , n )
249
- # round to nearest even
250
- n = (n + (0x7fff + ((n >> 16 ) & 1 ))) >> 16
251
- return n .astype (np .int16 )
252
-
253
- # Doing this row-wise is much, much faster than element-wise, hence the signature
254
- v_fp32_to_bf16 = np .vectorize (np_fp32_to_bf16 , otypes = [np .int16 ], signature = "(n)->(n)" )
255
- if self .lazy :
256
- # TODO: find a way to implicitly wrap np.vectorize functions
257
- # NOTE: the type is changed to reflect otypes passed to np.vectorize above
258
- v_fp32_to_bf16 = gguf .LazyNumpyTensor ._wrap_fn (v_fp32_to_bf16 , meta_noop = np .int16 )
259
-
260
243
max_name_len = max (len (s ) for _ , s in self .tensor_map .mapping .values ()) + len (".weight," )
261
244
262
245
for name , data_torch in self .get_tensors ():
@@ -309,27 +292,31 @@ def np_fp32_to_bf16(n: np.ndarray):
309
292
))
310
293
311
294
if self .ftype != gguf .LlamaFileType .ALL_F32 and extra_f16 and not extra_f32 :
312
- if self .ftype == gguf .LlamaFileType .MOSTLY_F16 :
295
+ if self .ftype == gguf .LlamaFileType .MOSTLY_BF16 :
296
+ data = gguf .quantize_bf16 (data )
297
+ assert data .dtype == np .int16
298
+ data_qtype = gguf .GGMLQuantizationType .BF16
299
+
300
+ elif self .ftype == gguf .LlamaFileType .MOSTLY_Q8_0 and gguf .can_quantize_to_q8_0 (data ):
301
+ data = gguf .quantize_q8_0 (data )
302
+ assert data .dtype == np .uint8
303
+ data_qtype = gguf .GGMLQuantizationType .Q8_0
304
+
305
+ else : # default to float16 for quantized tensors
313
306
if data_dtype != np .float16 :
314
307
data = data .astype (np .float16 )
315
308
data_qtype = gguf .GGMLQuantizationType .F16
316
309
317
- elif self .ftype == gguf .LlamaFileType .MOSTLY_BF16 :
318
- if data_dtype != np .float32 :
319
- data = data .astype (np .float32 )
320
- data = v_fp32_to_bf16 (data .view (np .int32 ))
321
- assert data .dtype == np .int16
322
- data_qtype = gguf .GGMLQuantizationType .BF16
323
-
324
- else : # by default, convert to float32
310
+ if data_qtype is None : # by default, convert to float32
325
311
if data_dtype != np .float32 :
326
312
data = data .astype (np .float32 )
327
313
data_qtype = gguf .GGMLQuantizationType .F32
328
314
329
- assert data_qtype is not None
330
-
315
+ block_size , type_size = gguf .GGML_QUANT_SIZES [data_qtype ]
331
316
# reverse shape to make it similar to the internal ggml dimension order
332
- shape_str = f"{{{ ', ' .join (str (n ) for n in reversed (data .shape ))} }}"
317
+ shape_str = f"""{{{ ', ' .join (str (n ) for n in reversed (
318
+ (* data .shape [:- 1 ], data .shape [- 1 ] * data .dtype .itemsize // type_size * block_size ))
319
+ )} }}"""
333
320
334
321
# n_dims is implicit in the shape
335
322
logger .info (f"{ f'%-{ max_name_len } s' % f'{ new_name } ,' } { old_dtype } --> { data_qtype .name } , shape = { shape_str } " )
@@ -859,6 +846,7 @@ def set_gguf_parameters(self):
859
846
self .gguf_writer .add_head_count (head_count )
860
847
self .gguf_writer .add_head_count_kv (head_count_kv )
861
848
self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["rms_norm_eps" ])
849
+ self .gguf_writer .add_file_type (self .ftype )
862
850
863
851
if self .hparams .get ("rope_scaling" ) is not None and "factor" in self .hparams ["rope_scaling" ]:
864
852
if self .hparams ["rope_scaling" ].get ("type" ) == "linear" :
@@ -981,6 +969,7 @@ def set_gguf_parameters(self):
981
969
self .gguf_writer .add_head_count (head_count )
982
970
self .gguf_writer .add_head_count_kv (head_count_kv )
983
971
self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["rms_norm_eps" ])
972
+ self .gguf_writer .add_file_type (self .ftype )
984
973
985
974
if self .hparams .get ("rope_scaling" ) is not None and "factor" in self .hparams ["rope_scaling" ]:
986
975
if self .hparams ["rope_scaling" ].get ("type" ) == "linear" :
@@ -1215,6 +1204,7 @@ def set_gguf_parameters(self):
1215
1204
self .gguf_writer .add_head_count_kv (hparams ["num_key_value_heads" ])
1216
1205
self .gguf_writer .add_parallel_residual (hparams ["use_parallel_residual" ] if "use_parallel_residual" in hparams else True )
1217
1206
self .gguf_writer .add_layer_norm_eps (self .find_hparam (["layer_norm_eps" , "norm_eps" ]))
1207
+ self .gguf_writer .add_file_type (self .ftype )
1218
1208
1219
1209
_q_norms : list [dict [str , Tensor ]] | None = None
1220
1210
_k_norms : list [dict [str , Tensor ]] | None = None
@@ -1591,6 +1581,7 @@ def set_gguf_parameters(self):
1591
1581
self .gguf_writer .add_rope_dimension_count (self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ])
1592
1582
self .gguf_writer .add_head_count (self .hparams ["num_attention_heads" ])
1593
1583
self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["layer_norm_epsilon" ])
1584
+ self .gguf_writer .add_file_type (self .ftype )
1594
1585
1595
1586
1596
1587
@Model .register ("Qwen2ForCausalLM" )
@@ -1828,6 +1819,7 @@ def set_gguf_parameters(self):
1828
1819
self .gguf_writer .add_head_count (hparams ["num_attention_heads" ])
1829
1820
self .gguf_writer .add_head_count_kv (5 ) # hparams["num_key_value_heads"]) is wrong
1830
1821
self .gguf_writer .add_layer_norm_rms_eps (hparams ["rms_norm_eps" ])
1822
+ self .gguf_writer .add_file_type (self .ftype )
1831
1823
1832
1824
def shuffle_attn_q_weight (self , data_torch ):
1833
1825
assert data_torch .size () == (5120 , 5120 )
@@ -2007,6 +1999,7 @@ def set_gguf_parameters(self):
2007
1999
self .gguf_writer .add_head_count (self .hparams ["num_attention_heads" ])
2008
2000
self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["rms_norm_eps" ])
2009
2001
self .gguf_writer .add_head_count_kv (self .hparams ["num_key_value_heads" ])
2002
+ self .gguf_writer .add_file_type (self .ftype )
2010
2003
2011
2004
def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2012
2005
num_heads = self .hparams ["num_attention_heads" ]
@@ -2415,25 +2408,15 @@ class LazyTorchTensor(gguf.LazyBase):
2415
2408
def numpy (self ) -> gguf .LazyNumpyTensor :
2416
2409
dtype = self ._dtype_map [self .dtype ]
2417
2410
return gguf .LazyNumpyTensor (
2418
- meta = np . lib . stride_tricks . as_strided ( np . zeros ( 1 , dtype ) , self .shape , ( 0 for _ in self . shape ) ),
2411
+ meta = gguf . LazyNumpyTensor . meta_with_dtype_and_shape ( dtype , self .shape ),
2419
2412
lazy = self ._lazy ,
2420
2413
args = (self ,),
2421
2414
func = (lambda s : s [0 ].numpy ())
2422
2415
)
2423
2416
2424
2417
@classmethod
2425
- def eager_to_meta (cls , t : Tensor ) -> Tensor :
2426
- if t .is_meta :
2427
- return t
2428
- return t .detach ().to ("meta" )
2429
-
2430
- @classmethod
2431
- def meta_with_dtype (cls , m : Tensor , dtype : torch .dtype ) -> Tensor :
2432
- m = m .detach ()
2433
- if not m .is_meta :
2434
- m = m .to ("meta" )
2435
- m .dtype = dtype
2436
- return m
2418
+ def meta_with_dtype_and_shape (cls , dtype : torch .dtype , shape : torch .Size ) -> Tensor :
2419
+ return torch .empty (size = shape , dtype = dtype , device = "meta" )
2437
2420
2438
2421
@classmethod
2439
2422
def __torch_function__ (cls , func , types , args = (), kwargs = None ):
@@ -2464,8 +2447,8 @@ def parse_args() -> argparse.Namespace:
2464
2447
help = "path to write to; default: based on input. {ftype} will be replaced by the outtype." ,
2465
2448
)
2466
2449
parser .add_argument (
2467
- "--outtype" , type = str , choices = ["f32" , "f16" , "bf16" , "auto" ], default = "f16" ,
2468
- help = "output format - use f32 for float32, f16 for float16, bf16 for bfloat16, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type" ,
2450
+ "--outtype" , type = str , choices = ["f32" , "f16" , "bf16" , "q8_0" , " auto" ], default = "f16" ,
2451
+ help = "output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type" ,
2469
2452
)
2470
2453
parser .add_argument (
2471
2454
"--bigendian" , action = "store_true" ,
@@ -2523,6 +2506,7 @@ def main() -> None:
2523
2506
"f32" : gguf .LlamaFileType .ALL_F32 ,
2524
2507
"f16" : gguf .LlamaFileType .MOSTLY_F16 ,
2525
2508
"bf16" : gguf .LlamaFileType .MOSTLY_BF16 ,
2509
+ "q8_0" : gguf .LlamaFileType .MOSTLY_Q8_0 ,
2526
2510
"auto" : gguf .LlamaFileType .GUESSED ,
2527
2511
}
2528
2512
0 commit comments