4
4
from __future__ import annotations
5
5
6
6
import argparse
7
+ import contextlib
7
8
import json
8
9
import os
9
10
import struct
20
21
import gguf
21
22
22
23
23
- def count_model_parts (dir_model : Path ) -> int :
24
+ def count_model_parts (dir_model : Path , prefix : str ) -> int :
24
25
num_parts = 0
25
26
for filename in os .listdir (dir_model ):
26
- if filename .startswith ("pytorch_model-" ):
27
+ if filename .startswith (prefix ):
27
28
num_parts += 1
28
29
29
30
if num_parts > 0 :
@@ -77,30 +78,36 @@ def parse_args() -> argparse.Namespace:
77
78
with open (dir_model / "config.json" , "r" , encoding = "utf-8" ) as f :
78
79
hparams = json .load (f )
79
80
80
- if hparams ["architectures" ][0 ] != "RWForCausalLM " :
81
+ if hparams ["architectures" ][0 ] != "FalconForCausalLM " :
81
82
print ("Model architecture not supported: " + hparams ["architectures" ][0 ])
82
83
83
84
sys .exit (1 )
84
85
85
86
# get number of model parts
86
- num_parts = count_model_parts (dir_model )
87
+ num_parts = count_model_parts (dir_model , "model-00" )
88
+ if num_parts :
89
+ is_safetensors = True
90
+ from safetensors import safe_open
91
+ else :
92
+ is_safetensors = False
93
+ num_parts = count_model_parts (dir_model , "pytorch_model-" )
87
94
88
95
ARCH = gguf .MODEL_ARCH .FALCON
89
96
gguf_writer = gguf .GGUFWriter (fname_out , gguf .MODEL_ARCH_NAMES [ARCH ])
90
97
91
98
print ("gguf: get model metadata" )
92
99
93
- block_count = hparams ["n_layer " ]
100
+ block_count = hparams ["num_hidden_layers " ]
94
101
95
102
gguf_writer .add_name ("Falcon" )
96
103
gguf_writer .add_context_length (2048 ) # not in config.json
97
104
gguf_writer .add_tensor_data_layout ("jploski" ) # qkv tensor transform
98
105
gguf_writer .add_embedding_length (hparams ["hidden_size" ])
99
106
gguf_writer .add_feed_forward_length (4 * hparams ["hidden_size" ])
100
107
gguf_writer .add_block_count (block_count )
101
- gguf_writer .add_head_count (hparams ["n_head " ])
102
- if "n_head_kv " in hparams :
103
- gguf_writer .add_head_count_kv (hparams ["n_head_kv " ])
108
+ gguf_writer .add_head_count (hparams ["num_attention_heads " ])
109
+ if "num_kv_heads " in hparams :
110
+ gguf_writer .add_head_count_kv (hparams ["num_kv_heads " ])
104
111
else :
105
112
gguf_writer .add_head_count_kv (1 )
106
113
gguf_writer .add_layer_norm_eps (hparams ["layer_norm_epsilon" ])
@@ -146,8 +153,8 @@ def parse_args() -> argparse.Namespace:
146
153
tensor_map = gguf .get_tensor_name_map (ARCH ,block_count )
147
154
148
155
# params for qkv transform
149
- n_head = hparams ["n_head " ]
150
- n_head_kv = hparams ["n_head_kv " ] if "n_head_kv " in hparams else 1
156
+ n_head = hparams ["num_attention_heads " ]
157
+ n_head_kv = hparams ["num_kv_heads " ] if "num_kv_heads " in hparams else 1
151
158
152
159
head_dim = hparams ["hidden_size" ] // n_head
153
160
@@ -156,6 +163,10 @@ def parse_args() -> argparse.Namespace:
156
163
157
164
if num_parts == 0 :
158
165
part_names = iter (("pytorch_model.bin" ,))
166
+ elif is_safetensors :
167
+ part_names = (
168
+ f"model-{ n :05} -of-{ num_parts :05} .safetensors" for n in range (1 , num_parts + 1 )
169
+ )
159
170
else :
160
171
part_names = (
161
172
f"pytorch_model-{ n :05} -of-{ num_parts :05} .bin" for n in range (1 , num_parts + 1 )
@@ -165,60 +176,64 @@ def parse_args() -> argparse.Namespace:
165
176
if args .vocab_only :
166
177
break
167
178
print ("gguf: loading model part '" + part_name + "'" )
168
- model_part = torch .load (dir_model / part_name , map_location = "cpu" )
169
-
170
- for name in model_part .keys ():
171
- data = model_part [name ]
172
-
173
- old_dtype = data .dtype
174
-
175
- # convert any unsupported data types to float32
176
- if data .dtype != torch .float16 and data .dtype != torch .float32 :
177
- data = data .to (torch .float32 )
178
-
179
- # QKV tensor transform
180
- # The original query_key_value tensor contains n_head_kv "kv groups",
181
- # each consisting of n_head/n_head_kv query weights followed by one key
182
- # and one value weight (shared by all query heads in the kv group).
183
- # This layout makes it a big pain to work with in GGML.
184
- # So we rearrange them here,, so that we have n_head query weights
185
- # followed by n_head_kv key weights followed by n_head_kv value weights,
186
- # in contiguous fashion.
187
- # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
188
-
189
- if "query_key_value" in name :
190
- qkv = data .view (n_head_kv , n_head // n_head_kv + 2 , head_dim , head_dim * n_head )
191
- q = qkv [:, :- 2 ].reshape (n_head * head_dim , head_dim * n_head )
192
- k = qkv [:, [- 2 ]].reshape (n_head_kv * head_dim , head_dim * n_head )
193
- v = qkv [:, [- 1 ]].reshape (n_head_kv * head_dim , head_dim * n_head )
194
- data = torch .cat ((q ,k ,v )).reshape_as (data )
195
-
196
- data = data .squeeze ().numpy ()
197
-
198
- # map tensor names
199
- new_name = tensor_map .get_name (name , try_suffixes = (".weight" , ".bias" ))
200
- if new_name is None :
201
- print ("Can not map tensor '" + name + "'" )
202
- sys .exit ()
203
-
204
- n_dims = len (data .shape )
205
- data_dtype = data .dtype
206
-
207
- # if f32 desired, convert any float16 to float32
208
- if ftype == 0 and data_dtype == np .float16 :
209
- data = data .astype (np .float32 )
210
-
211
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
212
- if ftype == 1 and data_dtype == np .float16 and n_dims == 1 :
213
- data = data .astype (np .float32 )
214
-
215
- # if f16 desired, convert any float32 2-dim weight tensors to float16
216
- if ftype == 1 and data_dtype == np .float32 and name .endswith (".weight" ) and n_dims == 2 :
217
- data = data .astype (np .float16 )
218
-
219
- print (new_name + ", n_dims = " + str (n_dims ) + ", " + str (old_dtype ) + " --> " + str (data .dtype ))
220
-
221
- gguf_writer .add_tensor (new_name , data )
179
+ if is_safetensors :
180
+ ctx = safe_open (dir_model / part_name , framework = "pt" , device = "cpu" )
181
+ else :
182
+ ctx = contextlib .nullcontext (torch .load (dir_model / part_name , map_location = "cpu" ))
183
+
184
+ with ctx as model_part :
185
+ for name in model_part .keys ():
186
+ data = model_part .get_tensor (name ) if is_safetensors else model_part [name ]
187
+
188
+ old_dtype = data .dtype
189
+
190
+ # convert any unsupported data types to float32
191
+ if data .dtype != torch .float16 and data .dtype != torch .float32 :
192
+ data = data .to (torch .float32 )
193
+
194
+ # QKV tensor transform
195
+ # The original query_key_value tensor contains n_head_kv "kv groups",
196
+ # each consisting of n_head/n_head_kv query weights followed by one key
197
+ # and one value weight (shared by all query heads in the kv group).
198
+ # This layout makes it a big pain to work with in GGML.
199
+ # So we rearrange them here,, so that we have n_head query weights
200
+ # followed by n_head_kv key weights followed by n_head_kv value weights,
201
+ # in contiguous fashion.
202
+ # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
203
+
204
+ if "query_key_value" in name :
205
+ qkv = data .view (n_head_kv , n_head // n_head_kv + 2 , head_dim , head_dim * n_head )
206
+ q = qkv [:, :- 2 ].reshape (n_head * head_dim , head_dim * n_head )
207
+ k = qkv [:, [- 2 ]].reshape (n_head_kv * head_dim , head_dim * n_head )
208
+ v = qkv [:, [- 1 ]].reshape (n_head_kv * head_dim , head_dim * n_head )
209
+ data = torch .cat ((q ,k ,v )).reshape_as (data )
210
+
211
+ data = data .squeeze ().numpy ()
212
+
213
+ # map tensor names
214
+ new_name = tensor_map .get_name (name , try_suffixes = (".weight" , ".bias" ))
215
+ if new_name is None :
216
+ print ("Can not map tensor '" + name + "'" )
217
+ sys .exit ()
218
+
219
+ n_dims = len (data .shape )
220
+ data_dtype = data .dtype
221
+
222
+ # if f32 desired, convert any float16 to float32
223
+ if ftype == 0 and data_dtype == np .float16 :
224
+ data = data .astype (np .float32 )
225
+
226
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
227
+ if ftype == 1 and data_dtype == np .float16 and n_dims == 1 :
228
+ data = data .astype (np .float32 )
229
+
230
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
231
+ if ftype == 1 and data_dtype == np .float32 and name .endswith (".weight" ) and n_dims == 2 :
232
+ data = data .astype (np .float16 )
233
+
234
+ print (new_name + ", n_dims = " + str (n_dims ) + ", " + str (old_dtype ) + " --> " + str (data .dtype ))
235
+
236
+ gguf_writer .add_tensor (new_name , data )
222
237
223
238
224
239
print ("gguf: write header" )
0 commit comments