@@ -87,13 +87,17 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
87
87
//
88
88
[[noreturn]]
89
89
static void usage (const char * executable) {
90
- printf (" usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n " , executable);
90
+ printf (" usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n " , executable);
91
91
printf (" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n " );
92
92
printf (" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n " );
93
93
printf (" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n " );
94
94
printf (" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n " );
95
95
printf (" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n " );
96
96
printf (" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n " );
97
+ printf (" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n " );
98
+ printf (" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n " );
99
+ printf (" --override-kv KEY=TYPE:VALUE\n " );
100
+ printf (" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n " );
97
101
printf (" Note: --include-weights and --exclude-weights cannot be used together\n " );
98
102
printf (" \n Allowed quantization types:\n " );
99
103
for (auto & it : QUANT_OPTIONS) {
@@ -107,14 +111,14 @@ static void usage(const char * executable) {
107
111
exit (1 );
108
112
}
109
113
110
- static void load_imatrix (const std::string& imatrix_file, std::unordered_map<std::string, std::vector<float >>& imatrix_data) {
114
+ static void load_imatrix (const std::string & imatrix_file, std::unordered_map<std::string, std::vector<float >> & imatrix_data) {
111
115
std::ifstream in (imatrix_file.c_str (), std::ios::binary);
112
116
if (!in) {
113
- printf (" %s: failed to open %s\n " ,__func__,imatrix_file.c_str ());
117
+ printf (" %s: failed to open %s\n " ,__func__, imatrix_file.c_str ());
114
118
return ;
115
119
}
116
120
int n_entries;
117
- in.read ((char *)&n_entries, sizeof (n_entries));
121
+ in.read ((char *)&n_entries, sizeof (n_entries));
118
122
if (in.fail () || n_entries < 1 ) {
119
123
printf (" %s: no data in file %s\n " , __func__, imatrix_file.c_str ());
120
124
return ;
@@ -124,39 +128,39 @@ static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std
124
128
std::vector<char > name_as_vec (len+1 );
125
129
in.read ((char *)name_as_vec.data (), len);
126
130
if (in.fail ()) {
127
- printf (" %s: failed reading name for entry %d from %s\n " ,__func__,i+1 ,imatrix_file.c_str ());
131
+ printf (" %s: failed reading name for entry %d from %s\n " , __func__, i+1 , imatrix_file.c_str ());
128
132
return ;
129
133
}
130
134
name_as_vec[len] = 0 ;
131
135
std::string name{name_as_vec.data ()};
132
- auto & e = imatrix_data[std::move (name)];
136
+ auto & e = imatrix_data[std::move (name)];
133
137
int ncall;
134
- in.read ((char *)&ncall, sizeof (ncall));
138
+ in.read ((char *)&ncall, sizeof (ncall));
135
139
int nval;
136
140
in.read ((char *)&nval, sizeof (nval));
137
141
if (in.fail () || nval < 1 ) {
138
- printf (" %s: failed reading number of values for entry %d\n " ,__func__,i);
142
+ printf (" %s: failed reading number of values for entry %d\n " , __func__, i);
139
143
imatrix_data = {};
140
144
return ;
141
145
}
142
146
e.resize (nval);
143
- in.read ((char *)e.data (), nval*sizeof (float ));
147
+ in.read ((char *)e.data (), nval*sizeof (float ));
144
148
if (in.fail ()) {
145
- printf (" %s: failed reading data for entry %d\n " ,__func__,i);
149
+ printf (" %s: failed reading data for entry %d\n " , __func__, i);
146
150
imatrix_data = {};
147
151
return ;
148
152
}
149
153
if (ncall > 0 ) {
150
154
for (auto & v : e) v /= ncall;
151
155
}
152
156
}
153
- printf (" %s: loaded %d importance matrix entries from %s\n " ,__func__,int (imatrix_data.size ()),imatrix_file.c_str ());
157
+ printf (" %s: loaded %d importance matrix entries from %s\n " , __func__, int (imatrix_data.size ()), imatrix_file.c_str ());
154
158
}
155
159
156
- static void prepare_imatrix (const std::string& imatrix_file,
157
- const std::vector<std::string>& included_weights,
158
- const std::vector<std::string>& excluded_weights,
159
- std::unordered_map<std::string, std::vector<float >>& imatrix_data) {
160
+ static void prepare_imatrix (const std::string & imatrix_file,
161
+ const std::vector<std::string> & included_weights,
162
+ const std::vector<std::string> & excluded_weights,
163
+ std::unordered_map<std::string, std::vector<float >> & imatrix_data) {
160
164
if (!imatrix_file.empty ()) {
161
165
load_imatrix (imatrix_file, imatrix_data);
162
166
}
@@ -201,6 +205,43 @@ static ggml_type parse_ggml_type(const char * arg) {
201
205
return result;
202
206
}
203
207
208
+ static bool parse_kv_override (const char * data, std::vector<llama_model_kv_override> & overrides) {
209
+ const char * sep = strchr (data, ' =' );
210
+ if (sep == nullptr || sep - data >= 128 ) {
211
+ fprintf (stderr, " %s: malformed KV override '%s'\n " , __func__, data);
212
+ return false ;
213
+ }
214
+ llama_model_kv_override kvo;
215
+ std::strncpy (kvo.key , data, sep - data);
216
+ kvo.key [sep - data] = 0 ;
217
+ sep++;
218
+ if (strncmp (sep, " int:" , 4 ) == 0 ) {
219
+ sep += 4 ;
220
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
221
+ kvo.int_value = std::atol (sep);
222
+ } else if (strncmp (sep, " float:" , 6 ) == 0 ) {
223
+ sep += 6 ;
224
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
225
+ kvo.float_value = std::atof (sep);
226
+ } else if (strncmp (sep, " bool:" , 5 ) == 0 ) {
227
+ sep += 5 ;
228
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
229
+ if (std::strcmp (sep, " true" ) == 0 ) {
230
+ kvo.bool_value = true ;
231
+ } else if (std::strcmp (sep, " false" ) == 0 ) {
232
+ kvo.bool_value = false ;
233
+ } else {
234
+ fprintf (stderr, " %s: invalid boolean value for KV override '%s'\n " , __func__, data);
235
+ return false ;
236
+ }
237
+ } else {
238
+ fprintf (stderr, " %s: invalid type for KV override '%s'\n " , __func__, data);
239
+ return false ;
240
+ }
241
+ overrides.emplace_back (std::move (kvo));
242
+ return true ;
243
+ }
244
+
204
245
int main (int argc, char ** argv) {
205
246
if (argc < 3 ) {
206
247
usage (argv[0 ]);
@@ -211,6 +252,7 @@ int main(int argc, char ** argv) {
211
252
int arg_idx = 1 ;
212
253
std::string imatrix_file;
213
254
std::vector<std::string> included_weights, excluded_weights;
255
+ std::vector<llama_model_kv_override> kv_overrides;
214
256
215
257
for (; arg_idx < argc && strncmp (argv[arg_idx], " --" , 2 ) == 0 ; arg_idx++) {
216
258
if (strcmp (argv[arg_idx], " --leave-output-tensor" ) == 0 ) {
@@ -227,6 +269,10 @@ int main(int argc, char ** argv) {
227
269
} else {
228
270
usage (argv[0 ]);
229
271
}
272
+ } else if (strcmp (argv[arg_idx], " --override-kv" ) == 0 ) {
273
+ if (arg_idx == argc-1 || !parse_kv_override (argv[++arg_idx], kv_overrides)) {
274
+ usage (argv[0 ]);
275
+ }
230
276
} else if (strcmp (argv[arg_idx], " --allow-requantize" ) == 0 ) {
231
277
params.allow_requantize = true ;
232
278
} else if (strcmp (argv[arg_idx], " --pure" ) == 0 ) {
@@ -267,6 +313,11 @@ int main(int argc, char ** argv) {
267
313
if (!imatrix_data.empty ()) {
268
314
params.imatrix = &imatrix_data;
269
315
}
316
+ if (!kv_overrides.empty ()) {
317
+ kv_overrides.emplace_back ();
318
+ kv_overrides.back ().key [0 ] = 0 ;
319
+ params.kv_overrides = &kv_overrides;
320
+ }
270
321
271
322
llama_backend_init ();
272
323
@@ -288,8 +339,7 @@ int main(int argc, char ** argv) {
288
339
if (ftype_str == " COPY" ) {
289
340
params.only_copy = true ;
290
341
}
291
- }
292
- else {
342
+ } else {
293
343
fname_out = argv[arg_idx];
294
344
arg_idx++;
295
345
0 commit comments