Skip to content

Commit ca32cb9

Browse files
ikawrakowKawrakowggerganov
authored andcommitted
quantize : be able to override metadata by key (ggml-org#6321)
* quantize: be able to override metadata by key * minor : spacing --------- Co-authored-by: Iwan Kawrakow <[email protected]> Co-authored-by: Georgi Gerganov <[email protected]>
1 parent de9ceb4 commit ca32cb9

File tree

3 files changed

+96
-27
lines changed

3 files changed

+96
-27
lines changed

examples/quantize/quantize.cpp

+67-17
Original file line numberDiff line numberDiff line change
@@ -87,13 +87,17 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
8787
//
8888
[[noreturn]]
8989
static void usage(const char * executable) {
90-
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
90+
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
9191
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
9292
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
9393
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
9494
printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
9595
printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
9696
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
97+
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
98+
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
99+
printf(" --override-kv KEY=TYPE:VALUE\n");
100+
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
97101
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
98102
printf("\nAllowed quantization types:\n");
99103
for (auto & it : QUANT_OPTIONS) {
@@ -107,14 +111,14 @@ static void usage(const char * executable) {
107111
exit(1);
108112
}
109113

110-
static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
114+
static void load_imatrix(const std::string & imatrix_file, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
111115
std::ifstream in(imatrix_file.c_str(), std::ios::binary);
112116
if (!in) {
113-
printf("%s: failed to open %s\n",__func__,imatrix_file.c_str());
117+
printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
114118
return;
115119
}
116120
int n_entries;
117-
in.read((char*)&n_entries, sizeof(n_entries));
121+
in.read((char *)&n_entries, sizeof(n_entries));
118122
if (in.fail() || n_entries < 1) {
119123
printf("%s: no data in file %s\n", __func__, imatrix_file.c_str());
120124
return;
@@ -124,39 +128,39 @@ static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std
124128
std::vector<char> name_as_vec(len+1);
125129
in.read((char *)name_as_vec.data(), len);
126130
if (in.fail()) {
127-
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file.c_str());
131+
printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str());
128132
return;
129133
}
130134
name_as_vec[len] = 0;
131135
std::string name{name_as_vec.data()};
132-
auto& e = imatrix_data[std::move(name)];
136+
auto & e = imatrix_data[std::move(name)];
133137
int ncall;
134-
in.read((char*)&ncall, sizeof(ncall));
138+
in.read((char *)&ncall, sizeof(ncall));
135139
int nval;
136140
in.read((char *)&nval, sizeof(nval));
137141
if (in.fail() || nval < 1) {
138-
printf("%s: failed reading number of values for entry %d\n",__func__,i);
142+
printf("%s: failed reading number of values for entry %d\n", __func__, i);
139143
imatrix_data = {};
140144
return;
141145
}
142146
e.resize(nval);
143-
in.read((char*)e.data(), nval*sizeof(float));
147+
in.read((char *)e.data(), nval*sizeof(float));
144148
if (in.fail()) {
145-
printf("%s: failed reading data for entry %d\n",__func__,i);
149+
printf("%s: failed reading data for entry %d\n", __func__, i);
146150
imatrix_data = {};
147151
return;
148152
}
149153
if (ncall > 0) {
150154
for (auto& v : e) v /= ncall;
151155
}
152156
}
153-
printf("%s: loaded %d importance matrix entries from %s\n",__func__,int(imatrix_data.size()),imatrix_file.c_str());
157+
printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str());
154158
}
155159

156-
static void prepare_imatrix(const std::string& imatrix_file,
157-
const std::vector<std::string>& included_weights,
158-
const std::vector<std::string>& excluded_weights,
159-
std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
160+
static void prepare_imatrix(const std::string & imatrix_file,
161+
const std::vector<std::string> & included_weights,
162+
const std::vector<std::string> & excluded_weights,
163+
std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
160164
if (!imatrix_file.empty()) {
161165
load_imatrix(imatrix_file, imatrix_data);
162166
}
@@ -201,6 +205,43 @@ static ggml_type parse_ggml_type(const char * arg) {
201205
return result;
202206
}
203207

208+
static bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
209+
const char* sep = strchr(data, '=');
210+
if (sep == nullptr || sep - data >= 128) {
211+
fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
212+
return false;
213+
}
214+
llama_model_kv_override kvo;
215+
std::strncpy(kvo.key, data, sep - data);
216+
kvo.key[sep - data] = 0;
217+
sep++;
218+
if (strncmp(sep, "int:", 4) == 0) {
219+
sep += 4;
220+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
221+
kvo.int_value = std::atol(sep);
222+
} else if (strncmp(sep, "float:", 6) == 0) {
223+
sep += 6;
224+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
225+
kvo.float_value = std::atof(sep);
226+
} else if (strncmp(sep, "bool:", 5) == 0) {
227+
sep += 5;
228+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
229+
if (std::strcmp(sep, "true") == 0) {
230+
kvo.bool_value = true;
231+
} else if (std::strcmp(sep, "false") == 0) {
232+
kvo.bool_value = false;
233+
} else {
234+
fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
235+
return false;
236+
}
237+
} else {
238+
fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
239+
return false;
240+
}
241+
overrides.emplace_back(std::move(kvo));
242+
return true;
243+
}
244+
204245
int main(int argc, char ** argv) {
205246
if (argc < 3) {
206247
usage(argv[0]);
@@ -211,6 +252,7 @@ int main(int argc, char ** argv) {
211252
int arg_idx = 1;
212253
std::string imatrix_file;
213254
std::vector<std::string> included_weights, excluded_weights;
255+
std::vector<llama_model_kv_override> kv_overrides;
214256

215257
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
216258
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@@ -227,6 +269,10 @@ int main(int argc, char ** argv) {
227269
} else {
228270
usage(argv[0]);
229271
}
272+
} else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
273+
if (arg_idx == argc-1 || !parse_kv_override(argv[++arg_idx], kv_overrides)) {
274+
usage(argv[0]);
275+
}
230276
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
231277
params.allow_requantize = true;
232278
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
@@ -267,6 +313,11 @@ int main(int argc, char ** argv) {
267313
if (!imatrix_data.empty()) {
268314
params.imatrix = &imatrix_data;
269315
}
316+
if (!kv_overrides.empty()) {
317+
kv_overrides.emplace_back();
318+
kv_overrides.back().key[0] = 0;
319+
params.kv_overrides = &kv_overrides;
320+
}
270321

271322
llama_backend_init();
272323

@@ -288,8 +339,7 @@ int main(int argc, char ** argv) {
288339
if (ftype_str == "COPY") {
289340
params.only_copy = true;
290341
}
291-
}
292-
else {
342+
} else {
293343
fname_out = argv[arg_idx];
294344
arg_idx++;
295345

llama.cpp

+28-10
Original file line numberDiff line numberDiff line change
@@ -12776,7 +12776,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1277612776
constexpr bool use_mmap = false;
1277712777
#endif
1277812778

12779-
llama_model_loader ml(fname_inp, use_mmap, NULL);
12779+
llama_model_kv_override * kv_overrides = nullptr;
12780+
if (params->kv_overrides) {
12781+
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
12782+
kv_overrides = v->data();
12783+
}
12784+
llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
1278012785
ml.init_mappings(false); // no prefetching?
1278112786

1278212787
llama_model model;
@@ -12805,6 +12810,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1280512810
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
1280612811
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
1280712812

12813+
if (params->kv_overrides) {
12814+
const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
12815+
for (auto & o : overrides) {
12816+
if (o.key[0] == 0) break;
12817+
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
12818+
gguf_set_val_f32(ctx_out, o.key, o.float_value);
12819+
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
12820+
gguf_set_val_i32(ctx_out, o.key, o.int_value);
12821+
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
12822+
gguf_set_val_bool(ctx_out, o.key, o.bool_value);
12823+
} else {
12824+
LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
12825+
}
12826+
}
12827+
}
12828+
1280812829
for (int i = 0; i < ml.n_tensors; ++i) {
1280912830
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
1281012831

@@ -12813,21 +12834,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1281312834
// TODO: avoid hardcoded tensor names - use the TN_* constants
1281412835
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
1281512836
++qs.n_attention_wv;
12816-
}
12817-
else if (name.find("ffn_down") != std::string::npos) {
12837+
} else if (name.find("ffn_down") != std::string::npos) {
1281812838
++qs.n_ffn_down;
12819-
}
12820-
else if (name.find("ffn_gate") != std::string::npos) {
12839+
} else if (name.find("ffn_gate") != std::string::npos) {
1282112840
++qs.n_ffn_gate;
12822-
}
12823-
else if (name.find("ffn_up") != std::string::npos) {
12841+
} else if (name.find("ffn_up") != std::string::npos) {
1282412842
++qs.n_ffn_up;
12825-
}
12826-
else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
12843+
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
1282712844
qs.has_output = true;
1282812845
}
1282912846
}
12830-
if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
12847+
if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t) qs.n_attention_wv != model.hparams.n_layer) {
1283112848
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
1283212849
__func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer);
1283312850
}
@@ -13363,6 +13380,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
1336313380
/*.only_copy =*/ false,
1336413381
/*.pure =*/ false,
1336513382
/*.imatrix =*/ nullptr,
13383+
/*.kv_overrides =*/ nullptr,
1336613384
};
1336713385

1336813386
return result;

llama.h

+1
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,7 @@ extern "C" {
284284
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
285285
bool pure; // quantize all tensors to the default type
286286
void * imatrix; // pointer to importance matrix data
287+
void * kv_overrides; // pointer to vector containing overrides
287288
} llama_model_quantize_params;
288289

289290
// grammar types

0 commit comments

Comments
 (0)