Skip to content

Commit cf5fb24

Browse files
phymbertslaren
authored andcommitted
llama : cleanup unused mmq flags (ggml-org#5772)
* cleanup unused --no-mul-mat-q,-nommq, -mmq, --mul-mat-q, mul_mat_q * remove: mul_mat_q in compare llama bench and usage * update llama-bench --------- Co-authored-by: slaren <[email protected]>
1 parent ade2ef5 commit cf5fb24

File tree

9 files changed

+10
-56
lines changed

9 files changed

+10
-56
lines changed

common/common.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1281,7 +1281,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
12811281
cparams.n_batch = params.n_batch;
12821282
cparams.n_threads = params.n_threads;
12831283
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
1284-
cparams.mul_mat_q = params.mul_mat_q;
12851284
cparams.seed = params.seed;
12861285
cparams.logits_all = params.logits_all;
12871286
cparams.embedding = params.embedding;
@@ -1725,7 +1724,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
17251724
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
17261725
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
17271726
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
1728-
fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
17291727
fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
17301728
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
17311729
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);

common/common.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,6 @@ struct gpt_params {
115115

116116
bool kl_divergence = false; // compute KL-divergence
117117

118-
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
119118
bool random_prompt = false; // do not randomize prompt if none provided
120119
bool use_color = false; // use color to distinguish generations and inputs
121120
bool interactive = false; // interactive mode

examples/batched-bench/batched-bench.cpp

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,15 @@ int main(int argc, char ** argv) {
3232
gpt_params params;
3333

3434
if (argc == 1 || argv[1][0] == '-') {
35-
printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>\n" , argv[0]);
35+
printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
3636
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
37-
printf(" example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
37+
printf(" example: %s ggml-model-f16.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
3838
return 1 ;
3939
}
4040

4141
int n_kv_max = 2048;
4242
int is_pp_shared = 0;
4343
int n_gpu_layers = 0;
44-
int mmq = 0;
4544

4645
std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
4746
std::vector<int> n_tg = { 128, 256, };
@@ -65,19 +64,15 @@ int main(int argc, char ** argv) {
6564
}
6665

6766
if (argc >= 6) {
68-
mmq = std::atoi(argv[5]);
67+
n_pp = parse_list(argv[5]);
6968
}
7069

7170
if (argc >= 7) {
72-
n_pp = parse_list(argv[6]);
71+
n_tg = parse_list(argv[6]);
7372
}
7473

7574
if (argc >= 8) {
76-
n_tg = parse_list(argv[7]);
77-
}
78-
79-
if (argc >= 9) {
80-
n_pl = parse_list(argv[8]);
75+
n_pl = parse_list(argv[7]);
8176
}
8277

8378
// init LLM
@@ -106,7 +101,6 @@ int main(int argc, char ** argv) {
106101
ctx_params.seed = 1234;
107102
ctx_params.n_ctx = n_kv_max;
108103
ctx_params.n_batch = 512;
109-
ctx_params.mul_mat_q = mmq;
110104

111105
ctx_params.n_threads = params.n_threads;
112106
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -159,7 +153,7 @@ int main(int argc, char ** argv) {
159153
}
160154

161155
LOG_TEE("\n");
162-
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
156+
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
163157
LOG_TEE("\n");
164158

165159
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");

examples/llama-bench/README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ options:
3535
-mg, --main-gpu <i> (default: 0)
3636
-nkvo, --no-kv-offload <0|1> (default: 0)
3737
-mmp, --mmap <0|1> (default: 1)
38-
-mmq, --mul-mat-q <0|1> (default: 1)
3938
-ts, --tensor_split <ts0/ts1/..> (default: 0)
4039
-r, --repetitions <n> (default: 5)
4140
-o, --output <csv|json|md|sql> (default: md)

examples/llama-bench/llama-bench.cpp

Lines changed: 3 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,6 @@ struct cmd_params {
176176
std::vector<llama_split_mode> split_mode;
177177
std::vector<int> main_gpu;
178178
std::vector<bool> no_kv_offload;
179-
std::vector<bool> mul_mat_q;
180179
std::vector<std::vector<float>> tensor_split;
181180
std::vector<bool> use_mmap;
182181
int reps;
@@ -196,7 +195,6 @@ static const cmd_params cmd_params_defaults = {
196195
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
197196
/* main_gpu */ {0},
198197
/* no_kv_offload */ {false},
199-
/* mul_mat_q */ {true},
200198
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
201199
/* use_mmap */ {true},
202200
/* reps */ 5,
@@ -221,7 +219,6 @@ static void print_usage(int /* argc */, char ** argv) {
221219
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
222220
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
223221
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
224-
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
225222
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
226223
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
227224
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
@@ -383,13 +380,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
383380
}
384381
auto p = split<bool>(argv[i], split_delim);
385382
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
386-
} else if (arg == "-mmq" || arg == "--mul-mat-q") {
387-
if (++i >= argc) {
388-
invalid_param = true;
389-
break;
390-
}
391-
auto p = split<bool>(argv[i], split_delim);
392-
params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
393383
} else if (arg == "-mmp" || arg == "--mmap") {
394384
if (++i >= argc) {
395385
invalid_param = true;
@@ -466,7 +456,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
466456
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
467457
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
468458
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
469-
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
470459
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
471460
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
472461
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
@@ -486,7 +475,6 @@ struct cmd_params_instance {
486475
llama_split_mode split_mode;
487476
int main_gpu;
488477
bool no_kv_offload;
489-
bool mul_mat_q;
490478
std::vector<float> tensor_split;
491479
bool use_mmap;
492480

@@ -518,7 +506,6 @@ struct cmd_params_instance {
518506
cparams.n_batch = n_batch;
519507
cparams.type_k = type_k;
520508
cparams.type_v = type_v;
521-
cparams.mul_mat_q = mul_mat_q;
522509
cparams.offload_kqv = !no_kv_offload;
523510

524511
return cparams;
@@ -538,7 +525,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
538525
for (const auto & nb : params.n_batch)
539526
for (const auto & tk : params.type_k)
540527
for (const auto & tv : params.type_v)
541-
for (const auto & mmq : params.mul_mat_q)
542528
for (const auto & nkvo : params.no_kv_offload)
543529
for (const auto & nt : params.n_threads) {
544530
for (const auto & n_prompt : params.n_prompt) {
@@ -557,7 +543,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
557543
/* .split_mode = */ sm,
558544
/* .main_gpu = */ mg,
559545
/* .no_kv_offload= */ nkvo,
560-
/* .mul_mat_q = */ mmq,
561546
/* .tensor_split = */ ts,
562547
/* .use_mmap = */ mmp,
563548
};
@@ -580,7 +565,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
580565
/* .split_mode = */ sm,
581566
/* .main_gpu = */ mg,
582567
/* .no_kv_offload= */ nkvo,
583-
/* .mul_mat_q = */ mmq,
584568
/* .tensor_split = */ ts,
585569
/* .use_mmap = */ mmp,
586570
};
@@ -616,7 +600,6 @@ struct test {
616600
llama_split_mode split_mode;
617601
int main_gpu;
618602
bool no_kv_offload;
619-
bool mul_mat_q;
620603
std::vector<float> tensor_split;
621604
bool use_mmap;
622605
int n_prompt;
@@ -639,7 +622,6 @@ struct test {
639622
split_mode = inst.split_mode;
640623
main_gpu = inst.main_gpu;
641624
no_kv_offload = inst.no_kv_offload;
642-
mul_mat_q = inst.mul_mat_q;
643625
tensor_split = inst.tensor_split;
644626
use_mmap = inst.use_mmap;
645627
n_prompt = inst.n_prompt;
@@ -713,7 +695,7 @@ struct test {
713695
"n_batch", "n_threads", "type_k", "type_v",
714696
"n_gpu_layers", "split_mode",
715697
"main_gpu", "no_kv_offload",
716-
"mul_mat_q", "tensor_split", "use_mmap",
698+
"tensor_split", "use_mmap",
717699
"n_prompt", "n_gen", "test_time",
718700
"avg_ns", "stddev_ns",
719701
"avg_ts", "stddev_ts"
@@ -733,7 +715,7 @@ struct test {
733715
}
734716
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
735717
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
736-
field == "mul_mat_q" || field == "use_mmap") {
718+
field == "use_mmap") {
737719
return BOOL;
738720
}
739721
if (field == "avg_ts" || field == "stddev_ts") {
@@ -767,7 +749,7 @@ struct test {
767749
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
768750
std::to_string(n_gpu_layers), split_mode_str(split_mode),
769751
std::to_string(main_gpu), std::to_string(no_kv_offload),
770-
std::to_string(mul_mat_q), tensor_split_str, std::to_string(use_mmap),
752+
tensor_split_str, std::to_string(use_mmap),
771753
std::to_string(n_prompt), std::to_string(n_gen), test_time,
772754
std::to_string(avg_ns()), std::to_string(stdev_ns()),
773755
std::to_string(avg_ts()), std::to_string(stdev_ts())
@@ -931,9 +913,6 @@ struct markdown_printer : public printer {
931913
if (field == "n_threads") {
932914
return "threads";
933915
}
934-
if (field == "mul_mat_q") {
935-
return "mmq";
936-
}
937916
if (field == "no_kv_offload") {
938917
return "nkvo";
939918
}
@@ -974,9 +953,6 @@ struct markdown_printer : public printer {
974953
if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
975954
fields.emplace_back("split_mode");
976955
}
977-
if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
978-
fields.emplace_back("mul_mat_q");
979-
}
980956
if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
981957
fields.emplace_back("no_kv_offload");
982958
}

examples/server/server.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2390,14 +2390,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
23902390
}
23912391
#else
23922392
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
2393-
#endif // GGML_USE_CUBLAS
2394-
}
2395-
else if (arg == "--no-mul-mat-q" || arg == "-nommq")
2396-
{
2397-
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
2398-
params.mul_mat_q = false;
2399-
#else
2400-
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
24012393
#endif // GGML_USE_CUBLAS
24022394
}
24032395
else if (arg == "--main-gpu" || arg == "-mg")

llama.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1645,7 +1645,6 @@ struct llama_cparams {
16451645
float yarn_beta_slow;
16461646
float defrag_thold;
16471647

1648-
bool mul_mat_q;
16491648
bool offload_kqv;
16501649
bool do_pooling;
16511650

@@ -11633,7 +11632,6 @@ struct llama_context_params llama_context_default_params() {
1163311632
/*.cb_eval_user_data =*/ nullptr,
1163411633
/*.type_k =*/ GGML_TYPE_F16,
1163511634
/*.type_v =*/ GGML_TYPE_F16,
11636-
/*.mul_mat_q =*/ true,
1163711635
/*.logits_all =*/ false,
1163811636
/*.embedding =*/ false,
1163911637
/*.offload_kqv =*/ true,
@@ -11785,7 +11783,6 @@ struct llama_context * llama_new_context_with_model(
1178511783
cparams.yarn_beta_fast = params.yarn_beta_fast;
1178611784
cparams.yarn_beta_slow = params.yarn_beta_slow;
1178711785
cparams.defrag_thold = params.defrag_thold;
11788-
cparams.mul_mat_q = params.mul_mat_q;
1178911786
cparams.offload_kqv = params.offload_kqv;
1179011787
cparams.do_pooling = params.do_pooling;
1179111788

llama.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,6 @@ extern "C" {
255255
enum ggml_type type_v; // data type for V cache
256256

257257
// Keep the booleans together to avoid misalignment during copy-by-value.
258-
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
259258
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
260259
bool embedding; // embedding mode only
261260
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU

scripts/compare-llama-bench.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
"model_size": "Model Size [GiB]", "model_n_params": "Num. of Parameters",
3232
"n_batch": "Batch size", "n_threads": "Threads", "type_k": "K type", "type_v": "V type",
3333
"n_gpu_layers": "GPU layers", "main_gpu": "Main GPU", "no_kv_offload": "NKVO",
34-
"mul_mat_q": "MMQ", "tensor_split": "Tensor split"
34+
"tensor_split": "Tensor split"
3535
}
3636

3737
DEFAULT_SHOW = ["model_type"] # Always show these properties by default.

0 commit comments

Comments
 (0)