Skip to content

Commit 2e64897

Browse files
committed
cleanup unused --no-mul-mat-q,-nommq, -mmq, --mul-mat-q, mul_mat_q
1 parent 78aacf3 commit 2e64897

File tree

8 files changed

+4
-46
lines changed

8 files changed

+4
-46
lines changed

common/common.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1281,7 +1281,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
12811281
cparams.n_batch = params.n_batch;
12821282
cparams.n_threads = params.n_threads;
12831283
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
1284-
cparams.mul_mat_q = params.mul_mat_q;
12851284
cparams.seed = params.seed;
12861285
cparams.logits_all = params.logits_all;
12871286
cparams.embedding = params.embedding;
@@ -1725,7 +1724,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
17251724
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
17261725
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
17271726
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
1728-
fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
17291727
fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
17301728
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
17311729
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);

common/common.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,6 @@ struct gpt_params {
115115

116116
bool kl_divergence = false; // compute KL-divergence
117117

118-
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
119118
bool random_prompt = false; // do not randomize prompt if none provided
120119
bool use_color = false; // use color to distinguish generations and inputs
121120
bool interactive = false; // interactive mode

examples/batched-bench/batched-bench.cpp

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ int main(int argc, char ** argv) {
4141
int n_kv_max = 2048;
4242
int is_pp_shared = 0;
4343
int n_gpu_layers = 0;
44-
int mmq = 0;
4544

4645
std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
4746
std::vector<int> n_tg = { 128, 256, };
@@ -65,19 +64,15 @@ int main(int argc, char ** argv) {
6564
}
6665

6766
if (argc >= 6) {
68-
mmq = std::atoi(argv[5]);
67+
n_pp = parse_list(argv[5]);
6968
}
7069

7170
if (argc >= 7) {
72-
n_pp = parse_list(argv[6]);
71+
n_tg = parse_list(argv[6]);
7372
}
7473

7574
if (argc >= 8) {
76-
n_tg = parse_list(argv[7]);
77-
}
78-
79-
if (argc >= 9) {
80-
n_pl = parse_list(argv[8]);
75+
n_pl = parse_list(argv[7]);
8176
}
8277

8378
// init LLM
@@ -106,7 +101,6 @@ int main(int argc, char ** argv) {
106101
ctx_params.seed = 1234;
107102
ctx_params.n_ctx = n_kv_max;
108103
ctx_params.n_batch = 512;
109-
ctx_params.mul_mat_q = mmq;
110104

111105
ctx_params.n_threads = params.n_threads;
112106
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -159,7 +153,7 @@ int main(int argc, char ** argv) {
159153
}
160154

161155
LOG_TEE("\n");
162-
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
156+
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
163157
LOG_TEE("\n");
164158

165159
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");

examples/llama-bench/README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ options:
3535
-mg, --main-gpu <i> (default: 0)
3636
-nkvo, --no-kv-offload <0|1> (default: 0)
3737
-mmp, --mmap <0|1> (default: 1)
38-
-mmq, --mul-mat-q <0|1> (default: 1)
3938
-ts, --tensor_split <ts0/ts1/..> (default: 0)
4039
-r, --repetitions <n> (default: 5)
4140
-o, --output <csv|json|md|sql> (default: md)

examples/llama-bench/llama-bench.cpp

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,6 @@ struct cmd_params {
176176
std::vector<llama_split_mode> split_mode;
177177
std::vector<int> main_gpu;
178178
std::vector<bool> no_kv_offload;
179-
std::vector<bool> mul_mat_q;
180179
std::vector<std::vector<float>> tensor_split;
181180
std::vector<bool> use_mmap;
182181
int reps;
@@ -196,7 +195,6 @@ static const cmd_params cmd_params_defaults = {
196195
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
197196
/* main_gpu */ {0},
198197
/* no_kv_offload */ {false},
199-
/* mul_mat_q */ {true},
200198
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
201199
/* use_mmap */ {true},
202200
/* reps */ 5,
@@ -221,7 +219,6 @@ static void print_usage(int /* argc */, char ** argv) {
221219
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
222220
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
223221
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
224-
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
225222
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
226223
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
227224
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
@@ -383,13 +380,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
383380
}
384381
auto p = split<bool>(argv[i], split_delim);
385382
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
386-
} else if (arg == "-mmq" || arg == "--mul-mat-q") {
387-
if (++i >= argc) {
388-
invalid_param = true;
389-
break;
390-
}
391-
auto p = split<bool>(argv[i], split_delim);
392-
params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
393383
} else if (arg == "-mmp" || arg == "--mmap") {
394384
if (++i >= argc) {
395385
invalid_param = true;
@@ -466,7 +456,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
466456
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
467457
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
468458
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
469-
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
470459
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
471460
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
472461
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
@@ -486,7 +475,6 @@ struct cmd_params_instance {
486475
llama_split_mode split_mode;
487476
int main_gpu;
488477
bool no_kv_offload;
489-
bool mul_mat_q;
490478
std::vector<float> tensor_split;
491479
bool use_mmap;
492480

@@ -518,7 +506,6 @@ struct cmd_params_instance {
518506
cparams.n_batch = n_batch;
519507
cparams.type_k = type_k;
520508
cparams.type_v = type_v;
521-
cparams.mul_mat_q = mul_mat_q;
522509
cparams.offload_kqv = !no_kv_offload;
523510

524511
return cparams;
@@ -538,7 +525,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
538525
for (const auto & nb : params.n_batch)
539526
for (const auto & tk : params.type_k)
540527
for (const auto & tv : params.type_v)
541-
for (const auto & mmq : params.mul_mat_q)
542528
for (const auto & nkvo : params.no_kv_offload)
543529
for (const auto & nt : params.n_threads) {
544530
for (const auto & n_prompt : params.n_prompt) {
@@ -557,7 +543,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
557543
/* .split_mode = */ sm,
558544
/* .main_gpu = */ mg,
559545
/* .no_kv_offload= */ nkvo,
560-
/* .mul_mat_q = */ mmq,
561546
/* .tensor_split = */ ts,
562547
/* .use_mmap = */ mmp,
563548
};
@@ -580,7 +565,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
580565
/* .split_mode = */ sm,
581566
/* .main_gpu = */ mg,
582567
/* .no_kv_offload= */ nkvo,
583-
/* .mul_mat_q = */ mmq,
584568
/* .tensor_split = */ ts,
585569
/* .use_mmap = */ mmp,
586570
};
@@ -639,7 +623,6 @@ struct test {
639623
split_mode = inst.split_mode;
640624
main_gpu = inst.main_gpu;
641625
no_kv_offload = inst.no_kv_offload;
642-
mul_mat_q = inst.mul_mat_q;
643626
tensor_split = inst.tensor_split;
644627
use_mmap = inst.use_mmap;
645628
n_prompt = inst.n_prompt;
@@ -974,9 +957,6 @@ struct markdown_printer : public printer {
974957
if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
975958
fields.emplace_back("split_mode");
976959
}
977-
if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
978-
fields.emplace_back("mul_mat_q");
979-
}
980960
if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
981961
fields.emplace_back("no_kv_offload");
982962
}

examples/server/server.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2432,14 +2432,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
24322432
}
24332433
#else
24342434
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
2435-
#endif // GGML_USE_CUBLAS
2436-
}
2437-
else if (arg == "--no-mul-mat-q" || arg == "-nommq")
2438-
{
2439-
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
2440-
params.mul_mat_q = false;
2441-
#else
2442-
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
24432435
#endif // GGML_USE_CUBLAS
24442436
}
24452437
else if (arg == "--main-gpu" || arg == "-mg")

llama.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1645,7 +1645,6 @@ struct llama_cparams {
16451645
float yarn_beta_slow;
16461646
float defrag_thold;
16471647

1648-
bool mul_mat_q;
16491648
bool offload_kqv;
16501649
bool do_pooling;
16511650

@@ -11668,7 +11667,6 @@ struct llama_context_params llama_context_default_params() {
1166811667
/*.cb_eval_user_data =*/ nullptr,
1166911668
/*.type_k =*/ GGML_TYPE_F16,
1167011669
/*.type_v =*/ GGML_TYPE_F16,
11671-
/*.mul_mat_q =*/ true,
1167211670
/*.logits_all =*/ false,
1167311671
/*.embedding =*/ false,
1167411672
/*.offload_kqv =*/ true,
@@ -11829,7 +11827,6 @@ struct llama_context * llama_new_context_with_model(
1182911827
cparams.yarn_beta_fast = params.yarn_beta_fast;
1183011828
cparams.yarn_beta_slow = params.yarn_beta_slow;
1183111829
cparams.defrag_thold = params.defrag_thold;
11832-
cparams.mul_mat_q = params.mul_mat_q;
1183311830
cparams.offload_kqv = params.offload_kqv;
1183411831
cparams.do_pooling = params.do_pooling;
1183511832

llama.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,6 @@ extern "C" {
255255
enum ggml_type type_v; // data type for V cache
256256

257257
// Keep the booleans together to avoid misalignment during copy-by-value.
258-
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
259258
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
260259
bool embedding; // embedding mode only
261260
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU

0 commit comments

Comments
 (0)