@@ -176,7 +176,6 @@ struct cmd_params {
176
176
std::vector<llama_split_mode> split_mode;
177
177
std::vector<int > main_gpu;
178
178
std::vector<bool > no_kv_offload;
179
- std::vector<bool > mul_mat_q;
180
179
std::vector<std::vector<float >> tensor_split;
181
180
std::vector<bool > use_mmap;
182
181
int reps;
@@ -196,7 +195,6 @@ static const cmd_params cmd_params_defaults = {
196
195
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
197
196
/* main_gpu */ {0 },
198
197
/* no_kv_offload */ {false },
199
- /* mul_mat_q */ {true },
200
198
/* tensor_split */ {std::vector<float >(llama_max_devices (), 0 .0f )},
201
199
/* use_mmap */ {true },
202
200
/* reps */ 5 ,
@@ -221,7 +219,6 @@ static void print_usage(int /* argc */, char ** argv) {
221
219
printf (" -mg, --main-gpu <i> (default: %s)\n " , join (cmd_params_defaults.main_gpu , " ," ).c_str ());
222
220
printf (" -nkvo, --no-kv-offload <0|1> (default: %s)\n " , join (cmd_params_defaults.no_kv_offload , " ," ).c_str ());
223
221
printf (" -mmp, --mmap <0|1> (default: %s)\n " , join (cmd_params_defaults.use_mmap , " ," ).c_str ());
224
- printf (" -mmq, --mul-mat-q <0|1> (default: %s)\n " , join (cmd_params_defaults.mul_mat_q , " ," ).c_str ());
225
222
printf (" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n " );
226
223
printf (" -r, --repetitions <n> (default: %d)\n " , cmd_params_defaults.reps );
227
224
printf (" -o, --output <csv|json|md|sql> (default: %s)\n " , output_format_str (cmd_params_defaults.output_format ));
@@ -383,13 +380,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
383
380
}
384
381
auto p = split<bool >(argv[i], split_delim);
385
382
params.no_kv_offload .insert (params.no_kv_offload .end (), p.begin (), p.end ());
386
- } else if (arg == " -mmq" || arg == " --mul-mat-q" ) {
387
- if (++i >= argc) {
388
- invalid_param = true ;
389
- break ;
390
- }
391
- auto p = split<bool >(argv[i], split_delim);
392
- params.mul_mat_q .insert (params.mul_mat_q .end (), p.begin (), p.end ());
393
383
} else if (arg == " -mmp" || arg == " --mmap" ) {
394
384
if (++i >= argc) {
395
385
invalid_param = true ;
@@ -466,7 +456,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
466
456
if (params.split_mode .empty ()) { params.split_mode = cmd_params_defaults.split_mode ; }
467
457
if (params.main_gpu .empty ()) { params.main_gpu = cmd_params_defaults.main_gpu ; }
468
458
if (params.no_kv_offload .empty ()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload ; }
469
- if (params.mul_mat_q .empty ()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q ; }
470
459
if (params.tensor_split .empty ()) { params.tensor_split = cmd_params_defaults.tensor_split ; }
471
460
if (params.use_mmap .empty ()) { params.use_mmap = cmd_params_defaults.use_mmap ; }
472
461
if (params.n_threads .empty ()) { params.n_threads = cmd_params_defaults.n_threads ; }
@@ -486,7 +475,6 @@ struct cmd_params_instance {
486
475
llama_split_mode split_mode;
487
476
int main_gpu;
488
477
bool no_kv_offload;
489
- bool mul_mat_q;
490
478
std::vector<float > tensor_split;
491
479
bool use_mmap;
492
480
@@ -518,7 +506,6 @@ struct cmd_params_instance {
518
506
cparams.n_batch = n_batch;
519
507
cparams.type_k = type_k;
520
508
cparams.type_v = type_v;
521
- cparams.mul_mat_q = mul_mat_q;
522
509
cparams.offload_kqv = !no_kv_offload;
523
510
524
511
return cparams;
@@ -538,7 +525,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
538
525
for (const auto & nb : params.n_batch )
539
526
for (const auto & tk : params.type_k )
540
527
for (const auto & tv : params.type_v )
541
- for (const auto & mmq : params.mul_mat_q )
542
528
for (const auto & nkvo : params.no_kv_offload )
543
529
for (const auto & nt : params.n_threads ) {
544
530
for (const auto & n_prompt : params.n_prompt ) {
@@ -557,7 +543,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
557
543
/* .split_mode = */ sm,
558
544
/* .main_gpu = */ mg,
559
545
/* .no_kv_offload= */ nkvo,
560
- /* .mul_mat_q = */ mmq,
561
546
/* .tensor_split = */ ts,
562
547
/* .use_mmap = */ mmp,
563
548
};
@@ -580,7 +565,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
580
565
/* .split_mode = */ sm,
581
566
/* .main_gpu = */ mg,
582
567
/* .no_kv_offload= */ nkvo,
583
- /* .mul_mat_q = */ mmq,
584
568
/* .tensor_split = */ ts,
585
569
/* .use_mmap = */ mmp,
586
570
};
@@ -616,7 +600,6 @@ struct test {
616
600
llama_split_mode split_mode;
617
601
int main_gpu;
618
602
bool no_kv_offload;
619
- bool mul_mat_q;
620
603
std::vector<float > tensor_split;
621
604
bool use_mmap;
622
605
int n_prompt;
@@ -639,7 +622,6 @@ struct test {
639
622
split_mode = inst.split_mode ;
640
623
main_gpu = inst.main_gpu ;
641
624
no_kv_offload = inst.no_kv_offload ;
642
- mul_mat_q = inst.mul_mat_q ;
643
625
tensor_split = inst.tensor_split ;
644
626
use_mmap = inst.use_mmap ;
645
627
n_prompt = inst.n_prompt ;
@@ -713,7 +695,7 @@ struct test {
713
695
" n_batch" , " n_threads" , " type_k" , " type_v" ,
714
696
" n_gpu_layers" , " split_mode" ,
715
697
" main_gpu" , " no_kv_offload" ,
716
- " mul_mat_q " , " tensor_split" , " use_mmap" ,
698
+ " tensor_split" , " use_mmap" ,
717
699
" n_prompt" , " n_gen" , " test_time" ,
718
700
" avg_ns" , " stddev_ns" ,
719
701
" avg_ts" , " stddev_ts"
@@ -733,7 +715,7 @@ struct test {
733
715
}
734
716
if (field == " cuda" || field == " opencl" || field == " vulkan" || field == " kompute" || field == " metal" ||
735
717
field == " gpu_blas" || field == " blas" || field == " sycl" ||field == " f16_kv" || field == " no_kv_offload" ||
736
- field == " mul_mat_q " || field == " use_mmap" ) {
718
+ field == " use_mmap" ) {
737
719
return BOOL;
738
720
}
739
721
if (field == " avg_ts" || field == " stddev_ts" ) {
@@ -767,7 +749,7 @@ struct test {
767
749
std::to_string (n_batch), std::to_string (n_threads), ggml_type_name (type_k), ggml_type_name (type_v),
768
750
std::to_string (n_gpu_layers), split_mode_str (split_mode),
769
751
std::to_string (main_gpu), std::to_string (no_kv_offload),
770
- std::to_string (mul_mat_q), tensor_split_str, std::to_string (use_mmap),
752
+ tensor_split_str, std::to_string (use_mmap),
771
753
std::to_string (n_prompt), std::to_string (n_gen), test_time,
772
754
std::to_string (avg_ns ()), std::to_string (stdev_ns ()),
773
755
std::to_string (avg_ts ()), std::to_string (stdev_ts ())
@@ -931,9 +913,6 @@ struct markdown_printer : public printer {
931
913
if (field == " n_threads" ) {
932
914
return " threads" ;
933
915
}
934
- if (field == " mul_mat_q" ) {
935
- return " mmq" ;
936
- }
937
916
if (field == " no_kv_offload" ) {
938
917
return " nkvo" ;
939
918
}
@@ -974,9 +953,6 @@ struct markdown_printer : public printer {
974
953
if (params.split_mode .size () > 1 || params.split_mode != cmd_params_defaults.split_mode ) {
975
954
fields.emplace_back (" split_mode" );
976
955
}
977
- if (params.mul_mat_q .size () > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q ) {
978
- fields.emplace_back (" mul_mat_q" );
979
- }
980
956
if (params.no_kv_offload .size () > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload ) {
981
957
fields.emplace_back (" no_kv_offload" );
982
958
}
0 commit comments