@@ -671,12 +671,11 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
671
671
fprintf (stdout, " number of layers to store in VRAM\n " );
672
672
fprintf (stdout, " -ts SPLIT --tensor-split SPLIT\n " );
673
673
fprintf (stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n " );
674
- fprintf (stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n " );
675
674
fprintf (stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n " );
676
675
fprintf (stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n " );
677
- fprintf (stdout, " -mmq , --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!! \n " );
678
- fprintf (stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed \n " );
679
- fprintf (stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K .\n " );
676
+ fprintf (stdout, " -nommq , --no- mul-mat-q\n " );
677
+ fprintf (stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels. \n " );
678
+ fprintf (stdout, " Not recommended since this is both slower and uses more VRAM .\n " );
680
679
#endif
681
680
fprintf (stdout, " -m FNAME, --model FNAME\n " );
682
681
fprintf (stdout, " model path (default: %s)\n " , params.model .c_str ());
@@ -867,12 +866,12 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
867
866
LOG_WARNING (" warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n " , {});
868
867
#endif // GGML_USE_CUBLAS
869
868
}
870
- else if (arg == " --mul-mat-q" || arg == " -mmq " )
869
+ else if (arg == " --no- mul-mat-q" || arg == " -nommq " )
871
870
{
872
871
#ifdef GGML_USE_CUBLAS
873
- params.mul_mat_q = true ;
872
+ params.mul_mat_q = false ;
874
873
#else
875
- LOG_WARNING (" warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels .\n " , {});
874
+ LOG_WARNING (" warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect .\n " , {});
876
875
#endif // GGML_USE_CUBLAS
877
876
}
878
877
else if (arg == " --main-gpu" || arg == " -mg" )
0 commit comments