Skip to content

Commit 54f9831

Browse files
m18coppolaMichael Coppola
authored andcommitted
server : add parameter -tb N, --threads-batch N (ggml-org#3584)
Co-authored-by: Michael Coppola <[email protected]>
1 parent b2f7e04 commit 54f9831

File tree

1 file changed

+19
-9
lines changed

1 file changed

+19
-9
lines changed

examples/server/server.cpp

+19-9
Original file line numberDiff line numberDiff line change
@@ -1766,15 +1766,16 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
17661766
printf("usage: %s [options]\n", argv0);
17671767
printf("\n");
17681768
printf("options:\n");
1769-
printf(" -h, --help show this help message and exit\n");
1770-
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
1771-
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
1772-
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
1773-
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
1774-
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
1775-
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
1776-
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
1777-
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
1769+
printf(" -h, --help show this help message and exit\n");
1770+
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
1771+
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
1772+
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
1773+
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
1774+
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
1775+
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
1776+
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
1777+
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
1778+
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
17781779
if (llama_mlock_supported())
17791780
{
17801781
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
@@ -1924,6 +1925,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
19241925
}
19251926
params.n_threads = std::stoi(argv[i]);
19261927
}
1928+
else if (arg == "--threads-batch" || arg == "-tb")
1929+
{
1930+
if (++i >= argc)
1931+
{
1932+
invalid_param = true;
1933+
break;
1934+
}
1935+
params.n_threads_batch = std::stoi(argv[i]);
1936+
}
19271937
else if (arg == "-b" || arg == "--batch-size")
19281938
{
19291939
if (++i >= argc)

0 commit comments

Comments
 (0)