Skip to content

Commit 52c76d5

Browse files
committed
server : add defrag thold parameter
1 parent 5d25f74 commit 52c76d5

File tree

1 file changed

+8
-0
lines changed

1 file changed

+8
-0
lines changed

examples/server/server.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2133,6 +2133,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
21332133
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
21342134
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
21352135
printf(" --pooling {none,mean,cls} pooling type for embeddings, use model default if unspecified\n");
2136+
printf(" -dt N, --defrag-thold N\n");
2137+
printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
21362138
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
21372139
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
21382140
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
@@ -2354,6 +2356,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
23542356
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
23552357
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
23562358
else { invalid_param = true; break; }
2359+
} else if (arg == "--defrag-thold" || arg == "-dt") {
2360+
if (++i >= argc) {
2361+
invalid_param = true;
2362+
break;
2363+
}
2364+
params.defrag_thold = std::stof(argv[i]);
23572365
} else if (arg == "--threads" || arg == "-t") {
23582366
if (++i >= argc)
23592367
{

0 commit comments

Comments
 (0)