Skip to content

Commit 1e2f78a

Browse files
authored
server : add speculative decoding presets for FIM (#12287)
1 parent 0fd7ca7 commit 1e2f78a

File tree

1 file changed

+38
-0
lines changed

1 file changed

+38
-0
lines changed

common/arg.cpp

+38
Original file line numberDiff line numberDiff line change
@@ -2571,5 +2571,43 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
25712571
}
25722572
).set_examples({LLAMA_EXAMPLE_SERVER}));
25732573

2574+
add_opt(common_arg(
2575+
{"--fim-qwen-7b-spec"},
2576+
string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
2577+
[](common_params & params) {
2578+
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
2579+
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
2580+
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
2581+
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
2582+
params.speculative.n_gpu_layers = 99;
2583+
params.port = 8012;
2584+
params.n_gpu_layers = 99;
2585+
params.flash_attn = true;
2586+
params.n_ubatch = 1024;
2587+
params.n_batch = 1024;
2588+
params.n_ctx = 0;
2589+
params.n_cache_reuse = 256;
2590+
}
2591+
).set_examples({LLAMA_EXAMPLE_SERVER}));
2592+
2593+
add_opt(common_arg(
2594+
{"--fim-qwen-14b-spec"},
2595+
string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
2596+
[](common_params & params) {
2597+
params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
2598+
params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
2599+
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
2600+
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
2601+
params.speculative.n_gpu_layers = 99;
2602+
params.port = 8012;
2603+
params.n_gpu_layers = 99;
2604+
params.flash_attn = true;
2605+
params.n_ubatch = 1024;
2606+
params.n_batch = 1024;
2607+
params.n_ctx = 0;
2608+
params.n_cache_reuse = 256;
2609+
}
2610+
).set_examples({LLAMA_EXAMPLE_SERVER}));
2611+
25742612
return ctx_arg;
25752613
}

0 commit comments

Comments
 (0)