@@ -2571,5 +2571,43 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2571
2571
}
2572
2572
).set_examples ({LLAMA_EXAMPLE_SERVER}));
2573
2573
2574
+ add_opt (common_arg (
2575
+ {" --fim-qwen-7b-spec" },
2576
+ string_format (" use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)" ),
2577
+ [](common_params & params) {
2578
+ params.hf_repo = " ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF" ;
2579
+ params.hf_file = " qwen2.5-coder-7b-q8_0.gguf" ;
2580
+ params.speculative .hf_repo = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF" ;
2581
+ params.speculative .hf_file = " qwen2.5-coder-0.5b-q8_0.gguf" ;
2582
+ params.speculative .n_gpu_layers = 99 ;
2583
+ params.port = 8012 ;
2584
+ params.n_gpu_layers = 99 ;
2585
+ params.flash_attn = true ;
2586
+ params.n_ubatch = 1024 ;
2587
+ params.n_batch = 1024 ;
2588
+ params.n_ctx = 0 ;
2589
+ params.n_cache_reuse = 256 ;
2590
+ }
2591
+ ).set_examples ({LLAMA_EXAMPLE_SERVER}));
2592
+
2593
+ add_opt (common_arg (
2594
+ {" --fim-qwen-14b-spec" },
2595
+ string_format (" use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)" ),
2596
+ [](common_params & params) {
2597
+ params.hf_repo = " ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF" ;
2598
+ params.hf_file = " qwen2.5-coder-14b-q8_0.gguf" ;
2599
+ params.speculative .hf_repo = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF" ;
2600
+ params.speculative .hf_file = " qwen2.5-coder-0.5b-q8_0.gguf" ;
2601
+ params.speculative .n_gpu_layers = 99 ;
2602
+ params.port = 8012 ;
2603
+ params.n_gpu_layers = 99 ;
2604
+ params.flash_attn = true ;
2605
+ params.n_ubatch = 1024 ;
2606
+ params.n_batch = 1024 ;
2607
+ params.n_ctx = 0 ;
2608
+ params.n_cache_reuse = 256 ;
2609
+ }
2610
+ ).set_examples ({LLAMA_EXAMPLE_SERVER}));
2611
+
2574
2612
return ctx_arg;
2575
2613
}
0 commit comments