@@ -889,6 +889,8 @@ struct server_context {
889
889
slot.sparams .tfs_z = json_value (data, " tfs_z" , default_sparams.tfs_z );
890
890
slot.sparams .typ_p = json_value (data, " typical_p" , default_sparams.typ_p );
891
891
slot.sparams .temp = json_value (data, " temperature" , default_sparams.temp );
892
+ slot.sparams .infill_p = json_value (data, " infill_p" , default_sparams.infill_p );
893
+ slot.sparams .infill_p_eog = json_value (data, " infill_p_eog" , default_sparams.infill_p_eog );
892
894
slot.sparams .dynatemp_range = json_value (data, " dynatemp_range" , default_sparams.dynatemp_range );
893
895
slot.sparams .dynatemp_exponent = json_value (data, " dynatemp_exponent" , default_sparams.dynatemp_exponent );
894
896
slot.sparams .penalty_last_n = json_value (data, " repeat_last_n" , default_sparams.penalty_last_n );
@@ -1236,6 +1238,8 @@ struct server_context {
1236
1238
{" min_p" , slot.sparams .min_p },
1237
1239
{" tfs_z" , slot.sparams .tfs_z },
1238
1240
{" typical_p" , slot.sparams .typ_p },
1241
+ {" infill_p" , slot.sparams .infill_p },
1242
+ {" infill_p_eog" , slot.sparams .infill_p_eog },
1239
1243
{" repeat_last_n" , slot.sparams .penalty_last_n },
1240
1244
{" repeat_penalty" , slot.sparams .penalty_repeat },
1241
1245
{" presence_penalty" , slot.sparams .penalty_present },
@@ -1964,55 +1968,57 @@ struct server_context {
1964
1968
slot.t_start_process_prompt = ggml_time_us ();
1965
1969
slot.t_start_generation = 0 ;
1966
1970
1967
- if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_INFILL) {
1968
- const bool add_bos = llama_add_bos_token (model);
1969
-
1970
- auto prefix_tokens = tokenize (slot.params .input_prefix , false , false );
1971
- auto suffix_tokens = tokenize (slot.params .input_suffix , false , false );
1972
-
1973
- prefix_tokens.insert (prefix_tokens.begin (), llama_token_fim_pre (model));
1974
- suffix_tokens.insert (suffix_tokens.begin (), llama_token_fim_suf (model));
1975
-
1976
- auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
1977
- auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
1978
-
1979
- if (add_bos) {
1980
- embd_inp.insert (embd_inp.begin (), llama_token_bos (model));
1981
- }
1982
-
1983
- embd_inp.insert (embd_inp.end (), embd_end.begin (), embd_end.end ());
1984
-
1985
- const llama_token middle_token = llama_token_fim_mid (model);
1986
- if (middle_token >= 0 ) {
1987
- embd_inp.push_back (middle_token);
1988
- }
1989
-
1990
- prompt_tokens = embd_inp;
1991
- } else if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
1992
- // require slot.prompt to be array of 2 strings
1993
- if (!slot.prompt .is_array () || slot.prompt .size () != 2 ) {
1994
- SLT_ERR (slot, " %s" , " invalid prompt for rerank task\n " );
1995
- slot.release ();
1996
- send_error (slot, " invalid prompt for rerank task" , ERROR_TYPE_INVALID_REQUEST);
1997
- continue ;
1998
- }
1999
-
2000
- // prompt: [BOS]query[EOS][SEP]doc[EOS]
2001
- prompt_tokens.clear ();
2002
- prompt_tokens.push_back (llama_token_bos (model));
2003
- {
2004
- const auto part = tokenize (slot.prompt [0 ], false , false );
2005
- prompt_tokens.insert (prompt_tokens.end (), part.begin (), part.end ());
2006
- }
2007
- prompt_tokens.push_back (llama_token_eos (model));
2008
- prompt_tokens.push_back (llama_token_sep (model));
2009
- {
2010
- const auto part = tokenize (slot.prompt [1 ], false , false );
2011
- prompt_tokens.insert (prompt_tokens.end (), part.begin (), part.end ());
2012
- }
2013
- prompt_tokens.push_back (llama_token_eos (model));
2014
- } else {
2015
- prompt_tokens = tokenize (slot.prompt , system_prompt.empty (), true ); // add BOS if there isn't system prompt
1971
+ switch (slot.cmpl_type ) {
1972
+ case SERVER_TASK_CMPL_TYPE_NORMAL:
1973
+ case SERVER_TASK_CMPL_TYPE_EMBEDDING:
1974
+ {
1975
+ prompt_tokens = tokenize (slot.prompt , system_prompt.empty (), true ); // add BOS if there isn't system prompt
1976
+ } break ;
1977
+ case SERVER_TASK_CMPL_TYPE_RERANK:
1978
+ {
1979
+ // require slot.prompt to be array of 2 strings
1980
+ if (!slot.prompt .is_array () || slot.prompt .size () != 2 ) {
1981
+ SLT_ERR (slot, " %s" , " invalid prompt for rerank task\n " );
1982
+ slot.release ();
1983
+ send_error (slot, " invalid prompt for rerank task" , ERROR_TYPE_INVALID_REQUEST);
1984
+ continue ;
1985
+ }
1986
+
1987
+ // prompt: [BOS]query[EOS][SEP]doc[EOS]
1988
+ prompt_tokens.clear ();
1989
+ prompt_tokens.push_back (llama_token_bos (model));
1990
+ {
1991
+ const auto part = tokenize (slot.prompt [0 ], false , false );
1992
+ prompt_tokens.insert (prompt_tokens.end (), part.begin (), part.end ());
1993
+ }
1994
+ prompt_tokens.push_back (llama_token_eos (model));
1995
+ prompt_tokens.push_back (llama_token_sep (model));
1996
+ {
1997
+ const auto part = tokenize (slot.prompt [1 ], false , false );
1998
+ prompt_tokens.insert (prompt_tokens.end (), part.begin (), part.end ());
1999
+ }
2000
+ prompt_tokens.push_back (llama_token_eos (model));
2001
+ } break ;
2002
+ case SERVER_TASK_CMPL_TYPE_INFILL:
2003
+ {
2004
+ auto prefix_tokens = tokenize (slot.params .input_prefix , false , false );
2005
+ auto suffix_tokens = tokenize (slot.params .input_suffix , false , false );
2006
+
2007
+ prefix_tokens.insert (prefix_tokens.begin (), llama_token_fim_pre (model));
2008
+ suffix_tokens.insert (suffix_tokens.begin (), llama_token_fim_suf (model));
2009
+
2010
+ auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
2011
+ auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
2012
+
2013
+ if (llama_add_bos_token (model)) {
2014
+ embd_inp.insert (embd_inp.begin (), llama_token_bos (model));
2015
+ }
2016
+
2017
+ embd_inp.insert (embd_inp.end (), embd_end.begin (), embd_end.end ());
2018
+ embd_inp.push_back (llama_token_fim_mid (model));
2019
+
2020
+ prompt_tokens = std::move (embd_inp);
2021
+ } break ;
2016
2022
}
2017
2023
2018
2024
slot.n_past = 0 ;
0 commit comments