@@ -753,12 +753,7 @@ struct server_context {
753
753
metrics.init ();
754
754
}
755
755
756
- std::vector<llama_token> tokenize (const json & json_prompt, bool add_special) const {
757
- // TODO: currently, we tokenize using special tokens by default
758
- // this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
759
- // but it's better compared to completely ignoring ChatML and other chat templates
760
- const bool TMP_FORCE_SPECIAL = true ;
761
-
756
+ std::vector<llama_token> tokenize (const json & json_prompt, bool add_special, bool parse_special) const {
762
757
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
763
758
// or the first element of the json_prompt array is a string.
764
759
std::vector<llama_token> prompt_tokens;
@@ -771,10 +766,10 @@ struct server_context {
771
766
772
767
std::vector<llama_token> p;
773
768
if (first) {
774
- p = ::llama_tokenize (ctx, s, add_special, TMP_FORCE_SPECIAL );
769
+ p = ::llama_tokenize (ctx, s, add_special, parse_special );
775
770
first = false ;
776
771
} else {
777
- p = ::llama_tokenize (ctx, s, false , TMP_FORCE_SPECIAL );
772
+ p = ::llama_tokenize (ctx, s, false , parse_special );
778
773
}
779
774
780
775
prompt_tokens.insert (prompt_tokens.end (), p.begin (), p.end ());
@@ -788,7 +783,7 @@ struct server_context {
788
783
}
789
784
} else {
790
785
auto s = json_prompt.template get <std::string>();
791
- prompt_tokens = ::llama_tokenize (ctx, s, add_special, TMP_FORCE_SPECIAL );
786
+ prompt_tokens = ::llama_tokenize (ctx, s, add_special, parse_special );
792
787
}
793
788
794
789
return prompt_tokens;
@@ -1220,7 +1215,7 @@ struct server_context {
1220
1215
slot.params .n_predict , n_ctx_train);
1221
1216
}
1222
1217
1223
- SLT_DBG (slot, " n_decoded = %d, n_remaining = %d, next token: '%s'\n " , slot.n_decoded , slot.n_remaining , token_str.c_str ());
1218
+ SLT_DBG (slot, " n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n " , slot.n_decoded , slot.n_remaining , result. tok , token_str.c_str ());
1224
1219
1225
1220
return slot.has_next_token ; // continue
1226
1221
}
@@ -1488,9 +1483,8 @@ struct server_context {
1488
1483
if (prompt.is_string () || json_is_array_of_numbers (prompt)) {
1489
1484
data[" index" ] = 0 ;
1490
1485
create_task (data, false , nullptr );
1491
- }
1492
- // otherwise, it's a multiple-prompt task, we break it into smaller tasks
1493
- else if (prompt.is_array ()) {
1486
+ } else if (prompt.is_array ()) {
1487
+ // otherwise, it's a multiple-prompt task, we break it into smaller tasks
1494
1488
std::vector<json> prompts = prompt;
1495
1489
if (cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
1496
1490
// prompts[0] is the question
@@ -1515,9 +1509,8 @@ struct server_context {
1515
1509
}
1516
1510
}
1517
1511
}
1518
- }
1519
- // invalid case
1520
- else {
1512
+ } else {
1513
+ // invalid case
1521
1514
throw std::runtime_error (error_msg);
1522
1515
}
1523
1516
@@ -1988,31 +1981,23 @@ struct server_context {
1988
1981
1989
1982
if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_INFILL) {
1990
1983
const bool add_bos = llama_add_bos_token (model);
1991
- bool suff_rm_leading_spc = true ;
1992
- if (params.input_suffix .find_first_of (' ' ) == 0 && params.input_suffix .size () > 1 ) {
1993
- params.input_suffix .erase (0 , 1 );
1994
- suff_rm_leading_spc = false ;
1995
- }
1996
1984
1997
- auto prefix_tokens = tokenize (slot.params .input_prefix , false );
1998
- auto suffix_tokens = tokenize (slot.params .input_suffix , false );
1985
+ auto prefix_tokens = tokenize (slot.params .input_prefix , false , false );
1986
+ auto suffix_tokens = tokenize (slot.params .input_suffix , false , false );
1999
1987
2000
- const int space_token = 29871 ; // TODO: this should not be hardcoded
2001
- if (suff_rm_leading_spc && !suffix_tokens.empty () && suffix_tokens[0 ] == space_token) {
2002
- suffix_tokens.erase (suffix_tokens.begin ());
2003
- }
2004
-
2005
- prefix_tokens.insert (prefix_tokens.begin (), llama_token_prefix (model));
2006
- suffix_tokens.insert (suffix_tokens.begin (), llama_token_suffix (model));
1988
+ prefix_tokens.insert (prefix_tokens.begin (), llama_token_fim_pre (model));
1989
+ suffix_tokens.insert (suffix_tokens.begin (), llama_token_fim_suf (model));
2007
1990
2008
1991
auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
2009
1992
auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
1993
+
2010
1994
if (add_bos) {
2011
1995
embd_inp.insert (embd_inp.begin (), llama_token_bos (model));
2012
1996
}
1997
+
2013
1998
embd_inp.insert (embd_inp.end (), embd_end.begin (), embd_end.end ());
2014
1999
2015
- const llama_token middle_token = llama_token_middle (model);
2000
+ const llama_token middle_token = llama_token_fim_mid (model);
2016
2001
if (middle_token >= 0 ) {
2017
2002
embd_inp.push_back (middle_token);
2018
2003
}
@@ -2031,25 +2016,30 @@ struct server_context {
2031
2016
prompt_tokens.clear ();
2032
2017
prompt_tokens.push_back (llama_token_bos (model));
2033
2018
{
2034
- const auto part = tokenize (slot.prompt [0 ], false );
2019
+ const auto part = tokenize (slot.prompt [0 ], false , false );
2035
2020
prompt_tokens.insert (prompt_tokens.end (), part.begin (), part.end ());
2036
2021
}
2037
2022
prompt_tokens.push_back (llama_token_eos (model));
2038
2023
prompt_tokens.push_back (llama_token_sep (model));
2039
2024
{
2040
- const auto part = tokenize (slot.prompt [1 ], false );
2025
+ const auto part = tokenize (slot.prompt [1 ], false , false );
2041
2026
prompt_tokens.insert (prompt_tokens.end (), part.begin (), part.end ());
2042
2027
}
2043
2028
prompt_tokens.push_back (llama_token_eos (model));
2044
2029
} else {
2045
- prompt_tokens = tokenize (slot.prompt , system_prompt.empty ()); // add BOS if there isn't system prompt
2030
+ prompt_tokens = tokenize (slot.prompt , system_prompt.empty (), true ); // add BOS if there isn't system prompt
2046
2031
}
2047
2032
2048
2033
slot.n_past = 0 ;
2049
2034
slot.n_prompt_tokens = prompt_tokens.size ();
2050
2035
2051
2036
SLT_INF (slot, " prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n " , slot.n_ctx , slot.params .n_keep , slot.n_prompt_tokens );
2052
2037
2038
+ // print prompt tokens:
2039
+ for (int i = 0 ; i < (int ) prompt_tokens.size (); i++) {
2040
+ SLT_DBG (slot, " prompt token %3d: %6d '%s'\n " , i, prompt_tokens[i], llama_token_to_piece (ctx, prompt_tokens[i]).c_str ());
2041
+ }
2042
+
2053
2043
// empty prompt passed -> release the slot and send empty response
2054
2044
if (prompt_tokens.empty ()) {
2055
2045
SLT_WRN (slot, " %s" , " empty prompt - releasing slot\n " );
@@ -2942,7 +2932,23 @@ int main(int argc, char ** argv) {
2942
2932
return handle_completions_generic (SERVER_TASK_CMPL_TYPE_NORMAL, data, res);
2943
2933
};
2944
2934
2945
- const auto handle_infill = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
2935
+ const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
2936
+ std::string err;
2937
+ if (llama_token_fim_pre (ctx_server.model ) == LLAMA_TOKEN_NULL) {
2938
+ err += " prefix token is missing. " ;
2939
+ }
2940
+ if (llama_token_fim_suf (ctx_server.model ) == LLAMA_TOKEN_NULL) {
2941
+ err += " suffix token is missing. " ;
2942
+ }
2943
+ if (llama_token_fim_mid (ctx_server.model ) == LLAMA_TOKEN_NULL) {
2944
+ err += " middle token is missing. " ;
2945
+ }
2946
+
2947
+ if (!err.empty ()) {
2948
+ res_error (res, format_error_response (string_format (" Infill is not supported by this model: %s" , err.c_str ()), ERROR_TYPE_NOT_SUPPORTED));
2949
+ return ;
2950
+ }
2951
+
2946
2952
json data = json::parse (req.body );
2947
2953
return handle_completions_generic (SERVER_TASK_CMPL_TYPE_INFILL, data, res);
2948
2954
};
@@ -3028,7 +3034,8 @@ int main(int argc, char ** argv) {
3028
3034
if (body.count (" content" ) != 0 ) {
3029
3035
const bool add_special = json_value (body, " add_special" , false );
3030
3036
const bool with_pieces = json_value (body, " with_pieces" , false );
3031
- std::vector<llama_token> tokens = ctx_server.tokenize (body.at (" content" ), add_special);
3037
+
3038
+ std::vector<llama_token> tokens = ctx_server.tokenize (body.at (" content" ), add_special, true );
3032
3039
3033
3040
if (with_pieces) {
3034
3041
for (const auto & token : tokens) {
0 commit comments