@@ -1406,7 +1406,7 @@ struct llama_server_context
1406
1406
task.multitask_id = multitask_id;
1407
1407
1408
1408
// when a completion task's prompt array is not a singleton, we split it into multiple requests
1409
- if (task.data .at (" prompt" ).size () > 1 )
1409
+ if (task.data .count ( " prompt " ) && task. data . at (" prompt" ).size () > 1 )
1410
1410
{
1411
1411
lock.unlock (); // entering new func scope
1412
1412
return split_multiprompt_task (task);
@@ -1577,9 +1577,9 @@ struct llama_server_context
1577
1577
1578
1578
slot->reset ();
1579
1579
1580
- slot->infill = task.infill_mode ;
1581
- slot->embedding = task.embedding_mode ;
1582
- slot->task_id = task.id ;
1580
+ slot->infill = task.infill_mode ;
1581
+ slot->embedding = task.embedding_mode ;
1582
+ slot->task_id = task.id ;
1583
1583
slot->multitask_id = task.multitask_id ;
1584
1584
1585
1585
if (!launch_slot_with_data (slot, task.data ))
@@ -1731,7 +1731,8 @@ struct llama_server_context
1731
1731
const bool has_prompt = slot.prompt .is_array () || (slot.prompt .is_string () && !slot.prompt .get <std::string>().empty ()) || !slot.images .empty ();
1732
1732
1733
1733
// empty prompt passed -> release the slot and send empty response
1734
- if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
1734
+ // note: infill mode allows empty prompt
1735
+ if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt && !slot.infill )
1735
1736
{
1736
1737
slot.release ();
1737
1738
slot.print_timings ();
@@ -2609,8 +2610,8 @@ static json format_final_response_oaicompat(const json &request, const task_resu
2609
2610
{" object" , streaming ? " chat.completion.chunk" : " chat.completion" },
2610
2611
{" usage" ,
2611
2612
json{{" completion_tokens" , num_tokens_predicted},
2612
- {" prompt_tokens" , num_prompt_tokens},
2613
- {" total_tokens" , num_tokens_predicted + num_prompt_tokens}}},
2613
+ {" prompt_tokens" , num_prompt_tokens},
2614
+ {" total_tokens" , num_tokens_predicted + num_prompt_tokens}}},
2614
2615
{" id" , gen_chatcmplid ()}};
2615
2616
2616
2617
if (server_verbose) {
0 commit comments