@@ -378,8 +378,8 @@ struct server_queue {
378
378
std::condition_variable condition_tasks;
379
379
380
380
// callback functions
381
- std::function<void (server_task& )> callback_new_task;
382
- std::function<void (void )> callback_update_slots;
381
+ std::function<void (server_task)> callback_new_task;
382
+ std::function<void (void )> callback_update_slots;
383
383
384
384
// Add a new task to the end of the queue
385
385
int post (server_task task, bool front = false ) {
@@ -431,7 +431,7 @@ struct server_queue {
431
431
}
432
432
433
433
// Register function to process a new task
434
- void on_new_task (std::function<void (server_task & )> callback) {
434
+ void on_new_task (std::function<void (server_task)> callback) {
435
435
callback_new_task = std::move (callback);
436
436
}
437
437
@@ -481,7 +481,7 @@ struct server_queue {
481
481
lock.unlock ();
482
482
483
483
QUE_DBG (" processing task, id = %d\n " , task.id );
484
- callback_new_task (task);
484
+ callback_new_task (std::move ( task) );
485
485
}
486
486
487
487
// all tasks in the current loop is processed, slots data is now ready
@@ -644,17 +644,12 @@ struct server_context {
644
644
bool load_model (const common_params & params_) {
645
645
params = params_;
646
646
647
- // reserve one extra sequence (seq_id == 0) for extra features
648
- params.n_parallel += 1 ;
649
-
650
647
common_init_result llama_init = common_init_from_params (params);
651
648
652
649
model = llama_init.model ;
653
650
ctx = llama_init.context ;
654
651
loras = llama_init.lora_adapters ;
655
652
656
- params.n_parallel -= 1 ; // but be sneaky about it
657
-
658
653
if (model == nullptr ) {
659
654
SRV_ERR (" failed to load model, '%s'\n " , params.model .c_str ());
660
655
return false ;
@@ -1288,16 +1283,16 @@ struct server_context {
1288
1283
1289
1284
void send_embedding (const server_slot & slot, const llama_batch & batch) {
1290
1285
server_task_result res;
1291
- res.id = slot.id_task ;
1292
- res.error = false ;
1293
- res.stop = true ;
1286
+ res.id = slot.id_task ;
1287
+ res.error = false ;
1288
+ res.stop = true ;
1294
1289
1295
1290
const int n_embd = llama_n_embd (model);
1296
1291
1297
1292
std::vector<float > embd_res (n_embd, 0 .0f );
1298
1293
1299
1294
for (int i = 0 ; i < batch.n_tokens ; ++i) {
1300
- if (!batch.logits [i] || batch.seq_id [i][0 ] != slot.id + 1 ) {
1295
+ if (!batch.logits [i] || batch.seq_id [i][0 ] != slot.id ) {
1301
1296
continue ;
1302
1297
}
1303
1298
@@ -1332,12 +1327,12 @@ struct server_context {
1332
1327
1333
1328
void send_rerank (const server_slot & slot, const llama_batch & batch) {
1334
1329
server_task_result res;
1335
- res.id = slot.id_task ;
1336
- res.error = false ;
1337
- res.stop = true ;
1330
+ res.id = slot.id_task ;
1331
+ res.error = false ;
1332
+ res.stop = true ;
1338
1333
1339
1334
for (int i = 0 ; i < batch.n_tokens ; ++i) {
1340
- if (!batch.logits [i] || batch.seq_id [i][0 ] != slot.id + 1 ) {
1335
+ if (!batch.logits [i] || batch.seq_id [i][0 ] != slot.id ) {
1341
1336
continue ;
1342
1337
}
1343
1338
@@ -1510,7 +1505,7 @@ struct server_context {
1510
1505
// Functions to process the task
1511
1506
//
1512
1507
1513
- void process_single_task (const server_task & task) {
1508
+ void process_single_task (server_task task) {
1514
1509
switch (task.type ) {
1515
1510
case SERVER_TASK_TYPE_INFERENCE:
1516
1511
{
@@ -1808,8 +1803,8 @@ struct server_context {
1808
1803
1809
1804
SLT_WRN (slot, " slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n " , n_keep, n_left, n_discard);
1810
1805
1811
- llama_kv_cache_seq_rm (ctx, slot.id + 1 , n_keep , n_keep + n_discard);
1812
- llama_kv_cache_seq_add (ctx, slot.id + 1 , n_keep + n_discard, slot.n_past , -n_discard);
1806
+ llama_kv_cache_seq_rm (ctx, slot.id , n_keep , n_keep + n_discard);
1807
+ llama_kv_cache_seq_add (ctx, slot.id , n_keep + n_discard, slot.n_past , -n_discard);
1813
1808
1814
1809
if (slot.params .cache_prompt ) {
1815
1810
for (size_t i = n_keep + n_discard; i < slot.cache_tokens .size (); i++) {
@@ -1836,7 +1831,7 @@ struct server_context {
1836
1831
1837
1832
slot.i_batch = batch.n_tokens ;
1838
1833
1839
- common_batch_add (batch, slot.sampled , slot.n_past , { slot.id + 1 }, true );
1834
+ common_batch_add (batch, slot.sampled , slot.n_past , { slot.id }, true );
1840
1835
1841
1836
slot.n_past += 1 ;
1842
1837
@@ -1983,8 +1978,8 @@ struct server_context {
1983
1978
1984
1979
const int64_t kv_shift = (int64_t ) head_p - (int64_t ) head_c;
1985
1980
1986
- llama_kv_cache_seq_rm (ctx, slot.id + 1 , head_p, head_c);
1987
- llama_kv_cache_seq_add (ctx, slot.id + 1 , head_c, -1 , kv_shift);
1981
+ llama_kv_cache_seq_rm (ctx, slot.id , head_p, head_c);
1982
+ llama_kv_cache_seq_add (ctx, slot.id , head_c, -1 , kv_shift);
1988
1983
1989
1984
for (size_t i = 0 ; i < n_match; i++) {
1990
1985
slot.cache_tokens [head_p + i] = slot.cache_tokens [head_c + i];
@@ -2033,9 +2028,9 @@ struct server_context {
2033
2028
}
2034
2029
2035
2030
// keep only the common part
2036
- if (!llama_kv_cache_seq_rm (ctx, slot.id + 1 , slot.n_past , -1 )) {
2031
+ if (!llama_kv_cache_seq_rm (ctx, slot.id , slot.n_past , -1 )) {
2037
2032
// could not partially delete (likely using a non-Transformer model)
2038
- llama_kv_cache_seq_rm (ctx, slot.id + 1 , -1 , -1 );
2033
+ llama_kv_cache_seq_rm (ctx, slot.id , -1 , -1 );
2039
2034
2040
2035
// there is no common part left
2041
2036
slot.n_past = 0 ;
@@ -2048,7 +2043,7 @@ struct server_context {
2048
2043
2049
2044
// add prompt tokens for processing in the current batch
2050
2045
while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
2051
- common_batch_add (batch, prompt_tokens[slot.n_past ], slot.n_past , { slot.id + 1 }, false );
2046
+ common_batch_add (batch, prompt_tokens[slot.n_past ], slot.n_past , { slot.id }, false );
2052
2047
2053
2048
if (slot.params .cache_prompt ) {
2054
2049
slot.cache_tokens .push_back (prompt_tokens[slot.n_past ]);
0 commit comments