@@ -647,6 +647,9 @@ struct server_context {
647
647
648
648
server_metrics metrics;
649
649
650
+ // Necessary similarity of prompt for slot selection
651
+ float slot_prompt_similarity = 0 .0f ;
652
+
650
653
~server_context () {
651
654
if (ctx) {
652
655
llama_free (ctx);
@@ -795,24 +798,88 @@ struct server_context {
795
798
return prompt_tokens;
796
799
}
797
800
798
- server_slot * get_slot (int id) {
799
- int64_t t_last = ggml_time_us ();
800
-
801
- server_slot * last_used = nullptr ;
802
-
801
+ server_slot * get_slot_by_id (int id) {
803
802
for (server_slot & slot : slots) {
804
- if (slot.id == id && slot. available () ) {
803
+ if (slot.id == id) {
805
804
return &slot;
806
805
}
806
+ }
807
+
808
+ return nullptr ;
809
+ }
810
+
811
+ server_slot * get_available_slot (const std::string & prompt) {
812
+ server_slot * ret = nullptr ;
813
+
814
+ // find the slot that has at least n% prompt similarity
815
+ if (ret == nullptr && slot_prompt_similarity != 0 .0f && !prompt.empty ()) {
816
+ int max_lcp_len = 0 ;
817
+ float similarity = 0 ;
818
+
819
+ for (server_slot & slot : slots) {
820
+ // skip the slot if it is not available
821
+ if (!slot.available ()) {
822
+ continue ;
823
+ }
824
+
825
+ // skip the slot if it does not contains prompt
826
+ if (!slot.prompt .is_string ()) {
827
+ continue ;
828
+ }
829
+
830
+ // current slot's prompt
831
+ std::string slot_prompt = slot.prompt .get <std::string>();
832
+
833
+ // length of the current slot's prompt
834
+ int slot_prompt_len = slot_prompt.size ();
835
+
836
+ // length of the Longest Common Prefix between the current slot's prompt and the input prompt
837
+ int lcp_len = common_part (slot_prompt, prompt);
838
+
839
+ // fraction of the common substring length compared to the current slot's prompt length
840
+ similarity = static_cast <float >(lcp_len) / slot_prompt_len;
841
+
842
+ // select the current slot if the criteria match
843
+ if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) {
844
+ max_lcp_len = lcp_len;
845
+ ret = &slot;
846
+ }
847
+ }
807
848
808
- // among all available slots, find the one that has been least recently used
809
- if (slot.available () && slot.t_last_used < t_last) {
810
- last_used = &slot;
811
- t_last = slot.t_last_used ;
849
+ if (ret != nullptr ) {
850
+ LOG_VERBOSE (" selected slot by lcp similarity" , {
851
+ {" id_slot" , ret->id },
852
+ {" max_lcp_len" , max_lcp_len},
853
+ {" similarity" , similarity},
854
+ });
812
855
}
813
856
}
814
857
815
- return last_used;
858
+ // find the slot that has been least recently used
859
+ if (ret == nullptr ) {
860
+ int64_t t_last = ggml_time_us ();
861
+ for (server_slot & slot : slots) {
862
+ // skip the slot if it is not available
863
+ if (!slot.available ()) {
864
+ continue ;
865
+ }
866
+
867
+ // select the current slot if the criteria match
868
+ if (slot.t_last_used < t_last) {
869
+ t_last = slot.t_last_used ;
870
+ ret = &slot;
871
+ }
872
+ }
873
+
874
+ if (ret != nullptr ) {
875
+ LOG_VERBOSE (" selected slot by lru" , {
876
+ {" id_slot" , ret->id },
877
+ {" t_last" , t_last},
878
+ });
879
+ }
880
+ }
881
+
882
+ return ret;
816
883
}
817
884
818
885
bool launch_slot_with_task (server_slot & slot, const server_task & task) {
@@ -1515,13 +1582,29 @@ struct server_context {
1515
1582
switch (task.type ) {
1516
1583
case SERVER_TASK_TYPE_COMPLETION:
1517
1584
{
1518
- server_slot * slot = get_slot (json_value (task.data , " id_slot" , -1 ));
1585
+ int id_slot = json_value (task.data , " id_slot" , -1 );
1586
+ std::string prompt = json_value (task.data , " prompt" , std::string ());
1587
+
1588
+ server_slot * slot;
1589
+
1590
+ if (id_slot != -1 ) {
1591
+ slot = get_slot_by_id (id_slot);
1592
+ } else {
1593
+ slot = get_available_slot (prompt);
1594
+ }
1595
+
1519
1596
if (slot == nullptr ) {
1520
1597
// if no slot is available, we defer this task for processing later
1521
1598
LOG_VERBOSE (" no slot is available" , {{" id_task" , task.id }});
1522
1599
queue_tasks.defer (task);
1523
1600
break ;
1524
1601
}
1602
+ if (!slot->available ()) {
1603
+ // if requested slot is unavailable, we defer this task for processing later
1604
+ LOG_VERBOSE (" requested slot is unavailable" , {{" id_task" , task.id }});
1605
+ queue_tasks.defer (task);
1606
+ break ;
1607
+ }
1525
1608
1526
1609
if (task.data .contains (" system_prompt" )) {
1527
1610
std::string sys_prompt = json_value (task.data , " system_prompt" , std::string ());
@@ -1638,11 +1721,17 @@ struct server_context {
1638
1721
case SERVER_TASK_TYPE_SLOT_SAVE:
1639
1722
{
1640
1723
int id_slot = task.data .at (" id_slot" );
1641
- server_slot * slot = get_slot (id_slot);
1724
+ server_slot * slot = get_slot_by_id (id_slot);
1642
1725
if (slot == nullptr ) {
1643
1726
send_error (task, " Invalid slot ID" , ERROR_TYPE_INVALID_REQUEST);
1644
1727
break ;
1645
1728
}
1729
+ if (!slot->available ()) {
1730
+ // if requested slot is unavailable, we defer this task for processing later
1731
+ LOG_VERBOSE (" requested slot is unavailable" , {{" id_task" , task.id }});
1732
+ queue_tasks.defer (task);
1733
+ break ;
1734
+ }
1646
1735
1647
1736
const size_t token_count = slot->cache_tokens .size ();
1648
1737
const int64_t t_start = ggml_time_us ();
@@ -1673,11 +1762,17 @@ struct server_context {
1673
1762
case SERVER_TASK_TYPE_SLOT_RESTORE:
1674
1763
{
1675
1764
int id_slot = task.data .at (" id_slot" );
1676
- server_slot * slot = get_slot (id_slot);
1765
+ server_slot * slot = get_slot_by_id (id_slot);
1677
1766
if (slot == nullptr ) {
1678
1767
send_error (task, " Invalid slot ID" , ERROR_TYPE_INVALID_REQUEST);
1679
1768
break ;
1680
1769
}
1770
+ if (!slot->available ()) {
1771
+ // if requested slot is unavailable, we defer this task for processing later
1772
+ LOG_VERBOSE (" requested slot is unavailable" , {{" id_task" , task.id }});
1773
+ queue_tasks.defer (task);
1774
+ break ;
1775
+ }
1681
1776
1682
1777
const int64_t t_start = ggml_time_us ();
1683
1778
@@ -1715,11 +1810,17 @@ struct server_context {
1715
1810
case SERVER_TASK_TYPE_SLOT_ERASE:
1716
1811
{
1717
1812
int id_slot = task.data .at (" id_slot" );
1718
- server_slot * slot = get_slot (id_slot);
1813
+ server_slot * slot = get_slot_by_id (id_slot);
1719
1814
if (slot == nullptr ) {
1720
1815
send_error (task, " Invalid slot ID" , ERROR_TYPE_INVALID_REQUEST);
1721
1816
break ;
1722
1817
}
1818
+ if (!slot->available ()) {
1819
+ // if requested slot is unavailable, we defer this task for processing later
1820
+ LOG_VERBOSE (" requested slot is unavailable" , {{" id_task" , task.id }});
1821
+ queue_tasks.defer (task);
1822
+ break ;
1823
+ }
1723
1824
1724
1825
// Erase token cache
1725
1826
const size_t n_erased = slot->cache_tokens .size ();
@@ -2467,6 +2568,9 @@ int main(int argc, char ** argv) {
2467
2568
log_data[" api_key" ] = " api_key: " + std::to_string (params.api_keys .size ()) + " keys loaded" ;
2468
2569
}
2469
2570
2571
+ // Necessary similarity of prompt for slot selection
2572
+ ctx_server.slot_prompt_similarity = params.slot_prompt_similarity ;
2573
+
2470
2574
// load the model
2471
2575
if (!ctx_server.load_model (params)) {
2472
2576
state.store (SERVER_STATE_ERROR);
0 commit comments