@@ -13,27 +13,23 @@ import (
13
13
)
14
14
15
15
const (
16
- // TODO Consider making this configurable.
17
- kvCacheThreshold = 80
16
+ // TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/16) Make this configurable.
17
+ kvCacheThreshold = 0.8
18
+ // TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/16) Make this configurable.
19
+ queueThreshold = 5
18
20
)
19
21
20
22
var (
21
- allowAllFilter = & filter {
22
- name : "noop" ,
23
- filter : toFilterFunc (allowAllPredicate ),
24
- }
25
-
26
23
defaultFilter = & filter {
27
24
name : "critical request" ,
28
25
filter : toFilterFunc (criticalRequestPredicate ),
29
- nextOnSuccess : criticalRequestFilter ,
26
+ nextOnSuccess : lowLatencyFilter ,
30
27
nextOnFailure : sheddableRequestFilter ,
31
28
}
32
29
33
- // The goal for scheduling critical requests is to minimize the latency. The heuristic is to
34
- // pick a server with least "load" (KV Cache), which typically yields lower latency.
35
- // Heuristics for scheduling critical requests:
36
- criticalRequestFilter = & filter {
30
+ // lowLatencyFilter tries to minimize the latency. The heuristic is to pick a server with lower
31
+ // cost to load an adapter and has low KV cache, which typically yields lower latency.
32
+ lowLatencyFilter = & filter {
37
33
name : "least queuing" ,
38
34
filter : leastQueuingFilterFunc ,
39
35
nextOnSuccessOrFailure : & filter {
@@ -46,23 +42,13 @@ var (
46
42
},
47
43
}
48
44
49
- // The goal for scheduling sheddable requests is to optimize for throughput while reducing
50
- // queuing, and leave low load (KV cache) servers to serve critical requests.
51
45
sheddableRequestFilter = & filter {
52
46
// When there is at least one model server that's not queuing requests, and still has KV
53
47
// cache below a certain threshold, we consider this model server has capacity to handle
54
48
// a sheddable request without impacting critical requests.
55
- name : "has capacity for sheddable requests" ,
56
- filter : toFilterFunc (noQueueAndLessThanKVCacheThresholdPredicate (kvCacheThreshold )),
57
- nextOnSuccess : & filter {
58
- name : "most KV cache percent" ,
59
- filter : mostKVCacheFilterFunc ,
60
- nextOnSuccessOrFailure : & filter {
61
- name : "low cost LoRA" ,
62
- filter : toFilterFunc (lowLoRACostPredicate ),
63
- nextOnFailure : allowAllFilter ,
64
- },
65
- },
49
+ name : "has capacity for sheddable requests" ,
50
+ filter : toFilterFunc (noQueueAndLessThanKVCacheThresholdPredicate (queueThreshold , kvCacheThreshold )),
51
+ nextOnSuccess : lowLatencyFilter ,
66
52
// If all pods are queuing or running above the KVCache threshold, we drop the sheddable
67
53
// request to make room for critical requests.
68
54
nextOnFailure : & filter {
0 commit comments