Skip to content

Commit c493a18

Browse files
committed
Use the least kv cache for sheddable requests when there is capacity
1 parent 08dbd5a commit c493a18

File tree

5 files changed

+17
-93
lines changed

5 files changed

+17
-93
lines changed

pkg/ext-proc/backend/vllm/metrics.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
8585
}
8686
*/
8787

88-
// TODO: Read from vLLM metrics once the is available.
88+
// TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/22): Read from vLLM metrics once the is available.
8989
updated.MaxActiveModels = 4
9090

9191
// Update active loras

pkg/ext-proc/handlers/server.go

+2
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error {
8585
if err != nil {
8686
klog.Errorf("failed to process request: %v", err)
8787
switch status.Code(err) {
88+
// This code can be returned by scheduler when there is no capacity for sheddable
89+
// requests.
8890
case codes.ResourceExhausted:
8991
resp = &extProcPb.ProcessingResponse{
9092
Response: &extProcPb.ProcessingResponse_ImmediateResponse{

pkg/ext-proc/scheduling/filter.go

+2-32
Original file line numberDiff line numberDiff line change
@@ -149,32 +149,6 @@ func leastKVCacheFilterFunc(req *LLMRequest, pods []*backend.PodMetrics) ([]*bac
149149
return filtered, nil
150150
}
151151

152-
// mostKVCacheFilterFunc is similar to leastKVCacheFilterFunc but prefers pods with higher KV cache.
153-
func mostKVCacheFilterFunc(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
154-
min := math.MaxFloat64
155-
var max float64 = 0
156-
filtered := []*backend.PodMetrics{}
157-
158-
for _, pod := range pods {
159-
if pod.KVCacheUsagePercent <= min {
160-
min = pod.KVCacheUsagePercent
161-
}
162-
if pod.KVCacheUsagePercent >= max {
163-
max = pod.KVCacheUsagePercent
164-
}
165-
}
166-
167-
klog.V(3).Infof("mostKVCacheFilterFunc, max=%v, min=%v", max, min)
168-
for _, pod := range pods {
169-
klog.V(3).Infof("Evaluating pod %v", pod.KVCacheUsagePercent)
170-
if pod.KVCacheUsagePercent <= max && pod.KVCacheUsagePercent >= max-(max-min)/float64(len(pods)) {
171-
klog.V(3).Infof("Selected pod %v", pod.KVCacheUsagePercent)
172-
filtered = append(filtered, pod)
173-
}
174-
}
175-
return filtered, nil
176-
}
177-
178152
// podPredicate is a filter function to check whether a pod is desired.
179153
type podPredicate func(req *LLMRequest, pod *backend.PodMetrics) bool
180154

@@ -189,12 +163,8 @@ func criticalRequestPredicate(req *LLMRequest, pod *backend.PodMetrics) bool {
189163
return req.Critical
190164
}
191165

192-
func noQueueAndLessThanKVCacheThresholdPredicate(threshold float64) podPredicate {
166+
func noQueueAndLessThanKVCacheThresholdPredicate(queueThreshold int, kvCacheThreshold float64) podPredicate {
193167
return func(req *LLMRequest, pod *backend.PodMetrics) bool {
194-
return pod.WaitingQueueSize <= 0 && pod.KVCacheUsagePercent <= threshold
168+
return pod.WaitingQueueSize <= queueThreshold && pod.KVCacheUsagePercent <= kvCacheThreshold
195169
}
196170
}
197-
198-
func allowAllPredicate(req *LLMRequest, pod *backend.PodMetrics) bool {
199-
return true
200-
}

pkg/ext-proc/scheduling/filter_test.go

+1-35
Original file line numberDiff line numberDiff line change
@@ -301,43 +301,9 @@ func TestFilterFunc(t *testing.T) {
301301
},
302302
},
303303
},
304-
{
305-
name: "most kv cache empty input",
306-
f: mostKVCacheFilterFunc,
307-
input: []*backend.PodMetrics{},
308-
output: []*backend.PodMetrics{},
309-
},
310-
{
311-
name: "most kv cache",
312-
f: mostKVCacheFilterFunc,
313-
input: []*backend.PodMetrics{
314-
{
315-
Metrics: backend.Metrics{
316-
KVCacheUsagePercent: 0,
317-
},
318-
},
319-
{
320-
Metrics: backend.Metrics{
321-
KVCacheUsagePercent: 0.3,
322-
},
323-
},
324-
{
325-
Metrics: backend.Metrics{
326-
KVCacheUsagePercent: 1.0,
327-
},
328-
},
329-
},
330-
output: []*backend.PodMetrics{
331-
{
332-
Metrics: backend.Metrics{
333-
KVCacheUsagePercent: 1.0,
334-
},
335-
},
336-
},
337-
},
338304
{
339305
name: "noQueueAndLessThanKVCacheThresholdPredicate",
340-
f: toFilterFunc(noQueueAndLessThanKVCacheThresholdPredicate(0.8)),
306+
f: toFilterFunc(noQueueAndLessThanKVCacheThresholdPredicate(0, 0.8)),
341307
input: []*backend.PodMetrics{
342308
{
343309
// This pod should be returned.

pkg/ext-proc/scheduling/scheduler.go

+11-25
Original file line numberDiff line numberDiff line change
@@ -13,27 +13,23 @@ import (
1313
)
1414

1515
const (
16-
// TODO Consider making this configurable.
17-
kvCacheThreshold = 80
16+
// TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/16) Make this configurable.
17+
kvCacheThreshold = 0.8
18+
// TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/16) Make this configurable.
19+
queueThreshold = 5
1820
)
1921

2022
var (
21-
allowAllFilter = &filter{
22-
name: "noop",
23-
filter: toFilterFunc(allowAllPredicate),
24-
}
25-
2623
defaultFilter = &filter{
2724
name: "critical request",
2825
filter: toFilterFunc(criticalRequestPredicate),
29-
nextOnSuccess: criticalRequestFilter,
26+
nextOnSuccess: lowLatencyFilter,
3027
nextOnFailure: sheddableRequestFilter,
3128
}
3229

33-
// The goal for scheduling critical requests is to minimize the latency. The heuristic is to
34-
// pick a server with least "load" (KV Cache), which typically yields lower latency.
35-
// Heuristics for scheduling critical requests:
36-
criticalRequestFilter = &filter{
30+
// lowLatencyFilter tries to minimize the latency. The heuristic is to pick a server with lower
31+
// cost to load an adapter and has low KV cache, which typically yields lower latency.
32+
lowLatencyFilter = &filter{
3733
name: "least queuing",
3834
filter: leastQueuingFilterFunc,
3935
nextOnSuccessOrFailure: &filter{
@@ -46,23 +42,13 @@ var (
4642
},
4743
}
4844

49-
// The goal for scheduling sheddable requests is to optimize for throughput while reducing
50-
// queuing, and leave low load (KV cache) servers to serve critical requests.
5145
sheddableRequestFilter = &filter{
5246
// When there is at least one model server that's not queuing requests, and still has KV
5347
// cache below a certain threshold, we consider this model server has capacity to handle
5448
// a sheddable request without impacting critical requests.
55-
name: "has capacity for sheddable requests",
56-
filter: toFilterFunc(noQueueAndLessThanKVCacheThresholdPredicate(kvCacheThreshold)),
57-
nextOnSuccess: &filter{
58-
name: "most KV cache percent",
59-
filter: mostKVCacheFilterFunc,
60-
nextOnSuccessOrFailure: &filter{
61-
name: "low cost LoRA",
62-
filter: toFilterFunc(lowLoRACostPredicate),
63-
nextOnFailure: allowAllFilter,
64-
},
65-
},
49+
name: "has capacity for sheddable requests",
50+
filter: toFilterFunc(noQueueAndLessThanKVCacheThresholdPredicate(queueThreshold, kvCacheThreshold)),
51+
nextOnSuccess: lowLatencyFilter,
6652
// If all pods are queuing or running above the KVCache threshold, we drop the sheddable
6753
// request to make room for critical requests.
6854
nextOnFailure: &filter{

0 commit comments

Comments
 (0)