Skip to content

Commit e466ea2

Browse files
liu-congkfswain
authored andcommitted
Add priority based scheduling (kubernetes-sigs#25)
* Add priority based scheduling * Use the least kv cache for sheddable requests when there is capacity
1 parent e4d65e2 commit e466ea2

File tree

1 file changed

+5
-2
lines changed

1 file changed

+5
-2
lines changed

pkg/ext-proc/backend/vllm/metrics.go

+5-2
Original file line numberDiff line numberDiff line change
@@ -85,18 +85,21 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
8585
}
8686
*/
8787

88+
// TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/22): Read from vLLM metrics once the is available.
89+
updated.MaxActiveModels = 4
90+
8891
// Update active loras
8992
mf, ok := metricFamilies[ActiveLoRAAdaptersMetricName]
9093
if ok {
9194
// IMPORTANT: replace the map entries instead of appending to it.
92-
updated.CachedModels = make(map[string]int)
95+
updated.ActiveModels = make(map[string]int)
9396
for _, metric := range mf.GetMetric() {
9497
for _, label := range metric.GetLabel() {
9598
if label.GetName() == "active_adapters" {
9699
if label.GetValue() != "" {
97100
adapterList := strings.Split(label.GetValue(), ",")
98101
for _, adapter := range adapterList {
99-
updated.CachedModels[adapter] = 0
102+
updated.ActiveModels[adapter] = 0
100103
}
101104
}
102105
}

0 commit comments

Comments
 (0)