Skip to content

Commit 3f5c7bb

Browse files
committed
Get running adapters from latest series in new metric, add table driven test function, delete old metrics
Signed-off-by: Kunjan Patel <[email protected]> Signed-off-by: Kunjan <[email protected]>
1 parent 07c9b9c commit 3f5c7bb

File tree

4 files changed

+127
-57
lines changed

4 files changed

+127
-57
lines changed

examples/poc/manifests/vllm/vllm-lora-deployment.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ spec:
3030
spec:
3131
containers:
3232
- name: lora
33-
image: "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:ac04a97a9fbc122bb14ff4eb590314d453cdf57c"
33+
image: "vllm/vllm-openai:latest"
3434
imagePullPolicy: Always
3535
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
3636
args:

go.mod

+2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ require (
1313
github.com/onsi/gomega v1.36.0
1414
github.com/prometheus/client_model v0.6.1
1515
github.com/prometheus/common v0.61.0
16+
github.com/stretchr/testify v1.10.0
1617
go.uber.org/multierr v1.11.0
1718
google.golang.org/grpc v1.68.0
1819
google.golang.org/protobuf v1.35.2
@@ -70,6 +71,7 @@ require (
7071
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
7172
github.com/pkg/errors v0.9.1 // indirect
7273
github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
74+
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
7375
github.com/prometheus/client_golang v1.20.4 // indirect
7476
github.com/prometheus/procfs v0.15.1 // indirect
7577
github.com/shopspring/decimal v1.2.0 // indirect

pkg/ext-proc/backend/vllm/metrics.go

+25-47
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,21 @@ import (
55
"context"
66
"fmt"
77
"net/http"
8+
"strconv"
89
"strings"
910
"time"
1011

11-
"inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend"
12-
1312
dto "github.com/prometheus/client_model/go"
1413
"github.com/prometheus/common/expfmt"
1514
"go.uber.org/multierr"
15+
"inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend"
1616
klog "k8s.io/klog/v2"
1717
)
1818

1919
const (
20-
ActiveLoRAAdaptersMetricName = "vllm:info_active_adapters_info"
21-
LoraRequestInfoMetricName = "vllm:lora_requests_info"
22-
LoRAAdapterPendingRequestMetricName = "vllm:active_lora_adapters"
20+
LoraRequestInfoMetricName = "vllm:lora_requests_info"
21+
LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters"
22+
LoraRequestInfoMaxAdaptersMetricName = "max_lora"
2323
// TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork.
2424
RunningQueueSizeMetricName = "vllm:num_requests_running"
2525
WaitingQueueSizeMetricName = "vllm:num_requests_waiting"
@@ -85,8 +85,8 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
8585
updated.KVCacheUsagePercent = cachePercent.GetGauge().GetValue()
8686
}
8787

88-
loraMetrics, _, err := getLatestLoraMetric(metricFamilies[LoraRequestInfoMetricName])
89-
multierr.Append(errs, err)
88+
loraMetrics, _, err := getLatestLoraMetric(metricFamilies)
89+
errs = multierr.Append(errs, err)
9090
/* TODO: uncomment once this is available in vllm.
9191
kvCap, _, err := getGaugeLatestValue(metricFamilies, KvCacheMaxTokenCapacityMetricName)
9292
errs = multierr.Append(errs, err)
@@ -95,54 +95,22 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
9595
}
9696
*/
9797

98-
// TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/22): Read from vLLM metrics once the is available.
99-
updated.MaxActiveModels = 4
100-
101-
// Update active loras
102-
mf, ok := metricFamilies[ActiveLoRAAdaptersMetricName]
103-
if ok {
104-
// IMPORTANT: replace the map entries instead of appending to it.
105-
updated.ActiveModels = make(map[string]int)
106-
for _, metric := range mf.GetMetric() {
107-
for _, label := range metric.GetLabel() {
108-
if label.GetName() == "active_adapters" {
109-
if label.GetValue() != "" {
110-
adapterList := strings.Split(label.GetValue(), ",")
111-
for _, adapter := range adapterList {
112-
updated.ActiveModels[adapter] = 0
113-
}
114-
}
115-
}
116-
}
117-
}
118-
} else {
119-
klog.Warningf("metric family %q not found", ActiveLoRAAdaptersMetricName)
120-
multierr.Append(errs, fmt.Errorf("metric family %q not found", ActiveLoRAAdaptersMetricName))
121-
}
122-
12398
if loraMetrics != nil {
124-
updated.Metrics.ActiveModels = make(map[string]int)
99+
updated.ActiveModels = make(map[string]int)
125100
for _, label := range loraMetrics.GetLabel() {
126-
if label.GetName() == "running_lora_adapters" {
101+
if label.GetName() == LoraRequestInfoRunningAdaptersMetricName {
127102
if label.GetValue() != "" {
128103
adapterList := strings.Split(label.GetValue(), ",")
129104
for _, adapter := range adapterList {
130-
updated.Metrics.ActiveModels[adapter] = 0
105+
updated.ActiveModels[adapter] = 0
131106
}
132107
}
133108
}
134-
}
135-
136-
}
137-
138-
if loraMetrics != nil {
139-
updated.CachedModels = make(map[string]int)
140-
for _, label := range loraMetrics.GetLabel() {
141-
if label.GetName() == "running_lora_adapters" {
109+
if label.GetName() == LoraRequestInfoMaxAdaptersMetricName {
142110
if label.GetValue() != "" {
143-
adapterList := strings.Split(label.GetValue(), ",")
144-
for _, adapter := range adapterList {
145-
updated.CachedModels[adapter] = 0
111+
updated.MaxActiveModels, err = strconv.Atoi(label.GetValue())
112+
if err != nil {
113+
errs = multierr.Append(errs, err)
146114
}
147115
}
148116
}
@@ -153,7 +121,16 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
153121
return updated, errs
154122
}
155123

156-
func getLatestLoraMetric(loraRequests *dto.MetricFamily) (*dto.Metric, time.Time, error) {
124+
// getLatestLoraMetric gets latest lora metric series in gauge metric family `vllm:lora_requests_info`
125+
// reason its specially fetched is because each label key value pair permutation generates new series
126+
// and only most recent is useful. The value of each series is the creation timestamp so we can
127+
// retrieve the latest by sorting the value.
128+
func getLatestLoraMetric(metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) {
129+
loraRequests, ok := metricFamilies[LoraRequestInfoMetricName]
130+
if !ok {
131+
klog.Warningf("metric family %q not found", LoraRequestInfoMetricName)
132+
return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName)
133+
}
157134
var latestTs float64
158135
var latest *dto.Metric
159136
for _, m := range loraRequests.GetMetric() {
@@ -166,6 +143,7 @@ func getLatestLoraMetric(loraRequests *dto.MetricFamily) (*dto.Metric, time.Time
166143
}
167144

168145
// getLatestMetric gets the latest metric of a family. This should be used to get the latest Gauge metric.
146+
// Since vllm doesn't set the timestamp in metric, this metric essentially gets the first metric.
169147
func getLatestMetric(metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, time.Time, error) {
170148
mf, ok := metricFamilies[metricName]
171149
if !ok {

pkg/ext-proc/backend/vllm/metrics_test.go

+99-9
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package vllm
22

33
import (
4+
"fmt"
45
"testing"
56

67
"inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend"
@@ -69,32 +70,116 @@ func TestPromToPodMetrics(t *testing.T) {
6970
},
7071
},
7172
},
72-
ActiveLoRAAdaptersMetricName: {
73+
LoraRequestInfoMetricName: {
7374
Metric: []*dto.Metric{
7475
{
7576
Label: []*dto.LabelPair{
7677
{
77-
Name: proto.String("active_adapters"),
78-
Value: proto.String("lora1,lora2"),
78+
Name: proto.String(LoraRequestInfoRunningAdaptersMetricName),
79+
Value: proto.String("lora3,lora4"),
80+
},
81+
{
82+
Name: proto.String(LoraRequestInfoMaxAdaptersMetricName),
83+
Value: proto.String("2"),
7984
},
8085
},
8186
Gauge: &dto.Gauge{
8287
Value: proto.Float64(100),
8388
},
8489
},
90+
{
91+
Label: []*dto.LabelPair{
92+
{
93+
Name: proto.String(LoraRequestInfoRunningAdaptersMetricName),
94+
Value: proto.String("lora2"),
95+
},
96+
{
97+
Name: proto.String(LoraRequestInfoMaxAdaptersMetricName),
98+
Value: proto.String("2"),
99+
},
100+
},
101+
Gauge: &dto.Gauge{
102+
Value: proto.Float64(90),
103+
},
104+
},
105+
},
106+
},
107+
},
108+
expectedMetrics: &backend.Metrics{
109+
RunningQueueSize: 15,
110+
WaitingQueueSize: 25,
111+
KVCacheUsagePercent: 0.9,
112+
ActiveModels: map[string]int{
113+
"lora3": 0,
114+
"lora4": 0,
115+
},
116+
MaxActiveModels: 2,
117+
},
118+
initialPodMetrics: &backend.PodMetrics{},
119+
expectedErr: nil,
120+
},
121+
{
122+
name: "invalid max lora",
123+
metricFamilies: map[string]*dto.MetricFamily{
124+
RunningQueueSizeMetricName: {
125+
Metric: []*dto.Metric{
126+
{
127+
Gauge: &dto.Gauge{
128+
Value: proto.Float64(10),
129+
},
130+
TimestampMs: proto.Int64(100),
131+
},
132+
{
133+
Gauge: &dto.Gauge{
134+
Value: proto.Float64(15),
135+
},
136+
TimestampMs: proto.Int64(200), // This is the latest
137+
},
138+
},
139+
},
140+
WaitingQueueSizeMetricName: {
141+
Metric: []*dto.Metric{
142+
{
143+
Gauge: &dto.Gauge{
144+
Value: proto.Float64(20),
145+
},
146+
TimestampMs: proto.Int64(100),
147+
},
148+
{
149+
Gauge: &dto.Gauge{
150+
Value: proto.Float64(25),
151+
},
152+
TimestampMs: proto.Int64(200), // This is the latest
153+
},
154+
},
155+
},
156+
KVCacheUsagePercentMetricName: {
157+
Metric: []*dto.Metric{
158+
{
159+
Gauge: &dto.Gauge{
160+
Value: proto.Float64(0.8),
161+
},
162+
TimestampMs: proto.Int64(100),
163+
},
164+
{
165+
Gauge: &dto.Gauge{
166+
Value: proto.Float64(0.9),
167+
},
168+
TimestampMs: proto.Int64(200), // This is the latest
169+
},
85170
},
86171
},
87172
LoraRequestInfoMetricName: {
88173
Metric: []*dto.Metric{
89174
{
90175
Label: []*dto.LabelPair{
91176
{
92-
Name: proto.String("running_lora_adapters"),
177+
Name: proto.String(LoraRequestInfoRunningAdaptersMetricName),
93178
Value: proto.String("lora3,lora4"),
94179
},
95180
{
96-
Name: proto.String("waiting_lora_adapters"),
97-
Value: proto.String("lora1,lora4"),
181+
Name: proto.String(LoraRequestInfoRunningAdaptersMetricName),
182+
Value: proto.String("2a"),
98183
},
99184
},
100185
Gauge: &dto.Gauge{
@@ -104,9 +189,13 @@ func TestPromToPodMetrics(t *testing.T) {
104189
{
105190
Label: []*dto.LabelPair{
106191
{
107-
Name: proto.String("running_lora_adapters"),
192+
Name: proto.String(LoraRequestInfoRunningAdaptersMetricName),
108193
Value: proto.String("lora2"),
109194
},
195+
{
196+
Name: proto.String(LoraRequestInfoMaxAdaptersMetricName),
197+
Value: proto.String("2"),
198+
},
110199
},
111200
Gauge: &dto.Gauge{
112201
Value: proto.Float64(90),
@@ -119,13 +208,14 @@ func TestPromToPodMetrics(t *testing.T) {
119208
RunningQueueSize: 15,
120209
WaitingQueueSize: 25,
121210
KVCacheUsagePercent: 0.9,
122-
CachedModels: map[string]int{
211+
ActiveModels: map[string]int{
123212
"lora3": 0,
124213
"lora4": 0,
125214
},
215+
MaxActiveModels: 0,
126216
},
127217
initialPodMetrics: &backend.PodMetrics{},
128-
expectedErr: nil,
218+
expectedErr: fmt.Errorf("strconv.Atoi: parsing '2a': invalid syntax"),
129219
},
130220
}
131221
for _, tc := range testCases {

0 commit comments

Comments
 (0)