Skip to content

Commit 918960c

Browse files
authored
Merge pull request #54 from coolkp/main
switch to using upstream vllm with new metric
2 parents e23aab7 + 3f5c7bb commit 918960c

File tree

4 files changed

+281
-23
lines changed

4 files changed

+281
-23
lines changed

examples/poc/manifests/vllm/vllm-lora-deployment.yaml

+1-2
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ spec:
3030
spec:
3131
containers:
3232
- name: lora
33-
image: "ghcr.io/tomatillo-and-multiverse/vllm:demo"
33+
image: "vllm/vllm-openai:latest"
3434
imagePullPolicy: Always
3535
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
3636
args:
@@ -40,7 +40,6 @@ spec:
4040
- "1"
4141
- "--port"
4242
- "8000"
43-
- "--disable-log-requests"
4443
- "--enable-lora"
4544
- "--max-loras"
4645
- "4"

go.mod

+2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ require (
1313
github.com/onsi/gomega v1.36.0
1414
github.com/prometheus/client_model v0.6.1
1515
github.com/prometheus/common v0.61.0
16+
github.com/stretchr/testify v1.10.0
1617
go.uber.org/multierr v1.11.0
1718
google.golang.org/grpc v1.68.0
1819
google.golang.org/protobuf v1.35.2
@@ -70,6 +71,7 @@ require (
7071
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
7172
github.com/pkg/errors v0.9.1 // indirect
7273
github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
74+
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
7375
github.com/prometheus/client_golang v1.20.4 // indirect
7476
github.com/prometheus/procfs v0.15.1 // indirect
7577
github.com/shopspring/decimal v1.2.0 // indirect

pkg/ext-proc/backend/vllm/metrics.go

+46-21
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,21 @@ import (
55
"context"
66
"fmt"
77
"net/http"
8+
"strconv"
89
"strings"
910
"time"
1011

11-
"inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend"
12-
1312
dto "github.com/prometheus/client_model/go"
1413
"github.com/prometheus/common/expfmt"
1514
"go.uber.org/multierr"
15+
"inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend"
1616
klog "k8s.io/klog/v2"
1717
)
1818

1919
const (
20-
ActiveLoRAAdaptersMetricName = "vllm:info_active_adapters_info"
20+
LoraRequestInfoMetricName = "vllm:lora_requests_info"
21+
LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters"
22+
LoraRequestInfoMaxAdaptersMetricName = "max_lora"
2123
// TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork.
2224
RunningQueueSizeMetricName = "vllm:num_requests_running"
2325
WaitingQueueSizeMetricName = "vllm:num_requests_waiting"
@@ -82,6 +84,9 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
8284
if err == nil {
8385
updated.KVCacheUsagePercent = cachePercent.GetGauge().GetValue()
8486
}
87+
88+
loraMetrics, _, err := getLatestLoraMetric(metricFamilies)
89+
errs = multierr.Append(errs, err)
8590
/* TODO: uncomment once this is available in vllm.
8691
kvCap, _, err := getGaugeLatestValue(metricFamilies, KvCacheMaxTokenCapacityMetricName)
8792
errs = multierr.Append(errs, err)
@@ -90,35 +95,55 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
9095
}
9196
*/
9297

93-
// TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/22): Read from vLLM metrics once the is available.
94-
updated.MaxActiveModels = 4
95-
96-
// Update active loras
97-
mf, ok := metricFamilies[ActiveLoRAAdaptersMetricName]
98-
if ok {
99-
// IMPORTANT: replace the map entries instead of appending to it.
98+
if loraMetrics != nil {
10099
updated.ActiveModels = make(map[string]int)
101-
for _, metric := range mf.GetMetric() {
102-
for _, label := range metric.GetLabel() {
103-
if label.GetName() == "active_adapters" {
104-
if label.GetValue() != "" {
105-
adapterList := strings.Split(label.GetValue(), ",")
106-
for _, adapter := range adapterList {
107-
updated.ActiveModels[adapter] = 0
108-
}
100+
for _, label := range loraMetrics.GetLabel() {
101+
if label.GetName() == LoraRequestInfoRunningAdaptersMetricName {
102+
if label.GetValue() != "" {
103+
adapterList := strings.Split(label.GetValue(), ",")
104+
for _, adapter := range adapterList {
105+
updated.ActiveModels[adapter] = 0
106+
}
107+
}
108+
}
109+
if label.GetName() == LoraRequestInfoMaxAdaptersMetricName {
110+
if label.GetValue() != "" {
111+
updated.MaxActiveModels, err = strconv.Atoi(label.GetValue())
112+
if err != nil {
113+
errs = multierr.Append(errs, err)
109114
}
110115
}
111116
}
112117
}
113-
} else {
114-
klog.Warningf("metric family %q not found", ActiveLoRAAdaptersMetricName)
115-
errs = multierr.Append(errs, fmt.Errorf("metric family %q not found", ActiveLoRAAdaptersMetricName))
118+
116119
}
117120

118121
return updated, errs
119122
}
120123

124+
// getLatestLoraMetric gets latest lora metric series in gauge metric family `vllm:lora_requests_info`
125+
// reason its specially fetched is because each label key value pair permutation generates new series
126+
// and only most recent is useful. The value of each series is the creation timestamp so we can
127+
// retrieve the latest by sorting the value.
128+
func getLatestLoraMetric(metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) {
129+
loraRequests, ok := metricFamilies[LoraRequestInfoMetricName]
130+
if !ok {
131+
klog.Warningf("metric family %q not found", LoraRequestInfoMetricName)
132+
return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName)
133+
}
134+
var latestTs float64
135+
var latest *dto.Metric
136+
for _, m := range loraRequests.GetMetric() {
137+
if m.GetGauge().GetValue() > latestTs {
138+
latestTs = m.GetGauge().GetValue()
139+
latest = m
140+
}
141+
}
142+
return latest, time.Unix(0, int64(latestTs*1000)), nil
143+
}
144+
121145
// getLatestMetric gets the latest metric of a family. This should be used to get the latest Gauge metric.
146+
// Since vllm doesn't set the timestamp in metric, this metric essentially gets the first metric.
122147
func getLatestMetric(metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, time.Time, error) {
123148
mf, ok := metricFamilies[metricName]
124149
if !ok {
+232
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
package vllm
2+
3+
import (
4+
"fmt"
5+
"testing"
6+
7+
"inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend"
8+
9+
dto "github.com/prometheus/client_model/go"
10+
"github.com/stretchr/testify/assert"
11+
"google.golang.org/protobuf/proto"
12+
)
13+
14+
func TestPromToPodMetrics(t *testing.T) {
15+
testCases := []struct {
16+
name string
17+
metricFamilies map[string]*dto.MetricFamily
18+
expectedMetrics *backend.Metrics
19+
expectedErr error
20+
initialPodMetrics *backend.PodMetrics
21+
}{
22+
{
23+
name: "all metrics available",
24+
metricFamilies: map[string]*dto.MetricFamily{
25+
RunningQueueSizeMetricName: {
26+
Metric: []*dto.Metric{
27+
{
28+
Gauge: &dto.Gauge{
29+
Value: proto.Float64(10),
30+
},
31+
TimestampMs: proto.Int64(100),
32+
},
33+
{
34+
Gauge: &dto.Gauge{
35+
Value: proto.Float64(15),
36+
},
37+
TimestampMs: proto.Int64(200), // This is the latest
38+
},
39+
},
40+
},
41+
WaitingQueueSizeMetricName: {
42+
Metric: []*dto.Metric{
43+
{
44+
Gauge: &dto.Gauge{
45+
Value: proto.Float64(20),
46+
},
47+
TimestampMs: proto.Int64(100),
48+
},
49+
{
50+
Gauge: &dto.Gauge{
51+
Value: proto.Float64(25),
52+
},
53+
TimestampMs: proto.Int64(200), // This is the latest
54+
},
55+
},
56+
},
57+
KVCacheUsagePercentMetricName: {
58+
Metric: []*dto.Metric{
59+
{
60+
Gauge: &dto.Gauge{
61+
Value: proto.Float64(0.8),
62+
},
63+
TimestampMs: proto.Int64(100),
64+
},
65+
{
66+
Gauge: &dto.Gauge{
67+
Value: proto.Float64(0.9),
68+
},
69+
TimestampMs: proto.Int64(200), // This is the latest
70+
},
71+
},
72+
},
73+
LoraRequestInfoMetricName: {
74+
Metric: []*dto.Metric{
75+
{
76+
Label: []*dto.LabelPair{
77+
{
78+
Name: proto.String(LoraRequestInfoRunningAdaptersMetricName),
79+
Value: proto.String("lora3,lora4"),
80+
},
81+
{
82+
Name: proto.String(LoraRequestInfoMaxAdaptersMetricName),
83+
Value: proto.String("2"),
84+
},
85+
},
86+
Gauge: &dto.Gauge{
87+
Value: proto.Float64(100),
88+
},
89+
},
90+
{
91+
Label: []*dto.LabelPair{
92+
{
93+
Name: proto.String(LoraRequestInfoRunningAdaptersMetricName),
94+
Value: proto.String("lora2"),
95+
},
96+
{
97+
Name: proto.String(LoraRequestInfoMaxAdaptersMetricName),
98+
Value: proto.String("2"),
99+
},
100+
},
101+
Gauge: &dto.Gauge{
102+
Value: proto.Float64(90),
103+
},
104+
},
105+
},
106+
},
107+
},
108+
expectedMetrics: &backend.Metrics{
109+
RunningQueueSize: 15,
110+
WaitingQueueSize: 25,
111+
KVCacheUsagePercent: 0.9,
112+
ActiveModels: map[string]int{
113+
"lora3": 0,
114+
"lora4": 0,
115+
},
116+
MaxActiveModels: 2,
117+
},
118+
initialPodMetrics: &backend.PodMetrics{},
119+
expectedErr: nil,
120+
},
121+
{
122+
name: "invalid max lora",
123+
metricFamilies: map[string]*dto.MetricFamily{
124+
RunningQueueSizeMetricName: {
125+
Metric: []*dto.Metric{
126+
{
127+
Gauge: &dto.Gauge{
128+
Value: proto.Float64(10),
129+
},
130+
TimestampMs: proto.Int64(100),
131+
},
132+
{
133+
Gauge: &dto.Gauge{
134+
Value: proto.Float64(15),
135+
},
136+
TimestampMs: proto.Int64(200), // This is the latest
137+
},
138+
},
139+
},
140+
WaitingQueueSizeMetricName: {
141+
Metric: []*dto.Metric{
142+
{
143+
Gauge: &dto.Gauge{
144+
Value: proto.Float64(20),
145+
},
146+
TimestampMs: proto.Int64(100),
147+
},
148+
{
149+
Gauge: &dto.Gauge{
150+
Value: proto.Float64(25),
151+
},
152+
TimestampMs: proto.Int64(200), // This is the latest
153+
},
154+
},
155+
},
156+
KVCacheUsagePercentMetricName: {
157+
Metric: []*dto.Metric{
158+
{
159+
Gauge: &dto.Gauge{
160+
Value: proto.Float64(0.8),
161+
},
162+
TimestampMs: proto.Int64(100),
163+
},
164+
{
165+
Gauge: &dto.Gauge{
166+
Value: proto.Float64(0.9),
167+
},
168+
TimestampMs: proto.Int64(200), // This is the latest
169+
},
170+
},
171+
},
172+
LoraRequestInfoMetricName: {
173+
Metric: []*dto.Metric{
174+
{
175+
Label: []*dto.LabelPair{
176+
{
177+
Name: proto.String(LoraRequestInfoRunningAdaptersMetricName),
178+
Value: proto.String("lora3,lora4"),
179+
},
180+
{
181+
Name: proto.String(LoraRequestInfoRunningAdaptersMetricName),
182+
Value: proto.String("2a"),
183+
},
184+
},
185+
Gauge: &dto.Gauge{
186+
Value: proto.Float64(100),
187+
},
188+
},
189+
{
190+
Label: []*dto.LabelPair{
191+
{
192+
Name: proto.String(LoraRequestInfoRunningAdaptersMetricName),
193+
Value: proto.String("lora2"),
194+
},
195+
{
196+
Name: proto.String(LoraRequestInfoMaxAdaptersMetricName),
197+
Value: proto.String("2"),
198+
},
199+
},
200+
Gauge: &dto.Gauge{
201+
Value: proto.Float64(90),
202+
},
203+
},
204+
},
205+
},
206+
},
207+
expectedMetrics: &backend.Metrics{
208+
RunningQueueSize: 15,
209+
WaitingQueueSize: 25,
210+
KVCacheUsagePercent: 0.9,
211+
ActiveModels: map[string]int{
212+
"lora3": 0,
213+
"lora4": 0,
214+
},
215+
MaxActiveModels: 0,
216+
},
217+
initialPodMetrics: &backend.PodMetrics{},
218+
expectedErr: fmt.Errorf("strconv.Atoi: parsing '2a': invalid syntax"),
219+
},
220+
}
221+
for _, tc := range testCases {
222+
t.Run(tc.name, func(t *testing.T) {
223+
updated, err := promToPodMetrics(tc.metricFamilies, tc.initialPodMetrics)
224+
if tc.expectedErr != nil {
225+
assert.Error(t, err)
226+
} else {
227+
assert.NoError(t, err)
228+
assert.Equal(t, tc.expectedMetrics, &updated.Metrics)
229+
}
230+
})
231+
}
232+
}

0 commit comments

Comments
 (0)