Skip to content

Commit 07c9b9c

Browse files
committed
Get running adapters from latest series in new metric
Signed-off-by: Kunjan Patel <[email protected]>
1 parent b52a0fd commit 07c9b9c

File tree

3 files changed

+192
-4
lines changed

3 files changed

+192
-4
lines changed

Diff for: examples/poc/manifests/vllm/vllm-lora-deployment.yaml

+1-2
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ spec:
3030
spec:
3131
containers:
3232
- name: lora
33-
image: "ghcr.io/tomatillo-and-multiverse/vllm:demo"
33+
image: "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:ac04a97a9fbc122bb14ff4eb590314d453cdf57c"
3434
imagePullPolicy: Always
3535
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
3636
args:
@@ -40,7 +40,6 @@ spec:
4040
- "1"
4141
- "--port"
4242
- "8000"
43-
- "--disable-log-requests"
4443
- "--enable-lora"
4544
- "--max-loras"
4645
- "4"

Diff for: pkg/ext-proc/backend/vllm/metrics.go

+49-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@ import (
1717
)
1818

1919
const (
20-
ActiveLoRAAdaptersMetricName = "vllm:info_active_adapters_info"
20+
ActiveLoRAAdaptersMetricName = "vllm:info_active_adapters_info"
21+
LoraRequestInfoMetricName = "vllm:lora_requests_info"
22+
LoRAAdapterPendingRequestMetricName = "vllm:active_lora_adapters"
2123
// TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork.
2224
RunningQueueSizeMetricName = "vllm:num_requests_running"
2325
WaitingQueueSizeMetricName = "vllm:num_requests_waiting"
@@ -82,6 +84,9 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
8284
if err == nil {
8385
updated.KVCacheUsagePercent = cachePercent.GetGauge().GetValue()
8486
}
87+
88+
loraMetrics, _, err := getLatestLoraMetric(metricFamilies[LoraRequestInfoMetricName])
89+
multierr.Append(errs, err)
8590
/* TODO: uncomment once this is available in vllm.
8691
kvCap, _, err := getGaugeLatestValue(metricFamilies, KvCacheMaxTokenCapacityMetricName)
8792
errs = multierr.Append(errs, err)
@@ -112,12 +117,54 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
112117
}
113118
} else {
114119
klog.Warningf("metric family %q not found", ActiveLoRAAdaptersMetricName)
115-
errs = multierr.Append(errs, fmt.Errorf("metric family %q not found", ActiveLoRAAdaptersMetricName))
120+
multierr.Append(errs, fmt.Errorf("metric family %q not found", ActiveLoRAAdaptersMetricName))
121+
}
122+
123+
if loraMetrics != nil {
124+
updated.Metrics.ActiveModels = make(map[string]int)
125+
for _, label := range loraMetrics.GetLabel() {
126+
if label.GetName() == "running_lora_adapters" {
127+
if label.GetValue() != "" {
128+
adapterList := strings.Split(label.GetValue(), ",")
129+
for _, adapter := range adapterList {
130+
updated.Metrics.ActiveModels[adapter] = 0
131+
}
132+
}
133+
}
134+
}
135+
136+
}
137+
138+
if loraMetrics != nil {
139+
updated.CachedModels = make(map[string]int)
140+
for _, label := range loraMetrics.GetLabel() {
141+
if label.GetName() == "running_lora_adapters" {
142+
if label.GetValue() != "" {
143+
adapterList := strings.Split(label.GetValue(), ",")
144+
for _, adapter := range adapterList {
145+
updated.CachedModels[adapter] = 0
146+
}
147+
}
148+
}
149+
}
150+
116151
}
117152

118153
return updated, errs
119154
}
120155

156+
func getLatestLoraMetric(loraRequests *dto.MetricFamily) (*dto.Metric, time.Time, error) {
157+
var latestTs float64
158+
var latest *dto.Metric
159+
for _, m := range loraRequests.GetMetric() {
160+
if m.GetGauge().GetValue() > latestTs {
161+
latestTs = m.GetGauge().GetValue()
162+
latest = m
163+
}
164+
}
165+
return latest, time.Unix(0, int64(latestTs*1000)), nil
166+
}
167+
121168
// getLatestMetric gets the latest metric of a family. This should be used to get the latest Gauge metric.
122169
func getLatestMetric(metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, time.Time, error) {
123170
mf, ok := metricFamilies[metricName]

Diff for: pkg/ext-proc/backend/vllm/metrics_test.go

+142
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
package vllm
2+
3+
import (
4+
"testing"
5+
6+
"inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend"
7+
8+
dto "github.com/prometheus/client_model/go"
9+
"github.com/stretchr/testify/assert"
10+
"google.golang.org/protobuf/proto"
11+
)
12+
13+
func TestPromToPodMetrics(t *testing.T) {
14+
testCases := []struct {
15+
name string
16+
metricFamilies map[string]*dto.MetricFamily
17+
expectedMetrics *backend.Metrics
18+
expectedErr error
19+
initialPodMetrics *backend.PodMetrics
20+
}{
21+
{
22+
name: "all metrics available",
23+
metricFamilies: map[string]*dto.MetricFamily{
24+
RunningQueueSizeMetricName: {
25+
Metric: []*dto.Metric{
26+
{
27+
Gauge: &dto.Gauge{
28+
Value: proto.Float64(10),
29+
},
30+
TimestampMs: proto.Int64(100),
31+
},
32+
{
33+
Gauge: &dto.Gauge{
34+
Value: proto.Float64(15),
35+
},
36+
TimestampMs: proto.Int64(200), // This is the latest
37+
},
38+
},
39+
},
40+
WaitingQueueSizeMetricName: {
41+
Metric: []*dto.Metric{
42+
{
43+
Gauge: &dto.Gauge{
44+
Value: proto.Float64(20),
45+
},
46+
TimestampMs: proto.Int64(100),
47+
},
48+
{
49+
Gauge: &dto.Gauge{
50+
Value: proto.Float64(25),
51+
},
52+
TimestampMs: proto.Int64(200), // This is the latest
53+
},
54+
},
55+
},
56+
KVCacheUsagePercentMetricName: {
57+
Metric: []*dto.Metric{
58+
{
59+
Gauge: &dto.Gauge{
60+
Value: proto.Float64(0.8),
61+
},
62+
TimestampMs: proto.Int64(100),
63+
},
64+
{
65+
Gauge: &dto.Gauge{
66+
Value: proto.Float64(0.9),
67+
},
68+
TimestampMs: proto.Int64(200), // This is the latest
69+
},
70+
},
71+
},
72+
ActiveLoRAAdaptersMetricName: {
73+
Metric: []*dto.Metric{
74+
{
75+
Label: []*dto.LabelPair{
76+
{
77+
Name: proto.String("active_adapters"),
78+
Value: proto.String("lora1,lora2"),
79+
},
80+
},
81+
Gauge: &dto.Gauge{
82+
Value: proto.Float64(100),
83+
},
84+
},
85+
},
86+
},
87+
LoraRequestInfoMetricName: {
88+
Metric: []*dto.Metric{
89+
{
90+
Label: []*dto.LabelPair{
91+
{
92+
Name: proto.String("running_lora_adapters"),
93+
Value: proto.String("lora3,lora4"),
94+
},
95+
{
96+
Name: proto.String("waiting_lora_adapters"),
97+
Value: proto.String("lora1,lora4"),
98+
},
99+
},
100+
Gauge: &dto.Gauge{
101+
Value: proto.Float64(100),
102+
},
103+
},
104+
{
105+
Label: []*dto.LabelPair{
106+
{
107+
Name: proto.String("running_lora_adapters"),
108+
Value: proto.String("lora2"),
109+
},
110+
},
111+
Gauge: &dto.Gauge{
112+
Value: proto.Float64(90),
113+
},
114+
},
115+
},
116+
},
117+
},
118+
expectedMetrics: &backend.Metrics{
119+
RunningQueueSize: 15,
120+
WaitingQueueSize: 25,
121+
KVCacheUsagePercent: 0.9,
122+
CachedModels: map[string]int{
123+
"lora3": 0,
124+
"lora4": 0,
125+
},
126+
},
127+
initialPodMetrics: &backend.PodMetrics{},
128+
expectedErr: nil,
129+
},
130+
}
131+
for _, tc := range testCases {
132+
t.Run(tc.name, func(t *testing.T) {
133+
updated, err := promToPodMetrics(tc.metricFamilies, tc.initialPodMetrics)
134+
if tc.expectedErr != nil {
135+
assert.Error(t, err)
136+
} else {
137+
assert.NoError(t, err)
138+
assert.Equal(t, tc.expectedMetrics, &updated.Metrics)
139+
}
140+
})
141+
}
142+
}

0 commit comments

Comments
 (0)