@@ -5,21 +5,21 @@ import (
5
5
"context"
6
6
"fmt"
7
7
"net/http"
8
+ "strconv"
8
9
"strings"
9
10
"time"
10
11
11
- "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend"
12
-
13
12
dto "github.com/prometheus/client_model/go"
14
13
"github.com/prometheus/common/expfmt"
15
14
"go.uber.org/multierr"
15
+ "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend"
16
16
klog "k8s.io/klog/v2"
17
17
)
18
18
19
19
const (
20
- ActiveLoRAAdaptersMetricName = "vllm:info_active_adapters_info "
21
- LoraRequestInfoMetricName = "vllm:lora_requests_info "
22
- LoRAAdapterPendingRequestMetricName = "vllm:active_lora_adapters "
20
+ LoraRequestInfoMetricName = "vllm:lora_requests_info "
21
+ LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters "
22
+ LoraRequestInfoMaxAdaptersMetricName = "max_lora "
23
23
// TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork.
24
24
RunningQueueSizeMetricName = "vllm:num_requests_running"
25
25
WaitingQueueSizeMetricName = "vllm:num_requests_waiting"
@@ -85,8 +85,8 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
85
85
updated .KVCacheUsagePercent = cachePercent .GetGauge ().GetValue ()
86
86
}
87
87
88
- loraMetrics , _ , err := getLatestLoraMetric (metricFamilies [ LoraRequestInfoMetricName ] )
89
- multierr .Append (errs , err )
88
+ loraMetrics , _ , err := getLatestLoraMetric (metricFamilies )
89
+ errs = multierr .Append (errs , err )
90
90
/* TODO: uncomment once this is available in vllm.
91
91
kvCap, _, err := getGaugeLatestValue(metricFamilies, KvCacheMaxTokenCapacityMetricName)
92
92
errs = multierr.Append(errs, err)
@@ -95,54 +95,22 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
95
95
}
96
96
*/
97
97
98
- // TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/22): Read from vLLM metrics once the is available.
99
- updated .MaxActiveModels = 4
100
-
101
- // Update active loras
102
- mf , ok := metricFamilies [ActiveLoRAAdaptersMetricName ]
103
- if ok {
104
- // IMPORTANT: replace the map entries instead of appending to it.
105
- updated .ActiveModels = make (map [string ]int )
106
- for _ , metric := range mf .GetMetric () {
107
- for _ , label := range metric .GetLabel () {
108
- if label .GetName () == "active_adapters" {
109
- if label .GetValue () != "" {
110
- adapterList := strings .Split (label .GetValue (), "," )
111
- for _ , adapter := range adapterList {
112
- updated .ActiveModels [adapter ] = 0
113
- }
114
- }
115
- }
116
- }
117
- }
118
- } else {
119
- klog .Warningf ("metric family %q not found" , ActiveLoRAAdaptersMetricName )
120
- multierr .Append (errs , fmt .Errorf ("metric family %q not found" , ActiveLoRAAdaptersMetricName ))
121
- }
122
-
123
98
if loraMetrics != nil {
124
- updated .Metrics . ActiveModels = make (map [string ]int )
99
+ updated .ActiveModels = make (map [string ]int )
125
100
for _ , label := range loraMetrics .GetLabel () {
126
- if label .GetName () == "running_lora_adapters" {
101
+ if label .GetName () == LoraRequestInfoRunningAdaptersMetricName {
127
102
if label .GetValue () != "" {
128
103
adapterList := strings .Split (label .GetValue (), "," )
129
104
for _ , adapter := range adapterList {
130
- updated .Metrics . ActiveModels [adapter ] = 0
105
+ updated .ActiveModels [adapter ] = 0
131
106
}
132
107
}
133
108
}
134
- }
135
-
136
- }
137
-
138
- if loraMetrics != nil {
139
- updated .CachedModels = make (map [string ]int )
140
- for _ , label := range loraMetrics .GetLabel () {
141
- if label .GetName () == "running_lora_adapters" {
109
+ if label .GetName () == LoraRequestInfoMaxAdaptersMetricName {
142
110
if label .GetValue () != "" {
143
- adapterList := strings . Split (label .GetValue (), "," )
144
- for _ , adapter := range adapterList {
145
- updated . CachedModels [ adapter ] = 0
111
+ updated . MaxActiveModels , err = strconv . Atoi (label .GetValue ())
112
+ if err != nil {
113
+ errs = multierr . Append ( errs , err )
146
114
}
147
115
}
148
116
}
@@ -153,7 +121,16 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
153
121
return updated , errs
154
122
}
155
123
156
- func getLatestLoraMetric (loraRequests * dto.MetricFamily ) (* dto.Metric , time.Time , error ) {
124
+ // getLatestLoraMetric gets latest lora metric series in gauge metric family `vllm:lora_requests_info`
125
+ // reason its specially fetched is because each label key value pair permutation generates new series
126
+ // and only most recent is useful. The value of each series is the creation timestamp so we can
127
+ // retrieve the latest by sorting the value.
128
+ func getLatestLoraMetric (metricFamilies map [string ]* dto.MetricFamily ) (* dto.Metric , time.Time , error ) {
129
+ loraRequests , ok := metricFamilies [LoraRequestInfoMetricName ]
130
+ if ! ok {
131
+ klog .Warningf ("metric family %q not found" , LoraRequestInfoMetricName )
132
+ return nil , time.Time {}, fmt .Errorf ("metric family %q not found" , LoraRequestInfoMetricName )
133
+ }
157
134
var latestTs float64
158
135
var latest * dto.Metric
159
136
for _ , m := range loraRequests .GetMetric () {
@@ -166,6 +143,7 @@ func getLatestLoraMetric(loraRequests *dto.MetricFamily) (*dto.Metric, time.Time
166
143
}
167
144
168
145
// getLatestMetric gets the latest metric of a family. This should be used to get the latest Gauge metric.
146
+ // Since vllm doesn't set the timestamp in metric, this metric essentially gets the first metric.
169
147
func getLatestMetric (metricFamilies map [string ]* dto.MetricFamily , metricName string ) (* dto.Metric , time.Time , error ) {
170
148
mf , ok := metricFamilies [metricName ]
171
149
if ! ok {
0 commit comments