Get running adapters from latest series in new metric, add table driven test function, delete old metrics

coolkp · coolkp · commit 3f5c7bb4de35 · 2024-12-11T12:04:18.000-08:00
Signed-off-by: Kunjan Patel &lt;kunjanp@google.com&gt;
Signed-off-by: Kunjan &lt;kunjanp@google.com&gt;
diff --git a/examples/poc/manifests/vllm/vllm-lora-deployment.yaml b/examples/poc/manifests/vllm/vllm-lora-deployment.yaml
@@ -30,7 +30,7 @@ spec:
     spec:
       containers:
         - name: lora
-          image: "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:ac04a97a9fbc122bb14ff4eb590314d453cdf57c"
+          image: "vllm/vllm-openai:latest"
           imagePullPolicy: Always
           command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
           args:
diff --git a/go.mod b/go.mod
@@ -13,6 +13,7 @@ require (
 	github.com/onsi/gomega v1.36.0
 	github.com/prometheus/client_model v0.6.1
 	github.com/prometheus/common v0.61.0
+	github.com/stretchr/testify v1.10.0
 	go.uber.org/multierr v1.11.0
 	google.golang.org/grpc v1.68.0
 	google.golang.org/protobuf v1.35.2
@@ -70,6 +71,7 @@ require (
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
+	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
 	github.com/prometheus/client_golang v1.20.4 // indirect
 	github.com/prometheus/procfs v0.15.1 // indirect
 	github.com/shopspring/decimal v1.2.0 // indirect
diff --git a/pkg/ext-proc/backend/vllm/metrics.go b/pkg/ext-proc/backend/vllm/metrics.go
@@ -5,21 +5,21 @@ import (
 	"context"
 	"fmt"
 	"net/http"
+	"strconv"
 	"strings"
 	"time"
 
-	"inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend"
-
 	dto "github.com/prometheus/client_model/go"
 	"github.com/prometheus/common/expfmt"
 	"go.uber.org/multierr"
+	"inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend"
 	klog "k8s.io/klog/v2"
 )
 
 const (
-	ActiveLoRAAdaptersMetricName        = "vllm:info_active_adapters_info"
-	LoraRequestInfoMetricName           = "vllm:lora_requests_info"
-	LoRAAdapterPendingRequestMetricName = "vllm:active_lora_adapters"
+	LoraRequestInfoMetricName                = "vllm:lora_requests_info"
+	LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters"
+	LoraRequestInfoMaxAdaptersMetricName     = "max_lora"
 	// TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork.
 	RunningQueueSizeMetricName = "vllm:num_requests_running"
 	WaitingQueueSizeMetricName = "vllm:num_requests_waiting"
@@ -85,8 +85,8 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
 		updated.KVCacheUsagePercent = cachePercent.GetGauge().GetValue()
 	}
 
-	loraMetrics, _, err := getLatestLoraMetric(metricFamilies[LoraRequestInfoMetricName])
-	multierr.Append(errs, err)
+	loraMetrics, _, err := getLatestLoraMetric(metricFamilies)
+	errs = multierr.Append(errs, err)
 	/* TODO: uncomment once this is available in vllm.
 	kvCap, _, err := getGaugeLatestValue(metricFamilies, KvCacheMaxTokenCapacityMetricName)
 	errs = multierr.Append(errs, err)
@@ -95,54 +95,22 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
 	}
 	*/
 
-	// TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/22): Read from vLLM metrics once the is available.
-	updated.MaxActiveModels = 4
-
-	// Update active loras
-	mf, ok := metricFamilies[ActiveLoRAAdaptersMetricName]
-	if ok {
-		// IMPORTANT: replace the map entries instead of appending to it.
-		updated.ActiveModels = make(map[string]int)
-		for _, metric := range mf.GetMetric() {
-			for _, label := range metric.GetLabel() {
-				if label.GetName() == "active_adapters" {
-					if label.GetValue() != "" {
-						adapterList := strings.Split(label.GetValue(), ",")
-						for _, adapter := range adapterList {
-							updated.ActiveModels[adapter] = 0
-						}
-					}
-				}
-			}
-		}
-	} else {
-		klog.Warningf("metric family %q not found", ActiveLoRAAdaptersMetricName)
-		multierr.Append(errs, fmt.Errorf("metric family %q not found", ActiveLoRAAdaptersMetricName))
-	}
-
 	if loraMetrics != nil {
-		updated.Metrics.ActiveModels = make(map[string]int)
+		updated.ActiveModels = make(map[string]int)
 		for _, label := range loraMetrics.GetLabel() {
-			if label.GetName() == "running_lora_adapters" {
+			if label.GetName() == LoraRequestInfoRunningAdaptersMetricName {
 				if label.GetValue() != "" {
 					adapterList := strings.Split(label.GetValue(), ",")
 					for _, adapter := range adapterList {
-						updated.Metrics.ActiveModels[adapter] = 0
+						updated.ActiveModels[adapter] = 0
 					}
 				}
 			}
-		}
-
-	}
-
-	if loraMetrics != nil {
-		updated.CachedModels = make(map[string]int)
-		for _, label := range loraMetrics.GetLabel() {
-			if label.GetName() == "running_lora_adapters" {
+			if label.GetName() == LoraRequestInfoMaxAdaptersMetricName {
 				if label.GetValue() != "" {
-					adapterList := strings.Split(label.GetValue(), ",")
-					for _, adapter := range adapterList {
-						updated.CachedModels[adapter] = 0
+					updated.MaxActiveModels, err = strconv.Atoi(label.GetValue())
+					if err != nil {
+						errs = multierr.Append(errs, err)
 					}
 				}
 			}
@@ -153,7 +121,16 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
 	return updated, errs
 }
 
-func getLatestLoraMetric(loraRequests *dto.MetricFamily) (*dto.Metric, time.Time, error) {
+// getLatestLoraMetric gets latest lora metric series in gauge metric family `vllm:lora_requests_info`
+// reason its specially fetched is because each label key value pair permutation generates new series
+// and only most recent is useful. The value of each series is the creation timestamp so we can
+// retrieve the latest by sorting the value.
+func getLatestLoraMetric(metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) {
+	loraRequests, ok := metricFamilies[LoraRequestInfoMetricName]
+	if !ok {
+		klog.Warningf("metric family %q not found", LoraRequestInfoMetricName)
+		return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName)
+	}
 	var latestTs float64
 	var latest *dto.Metric
 	for _, m := range loraRequests.GetMetric() {
@@ -166,6 +143,7 @@ func getLatestLoraMetric(loraRequests *dto.MetricFamily) (*dto.Metric, time.Time
 }
 
 // getLatestMetric gets the latest metric of a family. This should be used to get the latest Gauge metric.
+// Since vllm doesn't set the timestamp in metric, this metric essentially gets the first metric.
 func getLatestMetric(metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, time.Time, error) {
 	mf, ok := metricFamilies[metricName]
 	if !ok {
diff --git a/pkg/ext-proc/backend/vllm/metrics_test.go b/pkg/ext-proc/backend/vllm/metrics_test.go
@@ -1,6 +1,7 @@
 package vllm
 
 import (
+	"fmt"
 	"testing"
 
 	"inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend"
@@ -69,32 +70,116 @@ func TestPromToPodMetrics(t *testing.T) {
 						},
 					},
 				},
-				ActiveLoRAAdaptersMetricName: {
+				LoraRequestInfoMetricName: {
 					Metric: []*dto.Metric{
 						{
 							Label: []*dto.LabelPair{
 								{
-									Name:  proto.String("active_adapters"),
-									Value: proto.String("lora1,lora2"),
+									Name:  proto.String(LoraRequestInfoRunningAdaptersMetricName),
+									Value: proto.String("lora3,lora4"),
+								},
+								{
+									Name:  proto.String(LoraRequestInfoMaxAdaptersMetricName),
+									Value: proto.String("2"),
 								},
 							},
 							Gauge: &dto.Gauge{
 								Value: proto.Float64(100),
 							},
 						},
+						{
+							Label: []*dto.LabelPair{
+								{
+									Name:  proto.String(LoraRequestInfoRunningAdaptersMetricName),
+									Value: proto.String("lora2"),
+								},
+								{
+									Name:  proto.String(LoraRequestInfoMaxAdaptersMetricName),
+									Value: proto.String("2"),
+								},
+							},
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(90),
+							},
+						},
+					},
+				},
+			},
+			expectedMetrics: &backend.Metrics{
+				RunningQueueSize:    15,
+				WaitingQueueSize:    25,
+				KVCacheUsagePercent: 0.9,
+				ActiveModels: map[string]int{
+					"lora3": 0,
+					"lora4": 0,
+				},
+				MaxActiveModels: 2,
+			},
+			initialPodMetrics: &backend.PodMetrics{},
+			expectedErr:       nil,
+		},
+		{
+			name: "invalid max lora",
+			metricFamilies: map[string]*dto.MetricFamily{
+				RunningQueueSizeMetricName: {
+					Metric: []*dto.Metric{
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(10),
+							},
+							TimestampMs: proto.Int64(100),
+						},
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(15),
+							},
+							TimestampMs: proto.Int64(200), // This is the latest
+						},
+					},
+				},
+				WaitingQueueSizeMetricName: {
+					Metric: []*dto.Metric{
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(20),
+							},
+							TimestampMs: proto.Int64(100),
+						},
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(25),
+							},
+							TimestampMs: proto.Int64(200), // This is the latest
+						},
+					},
+				},
+				KVCacheUsagePercentMetricName: {
+					Metric: []*dto.Metric{
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(0.8),
+							},
+							TimestampMs: proto.Int64(100),
+						},
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(0.9),
+							},
+							TimestampMs: proto.Int64(200), // This is the latest
+						},
 					},
 				},
 				LoraRequestInfoMetricName: {
 					Metric: []*dto.Metric{
 						{
 							Label: []*dto.LabelPair{
 								{
-									Name:  proto.String("running_lora_adapters"),
+									Name:  proto.String(LoraRequestInfoRunningAdaptersMetricName),
 									Value: proto.String("lora3,lora4"),
 								},
 								{
-									Name:  proto.String("waiting_lora_adapters"),
-									Value: proto.String("lora1,lora4"),
+									Name:  proto.String(LoraRequestInfoRunningAdaptersMetricName),
+									Value: proto.String("2a"),
 								},
 							},
 							Gauge: &dto.Gauge{
@@ -104,9 +189,13 @@ func TestPromToPodMetrics(t *testing.T) {
 						{
 							Label: []*dto.LabelPair{
 								{
-									Name:  proto.String("running_lora_adapters"),
+									Name:  proto.String(LoraRequestInfoRunningAdaptersMetricName),
 									Value: proto.String("lora2"),
 								},
+								{
+									Name:  proto.String(LoraRequestInfoMaxAdaptersMetricName),
+									Value: proto.String("2"),
+								},
 							},
 							Gauge: &dto.Gauge{
 								Value: proto.Float64(90),
@@ -119,13 +208,14 @@ func TestPromToPodMetrics(t *testing.T) {
 				RunningQueueSize:    15,
 				WaitingQueueSize:    25,
 				KVCacheUsagePercent: 0.9,
-				CachedModels: map[string]int{
+				ActiveModels: map[string]int{
 					"lora3": 0,
 					"lora4": 0,
 				},
+				MaxActiveModels: 0,
 			},
 			initialPodMetrics: &backend.PodMetrics{},
-			expectedErr:       nil,
+			expectedErr:       fmt.Errorf("strconv.Atoi: parsing '2a': invalid syntax"),
 		},
 	}
 	for _, tc := range testCases {

-Original file line number
+Diff line change
 	github.com/onsi/gomega v1.36.0
 	github.com/prometheus/client_model v0.6.1
 	github.com/prometheus/common v0.61.0
 +	github.com/stretchr/testify v1.10.0
 	go.uber.org/multierr v1.11.0
 	google.golang.org/grpc v1.68.0
 	google.golang.org/protobuf v1.35.2
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
 +	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
 	github.com/prometheus/client_golang v1.20.4 // indirect
 	github.com/prometheus/procfs v0.15.1 // indirect
 	github.com/shopspring/decimal v1.2.0 // indirect