From 07c9b9c37eb8896094ebe9782d4254cc3266dd20 Mon Sep 17 00:00:00 2001
From: Kunjan Patel <kunjanp@google.com>
Date: Sun, 10 Nov 2024 13:03:07 -0800
Subject: [PATCH 1/2] Get running adapters from latest series in new metric

Signed-off-by: Kunjan Patel <kunjanp@google.com>
---
 .../manifests/vllm/vllm-lora-deployment.yaml  |   3 +-
 pkg/ext-proc/backend/vllm/metrics.go          |  51 ++++++-
 pkg/ext-proc/backend/vllm/metrics_test.go     | 142 ++++++++++++++++++
 3 files changed, 192 insertions(+), 4 deletions(-)
 create mode 100644 pkg/ext-proc/backend/vllm/metrics_test.go

diff --git a/examples/poc/manifests/vllm/vllm-lora-deployment.yaml b/examples/poc/manifests/vllm/vllm-lora-deployment.yaml
index 4f990c9e..1996c153 100644
--- a/examples/poc/manifests/vllm/vllm-lora-deployment.yaml
+++ b/examples/poc/manifests/vllm/vllm-lora-deployment.yaml
@@ -30,7 +30,7 @@ spec:
     spec:
       containers:
         - name: lora
-          image: "ghcr.io/tomatillo-and-multiverse/vllm:demo"
+          image: "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:ac04a97a9fbc122bb14ff4eb590314d453cdf57c"
           imagePullPolicy: Always
           command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
           args:
@@ -40,7 +40,6 @@ spec:
           - "1"
           - "--port"
           - "8000"
-          - "--disable-log-requests"
           - "--enable-lora"
           - "--max-loras"
           - "4"
diff --git a/pkg/ext-proc/backend/vllm/metrics.go b/pkg/ext-proc/backend/vllm/metrics.go
index f4540adf..0246bca7 100644
--- a/pkg/ext-proc/backend/vllm/metrics.go
+++ b/pkg/ext-proc/backend/vllm/metrics.go
@@ -17,7 +17,9 @@ import (
 )
 
 const (
-	ActiveLoRAAdaptersMetricName = "vllm:info_active_adapters_info"
+	ActiveLoRAAdaptersMetricName        = "vllm:info_active_adapters_info"
+	LoraRequestInfoMetricName           = "vllm:lora_requests_info"
+	LoRAAdapterPendingRequestMetricName = "vllm:active_lora_adapters"
 	// TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork.
 	RunningQueueSizeMetricName = "vllm:num_requests_running"
 	WaitingQueueSizeMetricName = "vllm:num_requests_waiting"
@@ -82,6 +84,9 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
 	if err == nil {
 		updated.KVCacheUsagePercent = cachePercent.GetGauge().GetValue()
 	}
+
+	loraMetrics, _, err := getLatestLoraMetric(metricFamilies[LoraRequestInfoMetricName])
+	multierr.Append(errs, err)
 	/* TODO: uncomment once this is available in vllm.
 	kvCap, _, err := getGaugeLatestValue(metricFamilies, KvCacheMaxTokenCapacityMetricName)
 	errs = multierr.Append(errs, err)
@@ -112,12 +117,54 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
 		}
 	} else {
 		klog.Warningf("metric family %q not found", ActiveLoRAAdaptersMetricName)
-		errs = multierr.Append(errs, fmt.Errorf("metric family %q not found", ActiveLoRAAdaptersMetricName))
+		multierr.Append(errs, fmt.Errorf("metric family %q not found", ActiveLoRAAdaptersMetricName))
+	}
+
+	if loraMetrics != nil {
+		updated.Metrics.ActiveModels = make(map[string]int)
+		for _, label := range loraMetrics.GetLabel() {
+			if label.GetName() == "running_lora_adapters" {
+				if label.GetValue() != "" {
+					adapterList := strings.Split(label.GetValue(), ",")
+					for _, adapter := range adapterList {
+						updated.Metrics.ActiveModels[adapter] = 0
+					}
+				}
+			}
+		}
+
+	}
+
+	if loraMetrics != nil {
+		updated.CachedModels = make(map[string]int)
+		for _, label := range loraMetrics.GetLabel() {
+			if label.GetName() == "running_lora_adapters" {
+				if label.GetValue() != "" {
+					adapterList := strings.Split(label.GetValue(), ",")
+					for _, adapter := range adapterList {
+						updated.CachedModels[adapter] = 0
+					}
+				}
+			}
+		}
+
 	}
 
 	return updated, errs
 }
 
+func getLatestLoraMetric(loraRequests *dto.MetricFamily) (*dto.Metric, time.Time, error) {
+	var latestTs float64
+	var latest *dto.Metric
+	for _, m := range loraRequests.GetMetric() {
+		if m.GetGauge().GetValue() > latestTs {
+			latestTs = m.GetGauge().GetValue()
+			latest = m
+		}
+	}
+	return latest, time.Unix(0, int64(latestTs*1000)), nil
+}
+
 // getLatestMetric gets the latest metric of a family. This should be used to get the latest Gauge metric.
 func getLatestMetric(metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, time.Time, error) {
 	mf, ok := metricFamilies[metricName]
diff --git a/pkg/ext-proc/backend/vllm/metrics_test.go b/pkg/ext-proc/backend/vllm/metrics_test.go
new file mode 100644
index 00000000..1e0da622
--- /dev/null
+++ b/pkg/ext-proc/backend/vllm/metrics_test.go
@@ -0,0 +1,142 @@
+package vllm
+
+import (
+	"testing"
+
+	"inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend"
+
+	dto "github.com/prometheus/client_model/go"
+	"github.com/stretchr/testify/assert"
+	"google.golang.org/protobuf/proto"
+)
+
+func TestPromToPodMetrics(t *testing.T) {
+	testCases := []struct {
+		name              string
+		metricFamilies    map[string]*dto.MetricFamily
+		expectedMetrics   *backend.Metrics
+		expectedErr       error
+		initialPodMetrics *backend.PodMetrics
+	}{
+		{
+			name: "all metrics available",
+			metricFamilies: map[string]*dto.MetricFamily{
+				RunningQueueSizeMetricName: {
+					Metric: []*dto.Metric{
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(10),
+							},
+							TimestampMs: proto.Int64(100),
+						},
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(15),
+							},
+							TimestampMs: proto.Int64(200), // This is the latest
+						},
+					},
+				},
+				WaitingQueueSizeMetricName: {
+					Metric: []*dto.Metric{
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(20),
+							},
+							TimestampMs: proto.Int64(100),
+						},
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(25),
+							},
+							TimestampMs: proto.Int64(200), // This is the latest
+						},
+					},
+				},
+				KVCacheUsagePercentMetricName: {
+					Metric: []*dto.Metric{
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(0.8),
+							},
+							TimestampMs: proto.Int64(100),
+						},
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(0.9),
+							},
+							TimestampMs: proto.Int64(200), // This is the latest
+						},
+					},
+				},
+				ActiveLoRAAdaptersMetricName: {
+					Metric: []*dto.Metric{
+						{
+							Label: []*dto.LabelPair{
+								{
+									Name:  proto.String("active_adapters"),
+									Value: proto.String("lora1,lora2"),
+								},
+							},
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(100),
+							},
+						},
+					},
+				},
+				LoraRequestInfoMetricName: {
+					Metric: []*dto.Metric{
+						{
+							Label: []*dto.LabelPair{
+								{
+									Name:  proto.String("running_lora_adapters"),
+									Value: proto.String("lora3,lora4"),
+								},
+								{
+									Name:  proto.String("waiting_lora_adapters"),
+									Value: proto.String("lora1,lora4"),
+								},
+							},
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(100),
+							},
+						},
+						{
+							Label: []*dto.LabelPair{
+								{
+									Name:  proto.String("running_lora_adapters"),
+									Value: proto.String("lora2"),
+								},
+							},
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(90),
+							},
+						},
+					},
+				},
+			},
+			expectedMetrics: &backend.Metrics{
+				RunningQueueSize:    15,
+				WaitingQueueSize:    25,
+				KVCacheUsagePercent: 0.9,
+				CachedModels: map[string]int{
+					"lora3": 0,
+					"lora4": 0,
+				},
+			},
+			initialPodMetrics: &backend.PodMetrics{},
+			expectedErr:       nil,
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			updated, err := promToPodMetrics(tc.metricFamilies, tc.initialPodMetrics)
+			if tc.expectedErr != nil {
+				assert.Error(t, err)
+			} else {
+				assert.NoError(t, err)
+				assert.Equal(t, tc.expectedMetrics, &updated.Metrics)
+			}
+		})
+	}
+}

From 3f5c7bb4de35c3d7cc02d9eacba8e3ac81944741 Mon Sep 17 00:00:00 2001
From: Kunjan Patel <kunjanp@google.com>
Date: Sun, 10 Nov 2024 14:01:52 -0800
Subject: [PATCH 2/2] Get running adapters from latest series in new metric,
 add table driven test function, delete old metrics

Signed-off-by: Kunjan Patel <kunjanp@google.com>
Signed-off-by: Kunjan <kunjanp@google.com>
---
 .../manifests/vllm/vllm-lora-deployment.yaml  |   2 +-
 go.mod                                        |   2 +
 pkg/ext-proc/backend/vllm/metrics.go          |  72 ++++--------
 pkg/ext-proc/backend/vllm/metrics_test.go     | 108 ++++++++++++++++--
 4 files changed, 127 insertions(+), 57 deletions(-)

diff --git a/examples/poc/manifests/vllm/vllm-lora-deployment.yaml b/examples/poc/manifests/vllm/vllm-lora-deployment.yaml
index 1996c153..a453eb7e 100644
--- a/examples/poc/manifests/vllm/vllm-lora-deployment.yaml
+++ b/examples/poc/manifests/vllm/vllm-lora-deployment.yaml
@@ -30,7 +30,7 @@ spec:
     spec:
       containers:
         - name: lora
-          image: "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:ac04a97a9fbc122bb14ff4eb590314d453cdf57c"
+          image: "vllm/vllm-openai:latest"
           imagePullPolicy: Always
           command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
           args:
diff --git a/go.mod b/go.mod
index 6daef914..5df490da 100644
--- a/go.mod
+++ b/go.mod
@@ -13,6 +13,7 @@ require (
 	github.com/onsi/gomega v1.36.0
 	github.com/prometheus/client_model v0.6.1
 	github.com/prometheus/common v0.61.0
+	github.com/stretchr/testify v1.10.0
 	go.uber.org/multierr v1.11.0
 	google.golang.org/grpc v1.68.0
 	google.golang.org/protobuf v1.35.2
@@ -70,6 +71,7 @@ require (
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
+	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
 	github.com/prometheus/client_golang v1.20.4 // indirect
 	github.com/prometheus/procfs v0.15.1 // indirect
 	github.com/shopspring/decimal v1.2.0 // indirect
diff --git a/pkg/ext-proc/backend/vllm/metrics.go b/pkg/ext-proc/backend/vllm/metrics.go
index 0246bca7..bd594df9 100644
--- a/pkg/ext-proc/backend/vllm/metrics.go
+++ b/pkg/ext-proc/backend/vllm/metrics.go
@@ -5,21 +5,21 @@ import (
 	"context"
 	"fmt"
 	"net/http"
+	"strconv"
 	"strings"
 	"time"
 
-	"inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend"
-
 	dto "github.com/prometheus/client_model/go"
 	"github.com/prometheus/common/expfmt"
 	"go.uber.org/multierr"
+	"inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend"
 	klog "k8s.io/klog/v2"
 )
 
 const (
-	ActiveLoRAAdaptersMetricName        = "vllm:info_active_adapters_info"
-	LoraRequestInfoMetricName           = "vllm:lora_requests_info"
-	LoRAAdapterPendingRequestMetricName = "vllm:active_lora_adapters"
+	LoraRequestInfoMetricName                = "vllm:lora_requests_info"
+	LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters"
+	LoraRequestInfoMaxAdaptersMetricName     = "max_lora"
 	// TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork.
 	RunningQueueSizeMetricName = "vllm:num_requests_running"
 	WaitingQueueSizeMetricName = "vllm:num_requests_waiting"
@@ -85,8 +85,8 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
 		updated.KVCacheUsagePercent = cachePercent.GetGauge().GetValue()
 	}
 
-	loraMetrics, _, err := getLatestLoraMetric(metricFamilies[LoraRequestInfoMetricName])
-	multierr.Append(errs, err)
+	loraMetrics, _, err := getLatestLoraMetric(metricFamilies)
+	errs = multierr.Append(errs, err)
 	/* TODO: uncomment once this is available in vllm.
 	kvCap, _, err := getGaugeLatestValue(metricFamilies, KvCacheMaxTokenCapacityMetricName)
 	errs = multierr.Append(errs, err)
@@ -95,54 +95,22 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
 	}
 	*/
 
-	// TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/22): Read from vLLM metrics once the is available.
-	updated.MaxActiveModels = 4
-
-	// Update active loras
-	mf, ok := metricFamilies[ActiveLoRAAdaptersMetricName]
-	if ok {
-		// IMPORTANT: replace the map entries instead of appending to it.
-		updated.ActiveModels = make(map[string]int)
-		for _, metric := range mf.GetMetric() {
-			for _, label := range metric.GetLabel() {
-				if label.GetName() == "active_adapters" {
-					if label.GetValue() != "" {
-						adapterList := strings.Split(label.GetValue(), ",")
-						for _, adapter := range adapterList {
-							updated.ActiveModels[adapter] = 0
-						}
-					}
-				}
-			}
-		}
-	} else {
-		klog.Warningf("metric family %q not found", ActiveLoRAAdaptersMetricName)
-		multierr.Append(errs, fmt.Errorf("metric family %q not found", ActiveLoRAAdaptersMetricName))
-	}
-
 	if loraMetrics != nil {
-		updated.Metrics.ActiveModels = make(map[string]int)
+		updated.ActiveModels = make(map[string]int)
 		for _, label := range loraMetrics.GetLabel() {
-			if label.GetName() == "running_lora_adapters" {
+			if label.GetName() == LoraRequestInfoRunningAdaptersMetricName {
 				if label.GetValue() != "" {
 					adapterList := strings.Split(label.GetValue(), ",")
 					for _, adapter := range adapterList {
-						updated.Metrics.ActiveModels[adapter] = 0
+						updated.ActiveModels[adapter] = 0
 					}
 				}
 			}
-		}
-
-	}
-
-	if loraMetrics != nil {
-		updated.CachedModels = make(map[string]int)
-		for _, label := range loraMetrics.GetLabel() {
-			if label.GetName() == "running_lora_adapters" {
+			if label.GetName() == LoraRequestInfoMaxAdaptersMetricName {
 				if label.GetValue() != "" {
-					adapterList := strings.Split(label.GetValue(), ",")
-					for _, adapter := range adapterList {
-						updated.CachedModels[adapter] = 0
+					updated.MaxActiveModels, err = strconv.Atoi(label.GetValue())
+					if err != nil {
+						errs = multierr.Append(errs, err)
 					}
 				}
 			}
@@ -153,7 +121,16 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
 	return updated, errs
 }
 
-func getLatestLoraMetric(loraRequests *dto.MetricFamily) (*dto.Metric, time.Time, error) {
+// getLatestLoraMetric gets latest lora metric series in gauge metric family `vllm:lora_requests_info`
+// reason its specially fetched is because each label key value pair permutation generates new series
+// and only most recent is useful. The value of each series is the creation timestamp so we can
+// retrieve the latest by sorting the value.
+func getLatestLoraMetric(metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) {
+	loraRequests, ok := metricFamilies[LoraRequestInfoMetricName]
+	if !ok {
+		klog.Warningf("metric family %q not found", LoraRequestInfoMetricName)
+		return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName)
+	}
 	var latestTs float64
 	var latest *dto.Metric
 	for _, m := range loraRequests.GetMetric() {
@@ -166,6 +143,7 @@ func getLatestLoraMetric(loraRequests *dto.MetricFamily) (*dto.Metric, time.Time
 }
 
 // getLatestMetric gets the latest metric of a family. This should be used to get the latest Gauge metric.
+// Since vllm doesn't set the timestamp in metric, this metric essentially gets the first metric.
 func getLatestMetric(metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, time.Time, error) {
 	mf, ok := metricFamilies[metricName]
 	if !ok {
diff --git a/pkg/ext-proc/backend/vllm/metrics_test.go b/pkg/ext-proc/backend/vllm/metrics_test.go
index 1e0da622..f6ac403f 100644
--- a/pkg/ext-proc/backend/vllm/metrics_test.go
+++ b/pkg/ext-proc/backend/vllm/metrics_test.go
@@ -1,6 +1,7 @@
 package vllm
 
 import (
+	"fmt"
 	"testing"
 
 	"inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend"
@@ -69,19 +70,103 @@ func TestPromToPodMetrics(t *testing.T) {
 						},
 					},
 				},
-				ActiveLoRAAdaptersMetricName: {
+				LoraRequestInfoMetricName: {
 					Metric: []*dto.Metric{
 						{
 							Label: []*dto.LabelPair{
 								{
-									Name:  proto.String("active_adapters"),
-									Value: proto.String("lora1,lora2"),
+									Name:  proto.String(LoraRequestInfoRunningAdaptersMetricName),
+									Value: proto.String("lora3,lora4"),
+								},
+								{
+									Name:  proto.String(LoraRequestInfoMaxAdaptersMetricName),
+									Value: proto.String("2"),
 								},
 							},
 							Gauge: &dto.Gauge{
 								Value: proto.Float64(100),
 							},
 						},
+						{
+							Label: []*dto.LabelPair{
+								{
+									Name:  proto.String(LoraRequestInfoRunningAdaptersMetricName),
+									Value: proto.String("lora2"),
+								},
+								{
+									Name:  proto.String(LoraRequestInfoMaxAdaptersMetricName),
+									Value: proto.String("2"),
+								},
+							},
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(90),
+							},
+						},
+					},
+				},
+			},
+			expectedMetrics: &backend.Metrics{
+				RunningQueueSize:    15,
+				WaitingQueueSize:    25,
+				KVCacheUsagePercent: 0.9,
+				ActiveModels: map[string]int{
+					"lora3": 0,
+					"lora4": 0,
+				},
+				MaxActiveModels: 2,
+			},
+			initialPodMetrics: &backend.PodMetrics{},
+			expectedErr:       nil,
+		},
+		{
+			name: "invalid max lora",
+			metricFamilies: map[string]*dto.MetricFamily{
+				RunningQueueSizeMetricName: {
+					Metric: []*dto.Metric{
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(10),
+							},
+							TimestampMs: proto.Int64(100),
+						},
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(15),
+							},
+							TimestampMs: proto.Int64(200), // This is the latest
+						},
+					},
+				},
+				WaitingQueueSizeMetricName: {
+					Metric: []*dto.Metric{
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(20),
+							},
+							TimestampMs: proto.Int64(100),
+						},
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(25),
+							},
+							TimestampMs: proto.Int64(200), // This is the latest
+						},
+					},
+				},
+				KVCacheUsagePercentMetricName: {
+					Metric: []*dto.Metric{
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(0.8),
+							},
+							TimestampMs: proto.Int64(100),
+						},
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(0.9),
+							},
+							TimestampMs: proto.Int64(200), // This is the latest
+						},
 					},
 				},
 				LoraRequestInfoMetricName: {
@@ -89,12 +174,12 @@ func TestPromToPodMetrics(t *testing.T) {
 						{
 							Label: []*dto.LabelPair{
 								{
-									Name:  proto.String("running_lora_adapters"),
+									Name:  proto.String(LoraRequestInfoRunningAdaptersMetricName),
 									Value: proto.String("lora3,lora4"),
 								},
 								{
-									Name:  proto.String("waiting_lora_adapters"),
-									Value: proto.String("lora1,lora4"),
+									Name:  proto.String(LoraRequestInfoRunningAdaptersMetricName),
+									Value: proto.String("2a"),
 								},
 							},
 							Gauge: &dto.Gauge{
@@ -104,9 +189,13 @@ func TestPromToPodMetrics(t *testing.T) {
 						{
 							Label: []*dto.LabelPair{
 								{
-									Name:  proto.String("running_lora_adapters"),
+									Name:  proto.String(LoraRequestInfoRunningAdaptersMetricName),
 									Value: proto.String("lora2"),
 								},
+								{
+									Name:  proto.String(LoraRequestInfoMaxAdaptersMetricName),
+									Value: proto.String("2"),
+								},
 							},
 							Gauge: &dto.Gauge{
 								Value: proto.Float64(90),
@@ -119,13 +208,14 @@ func TestPromToPodMetrics(t *testing.T) {
 				RunningQueueSize:    15,
 				WaitingQueueSize:    25,
 				KVCacheUsagePercent: 0.9,
-				CachedModels: map[string]int{
+				ActiveModels: map[string]int{
 					"lora3": 0,
 					"lora4": 0,
 				},
+				MaxActiveModels: 0,
 			},
 			initialPodMetrics: &backend.PodMetrics{},
-			expectedErr:       nil,
+			expectedErr:       fmt.Errorf("strconv.Atoi: parsing '2a': invalid syntax"),
 		},
 	}
 	for _, tc := range testCases {