Get running adapters from latest series in new metric

coolkp · coolkp · commit 07c9b9c37eb8 · 2024-12-11T10:21:48.000-08:00
Signed-off-by: Kunjan Patel &lt;kunjanp@google.com&gt;
diff --git a/examples/poc/manifests/vllm/vllm-lora-deployment.yaml b/examples/poc/manifests/vllm/vllm-lora-deployment.yaml
@@ -30,7 +30,7 @@ spec:
     spec:
       containers:
         - name: lora
-          image: "ghcr.io/tomatillo-and-multiverse/vllm:demo"
+          image: "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:ac04a97a9fbc122bb14ff4eb590314d453cdf57c"
           imagePullPolicy: Always
           command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
           args:
@@ -40,7 +40,6 @@ spec:
           - "1"
           - "--port"
           - "8000"
-          - "--disable-log-requests"
           - "--enable-lora"
           - "--max-loras"
           - "4"
diff --git a/pkg/ext-proc/backend/vllm/metrics.go b/pkg/ext-proc/backend/vllm/metrics.go
@@ -17,7 +17,9 @@ import (
 )
 
 const (
-	ActiveLoRAAdaptersMetricName = "vllm:info_active_adapters_info"
+	ActiveLoRAAdaptersMetricName        = "vllm:info_active_adapters_info"
+	LoraRequestInfoMetricName           = "vllm:lora_requests_info"
+	LoRAAdapterPendingRequestMetricName = "vllm:active_lora_adapters"
 	// TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork.
 	RunningQueueSizeMetricName = "vllm:num_requests_running"
 	WaitingQueueSizeMetricName = "vllm:num_requests_waiting"
@@ -82,6 +84,9 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
 	if err == nil {
 		updated.KVCacheUsagePercent = cachePercent.GetGauge().GetValue()
 	}
+
+	loraMetrics, _, err := getLatestLoraMetric(metricFamilies[LoraRequestInfoMetricName])
+	multierr.Append(errs, err)
 	/* TODO: uncomment once this is available in vllm.
 	kvCap, _, err := getGaugeLatestValue(metricFamilies, KvCacheMaxTokenCapacityMetricName)
 	errs = multierr.Append(errs, err)
@@ -112,12 +117,54 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
 		}
 	} else {
 		klog.Warningf("metric family %q not found", ActiveLoRAAdaptersMetricName)
-		errs = multierr.Append(errs, fmt.Errorf("metric family %q not found", ActiveLoRAAdaptersMetricName))
+		multierr.Append(errs, fmt.Errorf("metric family %q not found", ActiveLoRAAdaptersMetricName))
+	}
+
+	if loraMetrics != nil {
+		updated.Metrics.ActiveModels = make(map[string]int)
+		for _, label := range loraMetrics.GetLabel() {
+			if label.GetName() == "running_lora_adapters" {
+				if label.GetValue() != "" {
+					adapterList := strings.Split(label.GetValue(), ",")
+					for _, adapter := range adapterList {
+						updated.Metrics.ActiveModels[adapter] = 0
+					}
+				}
+			}
+		}
+
+	}
+
+	if loraMetrics != nil {
+		updated.CachedModels = make(map[string]int)
+		for _, label := range loraMetrics.GetLabel() {
+			if label.GetName() == "running_lora_adapters" {
+				if label.GetValue() != "" {
+					adapterList := strings.Split(label.GetValue(), ",")
+					for _, adapter := range adapterList {
+						updated.CachedModels[adapter] = 0
+					}
+				}
+			}
+		}
+
 	}
 
 	return updated, errs
 }
 
+func getLatestLoraMetric(loraRequests *dto.MetricFamily) (*dto.Metric, time.Time, error) {
+	var latestTs float64
+	var latest *dto.Metric
+	for _, m := range loraRequests.GetMetric() {
+		if m.GetGauge().GetValue() > latestTs {
+			latestTs = m.GetGauge().GetValue()
+			latest = m
+		}
+	}
+	return latest, time.Unix(0, int64(latestTs*1000)), nil
+}
+
 // getLatestMetric gets the latest metric of a family. This should be used to get the latest Gauge metric.
 func getLatestMetric(metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, time.Time, error) {
 	mf, ok := metricFamilies[metricName]
diff --git a/pkg/ext-proc/backend/vllm/metrics_test.go b/pkg/ext-proc/backend/vllm/metrics_test.go
@@ -0,0 +1,142 @@
+package vllm
+
+import (
+	"testing"
+
+	"inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend"
+
+	dto "github.com/prometheus/client_model/go"
+	"github.com/stretchr/testify/assert"
+	"google.golang.org/protobuf/proto"
+)
+
+func TestPromToPodMetrics(t *testing.T) {
+	testCases := []struct {
+		name              string
+		metricFamilies    map[string]*dto.MetricFamily
+		expectedMetrics   *backend.Metrics
+		expectedErr       error
+		initialPodMetrics *backend.PodMetrics
+	}{
+		{
+			name: "all metrics available",
+			metricFamilies: map[string]*dto.MetricFamily{
+				RunningQueueSizeMetricName: {
+					Metric: []*dto.Metric{
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(10),
+							},
+							TimestampMs: proto.Int64(100),
+						},
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(15),
+							},
+							TimestampMs: proto.Int64(200), // This is the latest
+						},
+					},
+				},
+				WaitingQueueSizeMetricName: {
+					Metric: []*dto.Metric{
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(20),
+							},
+							TimestampMs: proto.Int64(100),
+						},
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(25),
+							},
+							TimestampMs: proto.Int64(200), // This is the latest
+						},
+					},
+				},
+				KVCacheUsagePercentMetricName: {
+					Metric: []*dto.Metric{
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(0.8),
+							},
+							TimestampMs: proto.Int64(100),
+						},
+						{
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(0.9),
+							},
+							TimestampMs: proto.Int64(200), // This is the latest
+						},
+					},
+				},
+				ActiveLoRAAdaptersMetricName: {
+					Metric: []*dto.Metric{
+						{
+							Label: []*dto.LabelPair{
+								{
+									Name:  proto.String("active_adapters"),
+									Value: proto.String("lora1,lora2"),
+								},
+							},
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(100),
+							},
+						},
+					},
+				},
+				LoraRequestInfoMetricName: {
+					Metric: []*dto.Metric{
+						{
+							Label: []*dto.LabelPair{
+								{
+									Name:  proto.String("running_lora_adapters"),
+									Value: proto.String("lora3,lora4"),
+								},
+								{
+									Name:  proto.String("waiting_lora_adapters"),
+									Value: proto.String("lora1,lora4"),
+								},
+							},
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(100),
+							},
+						},
+						{
+							Label: []*dto.LabelPair{
+								{
+									Name:  proto.String("running_lora_adapters"),
+									Value: proto.String("lora2"),
+								},
+							},
+							Gauge: &dto.Gauge{
+								Value: proto.Float64(90),
+							},
+						},
+					},
+				},
+			},
+			expectedMetrics: &backend.Metrics{
+				RunningQueueSize:    15,
+				WaitingQueueSize:    25,
+				KVCacheUsagePercent: 0.9,
+				CachedModels: map[string]int{
+					"lora3": 0,
+					"lora4": 0,
+				},
+			},
+			initialPodMetrics: &backend.PodMetrics{},
+			expectedErr:       nil,
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			updated, err := promToPodMetrics(tc.metricFamilies, tc.initialPodMetrics)
+			if tc.expectedErr != nil {
+				assert.Error(t, err)
+			} else {
+				assert.NoError(t, err)
+				assert.Equal(t, tc.expectedMetrics, &updated.Metrics)
+			}
+		})
+	}
+}