From 07c9b9c37eb8896094ebe9782d4254cc3266dd20 Mon Sep 17 00:00:00 2001 From: Kunjan Patel Date: Sun, 10 Nov 2024 13:03:07 -0800 Subject: [PATCH 1/2] Get running adapters from latest series in new metric Signed-off-by: Kunjan Patel --- .../manifests/vllm/vllm-lora-deployment.yaml | 3 +- pkg/ext-proc/backend/vllm/metrics.go | 51 ++++++- pkg/ext-proc/backend/vllm/metrics_test.go | 142 ++++++++++++++++++ 3 files changed, 192 insertions(+), 4 deletions(-) create mode 100644 pkg/ext-proc/backend/vllm/metrics_test.go diff --git a/examples/poc/manifests/vllm/vllm-lora-deployment.yaml b/examples/poc/manifests/vllm/vllm-lora-deployment.yaml index 4f990c9e..1996c153 100644 --- a/examples/poc/manifests/vllm/vllm-lora-deployment.yaml +++ b/examples/poc/manifests/vllm/vllm-lora-deployment.yaml @@ -30,7 +30,7 @@ spec: spec: containers: - name: lora - image: "ghcr.io/tomatillo-and-multiverse/vllm:demo" + image: "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:ac04a97a9fbc122bb14ff4eb590314d453cdf57c" imagePullPolicy: Always command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] args: @@ -40,7 +40,6 @@ spec: - "1" - "--port" - "8000" - - "--disable-log-requests" - "--enable-lora" - "--max-loras" - "4" diff --git a/pkg/ext-proc/backend/vllm/metrics.go b/pkg/ext-proc/backend/vllm/metrics.go index f4540adf..0246bca7 100644 --- a/pkg/ext-proc/backend/vllm/metrics.go +++ b/pkg/ext-proc/backend/vllm/metrics.go @@ -17,7 +17,9 @@ import ( ) const ( - ActiveLoRAAdaptersMetricName = "vllm:info_active_adapters_info" + ActiveLoRAAdaptersMetricName = "vllm:info_active_adapters_info" + LoraRequestInfoMetricName = "vllm:lora_requests_info" + LoRAAdapterPendingRequestMetricName = "vllm:active_lora_adapters" // TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork. RunningQueueSizeMetricName = "vllm:num_requests_running" WaitingQueueSizeMetricName = "vllm:num_requests_waiting" @@ -82,6 +84,9 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac if err == nil { updated.KVCacheUsagePercent = cachePercent.GetGauge().GetValue() } + + loraMetrics, _, err := getLatestLoraMetric(metricFamilies[LoraRequestInfoMetricName]) + multierr.Append(errs, err) /* TODO: uncomment once this is available in vllm. kvCap, _, err := getGaugeLatestValue(metricFamilies, KvCacheMaxTokenCapacityMetricName) errs = multierr.Append(errs, err) @@ -112,12 +117,54 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac } } else { klog.Warningf("metric family %q not found", ActiveLoRAAdaptersMetricName) - errs = multierr.Append(errs, fmt.Errorf("metric family %q not found", ActiveLoRAAdaptersMetricName)) + multierr.Append(errs, fmt.Errorf("metric family %q not found", ActiveLoRAAdaptersMetricName)) + } + + if loraMetrics != nil { + updated.Metrics.ActiveModels = make(map[string]int) + for _, label := range loraMetrics.GetLabel() { + if label.GetName() == "running_lora_adapters" { + if label.GetValue() != "" { + adapterList := strings.Split(label.GetValue(), ",") + for _, adapter := range adapterList { + updated.Metrics.ActiveModels[adapter] = 0 + } + } + } + } + + } + + if loraMetrics != nil { + updated.CachedModels = make(map[string]int) + for _, label := range loraMetrics.GetLabel() { + if label.GetName() == "running_lora_adapters" { + if label.GetValue() != "" { + adapterList := strings.Split(label.GetValue(), ",") + for _, adapter := range adapterList { + updated.CachedModels[adapter] = 0 + } + } + } + } + } return updated, errs } +func getLatestLoraMetric(loraRequests *dto.MetricFamily) (*dto.Metric, time.Time, error) { + var latestTs float64 + var latest *dto.Metric + for _, m := range loraRequests.GetMetric() { + if m.GetGauge().GetValue() > latestTs { + latestTs = m.GetGauge().GetValue() + latest = m + } + } + return latest, time.Unix(0, int64(latestTs*1000)), nil +} + // getLatestMetric gets the latest metric of a family. This should be used to get the latest Gauge metric. func getLatestMetric(metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, time.Time, error) { mf, ok := metricFamilies[metricName] diff --git a/pkg/ext-proc/backend/vllm/metrics_test.go b/pkg/ext-proc/backend/vllm/metrics_test.go new file mode 100644 index 00000000..1e0da622 --- /dev/null +++ b/pkg/ext-proc/backend/vllm/metrics_test.go @@ -0,0 +1,142 @@ +package vllm + +import ( + "testing" + + "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend" + + dto "github.com/prometheus/client_model/go" + "github.com/stretchr/testify/assert" + "google.golang.org/protobuf/proto" +) + +func TestPromToPodMetrics(t *testing.T) { + testCases := []struct { + name string + metricFamilies map[string]*dto.MetricFamily + expectedMetrics *backend.Metrics + expectedErr error + initialPodMetrics *backend.PodMetrics + }{ + { + name: "all metrics available", + metricFamilies: map[string]*dto.MetricFamily{ + RunningQueueSizeMetricName: { + Metric: []*dto.Metric{ + { + Gauge: &dto.Gauge{ + Value: proto.Float64(10), + }, + TimestampMs: proto.Int64(100), + }, + { + Gauge: &dto.Gauge{ + Value: proto.Float64(15), + }, + TimestampMs: proto.Int64(200), // This is the latest + }, + }, + }, + WaitingQueueSizeMetricName: { + Metric: []*dto.Metric{ + { + Gauge: &dto.Gauge{ + Value: proto.Float64(20), + }, + TimestampMs: proto.Int64(100), + }, + { + Gauge: &dto.Gauge{ + Value: proto.Float64(25), + }, + TimestampMs: proto.Int64(200), // This is the latest + }, + }, + }, + KVCacheUsagePercentMetricName: { + Metric: []*dto.Metric{ + { + Gauge: &dto.Gauge{ + Value: proto.Float64(0.8), + }, + TimestampMs: proto.Int64(100), + }, + { + Gauge: &dto.Gauge{ + Value: proto.Float64(0.9), + }, + TimestampMs: proto.Int64(200), // This is the latest + }, + }, + }, + ActiveLoRAAdaptersMetricName: { + Metric: []*dto.Metric{ + { + Label: []*dto.LabelPair{ + { + Name: proto.String("active_adapters"), + Value: proto.String("lora1,lora2"), + }, + }, + Gauge: &dto.Gauge{ + Value: proto.Float64(100), + }, + }, + }, + }, + LoraRequestInfoMetricName: { + Metric: []*dto.Metric{ + { + Label: []*dto.LabelPair{ + { + Name: proto.String("running_lora_adapters"), + Value: proto.String("lora3,lora4"), + }, + { + Name: proto.String("waiting_lora_adapters"), + Value: proto.String("lora1,lora4"), + }, + }, + Gauge: &dto.Gauge{ + Value: proto.Float64(100), + }, + }, + { + Label: []*dto.LabelPair{ + { + Name: proto.String("running_lora_adapters"), + Value: proto.String("lora2"), + }, + }, + Gauge: &dto.Gauge{ + Value: proto.Float64(90), + }, + }, + }, + }, + }, + expectedMetrics: &backend.Metrics{ + RunningQueueSize: 15, + WaitingQueueSize: 25, + KVCacheUsagePercent: 0.9, + CachedModels: map[string]int{ + "lora3": 0, + "lora4": 0, + }, + }, + initialPodMetrics: &backend.PodMetrics{}, + expectedErr: nil, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + updated, err := promToPodMetrics(tc.metricFamilies, tc.initialPodMetrics) + if tc.expectedErr != nil { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.Equal(t, tc.expectedMetrics, &updated.Metrics) + } + }) + } +} From 3f5c7bb4de35c3d7cc02d9eacba8e3ac81944741 Mon Sep 17 00:00:00 2001 From: Kunjan Patel Date: Sun, 10 Nov 2024 14:01:52 -0800 Subject: [PATCH 2/2] Get running adapters from latest series in new metric, add table driven test function, delete old metrics Signed-off-by: Kunjan Patel Signed-off-by: Kunjan --- .../manifests/vllm/vllm-lora-deployment.yaml | 2 +- go.mod | 2 + pkg/ext-proc/backend/vllm/metrics.go | 72 ++++-------- pkg/ext-proc/backend/vllm/metrics_test.go | 108 ++++++++++++++++-- 4 files changed, 127 insertions(+), 57 deletions(-) diff --git a/examples/poc/manifests/vllm/vllm-lora-deployment.yaml b/examples/poc/manifests/vllm/vllm-lora-deployment.yaml index 1996c153..a453eb7e 100644 --- a/examples/poc/manifests/vllm/vllm-lora-deployment.yaml +++ b/examples/poc/manifests/vllm/vllm-lora-deployment.yaml @@ -30,7 +30,7 @@ spec: spec: containers: - name: lora - image: "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:ac04a97a9fbc122bb14ff4eb590314d453cdf57c" + image: "vllm/vllm-openai:latest" imagePullPolicy: Always command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] args: diff --git a/go.mod b/go.mod index 6daef914..5df490da 100644 --- a/go.mod +++ b/go.mod @@ -13,6 +13,7 @@ require ( github.com/onsi/gomega v1.36.0 github.com/prometheus/client_model v0.6.1 github.com/prometheus/common v0.61.0 + github.com/stretchr/testify v1.10.0 go.uber.org/multierr v1.11.0 google.golang.org/grpc v1.68.0 google.golang.org/protobuf v1.35.2 @@ -70,6 +71,7 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_golang v1.20.4 // indirect github.com/prometheus/procfs v0.15.1 // indirect github.com/shopspring/decimal v1.2.0 // indirect diff --git a/pkg/ext-proc/backend/vllm/metrics.go b/pkg/ext-proc/backend/vllm/metrics.go index 0246bca7..bd594df9 100644 --- a/pkg/ext-proc/backend/vllm/metrics.go +++ b/pkg/ext-proc/backend/vllm/metrics.go @@ -5,21 +5,21 @@ import ( "context" "fmt" "net/http" + "strconv" "strings" "time" - "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend" - dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" "go.uber.org/multierr" + "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend" klog "k8s.io/klog/v2" ) const ( - ActiveLoRAAdaptersMetricName = "vllm:info_active_adapters_info" - LoraRequestInfoMetricName = "vllm:lora_requests_info" - LoRAAdapterPendingRequestMetricName = "vllm:active_lora_adapters" + LoraRequestInfoMetricName = "vllm:lora_requests_info" + LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters" + LoraRequestInfoMaxAdaptersMetricName = "max_lora" // TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork. RunningQueueSizeMetricName = "vllm:num_requests_running" WaitingQueueSizeMetricName = "vllm:num_requests_waiting" @@ -85,8 +85,8 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac updated.KVCacheUsagePercent = cachePercent.GetGauge().GetValue() } - loraMetrics, _, err := getLatestLoraMetric(metricFamilies[LoraRequestInfoMetricName]) - multierr.Append(errs, err) + loraMetrics, _, err := getLatestLoraMetric(metricFamilies) + errs = multierr.Append(errs, err) /* TODO: uncomment once this is available in vllm. kvCap, _, err := getGaugeLatestValue(metricFamilies, KvCacheMaxTokenCapacityMetricName) errs = multierr.Append(errs, err) @@ -95,54 +95,22 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac } */ - // TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/22): Read from vLLM metrics once the is available. - updated.MaxActiveModels = 4 - - // Update active loras - mf, ok := metricFamilies[ActiveLoRAAdaptersMetricName] - if ok { - // IMPORTANT: replace the map entries instead of appending to it. - updated.ActiveModels = make(map[string]int) - for _, metric := range mf.GetMetric() { - for _, label := range metric.GetLabel() { - if label.GetName() == "active_adapters" { - if label.GetValue() != "" { - adapterList := strings.Split(label.GetValue(), ",") - for _, adapter := range adapterList { - updated.ActiveModels[adapter] = 0 - } - } - } - } - } - } else { - klog.Warningf("metric family %q not found", ActiveLoRAAdaptersMetricName) - multierr.Append(errs, fmt.Errorf("metric family %q not found", ActiveLoRAAdaptersMetricName)) - } - if loraMetrics != nil { - updated.Metrics.ActiveModels = make(map[string]int) + updated.ActiveModels = make(map[string]int) for _, label := range loraMetrics.GetLabel() { - if label.GetName() == "running_lora_adapters" { + if label.GetName() == LoraRequestInfoRunningAdaptersMetricName { if label.GetValue() != "" { adapterList := strings.Split(label.GetValue(), ",") for _, adapter := range adapterList { - updated.Metrics.ActiveModels[adapter] = 0 + updated.ActiveModels[adapter] = 0 } } } - } - - } - - if loraMetrics != nil { - updated.CachedModels = make(map[string]int) - for _, label := range loraMetrics.GetLabel() { - if label.GetName() == "running_lora_adapters" { + if label.GetName() == LoraRequestInfoMaxAdaptersMetricName { if label.GetValue() != "" { - adapterList := strings.Split(label.GetValue(), ",") - for _, adapter := range adapterList { - updated.CachedModels[adapter] = 0 + updated.MaxActiveModels, err = strconv.Atoi(label.GetValue()) + if err != nil { + errs = multierr.Append(errs, err) } } } @@ -153,7 +121,16 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac return updated, errs } -func getLatestLoraMetric(loraRequests *dto.MetricFamily) (*dto.Metric, time.Time, error) { +// getLatestLoraMetric gets latest lora metric series in gauge metric family `vllm:lora_requests_info` +// reason its specially fetched is because each label key value pair permutation generates new series +// and only most recent is useful. The value of each series is the creation timestamp so we can +// retrieve the latest by sorting the value. +func getLatestLoraMetric(metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) { + loraRequests, ok := metricFamilies[LoraRequestInfoMetricName] + if !ok { + klog.Warningf("metric family %q not found", LoraRequestInfoMetricName) + return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName) + } var latestTs float64 var latest *dto.Metric for _, m := range loraRequests.GetMetric() { @@ -166,6 +143,7 @@ func getLatestLoraMetric(loraRequests *dto.MetricFamily) (*dto.Metric, time.Time } // getLatestMetric gets the latest metric of a family. This should be used to get the latest Gauge metric. +// Since vllm doesn't set the timestamp in metric, this metric essentially gets the first metric. func getLatestMetric(metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, time.Time, error) { mf, ok := metricFamilies[metricName] if !ok { diff --git a/pkg/ext-proc/backend/vllm/metrics_test.go b/pkg/ext-proc/backend/vllm/metrics_test.go index 1e0da622..f6ac403f 100644 --- a/pkg/ext-proc/backend/vllm/metrics_test.go +++ b/pkg/ext-proc/backend/vllm/metrics_test.go @@ -1,6 +1,7 @@ package vllm import ( + "fmt" "testing" "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend" @@ -69,19 +70,103 @@ func TestPromToPodMetrics(t *testing.T) { }, }, }, - ActiveLoRAAdaptersMetricName: { + LoraRequestInfoMetricName: { Metric: []*dto.Metric{ { Label: []*dto.LabelPair{ { - Name: proto.String("active_adapters"), - Value: proto.String("lora1,lora2"), + Name: proto.String(LoraRequestInfoRunningAdaptersMetricName), + Value: proto.String("lora3,lora4"), + }, + { + Name: proto.String(LoraRequestInfoMaxAdaptersMetricName), + Value: proto.String("2"), }, }, Gauge: &dto.Gauge{ Value: proto.Float64(100), }, }, + { + Label: []*dto.LabelPair{ + { + Name: proto.String(LoraRequestInfoRunningAdaptersMetricName), + Value: proto.String("lora2"), + }, + { + Name: proto.String(LoraRequestInfoMaxAdaptersMetricName), + Value: proto.String("2"), + }, + }, + Gauge: &dto.Gauge{ + Value: proto.Float64(90), + }, + }, + }, + }, + }, + expectedMetrics: &backend.Metrics{ + RunningQueueSize: 15, + WaitingQueueSize: 25, + KVCacheUsagePercent: 0.9, + ActiveModels: map[string]int{ + "lora3": 0, + "lora4": 0, + }, + MaxActiveModels: 2, + }, + initialPodMetrics: &backend.PodMetrics{}, + expectedErr: nil, + }, + { + name: "invalid max lora", + metricFamilies: map[string]*dto.MetricFamily{ + RunningQueueSizeMetricName: { + Metric: []*dto.Metric{ + { + Gauge: &dto.Gauge{ + Value: proto.Float64(10), + }, + TimestampMs: proto.Int64(100), + }, + { + Gauge: &dto.Gauge{ + Value: proto.Float64(15), + }, + TimestampMs: proto.Int64(200), // This is the latest + }, + }, + }, + WaitingQueueSizeMetricName: { + Metric: []*dto.Metric{ + { + Gauge: &dto.Gauge{ + Value: proto.Float64(20), + }, + TimestampMs: proto.Int64(100), + }, + { + Gauge: &dto.Gauge{ + Value: proto.Float64(25), + }, + TimestampMs: proto.Int64(200), // This is the latest + }, + }, + }, + KVCacheUsagePercentMetricName: { + Metric: []*dto.Metric{ + { + Gauge: &dto.Gauge{ + Value: proto.Float64(0.8), + }, + TimestampMs: proto.Int64(100), + }, + { + Gauge: &dto.Gauge{ + Value: proto.Float64(0.9), + }, + TimestampMs: proto.Int64(200), // This is the latest + }, }, }, LoraRequestInfoMetricName: { @@ -89,12 +174,12 @@ func TestPromToPodMetrics(t *testing.T) { { Label: []*dto.LabelPair{ { - Name: proto.String("running_lora_adapters"), + Name: proto.String(LoraRequestInfoRunningAdaptersMetricName), Value: proto.String("lora3,lora4"), }, { - Name: proto.String("waiting_lora_adapters"), - Value: proto.String("lora1,lora4"), + Name: proto.String(LoraRequestInfoRunningAdaptersMetricName), + Value: proto.String("2a"), }, }, Gauge: &dto.Gauge{ @@ -104,9 +189,13 @@ func TestPromToPodMetrics(t *testing.T) { { Label: []*dto.LabelPair{ { - Name: proto.String("running_lora_adapters"), + Name: proto.String(LoraRequestInfoRunningAdaptersMetricName), Value: proto.String("lora2"), }, + { + Name: proto.String(LoraRequestInfoMaxAdaptersMetricName), + Value: proto.String("2"), + }, }, Gauge: &dto.Gauge{ Value: proto.Float64(90), @@ -119,13 +208,14 @@ func TestPromToPodMetrics(t *testing.T) { RunningQueueSize: 15, WaitingQueueSize: 25, KVCacheUsagePercent: 0.9, - CachedModels: map[string]int{ + ActiveModels: map[string]int{ "lora3": 0, "lora4": 0, }, + MaxActiveModels: 0, }, initialPodMetrics: &backend.PodMetrics{}, - expectedErr: nil, + expectedErr: fmt.Errorf("strconv.Atoi: parsing '2a': invalid syntax"), }, } for _, tc := range testCases {