From ff259d80f5da49cdfef8d8ec85c18505870ae263 Mon Sep 17 00:00:00 2001 From: Jie Wu Date: Thu, 12 Dec 2024 03:16:11 +0000 Subject: [PATCH 01/25] add request metrics --- go.mod | 3 + go.sum | 4 + pkg/ext-proc/metrics/metrics.go | 68 ++++++++++++++++ pkg/ext-proc/metrics/metrics_test.go | 78 +++++++++++++++++++ .../testdata/request_duration_seconds_metic | 0 .../metrics/testdata/request_sizes_metric | 0 .../metrics/testdata/request_total_metric | 5 ++ pkg/ext-proc/scheduling/filter.go | 6 +- 8 files changed, 161 insertions(+), 3 deletions(-) create mode 100644 pkg/ext-proc/metrics/metrics.go create mode 100644 pkg/ext-proc/metrics/metrics_test.go create mode 100644 pkg/ext-proc/metrics/testdata/request_duration_seconds_metic create mode 100644 pkg/ext-proc/metrics/testdata/request_sizes_metric create mode 100644 pkg/ext-proc/metrics/testdata/request_total_metric diff --git a/go.mod b/go.mod index 5df490da..e3b330c4 100644 --- a/go.mod +++ b/go.mod @@ -21,6 +21,7 @@ require ( k8s.io/apimachinery v0.31.3 k8s.io/client-go v0.31.3 k8s.io/code-generator v0.31.3 + k8s.io/component-base v0.31.3 k8s.io/klog/v2 v2.130.1 sigs.k8s.io/controller-runtime v0.19.3 sigs.k8s.io/structured-merge-diff/v4 v4.4.3 @@ -35,6 +36,7 @@ require ( github.com/Masterminds/sprig/v3 v3.2.3 // indirect github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 // indirect github.com/beorn7/perks v1.0.1 // indirect + github.com/blang/semver/v4 v4.0.0 // indirect github.com/bufbuild/protocompile v0.14.1 // indirect github.com/census-instrumentation/opencensus-proto v0.4.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect @@ -63,6 +65,7 @@ require ( github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/compress v1.17.9 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/mitchellh/copystructure v1.0.0 // indirect github.com/mitchellh/reflectwalk v1.0.1 // indirect diff --git a/go.sum b/go.sum index 66145152..8abb6124 100644 --- a/go.sum +++ b/go.sum @@ -15,6 +15,8 @@ github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 h1:JYp7IbQjafo github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= +github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/bojand/ghz v0.120.0 h1:6F4wsmZVwFg5UnD+/R+IABWk6sKE/0OKIBdUQUZnOdo= github.com/bojand/ghz v0.120.0/go.mod h1:HfECuBZj1v02XObGnRuoZgyB1PR24/25dIYiJIMjJnE= github.com/bufbuild/protocompile v0.14.1 h1:iA73zAf/fyljNjQKwYzUHD6AD4R8KMasmwa/FBatYVw= @@ -264,6 +266,8 @@ k8s.io/client-go v0.31.3 h1:CAlZuM+PH2cm+86LOBemaJI/lQ5linJ6UFxKX/SoG+4= k8s.io/client-go v0.31.3/go.mod h1:2CgjPUTpv3fE5dNygAr2NcM8nhHzXvxB8KL5gYc3kJs= k8s.io/code-generator v0.31.3 h1:Pj0fYOBms+ZrsulLi4DMsCEx1jG8fWKRLy44onHsLBI= k8s.io/code-generator v0.31.3/go.mod h1:/umCIlT84g1+Yu5ZXtP1KGSRTnGiIzzX5AzUAxsNlts= +k8s.io/component-base v0.31.3 h1:DMCXXVx546Rfvhj+3cOm2EUxhS+EyztH423j+8sOwhQ= +k8s.io/component-base v0.31.3/go.mod h1:xME6BHfUOafRgT0rGVBGl7TuSg8Z9/deT7qq6w7qjIU= k8s.io/gengo/v2 v2.0.0-20240228010128-51d4e06bde70 h1:NGrVE502P0s0/1hudf8zjgwki1X/TByhmAoILTarmzo= k8s.io/gengo/v2 v2.0.0-20240228010128-51d4e06bde70/go.mod h1:VH3AT8AaQOqiGjMF9p0/IM1Dj+82ZwjfxUP1IxaHE+8= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= diff --git a/pkg/ext-proc/metrics/metrics.go b/pkg/ext-proc/metrics/metrics.go new file mode 100644 index 00000000..cae38cf0 --- /dev/null +++ b/pkg/ext-proc/metrics/metrics.go @@ -0,0 +1,68 @@ +package metrics + +import ( + "sync" + "time" + + compbasemetrics "k8s.io/component-base/metrics" + "k8s.io/component-base/metrics/legacyregistry" +) + +const ( + LLMServiceModelComponent = "llmservice_model" +) + +var ( + requestCounter = compbasemetrics.NewCounterVec( + &compbasemetrics.CounterOpts{ + Subsystem: LLMServiceModelComponent, + Name: "request_total", + Help: "Counter of LLM service requests broken out for each model and target model.", + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"llmservice_name", "model_name", "target_model_name"}, + ) + + requestLatencies = compbasemetrics.NewHistogramVec( + &compbasemetrics.HistogramOpts{ + Subsystem: LLMServiceModelComponent, + Name: "request_duration_seconds", + Help: "LLM service response latency distribution in seconds for each model and target model.", + Buckets: []float64{0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3, + 4, 5, 6, 8, 10, 15, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600}, + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"llmservice_name", "model_name", "target_model_name"}, + ) + + requestSizes = compbasemetrics.NewHistogramVec( + &compbasemetrics.HistogramOpts{ + Subsystem: LLMServiceModelComponent, + Name: "request_sizes", + Help: "LLM service requests size distribution in bytes for each model and target model.", + // Use buckets ranging from 1000 bytes (1KB) to 10^9 bytes (1GB). + Buckets: compbasemetrics.ExponentialBuckets(1000, 10.0, 7), + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"llmservice_name", "model_name", "target_model_name"}, + ) +) + +var registerMetrics sync.Once + +// Register all metrics. +func Register() { + registerMetrics.Do(func() { + legacyregistry.MustRegister(requestCounter) + legacyregistry.MustRegister(requestLatencies) + legacyregistry.MustRegister(requestSizes) + }) +} + +// MonitorRequest handles monitoring requests. +func MonitorRequest(llmserviceName, modelName, targetModelName string, reqSize int, elapsed time.Duration) { + elapsedSeconds := elapsed.Seconds() + requestCounter.WithLabelValues(llmserviceName, modelName, targetModelName).Inc() + requestLatencies.WithLabelValues(llmserviceName, modelName, targetModelName).Observe(elapsedSeconds) + requestSizes.WithLabelValues(llmserviceName, modelName, targetModelName).Observe(float64(reqSize)) +} diff --git a/pkg/ext-proc/metrics/metrics_test.go b/pkg/ext-proc/metrics/metrics_test.go new file mode 100644 index 00000000..d980a1cf --- /dev/null +++ b/pkg/ext-proc/metrics/metrics_test.go @@ -0,0 +1,78 @@ +package metrics + +import ( + "os" + "testing" + "time" + + "k8s.io/component-base/metrics/legacyregistry" + "k8s.io/component-base/metrics/testutil" +) + +const RequestTotalMetric = LLMServiceModelComponent + "_request_total" + +func TestMonitorRequest(t *testing.T) { + type requests struct { + llmserviceName string + modelName string + targetModelName string + reqSize int + elapsed time.Duration + } + scenarios := []struct { + name string + reqs []requests + }{{ + name: "multiple requests", + reqs: []requests{ + { + llmserviceName: "s10", + modelName: "m10", + targetModelName: "t10", + reqSize: 10, + elapsed: time.Millisecond * 10, + }, + { + llmserviceName: "s10", + modelName: "m10", + targetModelName: "t10", + reqSize: 20, + elapsed: time.Millisecond * 20, + }, + { + llmserviceName: "s10", + modelName: "m10", + targetModelName: "t11", + reqSize: 30, + elapsed: time.Millisecond * 30, + }, + { + llmserviceName: "s20", + modelName: "m20", + targetModelName: "t20", + reqSize: 40, + elapsed: time.Millisecond * 40, + }, + }, + }} + Register() + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + for _, req := range scenario.reqs { + MonitorRequest(req.llmserviceName, req.modelName, req.targetModelName, req.reqSize, req.elapsed) + } + wantRequestTotal, err := os.Open("testdata/request_total_metric") + defer func() { + if err := wantRequestTotal.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestTotal, RequestTotalMetric); err != nil { + t.Error(err) + } + }) + } +} diff --git a/pkg/ext-proc/metrics/testdata/request_duration_seconds_metic b/pkg/ext-proc/metrics/testdata/request_duration_seconds_metic new file mode 100644 index 00000000..e69de29b diff --git a/pkg/ext-proc/metrics/testdata/request_sizes_metric b/pkg/ext-proc/metrics/testdata/request_sizes_metric new file mode 100644 index 00000000..e69de29b diff --git a/pkg/ext-proc/metrics/testdata/request_total_metric b/pkg/ext-proc/metrics/testdata/request_total_metric new file mode 100644 index 00000000..f31feb65 --- /dev/null +++ b/pkg/ext-proc/metrics/testdata/request_total_metric @@ -0,0 +1,5 @@ +# HELP llmservice_model_request_total [ALPHA] Counter of LLM service requests broken out for each model and target model. +# TYPE llmservice_model_request_total counter +llmservice_model_request_total{llmservice_name="s10", model_name="m10", target_model_name="t10"} 2 +llmservice_model_request_total{llmservice_name="s10", model_name="m10", target_model_name="t11"} 1 +llmservice_model_request_total{llmservice_name="s20", model_name="m20", target_model_name="t20"} 1 diff --git a/pkg/ext-proc/scheduling/filter.go b/pkg/ext-proc/scheduling/filter.go index 55330884..bcc4432b 100644 --- a/pkg/ext-proc/scheduling/filter.go +++ b/pkg/ext-proc/scheduling/filter.go @@ -157,9 +157,9 @@ func leastKVCacheFilterFunc(req *LLMRequest, pods []*backend.PodMetrics) ([]*bac type podPredicate func(req *LLMRequest, pod *backend.PodMetrics) bool // We consider serving an adapter low cost it the adapter is active in the model server, or the -// model server has room to load the adapter. The lowLoRACostPredicate ensures weak affinity by spreading the -// load of a LoRA adapter across multiple pods, avoiding "pinning" all requests to a single pod. -// This gave good performance in our initial benchmarking results in the scenario where # of lora slots > # of lora adapters. +// model server has room to load the adapter. The lowLoRACostPredicate ensures weak affinity by spreading the +// load of a LoRA adapter across multiple pods, avoiding "pinning" all requests to a single pod. +// This gave good performance in our initial benchmarking results in the scenario where # of lora slots > # of lora adapters. func lowLoRACostPredicate(req *LLMRequest, pod *backend.PodMetrics) bool { _, ok := pod.ActiveModels[req.ResolvedTargetModel] return ok || len(pod.ActiveModels) < pod.MaxActiveModels From bb98cdc996d87cd36f37ebcc5998619f87c98d0a Mon Sep 17 00:00:00 2001 From: Jie Wu Date: Thu, 12 Dec 2024 03:16:11 +0000 Subject: [PATCH 02/25] add request metrics --- go.mod | 3 + go.sum | 4 + pkg/ext-proc/metrics/metrics.go | 72 +++++++++++ pkg/ext-proc/metrics/metrics_test.go | 105 ++++++++++++++++ .../testdata/request_duration_seconds_metric | 116 ++++++++++++++++++ .../metrics/testdata/request_sizes_metric | 86 +++++++++++++ .../metrics/testdata/request_total_metric | 5 + pkg/ext-proc/scheduling/filter.go | 6 +- 8 files changed, 394 insertions(+), 3 deletions(-) create mode 100644 pkg/ext-proc/metrics/metrics.go create mode 100644 pkg/ext-proc/metrics/metrics_test.go create mode 100644 pkg/ext-proc/metrics/testdata/request_duration_seconds_metric create mode 100644 pkg/ext-proc/metrics/testdata/request_sizes_metric create mode 100644 pkg/ext-proc/metrics/testdata/request_total_metric diff --git a/go.mod b/go.mod index 5df490da..e3b330c4 100644 --- a/go.mod +++ b/go.mod @@ -21,6 +21,7 @@ require ( k8s.io/apimachinery v0.31.3 k8s.io/client-go v0.31.3 k8s.io/code-generator v0.31.3 + k8s.io/component-base v0.31.3 k8s.io/klog/v2 v2.130.1 sigs.k8s.io/controller-runtime v0.19.3 sigs.k8s.io/structured-merge-diff/v4 v4.4.3 @@ -35,6 +36,7 @@ require ( github.com/Masterminds/sprig/v3 v3.2.3 // indirect github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 // indirect github.com/beorn7/perks v1.0.1 // indirect + github.com/blang/semver/v4 v4.0.0 // indirect github.com/bufbuild/protocompile v0.14.1 // indirect github.com/census-instrumentation/opencensus-proto v0.4.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect @@ -63,6 +65,7 @@ require ( github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/compress v1.17.9 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/mitchellh/copystructure v1.0.0 // indirect github.com/mitchellh/reflectwalk v1.0.1 // indirect diff --git a/go.sum b/go.sum index 66145152..8abb6124 100644 --- a/go.sum +++ b/go.sum @@ -15,6 +15,8 @@ github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 h1:JYp7IbQjafo github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= +github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/bojand/ghz v0.120.0 h1:6F4wsmZVwFg5UnD+/R+IABWk6sKE/0OKIBdUQUZnOdo= github.com/bojand/ghz v0.120.0/go.mod h1:HfECuBZj1v02XObGnRuoZgyB1PR24/25dIYiJIMjJnE= github.com/bufbuild/protocompile v0.14.1 h1:iA73zAf/fyljNjQKwYzUHD6AD4R8KMasmwa/FBatYVw= @@ -264,6 +266,8 @@ k8s.io/client-go v0.31.3 h1:CAlZuM+PH2cm+86LOBemaJI/lQ5linJ6UFxKX/SoG+4= k8s.io/client-go v0.31.3/go.mod h1:2CgjPUTpv3fE5dNygAr2NcM8nhHzXvxB8KL5gYc3kJs= k8s.io/code-generator v0.31.3 h1:Pj0fYOBms+ZrsulLi4DMsCEx1jG8fWKRLy44onHsLBI= k8s.io/code-generator v0.31.3/go.mod h1:/umCIlT84g1+Yu5ZXtP1KGSRTnGiIzzX5AzUAxsNlts= +k8s.io/component-base v0.31.3 h1:DMCXXVx546Rfvhj+3cOm2EUxhS+EyztH423j+8sOwhQ= +k8s.io/component-base v0.31.3/go.mod h1:xME6BHfUOafRgT0rGVBGl7TuSg8Z9/deT7qq6w7qjIU= k8s.io/gengo/v2 v2.0.0-20240228010128-51d4e06bde70 h1:NGrVE502P0s0/1hudf8zjgwki1X/TByhmAoILTarmzo= k8s.io/gengo/v2 v2.0.0-20240228010128-51d4e06bde70/go.mod h1:VH3AT8AaQOqiGjMF9p0/IM1Dj+82ZwjfxUP1IxaHE+8= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= diff --git a/pkg/ext-proc/metrics/metrics.go b/pkg/ext-proc/metrics/metrics.go new file mode 100644 index 00000000..fe879724 --- /dev/null +++ b/pkg/ext-proc/metrics/metrics.go @@ -0,0 +1,72 @@ +package metrics + +import ( + "sync" + "time" + + compbasemetrics "k8s.io/component-base/metrics" + "k8s.io/component-base/metrics/legacyregistry" +) + +const ( + LLMServiceModelComponent = "llmservice_model" +) + +var ( + requestCounter = compbasemetrics.NewCounterVec( + &compbasemetrics.CounterOpts{ + Subsystem: LLMServiceModelComponent, + Name: "request_total", + Help: "Counter of LLM service requests broken out for each model and target model.", + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"llmservice_name", "model_name", "target_model_name"}, + ) + + requestLatencies = compbasemetrics.NewHistogramVec( + &compbasemetrics.HistogramOpts{ + Subsystem: LLMServiceModelComponent, + Name: "request_duration_seconds", + Help: "LLM service response latency distribution in seconds for each model and target model.", + Buckets: []float64{0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3, + 4, 5, 6, 8, 10, 15, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600}, + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"llmservice_name", "model_name", "target_model_name"}, + ) + + requestSizes = compbasemetrics.NewHistogramVec( + &compbasemetrics.HistogramOpts{ + Subsystem: LLMServiceModelComponent, + Name: "request_sizes", + Help: "LLM service requests size distribution in bytes for each model and target model.", + // Use buckets ranging from 1000 bytes (1KB) to 10^9 bytes (1GB). + Buckets: []float64{ + 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, // More fine-grained up to 64KB + 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, // Exponential up to 8MB + 16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, // Exponential up to 1GB + }, + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"llmservice_name", "model_name", "target_model_name"}, + ) +) + +var registerMetrics sync.Once + +// Register all metrics. +func Register() { + registerMetrics.Do(func() { + legacyregistry.MustRegister(requestCounter) + legacyregistry.MustRegister(requestLatencies) + legacyregistry.MustRegister(requestSizes) + }) +} + +// MonitorRequest handles monitoring requests. +func MonitorRequest(llmserviceName, modelName, targetModelName string, reqSize int, elapsed time.Duration) { + elapsedSeconds := elapsed.Seconds() + requestCounter.WithLabelValues(llmserviceName, modelName, targetModelName).Inc() + requestLatencies.WithLabelValues(llmserviceName, modelName, targetModelName).Observe(elapsedSeconds) + requestSizes.WithLabelValues(llmserviceName, modelName, targetModelName).Observe(float64(reqSize)) +} diff --git a/pkg/ext-proc/metrics/metrics_test.go b/pkg/ext-proc/metrics/metrics_test.go new file mode 100644 index 00000000..4d33c6f0 --- /dev/null +++ b/pkg/ext-proc/metrics/metrics_test.go @@ -0,0 +1,105 @@ +package metrics + +import ( + "os" + "testing" + "time" + + "k8s.io/component-base/metrics/legacyregistry" + "k8s.io/component-base/metrics/testutil" +) + +const RequestTotalMetric = LLMServiceModelComponent + "_request_total" +const RequestLatenciesMetric = LLMServiceModelComponent + "_request_duration_seconds" +const RequestSizesMetric = LLMServiceModelComponent + "_request_sizes" + +func TestMonitorRequest(t *testing.T) { + type requests struct { + llmserviceName string + modelName string + targetModelName string + reqSize int + elapsed time.Duration + } + scenarios := []struct { + name string + reqs []requests + }{{ + name: "multiple requests", + reqs: []requests{ + { + llmserviceName: "s10", + modelName: "m10", + targetModelName: "t10", + reqSize: 1200, + elapsed: time.Millisecond * 10, + }, + { + llmserviceName: "s10", + modelName: "m10", + targetModelName: "t10", + reqSize: 500, + elapsed: time.Millisecond * 1600, + }, + { + llmserviceName: "s10", + modelName: "m10", + targetModelName: "t11", + reqSize: 2480, + elapsed: time.Millisecond * 60, + }, + { + llmserviceName: "s20", + modelName: "m20", + targetModelName: "t20", + reqSize: 80, + elapsed: time.Millisecond * 120, + }, + }, + }} + Register() + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + for _, req := range scenario.reqs { + MonitorRequest(req.llmserviceName, req.modelName, req.targetModelName, req.reqSize, req.elapsed) + } + wantRequestTotal, err := os.Open("testdata/request_total_metric") + defer func() { + if err := wantRequestTotal.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestTotal, RequestTotalMetric); err != nil { + t.Error(err) + } + wantRequestLatencies, err := os.Open("testdata/request_duration_seconds_metric") + defer func() { + if err := wantRequestLatencies.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestLatencies, RequestLatenciesMetric); err != nil { + t.Error(err) + } + wantRequestSizes, err := os.Open("testdata/request_sizes_metric") + defer func() { + if err := wantRequestSizes.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestSizes, RequestSizesMetric); err != nil { + t.Error(err) + } + + }) + } +} diff --git a/pkg/ext-proc/metrics/testdata/request_duration_seconds_metric b/pkg/ext-proc/metrics/testdata/request_duration_seconds_metric new file mode 100644 index 00000000..921a03df --- /dev/null +++ b/pkg/ext-proc/metrics/testdata/request_duration_seconds_metric @@ -0,0 +1,116 @@ +# HELP llmservice_model_request_duration_seconds [ALPHA] LLM service response latency distribution in seconds for each model and target model. +# TYPE llmservice_model_request_duration_seconds histogram +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.005"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.025"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.05"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.1"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.2"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.4"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.6"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.8"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1.0"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1.25"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1.5"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="2"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="3"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="4"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="5"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="6"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="8"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="10"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="15"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="20"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="30"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="45"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="60"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="120"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="180"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="240"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="300"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="360"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="480"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="600"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="900"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1200"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1800"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="2700"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="3600"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="Inf"} 2 +llmservice_model_request_duration_seconds_sum{llmservice_name="s10", model_name="m10", target_model_name="t10"} 1.61 +llmservice_model_request_duration_seconds_count{llmservice_name="s10", model_name="m10", target_model_name="t10"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.005"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.025"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.05"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.1"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.2"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.4"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.6"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.8"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.25"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.5"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="3"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="4"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="5"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="6"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="8"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="10"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="15"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="20"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="30"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="45"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="60"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="120"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="180"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="240"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="300"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="360"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="480"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="600"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="900"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1200"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1800"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2700"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="3600"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="+Inf"} 1 +llmservice_model_request_duration_seconds_sum{llmservice_name="s10",model_name="m10",target_model_name="t11"} 0.06 +llmservice_model_request_duration_seconds_count{llmservice_name="s10",model_name="m10",target_model_name="t11"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.005"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.025"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.05"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.1"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.2"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.4"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.6"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.8"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.25"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.5"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="3"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="4"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="5"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="6"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="8"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="10"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="15"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="20"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="30"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="45"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="60"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="120"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="180"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="240"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="300"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="360"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="480"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="600"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="900"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1200"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1800"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2700"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="3600"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="+Inf"} 1 +llmservice_model_request_duration_seconds_sum{llmservice_name="s20",model_name="m20",target_model_name="t20"} 0.12 +llmservice_model_request_duration_seconds_count{llmservice_name="s20",model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/ext-proc/metrics/testdata/request_sizes_metric b/pkg/ext-proc/metrics/testdata/request_sizes_metric new file mode 100644 index 00000000..54f92c99 --- /dev/null +++ b/pkg/ext-proc/metrics/testdata/request_sizes_metric @@ -0,0 +1,86 @@ +# HELP llmservice_model_request_sizes [ALPHA] LLM service requests size distribution in bytes for each model and target model. +# TYPE llmservice_model_request_sizes histogram +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="64"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="128"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="256"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="512"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1024"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="2048"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="4096"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="8192"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="16384"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="32768"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="65536"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="131072"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="262144"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="524288"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1.048576e+06"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="2.097152e+06"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="4.194304e+06"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="8.388608e+06"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1.6777216e+07"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="3.3554432e+07"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="6.7108864e+07"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1.34217728e+08"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="2.68435456e+08"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="5.36870912e+08"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1.073741824e+09"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="+Inf"} 2 +llmservice_model_request_sizes_sum{llmservice_name="s10",model_name="m10",target_model_name="t10"} 1700 +llmservice_model_request_sizes_count{llmservice_name="s10",model_name="m10",target_model_name="t10"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="64"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="128"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="256"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="512"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1024"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2048"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="4096"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="8192"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="16384"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="32768"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="65536"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="131072"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="262144"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="524288"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.048576e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2.097152e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="4.194304e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="8.388608e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.6777216e+07"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="3.3554432e+07"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="6.7108864e+07"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.34217728e+08"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2.68435456e+08"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="5.36870912e+08"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.073741824e+09"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="+Inf"} 1 +llmservice_model_request_sizes_sum{llmservice_name="s10",model_name="m10",target_model_name="t11"} 2480 +llmservice_model_request_sizes_count{llmservice_name="s10",model_name="m10",target_model_name="t11"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="64"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="128"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="256"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="512"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1024"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2048"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="4096"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="8192"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="16384"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="32768"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="65536"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="131072"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="262144"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="524288"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.048576e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2.097152e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="4.194304e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="8.388608e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.6777216e+07"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="3.3554432e+07"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="6.7108864e+07"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.34217728e+08"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2.68435456e+08"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="5.36870912e+08"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.073741824e+09"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="+Inf"} 1 +llmservice_model_request_sizes_sum{llmservice_name="s20",model_name="m20",target_model_name="t20"} 80 +llmservice_model_request_sizes_count{llmservice_name="s20",model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/ext-proc/metrics/testdata/request_total_metric b/pkg/ext-proc/metrics/testdata/request_total_metric new file mode 100644 index 00000000..f31feb65 --- /dev/null +++ b/pkg/ext-proc/metrics/testdata/request_total_metric @@ -0,0 +1,5 @@ +# HELP llmservice_model_request_total [ALPHA] Counter of LLM service requests broken out for each model and target model. +# TYPE llmservice_model_request_total counter +llmservice_model_request_total{llmservice_name="s10", model_name="m10", target_model_name="t10"} 2 +llmservice_model_request_total{llmservice_name="s10", model_name="m10", target_model_name="t11"} 1 +llmservice_model_request_total{llmservice_name="s20", model_name="m20", target_model_name="t20"} 1 diff --git a/pkg/ext-proc/scheduling/filter.go b/pkg/ext-proc/scheduling/filter.go index 55330884..bcc4432b 100644 --- a/pkg/ext-proc/scheduling/filter.go +++ b/pkg/ext-proc/scheduling/filter.go @@ -157,9 +157,9 @@ func leastKVCacheFilterFunc(req *LLMRequest, pods []*backend.PodMetrics) ([]*bac type podPredicate func(req *LLMRequest, pod *backend.PodMetrics) bool // We consider serving an adapter low cost it the adapter is active in the model server, or the -// model server has room to load the adapter. The lowLoRACostPredicate ensures weak affinity by spreading the -// load of a LoRA adapter across multiple pods, avoiding "pinning" all requests to a single pod. -// This gave good performance in our initial benchmarking results in the scenario where # of lora slots > # of lora adapters. +// model server has room to load the adapter. The lowLoRACostPredicate ensures weak affinity by spreading the +// load of a LoRA adapter across multiple pods, avoiding "pinning" all requests to a single pod. +// This gave good performance in our initial benchmarking results in the scenario where # of lora slots > # of lora adapters. func lowLoRACostPredicate(req *LLMRequest, pod *backend.PodMetrics) bool { _, ok := pod.ActiveModels[req.ResolvedTargetModel] return ok || len(pod.ActiveModels) < pod.MaxActiveModels From 7791550aaebd8a00abef89bfb953b826739fd43c Mon Sep 17 00:00:00 2001 From: Jie Wu Date: Thu, 12 Dec 2024 03:16:11 +0000 Subject: [PATCH 03/25] add request metrics --- pkg/ext-proc/metrics/metrics.go | 72 +++++++++++ pkg/ext-proc/metrics/metrics_test.go | 105 ++++++++++++++++ .../testdata/request_duration_seconds_metric | 116 ++++++++++++++++++ .../metrics/testdata/request_sizes_metric | 86 +++++++++++++ .../metrics/testdata/request_total_metric | 5 + 5 files changed, 384 insertions(+) create mode 100644 pkg/ext-proc/metrics/metrics.go create mode 100644 pkg/ext-proc/metrics/metrics_test.go create mode 100644 pkg/ext-proc/metrics/testdata/request_duration_seconds_metric create mode 100644 pkg/ext-proc/metrics/testdata/request_sizes_metric create mode 100644 pkg/ext-proc/metrics/testdata/request_total_metric diff --git a/pkg/ext-proc/metrics/metrics.go b/pkg/ext-proc/metrics/metrics.go new file mode 100644 index 00000000..fe879724 --- /dev/null +++ b/pkg/ext-proc/metrics/metrics.go @@ -0,0 +1,72 @@ +package metrics + +import ( + "sync" + "time" + + compbasemetrics "k8s.io/component-base/metrics" + "k8s.io/component-base/metrics/legacyregistry" +) + +const ( + LLMServiceModelComponent = "llmservice_model" +) + +var ( + requestCounter = compbasemetrics.NewCounterVec( + &compbasemetrics.CounterOpts{ + Subsystem: LLMServiceModelComponent, + Name: "request_total", + Help: "Counter of LLM service requests broken out for each model and target model.", + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"llmservice_name", "model_name", "target_model_name"}, + ) + + requestLatencies = compbasemetrics.NewHistogramVec( + &compbasemetrics.HistogramOpts{ + Subsystem: LLMServiceModelComponent, + Name: "request_duration_seconds", + Help: "LLM service response latency distribution in seconds for each model and target model.", + Buckets: []float64{0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3, + 4, 5, 6, 8, 10, 15, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600}, + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"llmservice_name", "model_name", "target_model_name"}, + ) + + requestSizes = compbasemetrics.NewHistogramVec( + &compbasemetrics.HistogramOpts{ + Subsystem: LLMServiceModelComponent, + Name: "request_sizes", + Help: "LLM service requests size distribution in bytes for each model and target model.", + // Use buckets ranging from 1000 bytes (1KB) to 10^9 bytes (1GB). + Buckets: []float64{ + 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, // More fine-grained up to 64KB + 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, // Exponential up to 8MB + 16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, // Exponential up to 1GB + }, + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"llmservice_name", "model_name", "target_model_name"}, + ) +) + +var registerMetrics sync.Once + +// Register all metrics. +func Register() { + registerMetrics.Do(func() { + legacyregistry.MustRegister(requestCounter) + legacyregistry.MustRegister(requestLatencies) + legacyregistry.MustRegister(requestSizes) + }) +} + +// MonitorRequest handles monitoring requests. +func MonitorRequest(llmserviceName, modelName, targetModelName string, reqSize int, elapsed time.Duration) { + elapsedSeconds := elapsed.Seconds() + requestCounter.WithLabelValues(llmserviceName, modelName, targetModelName).Inc() + requestLatencies.WithLabelValues(llmserviceName, modelName, targetModelName).Observe(elapsedSeconds) + requestSizes.WithLabelValues(llmserviceName, modelName, targetModelName).Observe(float64(reqSize)) +} diff --git a/pkg/ext-proc/metrics/metrics_test.go b/pkg/ext-proc/metrics/metrics_test.go new file mode 100644 index 00000000..4d33c6f0 --- /dev/null +++ b/pkg/ext-proc/metrics/metrics_test.go @@ -0,0 +1,105 @@ +package metrics + +import ( + "os" + "testing" + "time" + + "k8s.io/component-base/metrics/legacyregistry" + "k8s.io/component-base/metrics/testutil" +) + +const RequestTotalMetric = LLMServiceModelComponent + "_request_total" +const RequestLatenciesMetric = LLMServiceModelComponent + "_request_duration_seconds" +const RequestSizesMetric = LLMServiceModelComponent + "_request_sizes" + +func TestMonitorRequest(t *testing.T) { + type requests struct { + llmserviceName string + modelName string + targetModelName string + reqSize int + elapsed time.Duration + } + scenarios := []struct { + name string + reqs []requests + }{{ + name: "multiple requests", + reqs: []requests{ + { + llmserviceName: "s10", + modelName: "m10", + targetModelName: "t10", + reqSize: 1200, + elapsed: time.Millisecond * 10, + }, + { + llmserviceName: "s10", + modelName: "m10", + targetModelName: "t10", + reqSize: 500, + elapsed: time.Millisecond * 1600, + }, + { + llmserviceName: "s10", + modelName: "m10", + targetModelName: "t11", + reqSize: 2480, + elapsed: time.Millisecond * 60, + }, + { + llmserviceName: "s20", + modelName: "m20", + targetModelName: "t20", + reqSize: 80, + elapsed: time.Millisecond * 120, + }, + }, + }} + Register() + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + for _, req := range scenario.reqs { + MonitorRequest(req.llmserviceName, req.modelName, req.targetModelName, req.reqSize, req.elapsed) + } + wantRequestTotal, err := os.Open("testdata/request_total_metric") + defer func() { + if err := wantRequestTotal.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestTotal, RequestTotalMetric); err != nil { + t.Error(err) + } + wantRequestLatencies, err := os.Open("testdata/request_duration_seconds_metric") + defer func() { + if err := wantRequestLatencies.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestLatencies, RequestLatenciesMetric); err != nil { + t.Error(err) + } + wantRequestSizes, err := os.Open("testdata/request_sizes_metric") + defer func() { + if err := wantRequestSizes.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestSizes, RequestSizesMetric); err != nil { + t.Error(err) + } + + }) + } +} diff --git a/pkg/ext-proc/metrics/testdata/request_duration_seconds_metric b/pkg/ext-proc/metrics/testdata/request_duration_seconds_metric new file mode 100644 index 00000000..921a03df --- /dev/null +++ b/pkg/ext-proc/metrics/testdata/request_duration_seconds_metric @@ -0,0 +1,116 @@ +# HELP llmservice_model_request_duration_seconds [ALPHA] LLM service response latency distribution in seconds for each model and target model. +# TYPE llmservice_model_request_duration_seconds histogram +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.005"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.025"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.05"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.1"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.2"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.4"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.6"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.8"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1.0"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1.25"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1.5"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="2"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="3"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="4"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="5"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="6"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="8"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="10"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="15"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="20"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="30"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="45"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="60"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="120"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="180"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="240"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="300"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="360"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="480"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="600"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="900"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1200"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1800"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="2700"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="3600"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="Inf"} 2 +llmservice_model_request_duration_seconds_sum{llmservice_name="s10", model_name="m10", target_model_name="t10"} 1.61 +llmservice_model_request_duration_seconds_count{llmservice_name="s10", model_name="m10", target_model_name="t10"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.005"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.025"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.05"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.1"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.2"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.4"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.6"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.8"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.25"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.5"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="3"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="4"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="5"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="6"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="8"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="10"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="15"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="20"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="30"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="45"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="60"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="120"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="180"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="240"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="300"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="360"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="480"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="600"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="900"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1200"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1800"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2700"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="3600"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="+Inf"} 1 +llmservice_model_request_duration_seconds_sum{llmservice_name="s10",model_name="m10",target_model_name="t11"} 0.06 +llmservice_model_request_duration_seconds_count{llmservice_name="s10",model_name="m10",target_model_name="t11"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.005"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.025"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.05"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.1"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.2"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.4"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.6"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.8"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.25"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.5"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="3"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="4"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="5"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="6"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="8"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="10"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="15"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="20"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="30"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="45"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="60"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="120"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="180"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="240"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="300"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="360"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="480"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="600"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="900"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1200"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1800"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2700"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="3600"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="+Inf"} 1 +llmservice_model_request_duration_seconds_sum{llmservice_name="s20",model_name="m20",target_model_name="t20"} 0.12 +llmservice_model_request_duration_seconds_count{llmservice_name="s20",model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/ext-proc/metrics/testdata/request_sizes_metric b/pkg/ext-proc/metrics/testdata/request_sizes_metric new file mode 100644 index 00000000..54f92c99 --- /dev/null +++ b/pkg/ext-proc/metrics/testdata/request_sizes_metric @@ -0,0 +1,86 @@ +# HELP llmservice_model_request_sizes [ALPHA] LLM service requests size distribution in bytes for each model and target model. +# TYPE llmservice_model_request_sizes histogram +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="64"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="128"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="256"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="512"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1024"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="2048"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="4096"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="8192"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="16384"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="32768"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="65536"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="131072"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="262144"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="524288"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1.048576e+06"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="2.097152e+06"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="4.194304e+06"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="8.388608e+06"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1.6777216e+07"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="3.3554432e+07"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="6.7108864e+07"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1.34217728e+08"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="2.68435456e+08"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="5.36870912e+08"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1.073741824e+09"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="+Inf"} 2 +llmservice_model_request_sizes_sum{llmservice_name="s10",model_name="m10",target_model_name="t10"} 1700 +llmservice_model_request_sizes_count{llmservice_name="s10",model_name="m10",target_model_name="t10"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="64"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="128"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="256"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="512"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1024"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2048"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="4096"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="8192"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="16384"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="32768"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="65536"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="131072"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="262144"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="524288"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.048576e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2.097152e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="4.194304e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="8.388608e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.6777216e+07"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="3.3554432e+07"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="6.7108864e+07"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.34217728e+08"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2.68435456e+08"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="5.36870912e+08"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.073741824e+09"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="+Inf"} 1 +llmservice_model_request_sizes_sum{llmservice_name="s10",model_name="m10",target_model_name="t11"} 2480 +llmservice_model_request_sizes_count{llmservice_name="s10",model_name="m10",target_model_name="t11"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="64"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="128"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="256"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="512"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1024"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2048"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="4096"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="8192"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="16384"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="32768"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="65536"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="131072"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="262144"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="524288"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.048576e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2.097152e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="4.194304e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="8.388608e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.6777216e+07"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="3.3554432e+07"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="6.7108864e+07"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.34217728e+08"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2.68435456e+08"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="5.36870912e+08"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.073741824e+09"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="+Inf"} 1 +llmservice_model_request_sizes_sum{llmservice_name="s20",model_name="m20",target_model_name="t20"} 80 +llmservice_model_request_sizes_count{llmservice_name="s20",model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/ext-proc/metrics/testdata/request_total_metric b/pkg/ext-proc/metrics/testdata/request_total_metric new file mode 100644 index 00000000..f31feb65 --- /dev/null +++ b/pkg/ext-proc/metrics/testdata/request_total_metric @@ -0,0 +1,5 @@ +# HELP llmservice_model_request_total [ALPHA] Counter of LLM service requests broken out for each model and target model. +# TYPE llmservice_model_request_total counter +llmservice_model_request_total{llmservice_name="s10", model_name="m10", target_model_name="t10"} 2 +llmservice_model_request_total{llmservice_name="s10", model_name="m10", target_model_name="t11"} 1 +llmservice_model_request_total{llmservice_name="s20", model_name="m20", target_model_name="t20"} 1 From 129a756d75787e1429f7146abc5764949eee294c Mon Sep 17 00:00:00 2001 From: Jie Wu Date: Thu, 12 Dec 2024 03:16:11 +0000 Subject: [PATCH 04/25] add request metrics --- pkg/ext-proc/metrics/testdata/request_duration_seconds_metic | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 pkg/ext-proc/metrics/testdata/request_duration_seconds_metic diff --git a/pkg/ext-proc/metrics/testdata/request_duration_seconds_metic b/pkg/ext-proc/metrics/testdata/request_duration_seconds_metic new file mode 100644 index 00000000..e69de29b From e8d45f0cebf7df87eab4a59e0a0b9e554e3f909a Mon Sep 17 00:00:00 2001 From: Jie WU Date: Mon, 6 Jan 2025 16:39:38 +0000 Subject: [PATCH 05/25] rename api and metrics --- pkg/ext-proc/metrics/metrics.go | 28 +-- pkg/ext-proc/metrics/metrics_test.go | 13 +- .../testdata/request_duration_seconds_metic | 0 .../testdata/request_duration_seconds_metric | 232 +++++++++--------- .../metrics/testdata/request_sizes_metric | 172 ++++++------- .../metrics/testdata/request_total_metric | 10 +- 6 files changed, 225 insertions(+), 230 deletions(-) delete mode 100644 pkg/ext-proc/metrics/testdata/request_duration_seconds_metic diff --git a/pkg/ext-proc/metrics/metrics.go b/pkg/ext-proc/metrics/metrics.go index fe879724..4ed823a6 100644 --- a/pkg/ext-proc/metrics/metrics.go +++ b/pkg/ext-proc/metrics/metrics.go @@ -9,37 +9,37 @@ import ( ) const ( - LLMServiceModelComponent = "llmservice_model" + InferenceModelComponent = "inference_model" ) var ( requestCounter = compbasemetrics.NewCounterVec( &compbasemetrics.CounterOpts{ - Subsystem: LLMServiceModelComponent, + Subsystem: InferenceModelComponent, Name: "request_total", - Help: "Counter of LLM service requests broken out for each model and target model.", + Help: "Counter of inference model requests broken out for each model and target model.", StabilityLevel: compbasemetrics.ALPHA, }, - []string{"llmservice_name", "model_name", "target_model_name"}, + []string{"model_name", "target_model_name"}, ) requestLatencies = compbasemetrics.NewHistogramVec( &compbasemetrics.HistogramOpts{ - Subsystem: LLMServiceModelComponent, + Subsystem: InferenceModelComponent, Name: "request_duration_seconds", - Help: "LLM service response latency distribution in seconds for each model and target model.", + Help: "Inference model response latency distribution in seconds for each model and target model.", Buckets: []float64{0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3, 4, 5, 6, 8, 10, 15, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600}, StabilityLevel: compbasemetrics.ALPHA, }, - []string{"llmservice_name", "model_name", "target_model_name"}, + []string{"model_name", "target_model_name"}, ) requestSizes = compbasemetrics.NewHistogramVec( &compbasemetrics.HistogramOpts{ - Subsystem: LLMServiceModelComponent, + Subsystem: InferenceModelComponent, Name: "request_sizes", - Help: "LLM service requests size distribution in bytes for each model and target model.", + Help: "Inference model requests size distribution in bytes for each model and target model.", // Use buckets ranging from 1000 bytes (1KB) to 10^9 bytes (1GB). Buckets: []float64{ 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, // More fine-grained up to 64KB @@ -48,7 +48,7 @@ var ( }, StabilityLevel: compbasemetrics.ALPHA, }, - []string{"llmservice_name", "model_name", "target_model_name"}, + []string{"model_name", "target_model_name"}, ) ) @@ -64,9 +64,9 @@ func Register() { } // MonitorRequest handles monitoring requests. -func MonitorRequest(llmserviceName, modelName, targetModelName string, reqSize int, elapsed time.Duration) { +func MonitorRequest(modelName, targetModelName string, reqSize int, elapsed time.Duration) { elapsedSeconds := elapsed.Seconds() - requestCounter.WithLabelValues(llmserviceName, modelName, targetModelName).Inc() - requestLatencies.WithLabelValues(llmserviceName, modelName, targetModelName).Observe(elapsedSeconds) - requestSizes.WithLabelValues(llmserviceName, modelName, targetModelName).Observe(float64(reqSize)) + requestCounter.WithLabelValues(modelName, targetModelName).Inc() + requestLatencies.WithLabelValues(modelName, targetModelName).Observe(elapsedSeconds) + requestSizes.WithLabelValues(modelName, targetModelName).Observe(float64(reqSize)) } diff --git a/pkg/ext-proc/metrics/metrics_test.go b/pkg/ext-proc/metrics/metrics_test.go index 4d33c6f0..df83a5ed 100644 --- a/pkg/ext-proc/metrics/metrics_test.go +++ b/pkg/ext-proc/metrics/metrics_test.go @@ -9,13 +9,12 @@ import ( "k8s.io/component-base/metrics/testutil" ) -const RequestTotalMetric = LLMServiceModelComponent + "_request_total" -const RequestLatenciesMetric = LLMServiceModelComponent + "_request_duration_seconds" -const RequestSizesMetric = LLMServiceModelComponent + "_request_sizes" +const RequestTotalMetric = InferenceModelComponent + "_request_total" +const RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds" +const RequestSizesMetric = InferenceModelComponent + "_request_sizes" func TestMonitorRequest(t *testing.T) { type requests struct { - llmserviceName string modelName string targetModelName string reqSize int @@ -28,28 +27,24 @@ func TestMonitorRequest(t *testing.T) { name: "multiple requests", reqs: []requests{ { - llmserviceName: "s10", modelName: "m10", targetModelName: "t10", reqSize: 1200, elapsed: time.Millisecond * 10, }, { - llmserviceName: "s10", modelName: "m10", targetModelName: "t10", reqSize: 500, elapsed: time.Millisecond * 1600, }, { - llmserviceName: "s10", modelName: "m10", targetModelName: "t11", reqSize: 2480, elapsed: time.Millisecond * 60, }, { - llmserviceName: "s20", modelName: "m20", targetModelName: "t20", reqSize: 80, @@ -61,7 +56,7 @@ func TestMonitorRequest(t *testing.T) { for _, scenario := range scenarios { t.Run(scenario.name, func(t *testing.T) { for _, req := range scenario.reqs { - MonitorRequest(req.llmserviceName, req.modelName, req.targetModelName, req.reqSize, req.elapsed) + MonitorRequest(req.modelName, req.targetModelName, req.reqSize, req.elapsed) } wantRequestTotal, err := os.Open("testdata/request_total_metric") defer func() { diff --git a/pkg/ext-proc/metrics/testdata/request_duration_seconds_metic b/pkg/ext-proc/metrics/testdata/request_duration_seconds_metic deleted file mode 100644 index e69de29b..00000000 diff --git a/pkg/ext-proc/metrics/testdata/request_duration_seconds_metric b/pkg/ext-proc/metrics/testdata/request_duration_seconds_metric index 921a03df..6c70b4ba 100644 --- a/pkg/ext-proc/metrics/testdata/request_duration_seconds_metric +++ b/pkg/ext-proc/metrics/testdata/request_duration_seconds_metric @@ -1,116 +1,116 @@ -# HELP llmservice_model_request_duration_seconds [ALPHA] LLM service response latency distribution in seconds for each model and target model. -# TYPE llmservice_model_request_duration_seconds histogram -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.005"} 0 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.025"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.05"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.1"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.2"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.4"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.6"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.8"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1.0"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1.25"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1.5"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="2"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="3"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="4"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="5"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="6"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="8"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="10"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="15"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="20"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="30"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="45"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="60"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="120"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="180"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="240"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="300"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="360"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="480"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="600"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="900"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1200"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1800"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="2700"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="3600"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="Inf"} 2 -llmservice_model_request_duration_seconds_sum{llmservice_name="s10", model_name="m10", target_model_name="t10"} 1.61 -llmservice_model_request_duration_seconds_count{llmservice_name="s10", model_name="m10", target_model_name="t10"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.005"} 0 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.025"} 0 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.05"} 0 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.1"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.2"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.4"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.6"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.8"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.25"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.5"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="3"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="4"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="5"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="6"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="8"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="10"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="15"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="20"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="30"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="45"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="60"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="120"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="180"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="240"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="300"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="360"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="480"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="600"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="900"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1200"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1800"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2700"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="3600"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="+Inf"} 1 -llmservice_model_request_duration_seconds_sum{llmservice_name="s10",model_name="m10",target_model_name="t11"} 0.06 -llmservice_model_request_duration_seconds_count{llmservice_name="s10",model_name="m10",target_model_name="t11"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.005"} 0 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.025"} 0 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.05"} 0 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.1"} 0 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.2"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.4"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.6"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.8"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.25"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.5"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="3"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="4"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="5"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="6"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="8"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="10"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="15"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="20"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="30"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="45"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="60"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="120"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="180"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="240"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="300"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="360"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="480"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="600"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="900"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1200"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1800"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2700"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="3600"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="+Inf"} 1 -llmservice_model_request_duration_seconds_sum{llmservice_name="s20",model_name="m20",target_model_name="t20"} 0.12 -llmservice_model_request_duration_seconds_count{llmservice_name="s20",model_name="m20",target_model_name="t20"} 1 +# HELP inference_model_request_duration_seconds [ALPHA] Inference model response latency distribution in seconds for each model and target model. +# TYPE inference_model_request_duration_seconds histogram +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.005"} 0 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.025"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.05"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.1"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.2"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.4"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.6"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.8"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1.0"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1.25"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1.5"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="2"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="3"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="4"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="5"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="6"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="8"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="10"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="15"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="20"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="30"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="45"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="60"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="120"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="180"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="240"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="300"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="360"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="480"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="600"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="900"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1200"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1800"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="2700"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="3600"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="Inf"} 2 +inference_model_request_duration_seconds_sum{model_name="m10", target_model_name="t10"} 1.61 +inference_model_request_duration_seconds_count{model_name="m10", target_model_name="t10"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.005"} 0 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.025"} 0 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.05"} 0 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.1"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.2"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.4"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.6"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.8"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1.25"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1.5"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="2"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="3"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="4"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="5"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="6"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="8"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="10"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="15"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="20"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="30"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="45"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="60"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="120"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="180"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="240"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="300"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="360"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="480"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="600"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="900"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1200"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1800"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="2700"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="3600"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1 +inference_model_request_duration_seconds_sum{model_name="m10",target_model_name="t11"} 0.06 +inference_model_request_duration_seconds_count{model_name="m10",target_model_name="t11"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.005"} 0 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.025"} 0 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.05"} 0 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.1"} 0 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.2"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.4"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.6"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.8"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1.25"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1.5"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="2"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="3"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="4"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="5"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="6"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="8"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="10"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="15"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="20"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="30"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="45"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="60"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="120"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="180"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="240"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="300"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="360"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="480"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="600"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="900"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1200"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1800"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="2700"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="3600"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1 +inference_model_request_duration_seconds_sum{model_name="m20",target_model_name="t20"} 0.12 +inference_model_request_duration_seconds_count{model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/ext-proc/metrics/testdata/request_sizes_metric b/pkg/ext-proc/metrics/testdata/request_sizes_metric index 54f92c99..ceca532e 100644 --- a/pkg/ext-proc/metrics/testdata/request_sizes_metric +++ b/pkg/ext-proc/metrics/testdata/request_sizes_metric @@ -1,86 +1,86 @@ -# HELP llmservice_model_request_sizes [ALPHA] LLM service requests size distribution in bytes for each model and target model. -# TYPE llmservice_model_request_sizes histogram -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="64"} 0 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="128"} 0 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="256"} 0 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="512"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1024"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="2048"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="4096"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="8192"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="16384"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="32768"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="65536"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="131072"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="262144"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="524288"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1.048576e+06"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="2.097152e+06"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="4.194304e+06"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="8.388608e+06"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1.6777216e+07"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="3.3554432e+07"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="6.7108864e+07"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1.34217728e+08"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="2.68435456e+08"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="5.36870912e+08"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1.073741824e+09"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="+Inf"} 2 -llmservice_model_request_sizes_sum{llmservice_name="s10",model_name="m10",target_model_name="t10"} 1700 -llmservice_model_request_sizes_count{llmservice_name="s10",model_name="m10",target_model_name="t10"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="64"} 0 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="128"} 0 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="256"} 0 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="512"} 0 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1024"} 0 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2048"} 0 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="4096"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="8192"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="16384"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="32768"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="65536"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="131072"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="262144"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="524288"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.048576e+06"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2.097152e+06"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="4.194304e+06"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="8.388608e+06"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.6777216e+07"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="3.3554432e+07"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="6.7108864e+07"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.34217728e+08"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2.68435456e+08"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="5.36870912e+08"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.073741824e+09"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="+Inf"} 1 -llmservice_model_request_sizes_sum{llmservice_name="s10",model_name="m10",target_model_name="t11"} 2480 -llmservice_model_request_sizes_count{llmservice_name="s10",model_name="m10",target_model_name="t11"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="64"} 0 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="128"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="256"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="512"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1024"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2048"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="4096"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="8192"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="16384"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="32768"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="65536"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="131072"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="262144"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="524288"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.048576e+06"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2.097152e+06"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="4.194304e+06"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="8.388608e+06"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.6777216e+07"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="3.3554432e+07"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="6.7108864e+07"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.34217728e+08"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2.68435456e+08"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="5.36870912e+08"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.073741824e+09"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="+Inf"} 1 -llmservice_model_request_sizes_sum{llmservice_name="s20",model_name="m20",target_model_name="t20"} 80 -llmservice_model_request_sizes_count{llmservice_name="s20",model_name="m20",target_model_name="t20"} 1 +# HELP inference_model_request_sizes [ALPHA] Inference model requests size distribution in bytes for each model and target model. +# TYPE inference_model_request_sizes histogram +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="64"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="128"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="256"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="512"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1024"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="2048"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="4096"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="8192"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="16384"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="32768"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="65536"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="131072"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="262144"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="524288"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1.048576e+06"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="2.097152e+06"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="4.194304e+06"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="8.388608e+06"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1.6777216e+07"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="3.3554432e+07"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="6.7108864e+07"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1.34217728e+08"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="2.68435456e+08"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="5.36870912e+08"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1.073741824e+09"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="+Inf"} 2 +inference_model_request_sizes_sum{model_name="m10",target_model_name="t10"} 1700 +inference_model_request_sizes_count{model_name="m10",target_model_name="t10"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="64"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="128"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="256"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="512"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1024"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="2048"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="4096"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="8192"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="16384"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="32768"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="65536"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="131072"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="262144"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="524288"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1.048576e+06"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="2.097152e+06"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="4.194304e+06"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="8.388608e+06"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1.6777216e+07"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="3.3554432e+07"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="6.7108864e+07"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1.34217728e+08"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="2.68435456e+08"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="5.36870912e+08"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1.073741824e+09"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1 +inference_model_request_sizes_sum{model_name="m10",target_model_name="t11"} 2480 +inference_model_request_sizes_count{model_name="m10",target_model_name="t11"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="64"} 0 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="128"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="256"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="512"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1024"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="2048"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="4096"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="8192"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="16384"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="32768"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="65536"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="131072"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="262144"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="524288"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1.048576e+06"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="2.097152e+06"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="4.194304e+06"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="8.388608e+06"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1.6777216e+07"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="3.3554432e+07"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="6.7108864e+07"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1.34217728e+08"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="2.68435456e+08"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="5.36870912e+08"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1.073741824e+09"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1 +inference_model_request_sizes_sum{model_name="m20",target_model_name="t20"} 80 +inference_model_request_sizes_count{model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/ext-proc/metrics/testdata/request_total_metric b/pkg/ext-proc/metrics/testdata/request_total_metric index f31feb65..9c6f48a3 100644 --- a/pkg/ext-proc/metrics/testdata/request_total_metric +++ b/pkg/ext-proc/metrics/testdata/request_total_metric @@ -1,5 +1,5 @@ -# HELP llmservice_model_request_total [ALPHA] Counter of LLM service requests broken out for each model and target model. -# TYPE llmservice_model_request_total counter -llmservice_model_request_total{llmservice_name="s10", model_name="m10", target_model_name="t10"} 2 -llmservice_model_request_total{llmservice_name="s10", model_name="m10", target_model_name="t11"} 1 -llmservice_model_request_total{llmservice_name="s20", model_name="m20", target_model_name="t20"} 1 +# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. +# TYPE inference_model_request_total counter +inference_model_request_total{model_name="m10", target_model_name="t10"} 2 +inference_model_request_total{model_name="m10", target_model_name="t11"} 1 +inference_model_request_total{model_name="m20", target_model_name="t20"} 1 From 28f39996e33eeec34386e8b213ef0001c03a6cc4 Mon Sep 17 00:00:00 2001 From: Jie WU Date: Mon, 6 Jan 2025 16:47:09 +0000 Subject: [PATCH 06/25] fix go mod --- go.mod | 26 ++++++------- go.sum | 64 ++++++++++++++++++------------- pkg/ext-proc/scheduling/filter.go | 14 +++---- 3 files changed, 58 insertions(+), 46 deletions(-) diff --git a/go.mod b/go.mod index e3b330c4..b61512f5 100644 --- a/go.mod +++ b/go.mod @@ -10,26 +10,26 @@ require ( github.com/google/go-cmp v0.6.0 github.com/jhump/protoreflect v1.17.0 github.com/onsi/ginkgo/v2 v2.22.0 - github.com/onsi/gomega v1.36.0 + github.com/onsi/gomega v1.36.1 github.com/prometheus/client_model v0.6.1 github.com/prometheus/common v0.61.0 github.com/stretchr/testify v1.10.0 go.uber.org/multierr v1.11.0 - google.golang.org/grpc v1.68.0 - google.golang.org/protobuf v1.35.2 - k8s.io/api v0.31.3 - k8s.io/apimachinery v0.31.3 - k8s.io/client-go v0.31.3 - k8s.io/code-generator v0.31.3 - k8s.io/component-base v0.31.3 + google.golang.org/grpc v1.69.0 + google.golang.org/protobuf v1.36.0 + k8s.io/api v0.31.4 + k8s.io/apimachinery v0.31.4 + k8s.io/client-go v0.31.4 + k8s.io/code-generator v0.31.4 + k8s.io/component-base v0.31.4 k8s.io/klog/v2 v2.130.1 sigs.k8s.io/controller-runtime v0.19.3 - sigs.k8s.io/structured-merge-diff/v4 v4.4.3 + sigs.k8s.io/structured-merge-diff/v4 v4.5.0 ) require ( - cel.dev/expr v0.16.1 // indirect - cloud.google.com/go/compute/metadata v0.5.0 // indirect + cel.dev/expr v0.16.2 // indirect + cloud.google.com/go/compute/metadata v0.5.2 // indirect github.com/BurntSushi/toml v1.1.0 // indirect github.com/Masterminds/goutils v1.1.1 // indirect github.com/Masterminds/semver/v3 v3.2.0 // indirect @@ -93,8 +93,8 @@ require ( golang.org/x/time v0.3.0 // indirect golang.org/x/tools v0.26.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20241015192408-796eee8c2d53 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20241015192408-796eee8c2d53 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect diff --git a/go.sum b/go.sum index 8abb6124..7d4c4e0d 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,7 @@ -cel.dev/expr v0.16.1 h1:NR0+oFYzR1CqLFhTAqg3ql59G9VfN8fKq1TCHJ6gq1g= -cel.dev/expr v0.16.1/go.mod h1:AsGA5zb3WruAEQeQng1RZdGEXmBj0jvMWh6l5SnNuC8= -cloud.google.com/go/compute/metadata v0.5.0 h1:Zr0eK8JbFv6+Wi4ilXAR8FJ3wyNdpxHKJNPos6LTZOY= -cloud.google.com/go/compute/metadata v0.5.0/go.mod h1:aHnloV2TPI38yx4s9+wAZhHykWvVCfu7hQbF+9CWoiY= +cel.dev/expr v0.16.2 h1:RwRhoH17VhAu9U5CMvMhH1PDVgf0tuz9FT+24AfMLfU= +cel.dev/expr v0.16.2/go.mod h1:gXngZQMkWJoSbE8mOzehJlXQyubn/Vg0vR9/F3W7iw8= +cloud.google.com/go/compute/metadata v0.5.2 h1:UxK4uu/Tn+I3p2dYWTfiX4wva7aYlKixAHn3fyqngqo= +cloud.google.com/go/compute/metadata v0.5.2/go.mod h1:C66sj2AluDcIqakBq/M8lw8/ybHgOZqin2obFxa/E5k= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/toml v1.1.0 h1:ksErzDEI1khOiGPgpwuI7x2ebx/uXQNw7xJpn9Eq1+I= github.com/BurntSushi/toml v1.1.0/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= @@ -48,6 +48,8 @@ github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE= @@ -119,8 +121,8 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/onsi/ginkgo/v2 v2.22.0 h1:Yed107/8DjTr0lKCNt7Dn8yQ6ybuDRQoMGrNFKzMfHg= github.com/onsi/ginkgo/v2 v2.22.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo= -github.com/onsi/gomega v1.36.0 h1:Pb12RlruUtj4XUuPUqeEWc6j5DkVVVA49Uf6YLfC95Y= -github.com/onsi/gomega v1.36.0/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog= +github.com/onsi/gomega v1.36.1 h1:bJDPBO7ibjxcbHMgSCoo4Yj18UWbKDlLwX1x9sybDcw= +github.com/onsi/gomega v1.36.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 h1:GFCKgmp0tecUJ0sJuv4pzYCqS9+RGSn52M3FUwPs+uo= @@ -161,6 +163,16 @@ github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcY github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +go.opentelemetry.io/otel v1.31.0 h1:NsJcKPIW0D0H3NgzPDHmo0WW6SptzPdqg/L1zsIm2hY= +go.opentelemetry.io/otel v1.31.0/go.mod h1:O0C14Yl9FgkjqcCZAsE053C13OaddMYr/hz6clDkEJE= +go.opentelemetry.io/otel/metric v1.31.0 h1:FSErL0ATQAmYHUIzSezZibnyVlft1ybhy4ozRPcF2fE= +go.opentelemetry.io/otel/metric v1.31.0/go.mod h1:C3dEloVbLuYoX41KpmAhOqNriGbA+qqH6PQ5E5mUfnY= +go.opentelemetry.io/otel/sdk v1.31.0 h1:xLY3abVHYZ5HSfOg3l2E5LUj2Cwva5Y7yGxnSW9H5Gk= +go.opentelemetry.io/otel/sdk v1.31.0/go.mod h1:TfRbMdhvxIIr/B2N2LQW2S5v9m3gOQ/08KsbbO5BPT0= +go.opentelemetry.io/otel/sdk/metric v1.31.0 h1:i9hxxLJF/9kkvfHppyLL55aW7iIJz4JjxTeYusH7zMc= +go.opentelemetry.io/otel/sdk/metric v1.31.0/go.mod h1:CRInTMVvNhUKgSAMbKyTMxqOBC0zgyxzW55lZzX43Y8= +go.opentelemetry.io/otel/trace v1.31.0 h1:ffjsj1aRouKewfr85U2aGagJ46+MvodynlQ1HYdmJys= +go.opentelemetry.io/otel/trace v1.31.0/go.mod h1:TXZkRk7SM2ZQLtR6eoAWQFIHPvzQ06FJAsO1tJg480A= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= @@ -234,14 +246,14 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1 h1:hjSy6tcFQZ171igDaN5QHOw2n6vx40juYbC/x67CEhc= -google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1/go.mod h1:qpvKtACPCQhAdu3PyQgV4l3LMXZEtft7y8QcarRsp9I= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 h1:pPJltXNxVzT4pK9yD8vR9X75DaWYYmLGMsEvBfFQZzQ= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1/go.mod h1:UqMtugtsSgubUsoxbuAoiCXvqvErP7Gf0so0mK9tHxU= -google.golang.org/grpc v1.68.0 h1:aHQeeJbo8zAkAa3pRzrVjZlbz6uSfeOXlJNQM0RAbz0= -google.golang.org/grpc v1.68.0/go.mod h1:fmSPC5AsjSBCK54MyHRx48kpOti1/jRfOlwEWywNjWA= -google.golang.org/protobuf v1.35.2 h1:8Ar7bF+apOIoThw1EdZl0p1oWvMqTHmpA2fRTyZO8io= -google.golang.org/protobuf v1.35.2/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +google.golang.org/genproto/googleapis/api v0.0.0-20241015192408-796eee8c2d53 h1:fVoAXEKA4+yufmbdVYv+SE73+cPZbbbe8paLsHfkK+U= +google.golang.org/genproto/googleapis/api v0.0.0-20241015192408-796eee8c2d53/go.mod h1:riSXTwQ4+nqmPGtobMFyW5FqVAmIs0St6VPp4Ug7CE4= +google.golang.org/genproto/googleapis/rpc v0.0.0-20241015192408-796eee8c2d53 h1:X58yt85/IXCx0Y3ZwN6sEIKZzQtDEYaBWrDvErdXrRE= +google.golang.org/genproto/googleapis/rpc v0.0.0-20241015192408-796eee8c2d53/go.mod h1:GX3210XPVPUjJbTUbvwI8f2IpZDMZuPJWDzDuebbviI= +google.golang.org/grpc v1.69.0 h1:quSiOM1GJPmPH5XtU+BCoVXcDVJJAzNcoyfC2cCjGkI= +google.golang.org/grpc v1.69.0/go.mod h1:vyjdE6jLBI76dgpDojsFGNaHlxdjXN9ghpnd2o7JGZ4= +google.golang.org/protobuf v1.36.0 h1:mjIs9gYtt56AzC4ZaffQuh88TZurBGhIJMBZGSxNerQ= +google.golang.org/protobuf v1.36.0/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= @@ -256,18 +268,18 @@ gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.31.3 h1:umzm5o8lFbdN/hIXbrK9oRpOproJO62CV1zqxXrLgk8= -k8s.io/api v0.31.3/go.mod h1:UJrkIp9pnMOI9K2nlL6vwpxRzzEX5sWgn8kGQe92kCE= +k8s.io/api v0.31.4 h1:I2QNzitPVsPeLQvexMEsj945QumYraqv9m74isPDKhM= +k8s.io/api v0.31.4/go.mod h1:d+7vgXLvmcdT1BCo79VEgJxHHryww3V5np2OYTr6jdw= k8s.io/apiextensions-apiserver v0.31.0 h1:fZgCVhGwsclj3qCw1buVXCV6khjRzKC5eCFt24kyLSk= k8s.io/apiextensions-apiserver v0.31.0/go.mod h1:b9aMDEYaEe5sdK+1T0KU78ApR/5ZVp4i56VacZYEHxk= -k8s.io/apimachinery v0.31.3 h1:6l0WhcYgasZ/wk9ktLq5vLaoXJJr5ts6lkaQzgeYPq4= -k8s.io/apimachinery v0.31.3/go.mod h1:rsPdaZJfTfLsNJSQzNHQvYoTmxhoOEofxtOsF3rtsMo= -k8s.io/client-go v0.31.3 h1:CAlZuM+PH2cm+86LOBemaJI/lQ5linJ6UFxKX/SoG+4= -k8s.io/client-go v0.31.3/go.mod h1:2CgjPUTpv3fE5dNygAr2NcM8nhHzXvxB8KL5gYc3kJs= -k8s.io/code-generator v0.31.3 h1:Pj0fYOBms+ZrsulLi4DMsCEx1jG8fWKRLy44onHsLBI= -k8s.io/code-generator v0.31.3/go.mod h1:/umCIlT84g1+Yu5ZXtP1KGSRTnGiIzzX5AzUAxsNlts= -k8s.io/component-base v0.31.3 h1:DMCXXVx546Rfvhj+3cOm2EUxhS+EyztH423j+8sOwhQ= -k8s.io/component-base v0.31.3/go.mod h1:xME6BHfUOafRgT0rGVBGl7TuSg8Z9/deT7qq6w7qjIU= +k8s.io/apimachinery v0.31.4 h1:8xjE2C4CzhYVm9DGf60yohpNUh5AEBnPxCryPBECmlM= +k8s.io/apimachinery v0.31.4/go.mod h1:rsPdaZJfTfLsNJSQzNHQvYoTmxhoOEofxtOsF3rtsMo= +k8s.io/client-go v0.31.4 h1:t4QEXt4jgHIkKKlx06+W3+1JOwAFU/2OPiOo7H92eRQ= +k8s.io/client-go v0.31.4/go.mod h1:kvuMro4sFYIa8sulL5Gi5GFqUPvfH2O/dXuKstbaaeg= +k8s.io/code-generator v0.31.4 h1:Vu+8fKz+239rKiVDHFVHgjQ162cg5iUQPtTyQbwXeQw= +k8s.io/code-generator v0.31.4/go.mod h1:yMDt13Kn7m4MMZ4LxB1KBzdZjEyxzdT4b4qXq+lnI90= +k8s.io/component-base v0.31.4 h1:wCquJh4ul9O8nNBSB8N/o8+gbfu3BVQkVw9jAUY/Qtw= +k8s.io/component-base v0.31.4/go.mod h1:G4dgtf5BccwiDT9DdejK0qM6zTK0jwDGEKnCmb9+u/s= k8s.io/gengo/v2 v2.0.0-20240228010128-51d4e06bde70 h1:NGrVE502P0s0/1hudf8zjgwki1X/TByhmAoILTarmzo= k8s.io/gengo/v2 v2.0.0-20240228010128-51d4e06bde70/go.mod h1:VH3AT8AaQOqiGjMF9p0/IM1Dj+82ZwjfxUP1IxaHE+8= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= @@ -280,7 +292,7 @@ sigs.k8s.io/controller-runtime v0.19.3 h1:XO2GvC9OPftRst6xWCpTgBZO04S2cbp0Qqkj8b sigs.k8s.io/controller-runtime v0.19.3/go.mod h1:j4j87DqtsThvwTv5/Tc5NFRyyF/RF0ip4+62tbTSIUM= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= -sigs.k8s.io/structured-merge-diff/v4 v4.4.3 h1:sCP7Vv3xx/CWIuTPVN38lUPx0uw0lcLfzaiDa8Ja01A= -sigs.k8s.io/structured-merge-diff/v4 v4.4.3/go.mod h1:N8f93tFZh9U6vpxwRArLiikrE5/2tiu1w1AGfACIGE4= +sigs.k8s.io/structured-merge-diff/v4 v4.5.0 h1:nbCitCK2hfnhyiKo6uf2HxUPTCodY6Qaf85SbDIaMBk= +sigs.k8s.io/structured-merge-diff/v4 v4.5.0/go.mod h1:N8f93tFZh9U6vpxwRArLiikrE5/2tiu1w1AGfACIGE4= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= diff --git a/pkg/ext-proc/scheduling/filter.go b/pkg/ext-proc/scheduling/filter.go index bcc4432b..6ce31fec 100644 --- a/pkg/ext-proc/scheduling/filter.go +++ b/pkg/ext-proc/scheduling/filter.go @@ -1,12 +1,11 @@ package scheduling import ( - "fmt" + "errors" "math" - klog "k8s.io/klog/v2" - "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend" + klog "k8s.io/klog/v2" ) type Filter interface { @@ -86,7 +85,7 @@ func toFilterFunc(pp podPredicate) filterFunc { } } if len(filtered) == 0 { - return nil, fmt.Errorf("no pods left") + return nil, errors.New("no pods left") } return filtered, nil } @@ -157,9 +156,10 @@ func leastKVCacheFilterFunc(req *LLMRequest, pods []*backend.PodMetrics) ([]*bac type podPredicate func(req *LLMRequest, pod *backend.PodMetrics) bool // We consider serving an adapter low cost it the adapter is active in the model server, or the -// model server has room to load the adapter. The lowLoRACostPredicate ensures weak affinity by spreading the -// load of a LoRA adapter across multiple pods, avoiding "pinning" all requests to a single pod. -// This gave good performance in our initial benchmarking results in the scenario where # of lora slots > # of lora adapters. +// model server has room to load the adapter. The lowLoRACostPredicate ensures weak affinity by +// spreading the load of a LoRA adapter across multiple pods, avoiding "pinning" all requests to +// a single pod. This gave good performance in our initial benchmarking results in the scenario +// where # of lora slots > # of lora adapters. func lowLoRACostPredicate(req *LLMRequest, pod *backend.PodMetrics) bool { _, ok := pod.ActiveModels[req.ResolvedTargetModel] return ok || len(pod.ActiveModels) < pod.MaxActiveModels From 59272c1695a5d7cbc10059c81e80d7caeb9944a8 Mon Sep 17 00:00:00 2001 From: Jie WU Date: Wed, 8 Jan 2025 23:16:44 +0000 Subject: [PATCH 07/25] Adding metrics handler --- pkg/manifests/ext_proc.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/manifests/ext_proc.yaml b/pkg/manifests/ext_proc.yaml index 7dc65bb7..0c8e38df 100644 --- a/pkg/manifests/ext_proc.yaml +++ b/pkg/manifests/ext_proc.yaml @@ -49,7 +49,6 @@ spec: containers: - name: inference-gateway-ext-proc # TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/34) Update the image and args. - image: us-central1-docker.pkg.dev/k8s-staging-images/llm-instance-gateway/epp:main args: - -serverPoolName - "vllm-llama2-7b-pool" @@ -59,7 +58,8 @@ spec: - "vllm-llama2-7b-pool" ports: - containerPort: 9002 - + - name: metrics + containerPort: 9090 - name: curl image: curlimages/curl command: ["sleep", "3600"] From c04fc29a04b9e37ef70752686c87263c92d67c0c Mon Sep 17 00:00:00 2001 From: Jie WU Date: Wed, 8 Jan 2025 23:17:15 +0000 Subject: [PATCH 08/25] Adding metrics handler --- pkg/ext-proc/handlers/request.go | 4 ++++ pkg/ext-proc/main.go | 5 +++++ pkg/ext-proc/metrics/metrics_handler.go | 29 +++++++++++++++++++++++++ 3 files changed, 38 insertions(+) create mode 100644 pkg/ext-proc/metrics/metrics_handler.go diff --git a/pkg/ext-proc/handlers/request.go b/pkg/ext-proc/handlers/request.go index 83ab46d0..cef6834e 100644 --- a/pkg/ext-proc/handlers/request.go +++ b/pkg/ext-proc/handlers/request.go @@ -5,10 +5,12 @@ import ( "errors" "fmt" "strconv" + "time" configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend" + "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/metrics" "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/scheduling" klog "k8s.io/klog/v2" ) @@ -18,6 +20,7 @@ import ( // Envoy sends the request body to ext proc before sending the request to the backend server. func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { klog.V(3).Infof("Handling request body") + requestReceivedTimestamp := time.Now() // Unmarshal request body (must be JSON). v := req.Request.(*extProcPb.ProcessingRequest_RequestBody) @@ -116,6 +119,7 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces }, }, } + metrics.MonitorRequest(llmReq.Model, llmReq.ResolvedTargetModel, len(v.RequestBody.Body), time.Since(requestReceivedTimestamp)) return resp, nil } diff --git a/pkg/ext-proc/main.go b/pkg/ext-proc/main.go index f7c76e6b..806c375c 100644 --- a/pkg/ext-proc/main.go +++ b/pkg/ext-proc/main.go @@ -19,6 +19,7 @@ import ( "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend" "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend/vllm" "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/handlers" + "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/metrics" "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/scheduling" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" @@ -32,6 +33,8 @@ var ( "port", 9002, "gRPC port") + metricsPort = flag.Int( + "metricsPort", 9090, "metrics port") targetPodHeader = flag.String( "targetPodHeader", "target-pod", @@ -103,6 +106,8 @@ func main() { klog.Fatalf("failed to listen: %v", err) } + metrics.Register() + go metrics.StartMetricsHandler(*metricsPort) datastore := backend.NewK8sDataStore() mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ diff --git a/pkg/ext-proc/metrics/metrics_handler.go b/pkg/ext-proc/metrics/metrics_handler.go new file mode 100644 index 00000000..7cc7b5f4 --- /dev/null +++ b/pkg/ext-proc/metrics/metrics_handler.go @@ -0,0 +1,29 @@ +package metrics + +import ( + "net" + "net/http" + "strconv" + + "github.com/prometheus/client_golang/prometheus/promhttp" + "k8s.io/component-base/metrics/legacyregistry" + "k8s.io/klog/v2" +) + +func StartMetricsHandler(port int) { + klog.Info("Starting metrics HTTP handler ...") + + mux := http.NewServeMux() + mux.Handle("/metrics", promhttp.HandlerFor( + legacyregistry.DefaultGatherer, + promhttp.HandlerOpts{}, + )) + + server := &http.Server{ + Addr: net.JoinHostPort("", strconv.Itoa(port)), + Handler: mux, + } + if err := server.ListenAndServe(); err != http.ErrServerClosed { + klog.Fatalf("failed to start metrics HTTP handler: %v", err) + } +} From 48124e438c58abd01629d6ab5326126f236775b1 Mon Sep 17 00:00:00 2001 From: Jie WU Date: Wed, 8 Jan 2025 23:18:44 +0000 Subject: [PATCH 09/25] typo --- pkg/manifests/ext_proc.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/manifests/ext_proc.yaml b/pkg/manifests/ext_proc.yaml index 0c8e38df..37858afc 100644 --- a/pkg/manifests/ext_proc.yaml +++ b/pkg/manifests/ext_proc.yaml @@ -49,6 +49,7 @@ spec: containers: - name: inference-gateway-ext-proc # TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/34) Update the image and args. + image: us-central1-docker.pkg.dev/k8s-staging-images/llm-instance-gateway/epp:main args: - -serverPoolName - "vllm-llama2-7b-pool" From 80039efb1a62c805c76c066982b2dd1a81ae874a Mon Sep 17 00:00:00 2001 From: kfswain <137822113+kfswain@users.noreply.github.com> Date: Mon, 6 Jan 2025 14:28:30 -0700 Subject: [PATCH 10/25] updating the boilerplate template (#156) --- api/v1alpha1/zz_generated.deepcopy.go | 2 +- client-go/applyconfiguration/api/v1alpha1/inferencemodel.go | 2 +- client-go/applyconfiguration/api/v1alpha1/inferencemodelspec.go | 2 +- .../applyconfiguration/api/v1alpha1/inferencemodelstatus.go | 2 +- client-go/applyconfiguration/api/v1alpha1/inferencepool.go | 2 +- client-go/applyconfiguration/api/v1alpha1/inferencepoolspec.go | 2 +- .../applyconfiguration/api/v1alpha1/inferencepoolstatus.go | 2 +- .../applyconfiguration/api/v1alpha1/poolobjectreference.go | 2 +- client-go/applyconfiguration/api/v1alpha1/targetmodel.go | 2 +- client-go/applyconfiguration/internal/internal.go | 2 +- client-go/applyconfiguration/utils.go | 2 +- client-go/clientset/versioned/clientset.go | 2 +- client-go/clientset/versioned/fake/clientset_generated.go | 2 +- client-go/clientset/versioned/fake/doc.go | 2 +- client-go/clientset/versioned/fake/register.go | 2 +- client-go/clientset/versioned/scheme/doc.go | 2 +- client-go/clientset/versioned/scheme/register.go | 2 +- client-go/clientset/versioned/typed/api/v1alpha1/api_client.go | 2 +- client-go/clientset/versioned/typed/api/v1alpha1/doc.go | 2 +- client-go/clientset/versioned/typed/api/v1alpha1/fake/doc.go | 2 +- .../versioned/typed/api/v1alpha1/fake/fake_api_client.go | 2 +- .../versioned/typed/api/v1alpha1/fake/fake_inferencemodel.go | 2 +- .../versioned/typed/api/v1alpha1/fake/fake_inferencepool.go | 2 +- .../versioned/typed/api/v1alpha1/generated_expansion.go | 2 +- .../clientset/versioned/typed/api/v1alpha1/inferencemodel.go | 2 +- .../clientset/versioned/typed/api/v1alpha1/inferencepool.go | 2 +- client-go/informers/externalversions/api/interface.go | 2 +- .../informers/externalversions/api/v1alpha1/inferencemodel.go | 2 +- .../informers/externalversions/api/v1alpha1/inferencepool.go | 2 +- client-go/informers/externalversions/api/v1alpha1/interface.go | 2 +- client-go/informers/externalversions/factory.go | 2 +- client-go/informers/externalversions/generic.go | 2 +- .../externalversions/internalinterfaces/factory_interfaces.go | 2 +- client-go/listers/api/v1alpha1/expansion_generated.go | 2 +- client-go/listers/api/v1alpha1/inferencemodel.go | 2 +- client-go/listers/api/v1alpha1/inferencepool.go | 2 +- hack/boilerplate.go.txt | 2 +- 37 files changed, 37 insertions(+), 37 deletions(-) diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 4f17fbd0..861f7901 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -1,7 +1,7 @@ //go:build !ignore_autogenerated /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencemodel.go b/client-go/applyconfiguration/api/v1alpha1/inferencemodel.go index 7684e2bb..b9aeb0ee 100644 --- a/client-go/applyconfiguration/api/v1alpha1/inferencemodel.go +++ b/client-go/applyconfiguration/api/v1alpha1/inferencemodel.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencemodelspec.go b/client-go/applyconfiguration/api/v1alpha1/inferencemodelspec.go index acaead74..dbdeff3d 100644 --- a/client-go/applyconfiguration/api/v1alpha1/inferencemodelspec.go +++ b/client-go/applyconfiguration/api/v1alpha1/inferencemodelspec.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencemodelstatus.go b/client-go/applyconfiguration/api/v1alpha1/inferencemodelstatus.go index 06f29cc8..b0b003bb 100644 --- a/client-go/applyconfiguration/api/v1alpha1/inferencemodelstatus.go +++ b/client-go/applyconfiguration/api/v1alpha1/inferencemodelstatus.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencepool.go b/client-go/applyconfiguration/api/v1alpha1/inferencepool.go index c7c74239..9a70ca29 100644 --- a/client-go/applyconfiguration/api/v1alpha1/inferencepool.go +++ b/client-go/applyconfiguration/api/v1alpha1/inferencepool.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencepoolspec.go b/client-go/applyconfiguration/api/v1alpha1/inferencepoolspec.go index 24ac03ea..5f8276d0 100644 --- a/client-go/applyconfiguration/api/v1alpha1/inferencepoolspec.go +++ b/client-go/applyconfiguration/api/v1alpha1/inferencepoolspec.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencepoolstatus.go b/client-go/applyconfiguration/api/v1alpha1/inferencepoolstatus.go index 54fbe9c7..f61a81b3 100644 --- a/client-go/applyconfiguration/api/v1alpha1/inferencepoolstatus.go +++ b/client-go/applyconfiguration/api/v1alpha1/inferencepoolstatus.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/applyconfiguration/api/v1alpha1/poolobjectreference.go b/client-go/applyconfiguration/api/v1alpha1/poolobjectreference.go index 07d91c44..692a185e 100644 --- a/client-go/applyconfiguration/api/v1alpha1/poolobjectreference.go +++ b/client-go/applyconfiguration/api/v1alpha1/poolobjectreference.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/applyconfiguration/api/v1alpha1/targetmodel.go b/client-go/applyconfiguration/api/v1alpha1/targetmodel.go index cdaff583..f6ac83f8 100644 --- a/client-go/applyconfiguration/api/v1alpha1/targetmodel.go +++ b/client-go/applyconfiguration/api/v1alpha1/targetmodel.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/applyconfiguration/internal/internal.go b/client-go/applyconfiguration/internal/internal.go index 69b66345..682718ab 100644 --- a/client-go/applyconfiguration/internal/internal.go +++ b/client-go/applyconfiguration/internal/internal.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/applyconfiguration/utils.go b/client-go/applyconfiguration/utils.go index 2348ccc1..20109099 100644 --- a/client-go/applyconfiguration/utils.go +++ b/client-go/applyconfiguration/utils.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/clientset/versioned/clientset.go b/client-go/clientset/versioned/clientset.go index dac47fdc..41f5a207 100644 --- a/client-go/clientset/versioned/clientset.go +++ b/client-go/clientset/versioned/clientset.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/clientset/versioned/fake/clientset_generated.go b/client-go/clientset/versioned/fake/clientset_generated.go index fd4ad406..42a1defb 100644 --- a/client-go/clientset/versioned/fake/clientset_generated.go +++ b/client-go/clientset/versioned/fake/clientset_generated.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/clientset/versioned/fake/doc.go b/client-go/clientset/versioned/fake/doc.go index 57c8c1bc..634bd02c 100644 --- a/client-go/clientset/versioned/fake/doc.go +++ b/client-go/clientset/versioned/fake/doc.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/clientset/versioned/fake/register.go b/client-go/clientset/versioned/fake/register.go index 6c06233c..5221f221 100644 --- a/client-go/clientset/versioned/fake/register.go +++ b/client-go/clientset/versioned/fake/register.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/clientset/versioned/scheme/doc.go b/client-go/clientset/versioned/scheme/doc.go index b00a2a62..40e42c29 100644 --- a/client-go/clientset/versioned/scheme/doc.go +++ b/client-go/clientset/versioned/scheme/doc.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/clientset/versioned/scheme/register.go b/client-go/clientset/versioned/scheme/register.go index b2c6025d..10d2e02f 100644 --- a/client-go/clientset/versioned/scheme/register.go +++ b/client-go/clientset/versioned/scheme/register.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/api_client.go b/client-go/clientset/versioned/typed/api/v1alpha1/api_client.go index b16eef53..bad3301f 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/api_client.go +++ b/client-go/clientset/versioned/typed/api/v1alpha1/api_client.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/doc.go b/client-go/clientset/versioned/typed/api/v1alpha1/doc.go index c0c36e9b..28991e22 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/doc.go +++ b/client-go/clientset/versioned/typed/api/v1alpha1/doc.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/fake/doc.go b/client-go/clientset/versioned/typed/api/v1alpha1/fake/doc.go index 3514f7d2..fbfccbb9 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/fake/doc.go +++ b/client-go/clientset/versioned/typed/api/v1alpha1/fake/doc.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_api_client.go b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_api_client.go index ab06f492..be7f7d78 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_api_client.go +++ b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_api_client.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencemodel.go b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencemodel.go index 0f55a18b..3306d915 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencemodel.go +++ b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencemodel.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencepool.go b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencepool.go index 3a6ae23c..93f15d33 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencepool.go +++ b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencepool.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/generated_expansion.go b/client-go/clientset/versioned/typed/api/v1alpha1/generated_expansion.go index f7907e15..65c88eb1 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/generated_expansion.go +++ b/client-go/clientset/versioned/typed/api/v1alpha1/generated_expansion.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/inferencemodel.go b/client-go/clientset/versioned/typed/api/v1alpha1/inferencemodel.go index 3912ef82..aae9252d 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/inferencemodel.go +++ b/client-go/clientset/versioned/typed/api/v1alpha1/inferencemodel.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/inferencepool.go b/client-go/clientset/versioned/typed/api/v1alpha1/inferencepool.go index 988b005b..56f37a32 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/inferencepool.go +++ b/client-go/clientset/versioned/typed/api/v1alpha1/inferencepool.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/informers/externalversions/api/interface.go b/client-go/informers/externalversions/api/interface.go index 1a47e2c7..01f7fa63 100644 --- a/client-go/informers/externalversions/api/interface.go +++ b/client-go/informers/externalversions/api/interface.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/informers/externalversions/api/v1alpha1/inferencemodel.go b/client-go/informers/externalversions/api/v1alpha1/inferencemodel.go index 2fe7a72b..74b8c0c0 100644 --- a/client-go/informers/externalversions/api/v1alpha1/inferencemodel.go +++ b/client-go/informers/externalversions/api/v1alpha1/inferencemodel.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/informers/externalversions/api/v1alpha1/inferencepool.go b/client-go/informers/externalversions/api/v1alpha1/inferencepool.go index b675adfe..c995f28f 100644 --- a/client-go/informers/externalversions/api/v1alpha1/inferencepool.go +++ b/client-go/informers/externalversions/api/v1alpha1/inferencepool.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/informers/externalversions/api/v1alpha1/interface.go b/client-go/informers/externalversions/api/v1alpha1/interface.go index 9ec36528..5e750459 100644 --- a/client-go/informers/externalversions/api/v1alpha1/interface.go +++ b/client-go/informers/externalversions/api/v1alpha1/interface.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/informers/externalversions/factory.go b/client-go/informers/externalversions/factory.go index 7c167d1f..9c2ba24a 100644 --- a/client-go/informers/externalversions/factory.go +++ b/client-go/informers/externalversions/factory.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/informers/externalversions/generic.go b/client-go/informers/externalversions/generic.go index 737fc907..fc1ab17b 100644 --- a/client-go/informers/externalversions/generic.go +++ b/client-go/informers/externalversions/generic.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go b/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go index 7fa481a4..9ba80db6 100644 --- a/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go +++ b/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/listers/api/v1alpha1/expansion_generated.go b/client-go/listers/api/v1alpha1/expansion_generated.go index e1344ae1..ffbe67cf 100644 --- a/client-go/listers/api/v1alpha1/expansion_generated.go +++ b/client-go/listers/api/v1alpha1/expansion_generated.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/listers/api/v1alpha1/inferencemodel.go b/client-go/listers/api/v1alpha1/inferencemodel.go index 273478ae..9274ca30 100644 --- a/client-go/listers/api/v1alpha1/inferencemodel.go +++ b/client-go/listers/api/v1alpha1/inferencemodel.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/client-go/listers/api/v1alpha1/inferencepool.go b/client-go/listers/api/v1alpha1/inferencepool.go index 9feba784..2b5cffb1 100644 --- a/client-go/listers/api/v1alpha1/inferencepool.go +++ b/client-go/listers/api/v1alpha1/inferencepool.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/hack/boilerplate.go.txt b/hack/boilerplate.go.txt index ff72ff2a..4ad43857 100644 --- a/hack/boilerplate.go.txt +++ b/hack/boilerplate.go.txt @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From 5001fb21d207fd0de6c64f5177567b6f7f5e71df Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 6 Jan 2025 23:44:29 +0100 Subject: [PATCH 11/25] Bump github.com/envoyproxy/go-control-plane from 0.13.1 to 0.13.3 (#155) Bumps [github.com/envoyproxy/go-control-plane](https://github.com/envoyproxy/go-control-plane) from 0.13.1 to 0.13.3. - [Release notes](https://github.com/envoyproxy/go-control-plane/releases) - [Changelog](https://github.com/envoyproxy/go-control-plane/blob/main/CHANGELOG.md) - [Commits](https://github.com/envoyproxy/go-control-plane/compare/v0.13.1...v0.13.3) --- updated-dependencies: - dependency-name: github.com/envoyproxy/go-control-plane dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 4 ++-- go.sum | 10 ++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index b61512f5..bf4d4f4a 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,7 @@ toolchain go1.23.2 require ( github.com/bojand/ghz v0.120.0 - github.com/envoyproxy/go-control-plane v0.13.1 + github.com/envoyproxy/go-control-plane/envoy v1.32.2 github.com/google/go-cmp v0.6.0 github.com/jhump/protoreflect v1.17.0 github.com/onsi/ginkgo/v2 v2.22.0 @@ -38,12 +38,12 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect github.com/bufbuild/protocompile v0.14.1 // indirect - github.com/census-instrumentation/opencensus-proto v0.4.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/dustin/go-humanize v1.0.1 // indirect github.com/emicklei/go-restful/v3 v3.11.0 // indirect + github.com/envoyproxy/go-control-plane v0.13.3 // indirect github.com/envoyproxy/protoc-gen-validate v1.1.0 // indirect github.com/evanphx/json-patch/v5 v5.9.0 // indirect github.com/fxamacker/cbor/v2 v2.7.0 // indirect diff --git a/go.sum b/go.sum index 7d4c4e0d..a3cad841 100644 --- a/go.sum +++ b/go.sum @@ -21,8 +21,6 @@ github.com/bojand/ghz v0.120.0 h1:6F4wsmZVwFg5UnD+/R+IABWk6sKE/0OKIBdUQUZnOdo= github.com/bojand/ghz v0.120.0/go.mod h1:HfECuBZj1v02XObGnRuoZgyB1PR24/25dIYiJIMjJnE= github.com/bufbuild/protocompile v0.14.1 h1:iA73zAf/fyljNjQKwYzUHD6AD4R8KMasmwa/FBatYVw= github.com/bufbuild/protocompile v0.14.1/go.mod h1:ppVdAIhbr2H8asPk6k4pY7t9zB1OU5DoEw9xY/FUi1c= -github.com/census-instrumentation/opencensus-proto v0.4.1 h1:iKLQ0xPNFxR/2hzXZMrBo8f1j86j5WHzznCCQxV/b8g= -github.com/census-instrumentation/opencensus-proto v0.4.1/go.mod h1:4T9NM4+4Vw91VeyqjLS6ao50K5bOcLKN6Q42XnYaRYw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78 h1:QVw89YDxXxEe+l8gU8ETbOasdwEV+avkR75ZzsVV9WI= @@ -36,8 +34,12 @@ github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkp github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/envoyproxy/go-control-plane v0.13.1 h1:vPfJZCkob6yTMEgS+0TwfTUfbHjfy/6vOJ8hUWX/uXE= -github.com/envoyproxy/go-control-plane v0.13.1/go.mod h1:X45hY0mufo6Fd0KW3rqsGvQMw58jvjymeCzBU3mWyHw= +github.com/envoyproxy/go-control-plane v0.13.3 h1:F2vYcSF8iRNhfvhZQRZ5Dvuyu0TpXazE9+h53TzkvA4= +github.com/envoyproxy/go-control-plane v0.13.3/go.mod h1:uhvHSBAMSvy2Y+CuAYfByIRH19zcdir1rgmMzKUo3eA= +github.com/envoyproxy/go-control-plane/envoy v1.32.2 h1:zidqwmijfcbyKqVxjQDFx042PgX+p9U+/fu/f9VtSk8= +github.com/envoyproxy/go-control-plane/envoy v1.32.2/go.mod h1:eR2SOX2IedqlPvmiKjUH7Wu//S602JKI7HPC/L3SRq8= +github.com/envoyproxy/go-control-plane/ratelimit v0.1.0 h1:/G9QYbddjL25KvtKTv3an9lx6VBE2cnb8wp1vEGNYGI= +github.com/envoyproxy/go-control-plane/ratelimit v0.1.0/go.mod h1:Wk+tMFAFbCXaJPzVVHnPgRKdUdwW/KdbRt94AzgRee4= github.com/envoyproxy/protoc-gen-validate v1.1.0 h1:tntQDh69XqOCOZsDz0lVJQez/2L6Uu2PdjCQwWCJ3bM= github.com/envoyproxy/protoc-gen-validate v1.1.0/go.mod h1:sXRDRVmzEbkM7CVcM06s9shE/m23dg3wzjl0UWqJ2q4= github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k= From c1ac0538c0ecfe82d926d9571b210683760c34a3 Mon Sep 17 00:00:00 2001 From: kfswain <137822113+kfswain@users.noreply.github.com> Date: Mon, 6 Jan 2025 15:56:29 -0700 Subject: [PATCH 12/25] Updating non-generated docs/ minor formatting (#160) --- api/v1alpha1/groupversion_info.go | 2 +- api/v1alpha1/inferencemodel_types.go | 2 +- api/v1alpha1/inferencepool_types.go | 4 ++-- test/e2e/e2e_suite_test.go | 2 +- test/e2e/e2e_test.go | 2 +- test/utils/utils.go | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/api/v1alpha1/groupversion_info.go b/api/v1alpha1/groupversion_info.go index 7ff9c399..8c0a449f 100644 --- a/api/v1alpha1/groupversion_info.go +++ b/api/v1alpha1/groupversion_info.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/api/v1alpha1/inferencemodel_types.go b/api/v1alpha1/inferencemodel_types.go index 8e81b4e8..63103181 100644 --- a/api/v1alpha1/inferencemodel_types.go +++ b/api/v1alpha1/inferencemodel_types.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/api/v1alpha1/inferencepool_types.go b/api/v1alpha1/inferencepool_types.go index 666d0ac1..852c7267 100644 --- a/api/v1alpha1/inferencepool_types.go +++ b/api/v1alpha1/inferencepool_types.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -63,7 +63,7 @@ type InferencePoolSpec struct { TargetPortNumber int32 `json:"targetPortNumber"` } -// Originally copied from: https://github.com/kubernetes-sigs/gateway-api/blob/99a3934c6bc1ce0874f3a4c5f20cafd8977ffcb4/apis/v1/shared_types.go#L694-L731 +// LabelKey was originally copied from: https://github.com/kubernetes-sigs/gateway-api/blob/99a3934c6bc1ce0874f3a4c5f20cafd8977ffcb4/apis/v1/shared_types.go#L694-L731 // Duplicated as to not take an unexpected dependency on gw's API. // // LabelKey is the key of a label. This is used for validation diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index 076a0573..2da01a9c 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 54547c04..413c15c6 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/test/utils/utils.go b/test/utils/utils.go index 23fea11a..1f96382e 100644 --- a/test/utils/utils.go +++ b/test/utils/utils.go @@ -1,5 +1,5 @@ /* -Copyright 2024. +Copyright 2024 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From 1252b8f0fee8d858416328cc2e298b87c0198fc1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 7 Jan 2025 00:06:29 +0100 Subject: [PATCH 13/25] Bump github.com/onsi/ginkgo/v2 from 2.22.0 to 2.22.2 (#138) Bumps [github.com/onsi/ginkgo/v2](https://github.com/onsi/ginkgo) from 2.22.0 to 2.22.2. - [Release notes](https://github.com/onsi/ginkgo/releases) - [Changelog](https://github.com/onsi/ginkgo/blob/master/CHANGELOG.md) - [Commits](https://github.com/onsi/ginkgo/compare/v2.22.0...v2.22.2) --- updated-dependencies: - dependency-name: github.com/onsi/ginkgo/v2 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 16 ++++++++-------- go.sum | 32 ++++++++++++++++---------------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/go.mod b/go.mod index bf4d4f4a..57ae1478 100644 --- a/go.mod +++ b/go.mod @@ -9,14 +9,14 @@ require ( github.com/envoyproxy/go-control-plane/envoy v1.32.2 github.com/google/go-cmp v0.6.0 github.com/jhump/protoreflect v1.17.0 - github.com/onsi/ginkgo/v2 v2.22.0 - github.com/onsi/gomega v1.36.1 + github.com/onsi/ginkgo/v2 v2.22.2 + github.com/onsi/gomega v1.36.2 github.com/prometheus/client_model v0.6.1 github.com/prometheus/common v0.61.0 github.com/stretchr/testify v1.10.0 go.uber.org/multierr v1.11.0 google.golang.org/grpc v1.69.0 - google.golang.org/protobuf v1.36.0 + google.golang.org/protobuf v1.36.1 k8s.io/api v0.31.4 k8s.io/apimachinery v0.31.4 k8s.io/client-go v0.31.4 @@ -57,7 +57,7 @@ require ( github.com/golang/protobuf v1.5.4 // indirect github.com/google/gnostic-models v0.6.8 // indirect github.com/google/gofuzz v1.2.0 // indirect - github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db // indirect + github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect github.com/google/uuid v1.6.0 // indirect github.com/huandu/xstrings v1.3.3 // indirect github.com/imdario/mergo v0.3.11 // indirect @@ -81,17 +81,17 @@ require ( github.com/spf13/cast v1.4.1 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/x448/float16 v0.8.4 // indirect - golang.org/x/crypto v0.30.0 // indirect + golang.org/x/crypto v0.31.0 // indirect golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc // indirect - golang.org/x/mod v0.21.0 // indirect - golang.org/x/net v0.32.0 // indirect + golang.org/x/mod v0.22.0 // indirect + golang.org/x/net v0.33.0 // indirect golang.org/x/oauth2 v0.24.0 // indirect golang.org/x/sync v0.10.0 // indirect golang.org/x/sys v0.28.0 // indirect golang.org/x/term v0.27.0 // indirect golang.org/x/text v0.21.0 // indirect golang.org/x/time v0.3.0 // indirect - golang.org/x/tools v0.26.0 // indirect + golang.org/x/tools v0.28.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20241015192408-796eee8c2d53 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20241015192408-796eee8c2d53 // indirect diff --git a/go.sum b/go.sum index a3cad841..f0f5807d 100644 --- a/go.sum +++ b/go.sum @@ -77,8 +77,8 @@ github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeN github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db h1:097atOisP2aRj7vFgYQBbFN4U4JNXUNYpxael3UzMyo= -github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= +github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad h1:a6HEuzUHeKH6hwfN/ZoQgRgVIWFJljSWa/zetS2WTvg= +github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= @@ -121,10 +121,10 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.22.0 h1:Yed107/8DjTr0lKCNt7Dn8yQ6ybuDRQoMGrNFKzMfHg= -github.com/onsi/ginkgo/v2 v2.22.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo= -github.com/onsi/gomega v1.36.1 h1:bJDPBO7ibjxcbHMgSCoo4Yj18UWbKDlLwX1x9sybDcw= -github.com/onsi/gomega v1.36.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog= +github.com/onsi/ginkgo/v2 v2.22.2 h1:/3X8Panh8/WwhU/3Ssa6rCKqPLuAkVY2I0RoyDLySlU= +github.com/onsi/ginkgo/v2 v2.22.2/go.mod h1:oeMosUL+8LtarXBHu/c0bx2D/K9zyQ6uX3cTyztHwsk= +github.com/onsi/gomega v1.36.2 h1:koNYke6TVk6ZmnyHrCXba/T/MoLBXFjeC1PtvYgw0A8= +github.com/onsi/gomega v1.36.2/go.mod h1:DdwyADRjrc825LhMEkD76cHR5+pUnjhUN8GlHlRPHzY= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 h1:GFCKgmp0tecUJ0sJuv4pzYCqS9+RGSn52M3FUwPs+uo= @@ -186,15 +186,15 @@ golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8U golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.3.0/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= -golang.org/x/crypto v0.30.0 h1:RwoQn3GkWiMkzlX562cLB7OxWvjH1L8xutO2WoJcRoY= -golang.org/x/crypto v0.30.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= +golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U= +golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc h1:mCRnTeVUjcrhlRmO0VK8a6k6Rrf6TF9htwo2pJVSjIU= golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc/go.mod h1:V1LtkGg67GoY2N1AnLN78QLrzxkLyJw7RJb1gzOOz9w= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.21.0 h1:vvrHzRwRfVKSiLrG+d4FMl/Qi4ukBCE6kZlTUkDYRT0= -golang.org/x/mod v0.21.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= +golang.org/x/mod v0.22.0 h1:D4nJWe9zXqHOmWqj4VMOJhvzj7bEZg4wEYa759z1pH4= +golang.org/x/mod v0.22.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= @@ -202,8 +202,8 @@ golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwY golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.2.0/go.mod h1:KqCZLdyyvdV855qA2rE3GC2aiw5xGR5TEjj8smXukLY= -golang.org/x/net v0.32.0 h1:ZqPmj8Kzc+Y6e0+skZsuACbx+wzMgo5MQsJh9Qd6aYI= -golang.org/x/net v0.32.0/go.mod h1:CwU0IoeOlnQQWJ6ioyFrfRuomB8GKF6KbYXZVyeXNfs= +golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I= +golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= golang.org/x/oauth2 v0.24.0 h1:KTBBxWqUa0ykRPLtV69rRto9TLXcqYkeswu48x/gvNE= golang.org/x/oauth2 v0.24.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -240,8 +240,8 @@ golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtn golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.26.0 h1:v/60pFQmzmT9ExmjDv2gGIfi3OqfKoEP6I5+umXlbnQ= -golang.org/x/tools v0.26.0/go.mod h1:TPVVj70c7JJ3WCazhD8OdXcZg/og+b9+tH/KxylGwH0= +golang.org/x/tools v0.28.0 h1:WuB6qZ4RPCQo5aP3WdKZS7i595EdWqWR8vqJTlwTVK8= +golang.org/x/tools v0.28.0/go.mod h1:dcIOrVd3mfQKTgrDVQHqCPMWy6lnhfhtX3hLXYVLfRw= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -254,8 +254,8 @@ google.golang.org/genproto/googleapis/rpc v0.0.0-20241015192408-796eee8c2d53 h1: google.golang.org/genproto/googleapis/rpc v0.0.0-20241015192408-796eee8c2d53/go.mod h1:GX3210XPVPUjJbTUbvwI8f2IpZDMZuPJWDzDuebbviI= google.golang.org/grpc v1.69.0 h1:quSiOM1GJPmPH5XtU+BCoVXcDVJJAzNcoyfC2cCjGkI= google.golang.org/grpc v1.69.0/go.mod h1:vyjdE6jLBI76dgpDojsFGNaHlxdjXN9ghpnd2o7JGZ4= -google.golang.org/protobuf v1.36.0 h1:mjIs9gYtt56AzC4ZaffQuh88TZurBGhIJMBZGSxNerQ= -google.golang.org/protobuf v1.36.0/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +google.golang.org/protobuf v1.36.1 h1:yBPeRvTftaleIgM3PZ/WBIZ7XM/eEYAaEyCwvyjq/gk= +google.golang.org/protobuf v1.36.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= From 6abf2e35e64fb2105c2897ab5041184dc9fca5df Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 7 Jan 2025 00:34:29 +0100 Subject: [PATCH 14/25] Bump google.golang.org/grpc from 1.69.0 to 1.69.2 (#133) Bumps [google.golang.org/grpc](https://github.com/grpc/grpc-go) from 1.69.0 to 1.69.2. - [Release notes](https://github.com/grpc/grpc-go/releases) - [Commits](https://github.com/grpc/grpc-go/compare/v1.69.0...v1.69.2) --- updated-dependencies: - dependency-name: google.golang.org/grpc dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 57ae1478..2600b9fc 100644 --- a/go.mod +++ b/go.mod @@ -15,7 +15,7 @@ require ( github.com/prometheus/common v0.61.0 github.com/stretchr/testify v1.10.0 go.uber.org/multierr v1.11.0 - google.golang.org/grpc v1.69.0 + google.golang.org/grpc v1.69.2 google.golang.org/protobuf v1.36.1 k8s.io/api v0.31.4 k8s.io/apimachinery v0.31.4 diff --git a/go.sum b/go.sum index f0f5807d..0b4ac1ed 100644 --- a/go.sum +++ b/go.sum @@ -252,8 +252,8 @@ google.golang.org/genproto/googleapis/api v0.0.0-20241015192408-796eee8c2d53 h1: google.golang.org/genproto/googleapis/api v0.0.0-20241015192408-796eee8c2d53/go.mod h1:riSXTwQ4+nqmPGtobMFyW5FqVAmIs0St6VPp4Ug7CE4= google.golang.org/genproto/googleapis/rpc v0.0.0-20241015192408-796eee8c2d53 h1:X58yt85/IXCx0Y3ZwN6sEIKZzQtDEYaBWrDvErdXrRE= google.golang.org/genproto/googleapis/rpc v0.0.0-20241015192408-796eee8c2d53/go.mod h1:GX3210XPVPUjJbTUbvwI8f2IpZDMZuPJWDzDuebbviI= -google.golang.org/grpc v1.69.0 h1:quSiOM1GJPmPH5XtU+BCoVXcDVJJAzNcoyfC2cCjGkI= -google.golang.org/grpc v1.69.0/go.mod h1:vyjdE6jLBI76dgpDojsFGNaHlxdjXN9ghpnd2o7JGZ4= +google.golang.org/grpc v1.69.2 h1:U3S9QEtbXC0bYNvRtcoklF3xGtLViumSYxWykJS+7AU= +google.golang.org/grpc v1.69.2/go.mod h1:vyjdE6jLBI76dgpDojsFGNaHlxdjXN9ghpnd2o7JGZ4= google.golang.org/protobuf v1.36.1 h1:yBPeRvTftaleIgM3PZ/WBIZ7XM/eEYAaEyCwvyjq/gk= google.golang.org/protobuf v1.36.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= From 111864dbe4b6afefc1edcf413894bdfb1959968b Mon Sep 17 00:00:00 2001 From: Madhav Jivrajani <12381034+MadhavJivrajani@users.noreply.github.com> Date: Mon, 6 Jan 2025 20:28:29 -0800 Subject: [PATCH 15/25] .*: change llm-instance-gateway -> gateway-api-inference-extension (#161) Signed-off-by: Madhav Jivrajani --- PROJECT | 8 ++++---- README.md | 2 +- .../applyconfiguration/api/v1alpha1/inferencemodelspec.go | 2 +- .../applyconfiguration/api/v1alpha1/inferencepoolspec.go | 2 +- client-go/applyconfiguration/utils.go | 6 +++--- client-go/clientset/versioned/clientset.go | 2 +- client-go/clientset/versioned/fake/clientset_generated.go | 8 ++++---- client-go/clientset/versioned/fake/register.go | 2 +- client-go/clientset/versioned/scheme/register.go | 2 +- .../clientset/versioned/typed/api/v1alpha1/api_client.go | 4 ++-- .../versioned/typed/api/v1alpha1/fake/fake_api_client.go | 2 +- .../typed/api/v1alpha1/fake/fake_inferencemodel.go | 4 ++-- .../typed/api/v1alpha1/fake/fake_inferencepool.go | 4 ++-- .../versioned/typed/api/v1alpha1/inferencemodel.go | 6 +++--- .../versioned/typed/api/v1alpha1/inferencepool.go | 6 +++--- client-go/informers/externalversions/api/interface.go | 4 ++-- .../externalversions/api/v1alpha1/inferencemodel.go | 8 ++++---- .../externalversions/api/v1alpha1/inferencepool.go | 8 ++++---- .../informers/externalversions/api/v1alpha1/interface.go | 2 +- client-go/informers/externalversions/factory.go | 6 +++--- client-go/informers/externalversions/generic.go | 2 +- .../internalinterfaces/factory_interfaces.go | 2 +- client-go/listers/api/v1alpha1/inferencemodel.go | 2 +- client-go/listers/api/v1alpha1/inferencepool.go | 2 +- go.mod | 2 +- hack/update-codegen.sh | 2 +- pkg/ext-proc/backend/datastore.go | 2 +- pkg/ext-proc/backend/datastore_test.go | 2 +- pkg/ext-proc/backend/endpointslice_reconciler.go | 2 +- pkg/ext-proc/backend/endpointslice_reconcilier_test.go | 2 +- pkg/ext-proc/backend/fake.go | 2 +- pkg/ext-proc/backend/inferencemodel_reconciler.go | 2 +- pkg/ext-proc/backend/inferencemodel_reconciler_test.go | 2 +- pkg/ext-proc/backend/inferencepool_reconciler.go | 2 +- pkg/ext-proc/backend/vllm/metrics.go | 4 ++-- pkg/ext-proc/backend/vllm/metrics_test.go | 2 +- pkg/ext-proc/handlers/server.go | 6 +++--- pkg/ext-proc/main.go | 5 +++++ pkg/ext-proc/scheduling/filter.go | 2 +- pkg/ext-proc/scheduling/filter_test.go | 2 +- pkg/ext-proc/scheduling/scheduler.go | 8 ++++---- pkg/ext-proc/test/benchmark/benchmark.go | 6 +++--- pkg/ext-proc/test/hermetic_test.go | 4 ++-- pkg/ext-proc/test/utils.go | 8 ++++---- pkg/manifests/ext_proc.yaml | 2 +- test/e2e/e2e_test.go | 2 +- 46 files changed, 86 insertions(+), 81 deletions(-) diff --git a/PROJECT b/PROJECT index 564bcc13..75c9c9cc 100644 --- a/PROJECT +++ b/PROJECT @@ -5,8 +5,8 @@ domain: x-k8s.io layout: - go.kubebuilder.io/v4 -projectName: llm-instance-gateway -repo: sigs.k8s.io/llm-instance-gateway +projectName: gateway-api-inference-extension +repo: sigs.k8s.io/gateway-api-inference-extension resources: - api: crdVersion: v1 @@ -14,7 +14,7 @@ resources: domain: x-k8s.io group: inference kind: InferencePool - path: sigs.k8s.io/llm-instance-gateway/api/v1alpha1 + path: sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1 version: v1alpha1 - api: crdVersion: v1 @@ -22,6 +22,6 @@ resources: domain: x-k8s.io group: inference kind: InferenceModel - path: sigs.k8s.io/llm-instance-gateway/api/v1alpha1 + path: sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1 version: v1alpha1 version: "3" diff --git a/README.md b/README.md index 0cf08769..8c5e47b9 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ make uninstall ``` **Deploying the ext-proc image** -Refer to this [README](https://github.com/kubernetes-sigs/llm-instance-gateway/blob/main/pkg/README.md) on how to deploy the Ext-Proc image. +Refer to this [README](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/README.md) on how to deploy the Ext-Proc image. ## Contributing diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencemodelspec.go b/client-go/applyconfiguration/api/v1alpha1/inferencemodelspec.go index dbdeff3d..4659a1fb 100644 --- a/client-go/applyconfiguration/api/v1alpha1/inferencemodelspec.go +++ b/client-go/applyconfiguration/api/v1alpha1/inferencemodelspec.go @@ -18,7 +18,7 @@ limitations under the License. package v1alpha1 import ( - v1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" + v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" ) // InferenceModelSpecApplyConfiguration represents a declarative configuration of the InferenceModelSpec type for use diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencepoolspec.go b/client-go/applyconfiguration/api/v1alpha1/inferencepoolspec.go index 5f8276d0..92a7f6e4 100644 --- a/client-go/applyconfiguration/api/v1alpha1/inferencepoolspec.go +++ b/client-go/applyconfiguration/api/v1alpha1/inferencepoolspec.go @@ -18,7 +18,7 @@ limitations under the License. package v1alpha1 import ( - v1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" + v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" ) // InferencePoolSpecApplyConfiguration represents a declarative configuration of the InferencePoolSpec type for use diff --git a/client-go/applyconfiguration/utils.go b/client-go/applyconfiguration/utils.go index 20109099..eb0264b3 100644 --- a/client-go/applyconfiguration/utils.go +++ b/client-go/applyconfiguration/utils.go @@ -18,9 +18,9 @@ limitations under the License. package applyconfiguration import ( - v1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" - apiv1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/client-go/applyconfiguration/api/v1alpha1" - internal "inference.networking.x-k8s.io/llm-instance-gateway/client-go/applyconfiguration/internal" + v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" + internal "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/internal" runtime "k8s.io/apimachinery/pkg/runtime" schema "k8s.io/apimachinery/pkg/runtime/schema" testing "k8s.io/client-go/testing" diff --git a/client-go/clientset/versioned/clientset.go b/client-go/clientset/versioned/clientset.go index 41f5a207..e91bf8ff 100644 --- a/client-go/clientset/versioned/clientset.go +++ b/client-go/clientset/versioned/clientset.go @@ -21,7 +21,7 @@ import ( "fmt" "net/http" - apiv1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/client-go/clientset/versioned/typed/api/v1alpha1" + apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" discovery "k8s.io/client-go/discovery" rest "k8s.io/client-go/rest" flowcontrol "k8s.io/client-go/util/flowcontrol" diff --git a/client-go/clientset/versioned/fake/clientset_generated.go b/client-go/clientset/versioned/fake/clientset_generated.go index 42a1defb..dda29ec6 100644 --- a/client-go/clientset/versioned/fake/clientset_generated.go +++ b/client-go/clientset/versioned/fake/clientset_generated.go @@ -18,10 +18,10 @@ limitations under the License. package fake import ( - applyconfiguration "inference.networking.x-k8s.io/llm-instance-gateway/client-go/applyconfiguration" - clientset "inference.networking.x-k8s.io/llm-instance-gateway/client-go/clientset/versioned" - apiv1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/client-go/clientset/versioned/typed/api/v1alpha1" - fakeapiv1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/client-go/clientset/versioned/typed/api/v1alpha1/fake" + applyconfiguration "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/applyconfiguration" + clientset "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" + apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" + fakeapiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1/fake" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/discovery" diff --git a/client-go/clientset/versioned/fake/register.go b/client-go/clientset/versioned/fake/register.go index 5221f221..f252a096 100644 --- a/client-go/clientset/versioned/fake/register.go +++ b/client-go/clientset/versioned/fake/register.go @@ -18,7 +18,7 @@ limitations under the License. package fake import ( - apiv1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" + apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" schema "k8s.io/apimachinery/pkg/runtime/schema" diff --git a/client-go/clientset/versioned/scheme/register.go b/client-go/clientset/versioned/scheme/register.go index 10d2e02f..6e243827 100644 --- a/client-go/clientset/versioned/scheme/register.go +++ b/client-go/clientset/versioned/scheme/register.go @@ -18,7 +18,7 @@ limitations under the License. package scheme import ( - apiv1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" + apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" schema "k8s.io/apimachinery/pkg/runtime/schema" diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/api_client.go b/client-go/clientset/versioned/typed/api/v1alpha1/api_client.go index bad3301f..7ab11004 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/api_client.go +++ b/client-go/clientset/versioned/typed/api/v1alpha1/api_client.go @@ -20,8 +20,8 @@ package v1alpha1 import ( "net/http" - v1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" - "inference.networking.x-k8s.io/llm-instance-gateway/client-go/clientset/versioned/scheme" + v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" rest "k8s.io/client-go/rest" ) diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_api_client.go b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_api_client.go index be7f7d78..3f1ee31b 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_api_client.go +++ b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_api_client.go @@ -18,7 +18,7 @@ limitations under the License. package fake import ( - v1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/client-go/clientset/versioned/typed/api/v1alpha1" + v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" rest "k8s.io/client-go/rest" testing "k8s.io/client-go/testing" ) diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencemodel.go b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencemodel.go index 3306d915..1bbf61b4 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencemodel.go +++ b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencemodel.go @@ -22,8 +22,8 @@ import ( json "encoding/json" "fmt" - v1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" - apiv1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/client-go/applyconfiguration/api/v1alpha1" + v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" labels "k8s.io/apimachinery/pkg/labels" types "k8s.io/apimachinery/pkg/types" diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencepool.go b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencepool.go index 93f15d33..655d5116 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencepool.go +++ b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencepool.go @@ -22,8 +22,8 @@ import ( json "encoding/json" "fmt" - v1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" - apiv1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/client-go/applyconfiguration/api/v1alpha1" + v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" labels "k8s.io/apimachinery/pkg/labels" types "k8s.io/apimachinery/pkg/types" diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/inferencemodel.go b/client-go/clientset/versioned/typed/api/v1alpha1/inferencemodel.go index aae9252d..d2edbe95 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/inferencemodel.go +++ b/client-go/clientset/versioned/typed/api/v1alpha1/inferencemodel.go @@ -20,9 +20,9 @@ package v1alpha1 import ( "context" - v1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" - apiv1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/client-go/applyconfiguration/api/v1alpha1" - scheme "inference.networking.x-k8s.io/llm-instance-gateway/client-go/clientset/versioned/scheme" + v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" + scheme "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" types "k8s.io/apimachinery/pkg/types" watch "k8s.io/apimachinery/pkg/watch" diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/inferencepool.go b/client-go/clientset/versioned/typed/api/v1alpha1/inferencepool.go index 56f37a32..28705873 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/inferencepool.go +++ b/client-go/clientset/versioned/typed/api/v1alpha1/inferencepool.go @@ -20,9 +20,9 @@ package v1alpha1 import ( "context" - v1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" - apiv1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/client-go/applyconfiguration/api/v1alpha1" - scheme "inference.networking.x-k8s.io/llm-instance-gateway/client-go/clientset/versioned/scheme" + v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" + scheme "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" types "k8s.io/apimachinery/pkg/types" watch "k8s.io/apimachinery/pkg/watch" diff --git a/client-go/informers/externalversions/api/interface.go b/client-go/informers/externalversions/api/interface.go index 01f7fa63..6ca4f9da 100644 --- a/client-go/informers/externalversions/api/interface.go +++ b/client-go/informers/externalversions/api/interface.go @@ -18,8 +18,8 @@ limitations under the License. package api import ( - v1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/client-go/informers/externalversions/api/v1alpha1" - internalinterfaces "inference.networking.x-k8s.io/llm-instance-gateway/client-go/informers/externalversions/internalinterfaces" + v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/api/v1alpha1" + internalinterfaces "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" ) // Interface provides access to each of this group's versions. diff --git a/client-go/informers/externalversions/api/v1alpha1/inferencemodel.go b/client-go/informers/externalversions/api/v1alpha1/inferencemodel.go index 74b8c0c0..bf5b7670 100644 --- a/client-go/informers/externalversions/api/v1alpha1/inferencemodel.go +++ b/client-go/informers/externalversions/api/v1alpha1/inferencemodel.go @@ -21,10 +21,10 @@ import ( "context" time "time" - apiv1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" - versioned "inference.networking.x-k8s.io/llm-instance-gateway/client-go/clientset/versioned" - internalinterfaces "inference.networking.x-k8s.io/llm-instance-gateway/client-go/informers/externalversions/internalinterfaces" - v1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/client-go/listers/api/v1alpha1" + apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + versioned "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" + internalinterfaces "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" + v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/listers/api/v1alpha1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" watch "k8s.io/apimachinery/pkg/watch" diff --git a/client-go/informers/externalversions/api/v1alpha1/inferencepool.go b/client-go/informers/externalversions/api/v1alpha1/inferencepool.go index c995f28f..ba163404 100644 --- a/client-go/informers/externalversions/api/v1alpha1/inferencepool.go +++ b/client-go/informers/externalversions/api/v1alpha1/inferencepool.go @@ -21,10 +21,10 @@ import ( "context" time "time" - apiv1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" - versioned "inference.networking.x-k8s.io/llm-instance-gateway/client-go/clientset/versioned" - internalinterfaces "inference.networking.x-k8s.io/llm-instance-gateway/client-go/informers/externalversions/internalinterfaces" - v1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/client-go/listers/api/v1alpha1" + apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + versioned "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" + internalinterfaces "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" + v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/listers/api/v1alpha1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" watch "k8s.io/apimachinery/pkg/watch" diff --git a/client-go/informers/externalversions/api/v1alpha1/interface.go b/client-go/informers/externalversions/api/v1alpha1/interface.go index 5e750459..9ba07025 100644 --- a/client-go/informers/externalversions/api/v1alpha1/interface.go +++ b/client-go/informers/externalversions/api/v1alpha1/interface.go @@ -18,7 +18,7 @@ limitations under the License. package v1alpha1 import ( - internalinterfaces "inference.networking.x-k8s.io/llm-instance-gateway/client-go/informers/externalversions/internalinterfaces" + internalinterfaces "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" ) // Interface provides access to all the informers in this group version. diff --git a/client-go/informers/externalversions/factory.go b/client-go/informers/externalversions/factory.go index 9c2ba24a..39c96068 100644 --- a/client-go/informers/externalversions/factory.go +++ b/client-go/informers/externalversions/factory.go @@ -22,9 +22,9 @@ import ( sync "sync" time "time" - versioned "inference.networking.x-k8s.io/llm-instance-gateway/client-go/clientset/versioned" - api "inference.networking.x-k8s.io/llm-instance-gateway/client-go/informers/externalversions/api" - internalinterfaces "inference.networking.x-k8s.io/llm-instance-gateway/client-go/informers/externalversions/internalinterfaces" + versioned "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" + api "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/api" + internalinterfaces "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" schema "k8s.io/apimachinery/pkg/runtime/schema" diff --git a/client-go/informers/externalversions/generic.go b/client-go/informers/externalversions/generic.go index fc1ab17b..0e77deb4 100644 --- a/client-go/informers/externalversions/generic.go +++ b/client-go/informers/externalversions/generic.go @@ -20,7 +20,7 @@ package externalversions import ( "fmt" - v1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" + v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" schema "k8s.io/apimachinery/pkg/runtime/schema" cache "k8s.io/client-go/tools/cache" ) diff --git a/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go b/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go index 9ba80db6..488aca6f 100644 --- a/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go +++ b/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go @@ -20,7 +20,7 @@ package internalinterfaces import ( time "time" - versioned "inference.networking.x-k8s.io/llm-instance-gateway/client-go/clientset/versioned" + versioned "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" cache "k8s.io/client-go/tools/cache" diff --git a/client-go/listers/api/v1alpha1/inferencemodel.go b/client-go/listers/api/v1alpha1/inferencemodel.go index 9274ca30..dfc7e9c9 100644 --- a/client-go/listers/api/v1alpha1/inferencemodel.go +++ b/client-go/listers/api/v1alpha1/inferencemodel.go @@ -18,7 +18,7 @@ limitations under the License. package v1alpha1 import ( - v1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" + v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" "k8s.io/apimachinery/pkg/labels" "k8s.io/client-go/listers" "k8s.io/client-go/tools/cache" diff --git a/client-go/listers/api/v1alpha1/inferencepool.go b/client-go/listers/api/v1alpha1/inferencepool.go index 2b5cffb1..ff5735f7 100644 --- a/client-go/listers/api/v1alpha1/inferencepool.go +++ b/client-go/listers/api/v1alpha1/inferencepool.go @@ -18,7 +18,7 @@ limitations under the License. package v1alpha1 import ( - v1alpha1 "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" + v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" "k8s.io/apimachinery/pkg/labels" "k8s.io/client-go/listers" "k8s.io/client-go/tools/cache" diff --git a/go.mod b/go.mod index 2600b9fc..1fdea8f7 100644 --- a/go.mod +++ b/go.mod @@ -1,4 +1,4 @@ -module inference.networking.x-k8s.io/llm-instance-gateway +module inference.networking.x-k8s.io/gateway-api-inference-extension go 1.22.7 diff --git a/hack/update-codegen.sh b/hack/update-codegen.sh index dc19fece..cfe75f81 100755 --- a/hack/update-codegen.sh +++ b/hack/update-codegen.sh @@ -23,7 +23,7 @@ echo "$SCRIPT_ROOT script" CODEGEN_PKG=${2:-bin} echo $CODEGEN_PKG source "${CODEGEN_PKG}/kube_codegen.sh" -THIS_PKG="inference.networking.x-k8s.io/llm-instance-gateway" +THIS_PKG="inference.networking.x-k8s.io/gateway-api-inference-extension" kube::codegen::gen_helpers \ diff --git a/pkg/ext-proc/backend/datastore.go b/pkg/ext-proc/backend/datastore.go index 5914ac04..f1e6379d 100644 --- a/pkg/ext-proc/backend/datastore.go +++ b/pkg/ext-proc/backend/datastore.go @@ -5,7 +5,7 @@ import ( "math/rand" "sync" - "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" + "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" corev1 "k8s.io/api/core/v1" "k8s.io/klog/v2" ) diff --git a/pkg/ext-proc/backend/datastore_test.go b/pkg/ext-proc/backend/datastore_test.go index d84206bb..57204eb0 100644 --- a/pkg/ext-proc/backend/datastore_test.go +++ b/pkg/ext-proc/backend/datastore_test.go @@ -3,7 +3,7 @@ package backend import ( "testing" - "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" + "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" ) func TestRandomWeightedDraw(t *testing.T) { diff --git a/pkg/ext-proc/backend/endpointslice_reconciler.go b/pkg/ext-proc/backend/endpointslice_reconciler.go index 42251cfb..df19431e 100644 --- a/pkg/ext-proc/backend/endpointslice_reconciler.go +++ b/pkg/ext-proc/backend/endpointslice_reconciler.go @@ -4,7 +4,7 @@ import ( "context" "strconv" - "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" + "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" discoveryv1 "k8s.io/api/discovery/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" diff --git a/pkg/ext-proc/backend/endpointslice_reconcilier_test.go b/pkg/ext-proc/backend/endpointslice_reconcilier_test.go index 16bcd8c2..e3c927ba 100644 --- a/pkg/ext-proc/backend/endpointslice_reconcilier_test.go +++ b/pkg/ext-proc/backend/endpointslice_reconcilier_test.go @@ -4,7 +4,7 @@ import ( "sync" "testing" - "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" + "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" v1 "k8s.io/api/core/v1" discoveryv1 "k8s.io/api/discovery/v1" ) diff --git a/pkg/ext-proc/backend/fake.go b/pkg/ext-proc/backend/fake.go index 4c83ae91..c4545497 100644 --- a/pkg/ext-proc/backend/fake.go +++ b/pkg/ext-proc/backend/fake.go @@ -3,7 +3,7 @@ package backend import ( "context" - "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" + "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" klog "k8s.io/klog/v2" ) diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler.go b/pkg/ext-proc/backend/inferencemodel_reconciler.go index b685b348..1dbadcb2 100644 --- a/pkg/ext-proc/backend/inferencemodel_reconciler.go +++ b/pkg/ext-proc/backend/inferencemodel_reconciler.go @@ -3,7 +3,7 @@ package backend import ( "context" - "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" + "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" "k8s.io/klog/v2" diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go index 9f1ef6ed..c0b40ddc 100644 --- a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go +++ b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go @@ -4,7 +4,7 @@ import ( "sync" "testing" - "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" + "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) diff --git a/pkg/ext-proc/backend/inferencepool_reconciler.go b/pkg/ext-proc/backend/inferencepool_reconciler.go index c85281d3..60e32f9b 100644 --- a/pkg/ext-proc/backend/inferencepool_reconciler.go +++ b/pkg/ext-proc/backend/inferencepool_reconciler.go @@ -3,7 +3,7 @@ package backend import ( "context" - "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" + "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" "k8s.io/klog/v2" diff --git a/pkg/ext-proc/backend/vllm/metrics.go b/pkg/ext-proc/backend/vllm/metrics.go index 018ba230..5fff4d8e 100644 --- a/pkg/ext-proc/backend/vllm/metrics.go +++ b/pkg/ext-proc/backend/vllm/metrics.go @@ -12,7 +12,7 @@ import ( dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" "go.uber.org/multierr" - "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" klog "k8s.io/klog/v2" ) @@ -41,7 +41,7 @@ func (p *PodMetricsClientImpl) FetchMetrics( existing *backend.PodMetrics, ) (*backend.PodMetrics, error) { // Currently the metrics endpoint is hard-coded, which works with vLLM. - // TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/16): Consume this from InferencePool config. + // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config. url := fmt.Sprintf("http://%s/metrics", pod.Address) req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { diff --git a/pkg/ext-proc/backend/vllm/metrics_test.go b/pkg/ext-proc/backend/vllm/metrics_test.go index 1225c709..e3c1449d 100644 --- a/pkg/ext-proc/backend/vllm/metrics_test.go +++ b/pkg/ext-proc/backend/vllm/metrics_test.go @@ -7,7 +7,7 @@ import ( dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" "google.golang.org/protobuf/proto" - "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" ) func TestPromToPodMetrics(t *testing.T) { diff --git a/pkg/ext-proc/handlers/server.go b/pkg/ext-proc/handlers/server.go index 8ddd54d1..76ec0928 100644 --- a/pkg/ext-proc/handlers/server.go +++ b/pkg/ext-proc/handlers/server.go @@ -7,9 +7,9 @@ import ( envoyTypePb "github.com/envoyproxy/go-control-plane/envoy/type/v3" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" - "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" - "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend" - "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/scheduling" + "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" klog "k8s.io/klog/v2" ) diff --git a/pkg/ext-proc/main.go b/pkg/ext-proc/main.go index 806c375c..f4e9f444 100644 --- a/pkg/ext-proc/main.go +++ b/pkg/ext-proc/main.go @@ -15,6 +15,11 @@ import ( "google.golang.org/grpc/codes" healthPb "google.golang.org/grpc/health/grpc_health_v1" "google.golang.org/grpc/status" + "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend/vllm" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/handlers" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend" "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend/vllm" diff --git a/pkg/ext-proc/scheduling/filter.go b/pkg/ext-proc/scheduling/filter.go index 6ce31fec..09779d63 100644 --- a/pkg/ext-proc/scheduling/filter.go +++ b/pkg/ext-proc/scheduling/filter.go @@ -4,7 +4,7 @@ import ( "errors" "math" - "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" klog "k8s.io/klog/v2" ) diff --git a/pkg/ext-proc/scheduling/filter_test.go b/pkg/ext-proc/scheduling/filter_test.go index c7ffe45d..d88f437c 100644 --- a/pkg/ext-proc/scheduling/filter_test.go +++ b/pkg/ext-proc/scheduling/filter_test.go @@ -5,7 +5,7 @@ import ( "testing" "github.com/google/go-cmp/cmp" - "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" ) func TestFilter(t *testing.T) { diff --git a/pkg/ext-proc/scheduling/scheduler.go b/pkg/ext-proc/scheduling/scheduler.go index 1c41794c..c6a91541 100644 --- a/pkg/ext-proc/scheduling/scheduler.go +++ b/pkg/ext-proc/scheduling/scheduler.go @@ -7,16 +7,16 @@ import ( "google.golang.org/grpc/codes" "google.golang.org/grpc/status" - "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" klog "k8s.io/klog/v2" ) const ( - // TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/16) Make this configurable. + // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable. kvCacheThreshold = 0.8 - // TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/16) Make this configurable. + // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable. queueThresholdCritical = 5 - // TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/16) Make this configurable. + // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable. // the threshold for queued requests to be considered low below which we can prioritize LoRA affinity. // The value of 50 is arrived heuristicically based on experiments. queueingThresholdLoRA = 50 diff --git a/pkg/ext-proc/test/benchmark/benchmark.go b/pkg/ext-proc/test/benchmark/benchmark.go index 559f3b7c..22a0988c 100644 --- a/pkg/ext-proc/test/benchmark/benchmark.go +++ b/pkg/ext-proc/test/benchmark/benchmark.go @@ -10,9 +10,9 @@ import ( "github.com/bojand/ghz/runner" "github.com/jhump/protoreflect/desc" "google.golang.org/protobuf/proto" - "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" - "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend" - "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/test" + "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/test" klog "k8s.io/klog/v2" ) diff --git a/pkg/ext-proc/test/hermetic_test.go b/pkg/ext-proc/test/hermetic_test.go index b77d4ae3..d98031ee 100644 --- a/pkg/ext-proc/test/hermetic_test.go +++ b/pkg/ext-proc/test/hermetic_test.go @@ -13,8 +13,8 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" "google.golang.org/protobuf/testing/protocmp" - "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" - "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend" + "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" ) const ( diff --git a/pkg/ext-proc/test/utils.go b/pkg/ext-proc/test/utils.go index aabb34a1..63972849 100644 --- a/pkg/ext-proc/test/utils.go +++ b/pkg/ext-proc/test/utils.go @@ -9,10 +9,10 @@ import ( extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "google.golang.org/grpc" "google.golang.org/grpc/reflection" - "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" - "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend" - "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/handlers" - "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/scheduling" + "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/handlers" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" klog "k8s.io/klog/v2" ) diff --git a/pkg/manifests/ext_proc.yaml b/pkg/manifests/ext_proc.yaml index 37858afc..327e044b 100644 --- a/pkg/manifests/ext_proc.yaml +++ b/pkg/manifests/ext_proc.yaml @@ -48,7 +48,7 @@ spec: spec: containers: - name: inference-gateway-ext-proc - # TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/34) Update the image and args. + # TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/34) Update the image and args. image: us-central1-docker.pkg.dev/k8s-staging-images/llm-instance-gateway/epp:main args: - -serverPoolName diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 413c15c6..5da11477 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -23,7 +23,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "inference.networking.x-k8s.io/llm-instance-gateway/test/utils" + "inference.networking.x-k8s.io/gateway-api-inference-extension/test/utils" ) const namespace = "api-system" From 774fb62d10031f986b9016395d4b7f3d48f4e3dc Mon Sep 17 00:00:00 2001 From: Daneyon Hansen Date: Mon, 6 Jan 2025 20:38:29 -0800 Subject: [PATCH 16/25] Changes InferencePool EPP Flags (#152) Signed-off-by: Daneyon Hansen --- .../backend/endpointslice_reconciler.go | 14 ++--- .../backend/inferencemodel_reconciler.go | 16 +++--- .../backend/inferencemodel_reconciler_test.go | 4 +- .../backend/inferencepool_reconciler.go | 14 ++--- pkg/ext-proc/main.go | 54 +++++++++---------- pkg/manifests/ext_proc.yaml | 2 +- 6 files changed, 52 insertions(+), 52 deletions(-) diff --git a/pkg/ext-proc/backend/endpointslice_reconciler.go b/pkg/ext-proc/backend/endpointslice_reconciler.go index df19431e..a4a0f5aa 100644 --- a/pkg/ext-proc/backend/endpointslice_reconciler.go +++ b/pkg/ext-proc/backend/endpointslice_reconciler.go @@ -21,13 +21,13 @@ var ( type EndpointSliceReconciler struct { client.Client - Scheme *runtime.Scheme - Record record.EventRecorder - ServerPoolName string - ServiceName string - Zone string - Namespace string - Datastore *K8sDatastore + Scheme *runtime.Scheme + Record record.EventRecorder + PoolName string + ServiceName string + Zone string + Namespace string + Datastore *K8sDatastore } func (c *EndpointSliceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler.go b/pkg/ext-proc/backend/inferencemodel_reconciler.go index 1dbadcb2..d13612de 100644 --- a/pkg/ext-proc/backend/inferencemodel_reconciler.go +++ b/pkg/ext-proc/backend/inferencemodel_reconciler.go @@ -13,15 +13,15 @@ import ( type InferenceModelReconciler struct { client.Client - Scheme *runtime.Scheme - Record record.EventRecorder - Datastore *K8sDatastore - ServerPoolName string - Namespace string + Scheme *runtime.Scheme + Record record.EventRecorder + Datastore *K8sDatastore + PoolName string + PoolNamespace string } func (c *InferenceModelReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - if req.Namespace != c.Namespace { + if req.Namespace != c.PoolNamespace { return ctrl.Result{}, nil } klog.V(1).Info("reconciling InferenceModel", req.NamespacedName) @@ -43,8 +43,8 @@ func (c *InferenceModelReconciler) SetupWithManager(mgr ctrl.Manager) error { } func (c *InferenceModelReconciler) updateDatastore(infModel *v1alpha1.InferenceModel) { - if infModel.Spec.PoolRef.Name == c.ServerPoolName { - klog.V(1).Infof("Incoming pool ref %v, server pool name: %v", infModel.Spec.PoolRef, c.ServerPoolName) + if infModel.Spec.PoolRef.Name == c.PoolName { + klog.V(1).Infof("Incoming pool ref %v, server pool name: %v", infModel.Spec.PoolRef, c.PoolName) klog.V(1).Infof("Adding/Updating inference model: %v", infModel.Spec.ModelName) c.Datastore.InferenceModels.Store(infModel.Spec.ModelName, infModel) return diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go index c0b40ddc..963630a4 100644 --- a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go +++ b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go @@ -125,8 +125,8 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { InferenceModelReconciler := &InferenceModelReconciler{ - Datastore: test.datastore, - ServerPoolName: test.datastore.inferencePool.Name, + Datastore: test.datastore, + PoolName: test.datastore.inferencePool.Name, } InferenceModelReconciler.updateDatastore(test.incomingService) diff --git a/pkg/ext-proc/backend/inferencepool_reconciler.go b/pkg/ext-proc/backend/inferencepool_reconciler.go index 60e32f9b..d45e2dbb 100644 --- a/pkg/ext-proc/backend/inferencepool_reconciler.go +++ b/pkg/ext-proc/backend/inferencepool_reconciler.go @@ -16,16 +16,16 @@ import ( // will have the proper controller that will create/manage objects on behalf of the server pool. type InferencePoolReconciler struct { client.Client - Scheme *runtime.Scheme - Record record.EventRecorder - ServerPoolName string - Namespace string - Datastore *K8sDatastore - Zone string + Scheme *runtime.Scheme + Record record.EventRecorder + PoolName string + PoolNamespace string + Datastore *K8sDatastore + Zone string } func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - if req.NamespacedName.Name != c.ServerPoolName || req.NamespacedName.Namespace != c.Namespace { + if req.NamespacedName.Name != c.PoolName || req.NamespacedName.Namespace != c.PoolNamespace { return ctrl.Result{}, nil } klog.V(1).Info("reconciling InferencePool", req.NamespacedName) diff --git a/pkg/ext-proc/main.go b/pkg/ext-proc/main.go index f4e9f444..dda11913 100644 --- a/pkg/ext-proc/main.go +++ b/pkg/ext-proc/main.go @@ -44,18 +44,18 @@ var ( "targetPodHeader", "target-pod", "Header key used by Envoy to route to the appropriate pod. This must match Envoy configuration.") - serverPoolName = flag.String( - "serverPoolName", + poolName = flag.String( + "poolName", "", - "Name of the serverPool this Endpoint Picker is associated with.") + "Name of the InferencePool this Endpoint Picker is associated with.") + poolNamespace = flag.String( + "poolNamespace", + "default", + "Namespace of the InferencePool this Endpoint Picker is associated with.") serviceName = flag.String( "serviceName", "", - "Name of the service that will be used to read the endpointslices from") - namespace = flag.String( - "namespace", - "default", - "The Namespace that the server pool should exist in.") + "Name of the Service that will be used to read EndpointSlices from") zone = flag.String( "zone", "", @@ -124,35 +124,35 @@ func main() { } if err := (&backend.InferencePoolReconciler{ - Datastore: datastore, - Scheme: mgr.GetScheme(), - Client: mgr.GetClient(), - ServerPoolName: *serverPoolName, - Namespace: *namespace, - Record: mgr.GetEventRecorderFor("InferencePool"), + Datastore: datastore, + Scheme: mgr.GetScheme(), + Client: mgr.GetClient(), + PoolName: *poolName, + PoolNamespace: *poolNamespace, + Record: mgr.GetEventRecorderFor("InferencePool"), }).SetupWithManager(mgr); err != nil { klog.Error(err, "Error setting up InferencePoolReconciler") } if err := (&backend.InferenceModelReconciler{ - Datastore: datastore, - Scheme: mgr.GetScheme(), - Client: mgr.GetClient(), - ServerPoolName: *serverPoolName, - Namespace: *namespace, - Record: mgr.GetEventRecorderFor("InferenceModel"), + Datastore: datastore, + Scheme: mgr.GetScheme(), + Client: mgr.GetClient(), + PoolName: *poolName, + PoolNamespace: *poolNamespace, + Record: mgr.GetEventRecorderFor("InferenceModel"), }).SetupWithManager(mgr); err != nil { klog.Error(err, "Error setting up InferenceModelReconciler") } if err := (&backend.EndpointSliceReconciler{ - Datastore: datastore, - Scheme: mgr.GetScheme(), - Client: mgr.GetClient(), - Record: mgr.GetEventRecorderFor("endpointslice"), - ServiceName: *serviceName, - Zone: *zone, - ServerPoolName: *serverPoolName, + Datastore: datastore, + Scheme: mgr.GetScheme(), + Client: mgr.GetClient(), + Record: mgr.GetEventRecorderFor("endpointslice"), + ServiceName: *serviceName, + Zone: *zone, + PoolName: *poolName, }).SetupWithManager(mgr); err != nil { klog.Error(err, "Error setting up EndpointSliceReconciler") } diff --git a/pkg/manifests/ext_proc.yaml b/pkg/manifests/ext_proc.yaml index 327e044b..dfcfdc3e 100644 --- a/pkg/manifests/ext_proc.yaml +++ b/pkg/manifests/ext_proc.yaml @@ -51,7 +51,7 @@ spec: # TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/34) Update the image and args. image: us-central1-docker.pkg.dev/k8s-staging-images/llm-instance-gateway/epp:main args: - - -serverPoolName + - -poolName - "vllm-llama2-7b-pool" - -v - "3" From 3a819012be838b8c8bfdf1a496899cb59c3e7227 Mon Sep 17 00:00:00 2001 From: Madhav Jivrajani <12381034+MadhavJivrajani@users.noreply.github.com> Date: Tue, 7 Jan 2025 19:08:29 -0800 Subject: [PATCH 17/25] ext-proc: remove unused fields from EndpointSliceReconciler (#165) Signed-off-by: Madhav Jivrajani --- pkg/ext-proc/backend/endpointslice_reconciler.go | 2 -- pkg/ext-proc/main.go | 1 - 2 files changed, 3 deletions(-) diff --git a/pkg/ext-proc/backend/endpointslice_reconciler.go b/pkg/ext-proc/backend/endpointslice_reconciler.go index a4a0f5aa..80bc2997 100644 --- a/pkg/ext-proc/backend/endpointslice_reconciler.go +++ b/pkg/ext-proc/backend/endpointslice_reconciler.go @@ -23,10 +23,8 @@ type EndpointSliceReconciler struct { client.Client Scheme *runtime.Scheme Record record.EventRecorder - PoolName string ServiceName string Zone string - Namespace string Datastore *K8sDatastore } diff --git a/pkg/ext-proc/main.go b/pkg/ext-proc/main.go index dda11913..ed426b4c 100644 --- a/pkg/ext-proc/main.go +++ b/pkg/ext-proc/main.go @@ -152,7 +152,6 @@ func main() { Record: mgr.GetEventRecorderFor("endpointslice"), ServiceName: *serviceName, Zone: *zone, - PoolName: *poolName, }).SetupWithManager(mgr); err != nil { klog.Error(err, "Error setting up EndpointSliceReconciler") } From 1ed2d8d6eee2fab89207436e0cccada521f0df54 Mon Sep 17 00:00:00 2001 From: Madhav Jivrajani <12381034+MadhavJivrajani@users.noreply.github.com> Date: Tue, 7 Jan 2025 19:24:29 -0800 Subject: [PATCH 18/25] ext-proc/backend: add unit test for InferencePoolReconciler (#168) Signed-off-by: Madhav Jivrajani --- .../backend/inferencepool_reconciler_test.go | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 pkg/ext-proc/backend/inferencepool_reconciler_test.go diff --git a/pkg/ext-proc/backend/inferencepool_reconciler_test.go b/pkg/ext-proc/backend/inferencepool_reconciler_test.go new file mode 100644 index 00000000..f03c31cb --- /dev/null +++ b/pkg/ext-proc/backend/inferencepool_reconciler_test.go @@ -0,0 +1,85 @@ +package backend + +import ( + "reflect" + "testing" + + "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +var ( + pool1 = &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pool", + ResourceVersion: "50", + }, + } + // Different name, same RV doesn't really make sense, but helps with testing the + // updateStore impl which relies on the equality of RVs alone. + modPool1SameRV = &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pool-mod", + ResourceVersion: "50", + }, + } + modPool1DiffRV = &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pool-mod", + ResourceVersion: "51", + }, + } +) + +func TestUpdateDatastore_InferencePoolReconciler(t *testing.T) { + tests := []struct { + name string + datastore *K8sDatastore + incomingPool *v1alpha1.InferencePool + wantPool *v1alpha1.InferencePool + }{ + { + name: "InferencePool not set, should set InferencePool", + datastore: &K8sDatastore{}, + incomingPool: pool1.DeepCopy(), + wantPool: pool1, + }, + { + name: "InferencePool set, matching RVs, do nothing", + datastore: &K8sDatastore{ + inferencePool: pool1.DeepCopy(), + }, + incomingPool: modPool1SameRV.DeepCopy(), + wantPool: pool1, + }, + { + name: "InferencePool set, differing RVs, re-set InferencePool", + datastore: &K8sDatastore{ + inferencePool: pool1.DeepCopy(), + }, + incomingPool: modPool1DiffRV.DeepCopy(), + wantPool: modPool1DiffRV, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + inferencePoolReconciler := &InferencePoolReconciler{Datastore: test.datastore} + inferencePoolReconciler.updateDatastore(test.incomingPool) + + gotPool := inferencePoolReconciler.Datastore.inferencePool + if !reflect.DeepEqual(gotPool, test.wantPool) { + t.Errorf("Unexpected InferencePool: want %#v, got: %#v", test.wantPool, gotPool) + } + }) + } +} From 14ae49f5b1629414eca8f1251d6611bd3ab093d3 Mon Sep 17 00:00:00 2001 From: Madhav Jivrajani <12381034+MadhavJivrajani@users.noreply.github.com> Date: Wed, 8 Jan 2025 12:14:30 -0800 Subject: [PATCH 19/25] ext-proc: change Inference* APIs to use NamespacedName (#172) Signed-off-by: Madhav Jivrajani --- .../backend/inferencemodel_reconciler.go | 16 +++++----- .../backend/inferencemodel_reconciler_test.go | 5 ++-- .../backend/inferencepool_reconciler.go | 14 ++++----- pkg/ext-proc/main.go | 29 +++++++++++-------- 4 files changed, 35 insertions(+), 29 deletions(-) diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler.go b/pkg/ext-proc/backend/inferencemodel_reconciler.go index d13612de..d8882e32 100644 --- a/pkg/ext-proc/backend/inferencemodel_reconciler.go +++ b/pkg/ext-proc/backend/inferencemodel_reconciler.go @@ -5,6 +5,7 @@ import ( "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" @@ -13,15 +14,14 @@ import ( type InferenceModelReconciler struct { client.Client - Scheme *runtime.Scheme - Record record.EventRecorder - Datastore *K8sDatastore - PoolName string - PoolNamespace string + Scheme *runtime.Scheme + Record record.EventRecorder + Datastore *K8sDatastore + PoolNamespacedName types.NamespacedName } func (c *InferenceModelReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - if req.Namespace != c.PoolNamespace { + if req.Namespace != c.PoolNamespacedName.Namespace { return ctrl.Result{}, nil } klog.V(1).Info("reconciling InferenceModel", req.NamespacedName) @@ -43,8 +43,8 @@ func (c *InferenceModelReconciler) SetupWithManager(mgr ctrl.Manager) error { } func (c *InferenceModelReconciler) updateDatastore(infModel *v1alpha1.InferenceModel) { - if infModel.Spec.PoolRef.Name == c.PoolName { - klog.V(1).Infof("Incoming pool ref %v, server pool name: %v", infModel.Spec.PoolRef, c.PoolName) + if infModel.Spec.PoolRef.Name == c.PoolNamespacedName.Name { + klog.V(1).Infof("Incoming pool ref %v, server pool name: %v", infModel.Spec.PoolRef, c.PoolNamespacedName.Name) klog.V(1).Infof("Adding/Updating inference model: %v", infModel.Spec.ModelName) c.Datastore.InferenceModels.Store(infModel.Spec.ModelName, infModel) return diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go index 963630a4..5609ca53 100644 --- a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go +++ b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go @@ -6,6 +6,7 @@ import ( "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" ) var ( @@ -125,8 +126,8 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { InferenceModelReconciler := &InferenceModelReconciler{ - Datastore: test.datastore, - PoolName: test.datastore.inferencePool.Name, + Datastore: test.datastore, + PoolNamespacedName: types.NamespacedName{Name: test.datastore.inferencePool.Name}, } InferenceModelReconciler.updateDatastore(test.incomingService) diff --git a/pkg/ext-proc/backend/inferencepool_reconciler.go b/pkg/ext-proc/backend/inferencepool_reconciler.go index d45e2dbb..7bb50467 100644 --- a/pkg/ext-proc/backend/inferencepool_reconciler.go +++ b/pkg/ext-proc/backend/inferencepool_reconciler.go @@ -5,6 +5,7 @@ import ( "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" @@ -16,16 +17,15 @@ import ( // will have the proper controller that will create/manage objects on behalf of the server pool. type InferencePoolReconciler struct { client.Client - Scheme *runtime.Scheme - Record record.EventRecorder - PoolName string - PoolNamespace string - Datastore *K8sDatastore - Zone string + Scheme *runtime.Scheme + Record record.EventRecorder + PoolNamespacedName types.NamespacedName + Datastore *K8sDatastore + Zone string } func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - if req.NamespacedName.Name != c.PoolName || req.NamespacedName.Namespace != c.PoolNamespace { + if req.NamespacedName.Name != c.PoolNamespacedName.Name || req.NamespacedName.Namespace != c.PoolNamespacedName.Namespace { return ctrl.Result{}, nil } klog.V(1).Info("reconciling InferencePool", req.NamespacedName) diff --git a/pkg/ext-proc/main.go b/pkg/ext-proc/main.go index ed426b4c..68345312 100644 --- a/pkg/ext-proc/main.go +++ b/pkg/ext-proc/main.go @@ -27,6 +27,7 @@ import ( "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/metrics" "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/scheduling" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" klog "k8s.io/klog/v2" @@ -124,23 +125,27 @@ func main() { } if err := (&backend.InferencePoolReconciler{ - Datastore: datastore, - Scheme: mgr.GetScheme(), - Client: mgr.GetClient(), - PoolName: *poolName, - PoolNamespace: *poolNamespace, - Record: mgr.GetEventRecorderFor("InferencePool"), + Datastore: datastore, + Scheme: mgr.GetScheme(), + Client: mgr.GetClient(), + PoolNamespacedName: types.NamespacedName{ + Name: *poolName, + Namespace: *poolNamespace, + }, + Record: mgr.GetEventRecorderFor("InferencePool"), }).SetupWithManager(mgr); err != nil { klog.Error(err, "Error setting up InferencePoolReconciler") } if err := (&backend.InferenceModelReconciler{ - Datastore: datastore, - Scheme: mgr.GetScheme(), - Client: mgr.GetClient(), - PoolName: *poolName, - PoolNamespace: *poolNamespace, - Record: mgr.GetEventRecorderFor("InferenceModel"), + Datastore: datastore, + Scheme: mgr.GetScheme(), + Client: mgr.GetClient(), + PoolNamespacedName: types.NamespacedName{ + Name: *poolName, + Namespace: *poolNamespace, + }, + Record: mgr.GetEventRecorderFor("InferenceModel"), }).SetupWithManager(mgr); err != nil { klog.Error(err, "Error setting up InferenceModelReconciler") } From 44da51e944c74f3253a6e80a42aafba6bf1daba8 Mon Sep 17 00:00:00 2001 From: Jie WU Date: Wed, 8 Jan 2025 23:17:15 +0000 Subject: [PATCH 20/25] Adding metrics handler --- pkg/ext-proc/main.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/ext-proc/main.go b/pkg/ext-proc/main.go index 68345312..85b33cee 100644 --- a/pkg/ext-proc/main.go +++ b/pkg/ext-proc/main.go @@ -19,6 +19,7 @@ import ( "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend/vllm" "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/handlers" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1" "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend" From c50fed5c749da32cec5a9ae0bb76f54310c6127d Mon Sep 17 00:00:00 2001 From: Jie Wu Date: Thu, 12 Dec 2024 03:16:11 +0000 Subject: [PATCH 21/25] add request metrics --- pkg/ext-proc/metrics/metrics.go | 72 +++++++++++ pkg/ext-proc/metrics/metrics_test.go | 105 ++++++++++++++++ .../testdata/request_duration_seconds_metric | 116 ++++++++++++++++++ .../metrics/testdata/request_sizes_metric | 86 +++++++++++++ .../metrics/testdata/request_total_metric | 5 + 5 files changed, 384 insertions(+) create mode 100644 pkg/ext-proc/metrics/metrics.go create mode 100644 pkg/ext-proc/metrics/metrics_test.go create mode 100644 pkg/ext-proc/metrics/testdata/request_duration_seconds_metric create mode 100644 pkg/ext-proc/metrics/testdata/request_sizes_metric create mode 100644 pkg/ext-proc/metrics/testdata/request_total_metric diff --git a/pkg/ext-proc/metrics/metrics.go b/pkg/ext-proc/metrics/metrics.go new file mode 100644 index 00000000..fe879724 --- /dev/null +++ b/pkg/ext-proc/metrics/metrics.go @@ -0,0 +1,72 @@ +package metrics + +import ( + "sync" + "time" + + compbasemetrics "k8s.io/component-base/metrics" + "k8s.io/component-base/metrics/legacyregistry" +) + +const ( + LLMServiceModelComponent = "llmservice_model" +) + +var ( + requestCounter = compbasemetrics.NewCounterVec( + &compbasemetrics.CounterOpts{ + Subsystem: LLMServiceModelComponent, + Name: "request_total", + Help: "Counter of LLM service requests broken out for each model and target model.", + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"llmservice_name", "model_name", "target_model_name"}, + ) + + requestLatencies = compbasemetrics.NewHistogramVec( + &compbasemetrics.HistogramOpts{ + Subsystem: LLMServiceModelComponent, + Name: "request_duration_seconds", + Help: "LLM service response latency distribution in seconds for each model and target model.", + Buckets: []float64{0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3, + 4, 5, 6, 8, 10, 15, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600}, + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"llmservice_name", "model_name", "target_model_name"}, + ) + + requestSizes = compbasemetrics.NewHistogramVec( + &compbasemetrics.HistogramOpts{ + Subsystem: LLMServiceModelComponent, + Name: "request_sizes", + Help: "LLM service requests size distribution in bytes for each model and target model.", + // Use buckets ranging from 1000 bytes (1KB) to 10^9 bytes (1GB). + Buckets: []float64{ + 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, // More fine-grained up to 64KB + 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, // Exponential up to 8MB + 16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, // Exponential up to 1GB + }, + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"llmservice_name", "model_name", "target_model_name"}, + ) +) + +var registerMetrics sync.Once + +// Register all metrics. +func Register() { + registerMetrics.Do(func() { + legacyregistry.MustRegister(requestCounter) + legacyregistry.MustRegister(requestLatencies) + legacyregistry.MustRegister(requestSizes) + }) +} + +// MonitorRequest handles monitoring requests. +func MonitorRequest(llmserviceName, modelName, targetModelName string, reqSize int, elapsed time.Duration) { + elapsedSeconds := elapsed.Seconds() + requestCounter.WithLabelValues(llmserviceName, modelName, targetModelName).Inc() + requestLatencies.WithLabelValues(llmserviceName, modelName, targetModelName).Observe(elapsedSeconds) + requestSizes.WithLabelValues(llmserviceName, modelName, targetModelName).Observe(float64(reqSize)) +} diff --git a/pkg/ext-proc/metrics/metrics_test.go b/pkg/ext-proc/metrics/metrics_test.go new file mode 100644 index 00000000..4d33c6f0 --- /dev/null +++ b/pkg/ext-proc/metrics/metrics_test.go @@ -0,0 +1,105 @@ +package metrics + +import ( + "os" + "testing" + "time" + + "k8s.io/component-base/metrics/legacyregistry" + "k8s.io/component-base/metrics/testutil" +) + +const RequestTotalMetric = LLMServiceModelComponent + "_request_total" +const RequestLatenciesMetric = LLMServiceModelComponent + "_request_duration_seconds" +const RequestSizesMetric = LLMServiceModelComponent + "_request_sizes" + +func TestMonitorRequest(t *testing.T) { + type requests struct { + llmserviceName string + modelName string + targetModelName string + reqSize int + elapsed time.Duration + } + scenarios := []struct { + name string + reqs []requests + }{{ + name: "multiple requests", + reqs: []requests{ + { + llmserviceName: "s10", + modelName: "m10", + targetModelName: "t10", + reqSize: 1200, + elapsed: time.Millisecond * 10, + }, + { + llmserviceName: "s10", + modelName: "m10", + targetModelName: "t10", + reqSize: 500, + elapsed: time.Millisecond * 1600, + }, + { + llmserviceName: "s10", + modelName: "m10", + targetModelName: "t11", + reqSize: 2480, + elapsed: time.Millisecond * 60, + }, + { + llmserviceName: "s20", + modelName: "m20", + targetModelName: "t20", + reqSize: 80, + elapsed: time.Millisecond * 120, + }, + }, + }} + Register() + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + for _, req := range scenario.reqs { + MonitorRequest(req.llmserviceName, req.modelName, req.targetModelName, req.reqSize, req.elapsed) + } + wantRequestTotal, err := os.Open("testdata/request_total_metric") + defer func() { + if err := wantRequestTotal.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestTotal, RequestTotalMetric); err != nil { + t.Error(err) + } + wantRequestLatencies, err := os.Open("testdata/request_duration_seconds_metric") + defer func() { + if err := wantRequestLatencies.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestLatencies, RequestLatenciesMetric); err != nil { + t.Error(err) + } + wantRequestSizes, err := os.Open("testdata/request_sizes_metric") + defer func() { + if err := wantRequestSizes.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestSizes, RequestSizesMetric); err != nil { + t.Error(err) + } + + }) + } +} diff --git a/pkg/ext-proc/metrics/testdata/request_duration_seconds_metric b/pkg/ext-proc/metrics/testdata/request_duration_seconds_metric new file mode 100644 index 00000000..921a03df --- /dev/null +++ b/pkg/ext-proc/metrics/testdata/request_duration_seconds_metric @@ -0,0 +1,116 @@ +# HELP llmservice_model_request_duration_seconds [ALPHA] LLM service response latency distribution in seconds for each model and target model. +# TYPE llmservice_model_request_duration_seconds histogram +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.005"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.025"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.05"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.1"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.2"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.4"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.6"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.8"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1.0"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1.25"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1.5"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="2"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="3"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="4"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="5"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="6"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="8"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="10"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="15"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="20"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="30"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="45"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="60"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="120"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="180"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="240"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="300"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="360"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="480"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="600"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="900"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1200"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1800"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="2700"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="3600"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="Inf"} 2 +llmservice_model_request_duration_seconds_sum{llmservice_name="s10", model_name="m10", target_model_name="t10"} 1.61 +llmservice_model_request_duration_seconds_count{llmservice_name="s10", model_name="m10", target_model_name="t10"} 2 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.005"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.025"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.05"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.1"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.2"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.4"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.6"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.8"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.25"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.5"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="3"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="4"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="5"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="6"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="8"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="10"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="15"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="20"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="30"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="45"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="60"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="120"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="180"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="240"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="300"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="360"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="480"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="600"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="900"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1200"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1800"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2700"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="3600"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="+Inf"} 1 +llmservice_model_request_duration_seconds_sum{llmservice_name="s10",model_name="m10",target_model_name="t11"} 0.06 +llmservice_model_request_duration_seconds_count{llmservice_name="s10",model_name="m10",target_model_name="t11"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.005"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.025"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.05"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.1"} 0 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.2"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.4"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.6"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.8"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.25"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.5"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="3"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="4"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="5"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="6"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="8"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="10"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="15"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="20"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="30"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="45"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="60"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="120"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="180"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="240"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="300"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="360"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="480"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="600"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="900"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1200"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1800"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2700"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="3600"} 1 +llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="+Inf"} 1 +llmservice_model_request_duration_seconds_sum{llmservice_name="s20",model_name="m20",target_model_name="t20"} 0.12 +llmservice_model_request_duration_seconds_count{llmservice_name="s20",model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/ext-proc/metrics/testdata/request_sizes_metric b/pkg/ext-proc/metrics/testdata/request_sizes_metric new file mode 100644 index 00000000..54f92c99 --- /dev/null +++ b/pkg/ext-proc/metrics/testdata/request_sizes_metric @@ -0,0 +1,86 @@ +# HELP llmservice_model_request_sizes [ALPHA] LLM service requests size distribution in bytes for each model and target model. +# TYPE llmservice_model_request_sizes histogram +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="64"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="128"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="256"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="512"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1024"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="2048"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="4096"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="8192"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="16384"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="32768"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="65536"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="131072"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="262144"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="524288"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1.048576e+06"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="2.097152e+06"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="4.194304e+06"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="8.388608e+06"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1.6777216e+07"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="3.3554432e+07"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="6.7108864e+07"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1.34217728e+08"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="2.68435456e+08"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="5.36870912e+08"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1.073741824e+09"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="+Inf"} 2 +llmservice_model_request_sizes_sum{llmservice_name="s10",model_name="m10",target_model_name="t10"} 1700 +llmservice_model_request_sizes_count{llmservice_name="s10",model_name="m10",target_model_name="t10"} 2 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="64"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="128"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="256"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="512"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1024"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2048"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="4096"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="8192"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="16384"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="32768"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="65536"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="131072"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="262144"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="524288"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.048576e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2.097152e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="4.194304e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="8.388608e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.6777216e+07"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="3.3554432e+07"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="6.7108864e+07"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.34217728e+08"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2.68435456e+08"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="5.36870912e+08"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.073741824e+09"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="+Inf"} 1 +llmservice_model_request_sizes_sum{llmservice_name="s10",model_name="m10",target_model_name="t11"} 2480 +llmservice_model_request_sizes_count{llmservice_name="s10",model_name="m10",target_model_name="t11"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="64"} 0 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="128"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="256"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="512"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1024"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2048"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="4096"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="8192"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="16384"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="32768"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="65536"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="131072"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="262144"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="524288"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.048576e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2.097152e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="4.194304e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="8.388608e+06"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.6777216e+07"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="3.3554432e+07"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="6.7108864e+07"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.34217728e+08"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2.68435456e+08"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="5.36870912e+08"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.073741824e+09"} 1 +llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="+Inf"} 1 +llmservice_model_request_sizes_sum{llmservice_name="s20",model_name="m20",target_model_name="t20"} 80 +llmservice_model_request_sizes_count{llmservice_name="s20",model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/ext-proc/metrics/testdata/request_total_metric b/pkg/ext-proc/metrics/testdata/request_total_metric new file mode 100644 index 00000000..f31feb65 --- /dev/null +++ b/pkg/ext-proc/metrics/testdata/request_total_metric @@ -0,0 +1,5 @@ +# HELP llmservice_model_request_total [ALPHA] Counter of LLM service requests broken out for each model and target model. +# TYPE llmservice_model_request_total counter +llmservice_model_request_total{llmservice_name="s10", model_name="m10", target_model_name="t10"} 2 +llmservice_model_request_total{llmservice_name="s10", model_name="m10", target_model_name="t11"} 1 +llmservice_model_request_total{llmservice_name="s20", model_name="m20", target_model_name="t20"} 1 From ac04b7cd7b66bab01e21c1f28c88640ca8c09542 Mon Sep 17 00:00:00 2001 From: Jie Wu Date: Thu, 12 Dec 2024 03:16:11 +0000 Subject: [PATCH 22/25] add request metrics --- pkg/ext-proc/metrics/testdata/request_duration_seconds_metic | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 pkg/ext-proc/metrics/testdata/request_duration_seconds_metic diff --git a/pkg/ext-proc/metrics/testdata/request_duration_seconds_metic b/pkg/ext-proc/metrics/testdata/request_duration_seconds_metic new file mode 100644 index 00000000..e69de29b From 2b3df7b88423c38e70507dfb37353bef22cbf5af Mon Sep 17 00:00:00 2001 From: Jie WU Date: Mon, 6 Jan 2025 16:39:38 +0000 Subject: [PATCH 23/25] rename api and metrics --- pkg/ext-proc/metrics/metrics.go | 28 +-- pkg/ext-proc/metrics/metrics_test.go | 13 +- .../testdata/request_duration_seconds_metic | 0 .../testdata/request_duration_seconds_metric | 232 +++++++++--------- .../metrics/testdata/request_sizes_metric | 172 ++++++------- .../metrics/testdata/request_total_metric | 10 +- 6 files changed, 225 insertions(+), 230 deletions(-) delete mode 100644 pkg/ext-proc/metrics/testdata/request_duration_seconds_metic diff --git a/pkg/ext-proc/metrics/metrics.go b/pkg/ext-proc/metrics/metrics.go index fe879724..4ed823a6 100644 --- a/pkg/ext-proc/metrics/metrics.go +++ b/pkg/ext-proc/metrics/metrics.go @@ -9,37 +9,37 @@ import ( ) const ( - LLMServiceModelComponent = "llmservice_model" + InferenceModelComponent = "inference_model" ) var ( requestCounter = compbasemetrics.NewCounterVec( &compbasemetrics.CounterOpts{ - Subsystem: LLMServiceModelComponent, + Subsystem: InferenceModelComponent, Name: "request_total", - Help: "Counter of LLM service requests broken out for each model and target model.", + Help: "Counter of inference model requests broken out for each model and target model.", StabilityLevel: compbasemetrics.ALPHA, }, - []string{"llmservice_name", "model_name", "target_model_name"}, + []string{"model_name", "target_model_name"}, ) requestLatencies = compbasemetrics.NewHistogramVec( &compbasemetrics.HistogramOpts{ - Subsystem: LLMServiceModelComponent, + Subsystem: InferenceModelComponent, Name: "request_duration_seconds", - Help: "LLM service response latency distribution in seconds for each model and target model.", + Help: "Inference model response latency distribution in seconds for each model and target model.", Buckets: []float64{0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3, 4, 5, 6, 8, 10, 15, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600}, StabilityLevel: compbasemetrics.ALPHA, }, - []string{"llmservice_name", "model_name", "target_model_name"}, + []string{"model_name", "target_model_name"}, ) requestSizes = compbasemetrics.NewHistogramVec( &compbasemetrics.HistogramOpts{ - Subsystem: LLMServiceModelComponent, + Subsystem: InferenceModelComponent, Name: "request_sizes", - Help: "LLM service requests size distribution in bytes for each model and target model.", + Help: "Inference model requests size distribution in bytes for each model and target model.", // Use buckets ranging from 1000 bytes (1KB) to 10^9 bytes (1GB). Buckets: []float64{ 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, // More fine-grained up to 64KB @@ -48,7 +48,7 @@ var ( }, StabilityLevel: compbasemetrics.ALPHA, }, - []string{"llmservice_name", "model_name", "target_model_name"}, + []string{"model_name", "target_model_name"}, ) ) @@ -64,9 +64,9 @@ func Register() { } // MonitorRequest handles monitoring requests. -func MonitorRequest(llmserviceName, modelName, targetModelName string, reqSize int, elapsed time.Duration) { +func MonitorRequest(modelName, targetModelName string, reqSize int, elapsed time.Duration) { elapsedSeconds := elapsed.Seconds() - requestCounter.WithLabelValues(llmserviceName, modelName, targetModelName).Inc() - requestLatencies.WithLabelValues(llmserviceName, modelName, targetModelName).Observe(elapsedSeconds) - requestSizes.WithLabelValues(llmserviceName, modelName, targetModelName).Observe(float64(reqSize)) + requestCounter.WithLabelValues(modelName, targetModelName).Inc() + requestLatencies.WithLabelValues(modelName, targetModelName).Observe(elapsedSeconds) + requestSizes.WithLabelValues(modelName, targetModelName).Observe(float64(reqSize)) } diff --git a/pkg/ext-proc/metrics/metrics_test.go b/pkg/ext-proc/metrics/metrics_test.go index 4d33c6f0..df83a5ed 100644 --- a/pkg/ext-proc/metrics/metrics_test.go +++ b/pkg/ext-proc/metrics/metrics_test.go @@ -9,13 +9,12 @@ import ( "k8s.io/component-base/metrics/testutil" ) -const RequestTotalMetric = LLMServiceModelComponent + "_request_total" -const RequestLatenciesMetric = LLMServiceModelComponent + "_request_duration_seconds" -const RequestSizesMetric = LLMServiceModelComponent + "_request_sizes" +const RequestTotalMetric = InferenceModelComponent + "_request_total" +const RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds" +const RequestSizesMetric = InferenceModelComponent + "_request_sizes" func TestMonitorRequest(t *testing.T) { type requests struct { - llmserviceName string modelName string targetModelName string reqSize int @@ -28,28 +27,24 @@ func TestMonitorRequest(t *testing.T) { name: "multiple requests", reqs: []requests{ { - llmserviceName: "s10", modelName: "m10", targetModelName: "t10", reqSize: 1200, elapsed: time.Millisecond * 10, }, { - llmserviceName: "s10", modelName: "m10", targetModelName: "t10", reqSize: 500, elapsed: time.Millisecond * 1600, }, { - llmserviceName: "s10", modelName: "m10", targetModelName: "t11", reqSize: 2480, elapsed: time.Millisecond * 60, }, { - llmserviceName: "s20", modelName: "m20", targetModelName: "t20", reqSize: 80, @@ -61,7 +56,7 @@ func TestMonitorRequest(t *testing.T) { for _, scenario := range scenarios { t.Run(scenario.name, func(t *testing.T) { for _, req := range scenario.reqs { - MonitorRequest(req.llmserviceName, req.modelName, req.targetModelName, req.reqSize, req.elapsed) + MonitorRequest(req.modelName, req.targetModelName, req.reqSize, req.elapsed) } wantRequestTotal, err := os.Open("testdata/request_total_metric") defer func() { diff --git a/pkg/ext-proc/metrics/testdata/request_duration_seconds_metic b/pkg/ext-proc/metrics/testdata/request_duration_seconds_metic deleted file mode 100644 index e69de29b..00000000 diff --git a/pkg/ext-proc/metrics/testdata/request_duration_seconds_metric b/pkg/ext-proc/metrics/testdata/request_duration_seconds_metric index 921a03df..6c70b4ba 100644 --- a/pkg/ext-proc/metrics/testdata/request_duration_seconds_metric +++ b/pkg/ext-proc/metrics/testdata/request_duration_seconds_metric @@ -1,116 +1,116 @@ -# HELP llmservice_model_request_duration_seconds [ALPHA] LLM service response latency distribution in seconds for each model and target model. -# TYPE llmservice_model_request_duration_seconds histogram -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.005"} 0 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.025"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.05"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.1"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.2"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.4"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.6"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="0.8"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1.0"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1.25"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1.5"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="2"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="3"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="4"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="5"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="6"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="8"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="10"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="15"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="20"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="30"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="45"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="60"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="120"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="180"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="240"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="300"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="360"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="480"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="600"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="900"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1200"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="1800"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="2700"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="3600"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10", model_name="m10", target_model_name="t10", le="Inf"} 2 -llmservice_model_request_duration_seconds_sum{llmservice_name="s10", model_name="m10", target_model_name="t10"} 1.61 -llmservice_model_request_duration_seconds_count{llmservice_name="s10", model_name="m10", target_model_name="t10"} 2 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.005"} 0 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.025"} 0 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.05"} 0 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.1"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.2"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.4"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.6"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="0.8"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.25"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.5"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="3"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="4"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="5"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="6"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="8"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="10"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="15"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="20"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="30"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="45"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="60"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="120"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="180"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="240"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="300"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="360"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="480"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="600"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="900"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1200"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1800"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2700"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="3600"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="+Inf"} 1 -llmservice_model_request_duration_seconds_sum{llmservice_name="s10",model_name="m10",target_model_name="t11"} 0.06 -llmservice_model_request_duration_seconds_count{llmservice_name="s10",model_name="m10",target_model_name="t11"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.005"} 0 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.025"} 0 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.05"} 0 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.1"} 0 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.2"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.4"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.6"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="0.8"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.25"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.5"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="3"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="4"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="5"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="6"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="8"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="10"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="15"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="20"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="30"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="45"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="60"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="120"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="180"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="240"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="300"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="360"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="480"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="600"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="900"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1200"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1800"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2700"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="3600"} 1 -llmservice_model_request_duration_seconds_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="+Inf"} 1 -llmservice_model_request_duration_seconds_sum{llmservice_name="s20",model_name="m20",target_model_name="t20"} 0.12 -llmservice_model_request_duration_seconds_count{llmservice_name="s20",model_name="m20",target_model_name="t20"} 1 +# HELP inference_model_request_duration_seconds [ALPHA] Inference model response latency distribution in seconds for each model and target model. +# TYPE inference_model_request_duration_seconds histogram +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.005"} 0 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.025"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.05"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.1"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.2"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.4"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.6"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="0.8"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1.0"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1.25"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1.5"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="2"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="3"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="4"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="5"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="6"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="8"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="10"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="15"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="20"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="30"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="45"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="60"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="120"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="180"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="240"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="300"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="360"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="480"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="600"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="900"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1200"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="1800"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="2700"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="3600"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10", target_model_name="t10", le="Inf"} 2 +inference_model_request_duration_seconds_sum{model_name="m10", target_model_name="t10"} 1.61 +inference_model_request_duration_seconds_count{model_name="m10", target_model_name="t10"} 2 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.005"} 0 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.025"} 0 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.05"} 0 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.1"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.2"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.4"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.6"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="0.8"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1.25"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1.5"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="2"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="3"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="4"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="5"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="6"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="8"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="10"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="15"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="20"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="30"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="45"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="60"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="120"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="180"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="240"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="300"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="360"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="480"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="600"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="900"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1200"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="1800"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="2700"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="3600"} 1 +inference_model_request_duration_seconds_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1 +inference_model_request_duration_seconds_sum{model_name="m10",target_model_name="t11"} 0.06 +inference_model_request_duration_seconds_count{model_name="m10",target_model_name="t11"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.005"} 0 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.025"} 0 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.05"} 0 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.1"} 0 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.2"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.4"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.6"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="0.8"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1.25"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1.5"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="2"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="3"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="4"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="5"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="6"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="8"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="10"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="15"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="20"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="30"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="45"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="60"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="120"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="180"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="240"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="300"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="360"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="480"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="600"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="900"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1200"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="1800"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="2700"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="3600"} 1 +inference_model_request_duration_seconds_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1 +inference_model_request_duration_seconds_sum{model_name="m20",target_model_name="t20"} 0.12 +inference_model_request_duration_seconds_count{model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/ext-proc/metrics/testdata/request_sizes_metric b/pkg/ext-proc/metrics/testdata/request_sizes_metric index 54f92c99..ceca532e 100644 --- a/pkg/ext-proc/metrics/testdata/request_sizes_metric +++ b/pkg/ext-proc/metrics/testdata/request_sizes_metric @@ -1,86 +1,86 @@ -# HELP llmservice_model_request_sizes [ALPHA] LLM service requests size distribution in bytes for each model and target model. -# TYPE llmservice_model_request_sizes histogram -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="64"} 0 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="128"} 0 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="256"} 0 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="512"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1024"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="2048"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="4096"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="8192"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="16384"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="32768"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="65536"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="131072"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="262144"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="524288"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1.048576e+06"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="2.097152e+06"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="4.194304e+06"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="8.388608e+06"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1.6777216e+07"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="3.3554432e+07"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="6.7108864e+07"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1.34217728e+08"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="2.68435456e+08"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="5.36870912e+08"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="1.073741824e+09"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t10",le="+Inf"} 2 -llmservice_model_request_sizes_sum{llmservice_name="s10",model_name="m10",target_model_name="t10"} 1700 -llmservice_model_request_sizes_count{llmservice_name="s10",model_name="m10",target_model_name="t10"} 2 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="64"} 0 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="128"} 0 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="256"} 0 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="512"} 0 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1024"} 0 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2048"} 0 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="4096"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="8192"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="16384"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="32768"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="65536"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="131072"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="262144"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="524288"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.048576e+06"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2.097152e+06"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="4.194304e+06"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="8.388608e+06"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.6777216e+07"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="3.3554432e+07"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="6.7108864e+07"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.34217728e+08"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="2.68435456e+08"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="5.36870912e+08"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="1.073741824e+09"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s10",model_name="m10",target_model_name="t11",le="+Inf"} 1 -llmservice_model_request_sizes_sum{llmservice_name="s10",model_name="m10",target_model_name="t11"} 2480 -llmservice_model_request_sizes_count{llmservice_name="s10",model_name="m10",target_model_name="t11"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="64"} 0 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="128"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="256"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="512"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1024"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2048"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="4096"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="8192"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="16384"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="32768"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="65536"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="131072"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="262144"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="524288"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.048576e+06"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2.097152e+06"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="4.194304e+06"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="8.388608e+06"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.6777216e+07"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="3.3554432e+07"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="6.7108864e+07"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.34217728e+08"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="2.68435456e+08"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="5.36870912e+08"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="1.073741824e+09"} 1 -llmservice_model_request_sizes_bucket{llmservice_name="s20",model_name="m20",target_model_name="t20",le="+Inf"} 1 -llmservice_model_request_sizes_sum{llmservice_name="s20",model_name="m20",target_model_name="t20"} 80 -llmservice_model_request_sizes_count{llmservice_name="s20",model_name="m20",target_model_name="t20"} 1 +# HELP inference_model_request_sizes [ALPHA] Inference model requests size distribution in bytes for each model and target model. +# TYPE inference_model_request_sizes histogram +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="64"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="128"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="256"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="512"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1024"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="2048"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="4096"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="8192"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="16384"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="32768"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="65536"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="131072"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="262144"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="524288"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1.048576e+06"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="2.097152e+06"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="4.194304e+06"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="8.388608e+06"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1.6777216e+07"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="3.3554432e+07"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="6.7108864e+07"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1.34217728e+08"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="2.68435456e+08"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="5.36870912e+08"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="1.073741824e+09"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t10",le="+Inf"} 2 +inference_model_request_sizes_sum{model_name="m10",target_model_name="t10"} 1700 +inference_model_request_sizes_count{model_name="m10",target_model_name="t10"} 2 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="64"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="128"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="256"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="512"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1024"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="2048"} 0 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="4096"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="8192"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="16384"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="32768"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="65536"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="131072"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="262144"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="524288"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1.048576e+06"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="2.097152e+06"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="4.194304e+06"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="8.388608e+06"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1.6777216e+07"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="3.3554432e+07"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="6.7108864e+07"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1.34217728e+08"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="2.68435456e+08"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="5.36870912e+08"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="1.073741824e+09"} 1 +inference_model_request_sizes_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1 +inference_model_request_sizes_sum{model_name="m10",target_model_name="t11"} 2480 +inference_model_request_sizes_count{model_name="m10",target_model_name="t11"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="64"} 0 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="128"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="256"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="512"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1024"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="2048"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="4096"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="8192"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="16384"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="32768"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="65536"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="131072"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="262144"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="524288"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1.048576e+06"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="2.097152e+06"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="4.194304e+06"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="8.388608e+06"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1.6777216e+07"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="3.3554432e+07"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="6.7108864e+07"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1.34217728e+08"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="2.68435456e+08"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="5.36870912e+08"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="1.073741824e+09"} 1 +inference_model_request_sizes_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1 +inference_model_request_sizes_sum{model_name="m20",target_model_name="t20"} 80 +inference_model_request_sizes_count{model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/ext-proc/metrics/testdata/request_total_metric b/pkg/ext-proc/metrics/testdata/request_total_metric index f31feb65..9c6f48a3 100644 --- a/pkg/ext-proc/metrics/testdata/request_total_metric +++ b/pkg/ext-proc/metrics/testdata/request_total_metric @@ -1,5 +1,5 @@ -# HELP llmservice_model_request_total [ALPHA] Counter of LLM service requests broken out for each model and target model. -# TYPE llmservice_model_request_total counter -llmservice_model_request_total{llmservice_name="s10", model_name="m10", target_model_name="t10"} 2 -llmservice_model_request_total{llmservice_name="s10", model_name="m10", target_model_name="t11"} 1 -llmservice_model_request_total{llmservice_name="s20", model_name="m20", target_model_name="t20"} 1 +# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. +# TYPE inference_model_request_total counter +inference_model_request_total{model_name="m10", target_model_name="t10"} 2 +inference_model_request_total{model_name="m10", target_model_name="t11"} 1 +inference_model_request_total{model_name="m20", target_model_name="t20"} 1 From 4b26bb374a50f26a1d7e773daa8066b65261315a Mon Sep 17 00:00:00 2001 From: Jie WU Date: Wed, 8 Jan 2025 23:16:44 +0000 Subject: [PATCH 24/25] Adding metrics handler --- pkg/manifests/ext_proc.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/manifests/ext_proc.yaml b/pkg/manifests/ext_proc.yaml index baa04d60..dfcfdc3e 100644 --- a/pkg/manifests/ext_proc.yaml +++ b/pkg/manifests/ext_proc.yaml @@ -59,7 +59,8 @@ spec: - "vllm-llama2-7b-pool" ports: - containerPort: 9002 - + - name: metrics + containerPort: 9090 - name: curl image: curlimages/curl command: ["sleep", "3600"] From cb45488ae75eada3c47bf5f21dd6346a9ffa3ee6 Mon Sep 17 00:00:00 2001 From: Jie WU Date: Wed, 8 Jan 2025 23:17:15 +0000 Subject: [PATCH 25/25] Adding metrics handler --- pkg/ext-proc/handlers/request.go | 4 ++++ pkg/ext-proc/main.go | 5 +++++ pkg/ext-proc/metrics/metrics_handler.go | 29 +++++++++++++++++++++++++ 3 files changed, 38 insertions(+) create mode 100644 pkg/ext-proc/metrics/metrics_handler.go diff --git a/pkg/ext-proc/handlers/request.go b/pkg/ext-proc/handlers/request.go index 16c3f4f0..abb03c72 100644 --- a/pkg/ext-proc/handlers/request.go +++ b/pkg/ext-proc/handlers/request.go @@ -5,10 +5,12 @@ import ( "errors" "fmt" "strconv" + "time" configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" klog "k8s.io/klog/v2" ) @@ -18,6 +20,7 @@ import ( // Envoy sends the request body to ext proc before sending the request to the backend server. func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { klog.V(3).Infof("Handling request body") + requestReceivedTimestamp := time.Now() // Unmarshal request body (must be JSON). v := req.Request.(*extProcPb.ProcessingRequest_RequestBody) @@ -116,6 +119,7 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces }, }, } + metrics.MonitorRequest(llmReq.Model, llmReq.ResolvedTargetModel, len(v.RequestBody.Body), time.Since(requestReceivedTimestamp)) return resp, nil } diff --git a/pkg/ext-proc/main.go b/pkg/ext-proc/main.go index e8a41667..d47a6748 100644 --- a/pkg/ext-proc/main.go +++ b/pkg/ext-proc/main.go @@ -19,6 +19,7 @@ import ( "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend/vllm" "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/handlers" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -33,6 +34,8 @@ var ( "port", 9002, "gRPC port") + metricsPort = flag.Int( + "metricsPort", 9090, "metrics port") targetPodHeader = flag.String( "targetPodHeader", "target-pod", @@ -104,6 +107,8 @@ func main() { klog.Fatalf("failed to listen: %v", err) } + metrics.Register() + go metrics.StartMetricsHandler(*metricsPort) datastore := backend.NewK8sDataStore() mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ diff --git a/pkg/ext-proc/metrics/metrics_handler.go b/pkg/ext-proc/metrics/metrics_handler.go new file mode 100644 index 00000000..7cc7b5f4 --- /dev/null +++ b/pkg/ext-proc/metrics/metrics_handler.go @@ -0,0 +1,29 @@ +package metrics + +import ( + "net" + "net/http" + "strconv" + + "github.com/prometheus/client_golang/prometheus/promhttp" + "k8s.io/component-base/metrics/legacyregistry" + "k8s.io/klog/v2" +) + +func StartMetricsHandler(port int) { + klog.Info("Starting metrics HTTP handler ...") + + mux := http.NewServeMux() + mux.Handle("/metrics", promhttp.HandlerFor( + legacyregistry.DefaultGatherer, + promhttp.HandlerOpts{}, + )) + + server := &http.Server{ + Addr: net.JoinHostPort("", strconv.Itoa(port)), + Handler: mux, + } + if err := server.ListenAndServe(); err != http.ErrServerClosed { + klog.Fatalf("failed to start metrics HTTP handler: %v", err) + } +}