Skip to content

Adding metrics for request total, latency and size #176

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 28 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
ff259d8
add request metrics
courageJ Dec 12, 2024
bb98cdc
add request metrics
courageJ Dec 12, 2024
d84926b
Merge remote-tracking branch 'origin/requests' into requests
courageJ Dec 12, 2024
7791550
add request metrics
courageJ Dec 12, 2024
129a756
add request metrics
courageJ Dec 12, 2024
6642dfa
Merge remote-tracking branch 'origin/main'
courageJ Jan 6, 2025
e8d45f0
rename api and metrics
courageJ Jan 6, 2025
28f3999
fix go mod
courageJ Jan 6, 2025
59272c1
Adding metrics handler
courageJ Jan 8, 2025
c04fc29
Adding metrics handler
courageJ Jan 8, 2025
48124e4
typo
courageJ Jan 8, 2025
80039ef
updating the boilerplate template (#156)
kfswain Jan 6, 2025
5001fb2
Bump github.com/envoyproxy/go-control-plane from 0.13.1 to 0.13.3 (#155)
dependabot[bot] Jan 6, 2025
c1ac053
Updating non-generated docs/ minor formatting (#160)
kfswain Jan 6, 2025
1252b8f
Bump github.com/onsi/ginkgo/v2 from 2.22.0 to 2.22.2 (#138)
dependabot[bot] Jan 6, 2025
6abf2e3
Bump google.golang.org/grpc from 1.69.0 to 1.69.2 (#133)
dependabot[bot] Jan 6, 2025
111864d
.*: change llm-instance-gateway -> gateway-api-inference-extension (#…
MadhavJivrajani Jan 7, 2025
774fb62
Changes InferencePool EPP Flags (#152)
danehans Jan 7, 2025
3a81901
ext-proc: remove unused fields from EndpointSliceReconciler (#165)
MadhavJivrajani Jan 8, 2025
1ed2d8d
ext-proc/backend: add unit test for InferencePoolReconciler (#168)
MadhavJivrajani Jan 8, 2025
14ae49f
ext-proc: change Inference* APIs to use NamespacedName (#172)
MadhavJivrajani Jan 8, 2025
44da51e
Adding metrics handler
courageJ Jan 8, 2025
c50fed5
add request metrics
courageJ Dec 12, 2024
ac04b7c
add request metrics
courageJ Dec 12, 2024
2b3df7b
rename api and metrics
courageJ Jan 6, 2025
4b26bb3
Adding metrics handler
courageJ Jan 8, 2025
cb45488
Adding metrics handler
courageJ Jan 8, 2025
4f32179
Merge remote-tracking branch 'origin/main'
courageJ Jan 9, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ require (
k8s.io/apimachinery v0.31.4
k8s.io/client-go v0.31.4
k8s.io/code-generator v0.31.4
k8s.io/component-base v0.31.4
k8s.io/klog/v2 v2.130.1
sigs.k8s.io/controller-runtime v0.19.3
sigs.k8s.io/structured-merge-diff/v4 v4.5.0
Expand All @@ -35,6 +36,7 @@ require (
github.com/Masterminds/sprig/v3 v3.2.3 // indirect
github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/blang/semver/v4 v4.0.0 // indirect
github.com/bufbuild/protocompile v0.14.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78 // indirect
Expand Down Expand Up @@ -63,6 +65,7 @@ require (
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.17.9 // indirect
github.com/kylelemons/godebug v1.1.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/mitchellh/copystructure v1.0.0 // indirect
github.com/mitchellh/reflectwalk v1.0.1 // indirect
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 h1:JYp7IbQjafo
github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ=
github.com/bojand/ghz v0.120.0 h1:6F4wsmZVwFg5UnD+/R+IABWk6sKE/0OKIBdUQUZnOdo=
github.com/bojand/ghz v0.120.0/go.mod h1:HfECuBZj1v02XObGnRuoZgyB1PR24/25dIYiJIMjJnE=
github.com/bufbuild/protocompile v0.14.1 h1:iA73zAf/fyljNjQKwYzUHD6AD4R8KMasmwa/FBatYVw=
Expand Down Expand Up @@ -278,6 +280,8 @@ k8s.io/client-go v0.31.4 h1:t4QEXt4jgHIkKKlx06+W3+1JOwAFU/2OPiOo7H92eRQ=
k8s.io/client-go v0.31.4/go.mod h1:kvuMro4sFYIa8sulL5Gi5GFqUPvfH2O/dXuKstbaaeg=
k8s.io/code-generator v0.31.4 h1:Vu+8fKz+239rKiVDHFVHgjQ162cg5iUQPtTyQbwXeQw=
k8s.io/code-generator v0.31.4/go.mod h1:yMDt13Kn7m4MMZ4LxB1KBzdZjEyxzdT4b4qXq+lnI90=
k8s.io/component-base v0.31.4 h1:wCquJh4ul9O8nNBSB8N/o8+gbfu3BVQkVw9jAUY/Qtw=
k8s.io/component-base v0.31.4/go.mod h1:G4dgtf5BccwiDT9DdejK0qM6zTK0jwDGEKnCmb9+u/s=
k8s.io/gengo/v2 v2.0.0-20240228010128-51d4e06bde70 h1:NGrVE502P0s0/1hudf8zjgwki1X/TByhmAoILTarmzo=
k8s.io/gengo/v2 v2.0.0-20240228010128-51d4e06bde70/go.mod h1:VH3AT8AaQOqiGjMF9p0/IM1Dj+82ZwjfxUP1IxaHE+8=
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
Expand Down
4 changes: 4 additions & 0 deletions pkg/ext-proc/handlers/request.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@ import (
"errors"
"fmt"
"strconv"
"time"

configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
"inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend"
"inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics"
"inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling"
klog "k8s.io/klog/v2"
)
Expand All @@ -18,6 +20,7 @@ import (
// Envoy sends the request body to ext proc before sending the request to the backend server.
func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) {
klog.V(3).Infof("Handling request body")
requestReceivedTimestamp := time.Now()

// Unmarshal request body (must be JSON).
v := req.Request.(*extProcPb.ProcessingRequest_RequestBody)
Expand Down Expand Up @@ -116,6 +119,7 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces
},
},
}
metrics.MonitorRequest(llmReq.Model, llmReq.ResolvedTargetModel, len(v.RequestBody.Body), time.Since(requestReceivedTimestamp))
return resp, nil
}

Expand Down
5 changes: 5 additions & 0 deletions pkg/ext-proc/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend"
"inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend/vllm"
"inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/handlers"
"inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics"
"inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
Expand All @@ -33,6 +34,8 @@ var (
"port",
9002,
"gRPC port")
metricsPort = flag.Int(
"metricsPort", 9090, "metrics port")
targetPodHeader = flag.String(
"targetPodHeader",
"target-pod",
Expand Down Expand Up @@ -104,6 +107,8 @@ func main() {
klog.Fatalf("failed to listen: %v", err)
}

metrics.Register()
go metrics.StartMetricsHandler(*metricsPort)
datastore := backend.NewK8sDataStore()

mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
Expand Down
72 changes: 72 additions & 0 deletions pkg/ext-proc/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package metrics

import (
"sync"
"time"

compbasemetrics "k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
)

const (
InferenceModelComponent = "inference_model"
)

var (
requestCounter = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Subsystem: InferenceModelComponent,
Name: "request_total",
Help: "Counter of inference model requests broken out for each model and target model.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"model_name", "target_model_name"},
)

requestLatencies = compbasemetrics.NewHistogramVec(
&compbasemetrics.HistogramOpts{
Subsystem: InferenceModelComponent,
Name: "request_duration_seconds",
Help: "Inference model response latency distribution in seconds for each model and target model.",
Buckets: []float64{0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3,
4, 5, 6, 8, 10, 15, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600},
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"model_name", "target_model_name"},
)

requestSizes = compbasemetrics.NewHistogramVec(
&compbasemetrics.HistogramOpts{
Subsystem: InferenceModelComponent,
Name: "request_sizes",
Help: "Inference model requests size distribution in bytes for each model and target model.",
// Use buckets ranging from 1000 bytes (1KB) to 10^9 bytes (1GB).
Buckets: []float64{
64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, // More fine-grained up to 64KB
131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, // Exponential up to 8MB
16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, // Exponential up to 1GB
},
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"model_name", "target_model_name"},
)
)

var registerMetrics sync.Once

// Register all metrics.
func Register() {
registerMetrics.Do(func() {
legacyregistry.MustRegister(requestCounter)
legacyregistry.MustRegister(requestLatencies)
legacyregistry.MustRegister(requestSizes)
})
}

// MonitorRequest handles monitoring requests.
func MonitorRequest(modelName, targetModelName string, reqSize int, elapsed time.Duration) {
elapsedSeconds := elapsed.Seconds()
requestCounter.WithLabelValues(modelName, targetModelName).Inc()
requestLatencies.WithLabelValues(modelName, targetModelName).Observe(elapsedSeconds)
requestSizes.WithLabelValues(modelName, targetModelName).Observe(float64(reqSize))
}
29 changes: 29 additions & 0 deletions pkg/ext-proc/metrics/metrics_handler.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package metrics

import (
"net"
"net/http"
"strconv"

"github.com/prometheus/client_golang/prometheus/promhttp"
"k8s.io/component-base/metrics/legacyregistry"
"k8s.io/klog/v2"
)

func StartMetricsHandler(port int) {
klog.Info("Starting metrics HTTP handler ...")

mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.HandlerFor(
legacyregistry.DefaultGatherer,
promhttp.HandlerOpts{},
))

server := &http.Server{
Addr: net.JoinHostPort("", strconv.Itoa(port)),
Handler: mux,
}
if err := server.ListenAndServe(); err != http.ErrServerClosed {
klog.Fatalf("failed to start metrics HTTP handler: %v", err)
}
}
100 changes: 100 additions & 0 deletions pkg/ext-proc/metrics/metrics_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
package metrics

import (
"os"
"testing"
"time"

"k8s.io/component-base/metrics/legacyregistry"
"k8s.io/component-base/metrics/testutil"
)

const RequestTotalMetric = InferenceModelComponent + "_request_total"
const RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds"
const RequestSizesMetric = InferenceModelComponent + "_request_sizes"

func TestMonitorRequest(t *testing.T) {
type requests struct {
modelName string
targetModelName string
reqSize int
elapsed time.Duration
}
scenarios := []struct {
name string
reqs []requests
}{{
name: "multiple requests",
reqs: []requests{
{
modelName: "m10",
targetModelName: "t10",
reqSize: 1200,
elapsed: time.Millisecond * 10,
},
{
modelName: "m10",
targetModelName: "t10",
reqSize: 500,
elapsed: time.Millisecond * 1600,
},
{
modelName: "m10",
targetModelName: "t11",
reqSize: 2480,
elapsed: time.Millisecond * 60,
},
{
modelName: "m20",
targetModelName: "t20",
reqSize: 80,
elapsed: time.Millisecond * 120,
},
},
}}
Register()
for _, scenario := range scenarios {
t.Run(scenario.name, func(t *testing.T) {
for _, req := range scenario.reqs {
MonitorRequest(req.modelName, req.targetModelName, req.reqSize, req.elapsed)
}
wantRequestTotal, err := os.Open("testdata/request_total_metric")
defer func() {
if err := wantRequestTotal.Close(); err != nil {
t.Error(err)
}
}()
if err != nil {
t.Fatal(err)
}
if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestTotal, RequestTotalMetric); err != nil {
t.Error(err)
}
wantRequestLatencies, err := os.Open("testdata/request_duration_seconds_metric")
defer func() {
if err := wantRequestLatencies.Close(); err != nil {
t.Error(err)
}
}()
if err != nil {
t.Fatal(err)
}
if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestLatencies, RequestLatenciesMetric); err != nil {
t.Error(err)
}
wantRequestSizes, err := os.Open("testdata/request_sizes_metric")
defer func() {
if err := wantRequestSizes.Close(); err != nil {
t.Error(err)
}
}()
if err != nil {
t.Fatal(err)
}
if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestSizes, RequestSizesMetric); err != nil {
t.Error(err)
}

})
}
}
Loading