diff --git a/pkg/epp/backend/metrics/logger.go b/pkg/epp/backend/metrics/logger.go index 8c73d488..d71dc3fa 100644 --- a/pkg/epp/backend/metrics/logger.go +++ b/pkg/epp/backend/metrics/logger.go @@ -110,4 +110,5 @@ func flushPrometheusMetricsOnce(logger logr.Logger, datastore Datastore) { podTotalCount := len(podMetrics) metrics.RecordInferencePoolAvgKVCache(pool.Name, kvCacheTotal/float64(podTotalCount)) metrics.RecordInferencePoolAvgQueueSize(pool.Name, float64(queueTotal/podTotalCount)) + metrics.RecordinferencePoolReadyPods(pool.Name, float64(podTotalCount)) } diff --git a/pkg/epp/metrics/metrics.go b/pkg/epp/metrics/metrics.go index 9ff2bb79..434b8381 100644 --- a/pkg/epp/metrics/metrics.go +++ b/pkg/epp/metrics/metrics.go @@ -151,6 +151,16 @@ var ( }, []string{"name"}, ) + + inferencePoolReadyPods = compbasemetrics.NewGaugeVec( + &compbasemetrics.GaugeOpts{ + Subsystem: InferencePoolComponent, + Name: "ready_pods", + Help: "The number of ready pods in the inference server pool.", + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"name"}, + ) ) var registerMetrics sync.Once @@ -169,6 +179,7 @@ func Register() { legacyregistry.MustRegister(inferencePoolAvgKVCache) legacyregistry.MustRegister(inferencePoolAvgQueueSize) + legacyregistry.MustRegister(inferencePoolReadyPods) }) } @@ -241,3 +252,7 @@ func RecordInferencePoolAvgKVCache(name string, utilization float64) { func RecordInferencePoolAvgQueueSize(name string, queueSize float64) { inferencePoolAvgQueueSize.WithLabelValues(name).Set(queueSize) } + +func RecordinferencePoolReadyPods(name string, runningPods float64) { + inferencePoolReadyPods.WithLabelValues(name).Set(runningPods) +} diff --git a/site-src/guides/metrics.md b/site-src/guides/metrics.md index d0747307..a781f721 100644 --- a/site-src/guides/metrics.md +++ b/site-src/guides/metrics.md @@ -33,6 +33,7 @@ curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ | inference_model_running_requests | Gauge | Number of running requests for each model. | `model_name`=<model-name> | ALPHA | | inference_pool_average_kv_cache_utilization | Gauge | The average kv cache utilization for an inference server pool. | `name`=<inference-pool-name> | ALPHA | | inference_pool_average_queue_size | Gauge | The average number of requests pending in the model server queue. | `name`=<inference-pool-name> | ALPHA | +| inference_pool_ready_pods | Gauge | The number of ready pods for an inference server pool. | `name`=<inference-pool-name> | ALPHA | ## Scrape Metrics diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go index 8e02aca4..2acdacf8 100644 --- a/test/integration/epp/hermetic_test.go +++ b/test/integration/epp/hermetic_test.go @@ -430,7 +430,13 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. # TYPE inference_model_request_total counter inference_model_request_total{model_name="my-model",target_model_name="my-model-12345"} 1 - `}, + `, + `inference_pool_ready_pods`: ` + # HELP inference_pool_ready_pods [ALPHA] The number of ready pods in the inference server pool. + # TYPE inference_pool_ready_pods gauge + inference_pool_ready_pods{name="vllm-llama3-8b-instruct-pool"} 3 + `, + }, wantErr: false, wantResponses: []*extProcPb.ProcessingResponse{ { @@ -1465,6 +1471,11 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { }, }, }, + wantMetrics: map[string]string{`inference_pool_ready_pods`: ` + # HELP inference_pool_ready_pods [ALPHA] The number of ready pods in the inference server pool. + # TYPE inference_pool_ready_pods gauge + inference_pool_ready_pods{name="vllm-llama3-8b-instruct-pool"} 1 + `}, }, }