Skip to content

Commit 352f3a2

Browse files
committed
[Metrics] Add running requests gauge metric
1 parent 16ded66 commit 352f3a2

File tree

5 files changed

+93
-0
lines changed

5 files changed

+93
-0
lines changed

Diff for: pkg/epp/handlers/streamingserver.go

+2
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
137137
case *extProcPb.ProcessingRequest_RequestTrailers:
138138
// This is currently unused.
139139
case *extProcPb.ProcessingRequest_ResponseHeaders:
140+
metrics.DecRunningRequests(reqCtx.Model)
140141
for _, header := range v.ResponseHeaders.Headers.GetHeaders() {
141142
value := string(header.RawValue)
142143

@@ -322,6 +323,7 @@ func (s *StreamingServer) HandleRequestBody(
322323
if !ok {
323324
return reqCtx, errutil.Error{Code: errutil.BadRequest, Msg: "model not found in request"}
324325
}
326+
metrics.IncRunningRequests(model)
325327

326328
modelName := model
327329

Diff for: pkg/epp/metrics/metrics.go

+25
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,16 @@ var (
121121
[]string{"model_name", "target_model_name"},
122122
)
123123

124+
runningRequests = compbasemetrics.NewGaugeVec(
125+
&compbasemetrics.GaugeOpts{
126+
Subsystem: InferenceModelComponent,
127+
Name: "running_requests",
128+
Help: "Inference model number of running requests in each model.",
129+
StabilityLevel: compbasemetrics.ALPHA,
130+
},
131+
[]string{"model_name"},
132+
)
133+
124134
// Inference Pool Metrics
125135
inferencePoolAvgKVCache = compbasemetrics.NewGaugeVec(
126136
&compbasemetrics.GaugeOpts{
@@ -155,6 +165,7 @@ func Register() {
155165
legacyregistry.MustRegister(responseSizes)
156166
legacyregistry.MustRegister(inputTokens)
157167
legacyregistry.MustRegister(outputTokens)
168+
legacyregistry.MustRegister(runningRequests)
158169

159170
legacyregistry.MustRegister(inferencePoolAvgKVCache)
160171
legacyregistry.MustRegister(inferencePoolAvgQueueSize)
@@ -209,6 +220,20 @@ func RecordOutputTokens(modelName, targetModelName string, size int) {
209220
}
210221
}
211222

223+
// IncRunningRequests increases the current running requests.
224+
func IncRunningRequests(modelName string) {
225+
if modelName != "" {
226+
runningRequests.WithLabelValues(modelName).Inc()
227+
}
228+
}
229+
230+
// DecRunningRequests decreases the current running requests.
231+
func DecRunningRequests(modelName string) {
232+
if modelName != "" {
233+
runningRequests.WithLabelValues(modelName).Dec()
234+
}
235+
}
236+
212237
func RecordInferencePoolAvgKVCache(name string, utilization float64) {
213238
inferencePoolAvgKVCache.WithLabelValues(name).Set(utilization)
214239
}

Diff for: pkg/epp/metrics/metrics_test.go

+61
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ const (
3636
ResponseSizesMetric = InferenceModelComponent + "_response_sizes"
3737
InputTokensMetric = InferenceModelComponent + "_input_tokens"
3838
OutputTokensMetric = InferenceModelComponent + "_output_tokens"
39+
RunningRequestsMetric = InferenceModelComponent + "_running_requests"
3940
KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization"
4041
QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size"
4142
)
@@ -345,6 +346,66 @@ func TestRecordResponseMetrics(t *testing.T) {
345346
}
346347
}
347348

349+
func TestRunningRequestsMetrics(t *testing.T) {
350+
type request struct {
351+
modelName string
352+
complete bool // true -> request is completed, false -> runing request
353+
}
354+
355+
scenarios := []struct {
356+
name string
357+
requests []request
358+
}{
359+
{
360+
name: "basic test",
361+
requests: []request{
362+
{
363+
modelName: "m1",
364+
complete: false,
365+
},
366+
{
367+
modelName: "m1",
368+
complete: false,
369+
},
370+
{
371+
modelName: "m1",
372+
complete: true,
373+
},
374+
{
375+
modelName: "m2",
376+
complete: false,
377+
},
378+
},
379+
},
380+
}
381+
382+
Register()
383+
for _, scenario := range scenarios {
384+
t.Run(scenario.name, func(t *testing.T) {
385+
for _, req := range scenario.requests {
386+
if req.complete {
387+
DecRunningRequests(req.modelName)
388+
} else {
389+
IncRunningRequests(req.modelName)
390+
}
391+
}
392+
393+
wantRunningRequests, err := os.Open("testdata/running_requests_metrics")
394+
defer func() {
395+
if err := wantRunningRequests.Close(); err != nil {
396+
t.Error(err)
397+
}
398+
}()
399+
if err != nil {
400+
t.Fatal(err)
401+
}
402+
if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRunningRequests, RunningRequestsMetric); err != nil {
403+
t.Error(err)
404+
}
405+
})
406+
}
407+
}
408+
348409
func TestInferencePoolMetrics(t *testing.T) {
349410
scenarios := []struct {
350411
name string

Diff for: pkg/epp/metrics/testdata/running_requests_metrics

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# HELP inference_model_running_requests [ALPHA] Inference model number of running requests in each model.
2+
# TYPE inference_model_running_requests gauge
3+
inference_model_running_requests{model_name="m1"} 1
4+
inference_model_running_requests{model_name="m2"} 1

Diff for: site-src/guides/metrics.md

+1
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{
4949
| inference_model_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
5050
| inference_model_input_tokens | Distribution | Distribution of input token count. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
5151
| inference_model_output_tokens | Distribution | Distribution of output token count. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
52+
| inference_model_running_requests | Gauge | Number of running requests for each model. | `model_name`=&lt;model-name&gt; | ALPHA |
5253
| inference_pool_average_kv_cache_utilization | Gauge | The average kv cache utilization for an inference server pool. | `name`=&lt;inference-pool-name&gt; | ALPHA |
5354
| inference_pool_average_queue_size | Gauge | The average number of requests pending in the model server queue. | `name`=&lt;inference-pool-name&gt; | ALPHA |
5455

0 commit comments

Comments
 (0)