Skip to content

Commit 319587c

Browse files
committed
[Metrics] Add running requests gauge metric
1 parent 1ce8273 commit 319587c

File tree

6 files changed

+99
-2
lines changed

6 files changed

+99
-2
lines changed

pkg/epp/handlers/server.go

+1
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,7 @@ type RequestContext struct {
228228
ResponseSize int
229229
ResponseComplete bool
230230
ResponseStatusCode string
231+
RequestRunning bool
231232

232233
RequestState StreamRequestState
233234
modelServerStreaming bool

pkg/epp/handlers/streamingserver.go

+7-2
Original file line numberDiff line numberDiff line change
@@ -81,13 +81,16 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
8181
// error metrics. This doesn't cover the error "Cannot receive stream request" because
8282
// such errors might happen even though response is processed.
8383
var err error
84-
defer func(error) {
84+
defer func(error, *RequestContext) {
8585
if reqCtx.ResponseStatusCode != "" {
8686
metrics.RecordRequestErrCounter(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseStatusCode)
8787
} else if err != nil {
8888
metrics.RecordRequestErrCounter(reqCtx.Model, reqCtx.ResolvedTargetModel, errutil.CanonicalCode(err))
8989
}
90-
}(err)
90+
if reqCtx.RequestRunning {
91+
metrics.DecRunningRequests(reqCtx.Model)
92+
}
93+
}(err, reqCtx)
9194

9295
for {
9396
select {
@@ -269,6 +272,8 @@ func (r *RequestContext) updateStateAndSendIfNeeded(srv extProcPb.ExternalProces
269272
return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err)
270273
}
271274
r.RequestState = BodyRequestResponsesComplete
275+
metrics.IncRunningRequests(r.Model)
276+
r.RequestRunning = true
272277
// Dump the response so a new stream message can begin
273278
r.reqBodyResp = nil
274279
}

pkg/epp/metrics/metrics.go

+25
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,16 @@ var (
121121
[]string{"model_name", "target_model_name"},
122122
)
123123

124+
runningRequests = compbasemetrics.NewGaugeVec(
125+
&compbasemetrics.GaugeOpts{
126+
Subsystem: InferenceModelComponent,
127+
Name: "running_requests",
128+
Help: "Inference model number of running requests in each model.",
129+
StabilityLevel: compbasemetrics.ALPHA,
130+
},
131+
[]string{"model_name"},
132+
)
133+
124134
// Inference Pool Metrics
125135
inferencePoolAvgKVCache = compbasemetrics.NewGaugeVec(
126136
&compbasemetrics.GaugeOpts{
@@ -155,6 +165,7 @@ func Register() {
155165
legacyregistry.MustRegister(responseSizes)
156166
legacyregistry.MustRegister(inputTokens)
157167
legacyregistry.MustRegister(outputTokens)
168+
legacyregistry.MustRegister(runningRequests)
158169

159170
legacyregistry.MustRegister(inferencePoolAvgKVCache)
160171
legacyregistry.MustRegister(inferencePoolAvgQueueSize)
@@ -209,6 +220,20 @@ func RecordOutputTokens(modelName, targetModelName string, size int) {
209220
}
210221
}
211222

223+
// IncRunningRequests increases the current running requests.
224+
func IncRunningRequests(modelName string) {
225+
if modelName != "" {
226+
runningRequests.WithLabelValues(modelName).Inc()
227+
}
228+
}
229+
230+
// DecRunningRequests decreases the current running requests.
231+
func DecRunningRequests(modelName string) {
232+
if modelName != "" {
233+
runningRequests.WithLabelValues(modelName).Dec()
234+
}
235+
}
236+
212237
func RecordInferencePoolAvgKVCache(name string, utilization float64) {
213238
inferencePoolAvgKVCache.WithLabelValues(name).Set(utilization)
214239
}

pkg/epp/metrics/metrics_test.go

+61
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ const (
3636
ResponseSizesMetric = InferenceModelComponent + "_response_sizes"
3737
InputTokensMetric = InferenceModelComponent + "_input_tokens"
3838
OutputTokensMetric = InferenceModelComponent + "_output_tokens"
39+
RunningRequestsMetric = InferenceModelComponent + "_running_requests"
3940
KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization"
4041
QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size"
4142
)
@@ -345,6 +346,66 @@ func TestRecordResponseMetrics(t *testing.T) {
345346
}
346347
}
347348

349+
func TestRunningRequestsMetrics(t *testing.T) {
350+
type request struct {
351+
modelName string
352+
complete bool // true -> request is completed, false -> running request
353+
}
354+
355+
scenarios := []struct {
356+
name string
357+
requests []request
358+
}{
359+
{
360+
name: "basic test",
361+
requests: []request{
362+
{
363+
modelName: "m1",
364+
complete: false,
365+
},
366+
{
367+
modelName: "m1",
368+
complete: false,
369+
},
370+
{
371+
modelName: "m1",
372+
complete: true,
373+
},
374+
{
375+
modelName: "m2",
376+
complete: false,
377+
},
378+
},
379+
},
380+
}
381+
382+
Register()
383+
for _, scenario := range scenarios {
384+
t.Run(scenario.name, func(t *testing.T) {
385+
for _, req := range scenario.requests {
386+
if req.complete {
387+
DecRunningRequests(req.modelName)
388+
} else {
389+
IncRunningRequests(req.modelName)
390+
}
391+
}
392+
393+
wantRunningRequests, err := os.Open("testdata/running_requests_metrics")
394+
defer func() {
395+
if err := wantRunningRequests.Close(); err != nil {
396+
t.Error(err)
397+
}
398+
}()
399+
if err != nil {
400+
t.Fatal(err)
401+
}
402+
if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRunningRequests, RunningRequestsMetric); err != nil {
403+
t.Error(err)
404+
}
405+
})
406+
}
407+
}
408+
348409
func TestInferencePoolMetrics(t *testing.T) {
349410
scenarios := []struct {
350411
name string
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# HELP inference_model_running_requests [ALPHA] Inference model number of running requests in each model.
2+
# TYPE inference_model_running_requests gauge
3+
inference_model_running_requests{model_name="m1"} 1
4+
inference_model_running_requests{model_name="m2"} 1

site-src/guides/metrics.md

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{
3030
| inference_model_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
3131
| inference_model_input_tokens | Distribution | Distribution of input token count. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
3232
| inference_model_output_tokens | Distribution | Distribution of output token count. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
33+
| inference_model_running_requests | Gauge | Number of running requests for each model. | `model_name`=&lt;model-name&gt; | ALPHA |
3334
| inference_pool_average_kv_cache_utilization | Gauge | The average kv cache utilization for an inference server pool. | `name`=&lt;inference-pool-name&gt; | ALPHA |
3435
| inference_pool_average_queue_size | Gauge | The average number of requests pending in the model server queue. | `name`=&lt;inference-pool-name&gt; | ALPHA |
3536

0 commit comments

Comments
 (0)