Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 3aa0f12

Browse files
committedApr 3, 2025·
change metric name from ntpot to normalized time per output token
1 parent 9857871 commit 3aa0f12

File tree

6 files changed

+64
-64
lines changed

6 files changed

+64
-64
lines changed
 

‎pkg/epp/handlers/server.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error {
130130
metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize)
131131
metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens)
132132
metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens)
133-
metrics.RecordLatencyPerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
133+
metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
134134
}
135135
if reqCtx.modelServerStreaming {
136136
logger.V(logutil.DEBUG).Info("Request context after HandleResponseBody", "context", reqCtx)

‎pkg/epp/handlers/streamingserver.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
184184
reqCtx.ResponseCompleteTimestamp = time.Now()
185185
metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp)
186186
metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize)
187-
metrics.RecordLatencyPerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
187+
metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
188188
}
189189

190190
reqCtx.respBodyResp = &extProcPb.ProcessingResponse{
@@ -227,7 +227,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
227227
metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize)
228228
metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens)
229229
metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens)
230-
metrics.RecordLatencyPerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
230+
metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
231231
}
232232
}
233233
}

‎pkg/epp/metrics/metrics.go

+6-6
Original file line numberDiff line numberDiff line change
@@ -132,10 +132,10 @@ var (
132132
)
133133

134134
// NTPOT - Normalized Time Per Output Token
135-
latencyPerOutputToken = compbasemetrics.NewHistogramVec(
135+
NormalizedTimePerOutputToken = compbasemetrics.NewHistogramVec(
136136
&compbasemetrics.HistogramOpts{
137137
Subsystem: InferenceModelComponent,
138-
Name: "ntpot_seconds",
138+
Name: "normalized_time_per_output_token_seconds",
139139
Help: "Inference model latency divided by number of output tokens in seconds for each model and target model.",
140140
// From few milliseconds per token to multiple seconds per token
141141
Buckets: []float64{
@@ -191,7 +191,7 @@ func Register() {
191191
legacyregistry.MustRegister(inputTokens)
192192
legacyregistry.MustRegister(outputTokens)
193193
legacyregistry.MustRegister(runningRequests)
194-
legacyregistry.MustRegister(latencyPerOutputToken)
194+
legacyregistry.MustRegister(NormalizedTimePerOutputToken)
195195

196196
legacyregistry.MustRegister(inferencePoolAvgKVCache)
197197
legacyregistry.MustRegister(inferencePoolAvgQueueSize)
@@ -247,8 +247,8 @@ func RecordOutputTokens(modelName, targetModelName string, size int) {
247247
}
248248
}
249249

250-
// RecordLatencyPerOutputToken (NTPOT) records the normalized time per output token.
251-
func RecordLatencyPerOutputToken(ctx context.Context, modelName, targetModelName string, received time.Time, complete time.Time, outputTokenCount int) bool {
250+
// RecordNormalizedTimePerOutputToken (NTPOT) records the normalized time per output token.
251+
func RecordNormalizedTimePerOutputToken(ctx context.Context, modelName, targetModelName string, received time.Time, complete time.Time, outputTokenCount int) bool {
252252
if !complete.After(received) {
253253
log.FromContext(ctx).Error(nil, "Request latency values are invalid for NTPOT calculation",
254254
"modelName", modelName, "targetModelName", targetModelName, "completeTime", complete, "receivedTime", received)
@@ -264,7 +264,7 @@ func RecordLatencyPerOutputToken(ctx context.Context, modelName, targetModelName
264264
elapsedSeconds := complete.Sub(received).Seconds()
265265
secondsPerToken := elapsedSeconds / float64(outputTokenCount)
266266

267-
latencyPerOutputToken.WithLabelValues(modelName, targetModelName).Observe(secondsPerToken)
267+
NormalizedTimePerOutputToken.WithLabelValues(modelName, targetModelName).Observe(secondsPerToken)
268268
return true
269269
}
270270

‎pkg/epp/metrics/metrics_test.go

+5-5
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ const (
3636
ResponseSizesMetric = InferenceModelComponent + "_response_sizes"
3737
InputTokensMetric = InferenceModelComponent + "_input_tokens"
3838
OutputTokensMetric = InferenceModelComponent + "_output_tokens"
39-
LatencyPerOutputTokenMetric = InferenceModelComponent + "_ntpot_seconds"
39+
NormalizedTimePerOutputTokenMetric = InferenceModelComponent + "_normalized_time_per_output_token_seconds"
4040
RunningRequestsMetric = InferenceModelComponent + "_running_requests"
4141
KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization"
4242
QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size"
@@ -253,7 +253,7 @@ func TestRecordRequestLatencies(t *testing.T) {
253253
}
254254
}
255255

256-
func TestRecordLatencyPerOutputToken(t *testing.T) {
256+
func TestRecordNormalizedTimePerOutputToken(t *testing.T) {
257257
ctx := logutil.NewTestLoggerIntoContext(context.Background())
258258
timeBaseline := time.Now()
259259
type tokenRequests struct {
@@ -332,13 +332,13 @@ func TestRecordLatencyPerOutputToken(t *testing.T) {
332332
for _, scenario := range scenarios {
333333
t.Run(scenario.name, func(t *testing.T) {
334334
for _, req := range scenario.reqs {
335-
success := RecordLatencyPerOutputToken(ctx, req.modelName, req.targetModelName, req.receivedTime, req.completeTime, req.outputTokens)
335+
success := RecordNormalizedTimePerOutputToken(ctx, req.modelName, req.targetModelName, req.receivedTime, req.completeTime, req.outputTokens)
336336
if success == scenario.invalid {
337337
t.Errorf("got record success(%v), but the request expects invalid(%v)", success, scenario.invalid)
338338
}
339339
}
340340

341-
wantLatencyPerToken, err := os.Open("testdata/ntpot_seconds_metric")
341+
wantLatencyPerToken, err := os.Open("testdata/normalized_time_per_output_token_seconds_metric")
342342
defer func() {
343343
if err := wantLatencyPerToken.Close(); err != nil {
344344
t.Error(err)
@@ -347,7 +347,7 @@ func TestRecordLatencyPerOutputToken(t *testing.T) {
347347
if err != nil {
348348
t.Fatal(err)
349349
}
350-
if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantLatencyPerToken, LatencyPerOutputTokenMetric); err != nil {
350+
if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantLatencyPerToken, NormalizedTimePerOutputTokenMetric); err != nil {
351351
t.Error(err)
352352
}
353353
})
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# HELP inference_model_normalized_time_per_output_token_seconds [ALPHA] Inference model latency divided by number of output tokens in seconds for each model and target model.
2+
# TYPE inference_model_normalized_time_per_output_token_seconds histogram
3+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.001"} 0
4+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.002"} 0
5+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.005"} 0
6+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.01"} 1
7+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.02"} 2
8+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.05"} 2
9+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.1"} 2
10+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.2"} 2
11+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.5"} 2
12+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="1.0"} 2
13+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="2.0"} 2
14+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="5.0"} 2
15+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="10.0"} 2
16+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="+Inf"} 2
17+
inference_model_normalized_time_per_output_token_seconds_sum{model_name="m10", target_model_name="t10"} 0.03
18+
inference_model_normalized_time_per_output_token_seconds_count{model_name="m10", target_model_name="t10"} 2
19+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.001"} 0
20+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.002"} 0
21+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.005"} 0
22+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.01"} 0
23+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.02"} 1
24+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.05"} 1
25+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.1"} 1
26+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.2"} 1
27+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.5"} 1
28+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="1.0"} 1
29+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="2.0"} 1
30+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="5.0"} 1
31+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="10.0"} 1
32+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="+Inf"} 1
33+
inference_model_normalized_time_per_output_token_seconds_sum{model_name="m10", target_model_name="t11"} 0.02
34+
inference_model_normalized_time_per_output_token_seconds_count{model_name="m10", target_model_name="t11"} 1
35+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.001"} 0
36+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.002"} 0
37+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.005"} 0
38+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.01"} 1
39+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.02"} 1
40+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.05"} 1
41+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.1"} 1
42+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.2"} 1
43+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.5"} 1
44+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="1.0"} 1
45+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="2.0"} 1
46+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="5.0"} 1
47+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="10.0"} 1
48+
inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="+Inf"} 1
49+
inference_model_normalized_time_per_output_token_seconds_sum{model_name="m20", target_model_name="t20"} 0.006
50+
inference_model_normalized_time_per_output_token_seconds_count{model_name="m20", target_model_name="t20"} 1

‎pkg/epp/metrics/testdata/ntpot_seconds_metric

-50
This file was deleted.

0 commit comments

Comments
 (0)
Please sign in to comment.