diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go
index a92f091c..8aca4b20 100644
--- a/pkg/epp/handlers/server.go
+++ b/pkg/epp/handlers/server.go
@@ -130,6 +130,7 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error {
metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize)
metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens)
metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens)
+ metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
}
if reqCtx.modelServerStreaming {
logger.V(logutil.DEBUG).Info("Request context after HandleResponseBody", "context", reqCtx)
diff --git a/pkg/epp/handlers/streamingserver.go b/pkg/epp/handlers/streamingserver.go
index 874dd734..37392167 100644
--- a/pkg/epp/handlers/streamingserver.go
+++ b/pkg/epp/handlers/streamingserver.go
@@ -184,6 +184,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
reqCtx.ResponseCompleteTimestamp = time.Now()
metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp)
metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize)
+ metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
}
reqCtx.respBodyResp = &extProcPb.ProcessingResponse{
@@ -226,6 +227,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize)
metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens)
metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens)
+ metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
}
}
}
diff --git a/pkg/epp/metrics/metrics.go b/pkg/epp/metrics/metrics.go
index 434b8381..b474df36 100644
--- a/pkg/epp/metrics/metrics.go
+++ b/pkg/epp/metrics/metrics.go
@@ -131,6 +131,21 @@ var (
[]string{"model_name"},
)
+ // NTPOT - Normalized Time Per Output Token
+ NormalizedTimePerOutputToken = compbasemetrics.NewHistogramVec(
+ &compbasemetrics.HistogramOpts{
+ Subsystem: InferenceModelComponent,
+ Name: "normalized_time_per_output_token_seconds",
+ Help: "Inference model latency divided by number of output tokens in seconds for each model and target model.",
+ // From few milliseconds per token to multiple seconds per token
+ Buckets: []float64{
+ 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0,
+ },
+ StabilityLevel: compbasemetrics.ALPHA,
+ },
+ []string{"model_name", "target_model_name"},
+ )
+
// Inference Pool Metrics
inferencePoolAvgKVCache = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
@@ -176,6 +191,7 @@ func Register() {
legacyregistry.MustRegister(inputTokens)
legacyregistry.MustRegister(outputTokens)
legacyregistry.MustRegister(runningRequests)
+ legacyregistry.MustRegister(NormalizedTimePerOutputToken)
legacyregistry.MustRegister(inferencePoolAvgKVCache)
legacyregistry.MustRegister(inferencePoolAvgQueueSize)
@@ -231,6 +247,27 @@ func RecordOutputTokens(modelName, targetModelName string, size int) {
}
}
+// RecordNormalizedTimePerOutputToken (NTPOT) records the normalized time per output token.
+func RecordNormalizedTimePerOutputToken(ctx context.Context, modelName, targetModelName string, received time.Time, complete time.Time, outputTokenCount int) bool {
+ if !complete.After(received) {
+ log.FromContext(ctx).Error(nil, "Request latency values are invalid for NTPOT calculation",
+ "modelName", modelName, "targetModelName", targetModelName, "completeTime", complete, "receivedTime", received)
+ return false
+ }
+
+ if outputTokenCount <= 0 {
+ log.FromContext(ctx).Error(nil, "Output token count must be positive for NTPOT calculation",
+ "modelName", modelName, "targetModelName", targetModelName, "outputTokenCount", outputTokenCount)
+ return false
+ }
+
+ elapsedSeconds := complete.Sub(received).Seconds()
+ secondsPerToken := elapsedSeconds / float64(outputTokenCount)
+
+ NormalizedTimePerOutputToken.WithLabelValues(modelName, targetModelName).Observe(secondsPerToken)
+ return true
+}
+
// IncRunningRequests increases the current running requests.
func IncRunningRequests(modelName string) {
if modelName != "" {
diff --git a/pkg/epp/metrics/metrics_test.go b/pkg/epp/metrics/metrics_test.go
index dc4c7044..b5f19e6d 100644
--- a/pkg/epp/metrics/metrics_test.go
+++ b/pkg/epp/metrics/metrics_test.go
@@ -29,16 +29,17 @@ import (
)
const (
- RequestTotalMetric = InferenceModelComponent + "_request_total"
- RequestErrorTotalMetric = InferenceModelComponent + "_request_error_total"
- RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds"
- RequestSizesMetric = InferenceModelComponent + "_request_sizes"
- ResponseSizesMetric = InferenceModelComponent + "_response_sizes"
- InputTokensMetric = InferenceModelComponent + "_input_tokens"
- OutputTokensMetric = InferenceModelComponent + "_output_tokens"
- RunningRequestsMetric = InferenceModelComponent + "_running_requests"
- KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization"
- QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size"
+ RequestTotalMetric = InferenceModelComponent + "_request_total"
+ RequestErrorTotalMetric = InferenceModelComponent + "_request_error_total"
+ RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds"
+ RequestSizesMetric = InferenceModelComponent + "_request_sizes"
+ ResponseSizesMetric = InferenceModelComponent + "_response_sizes"
+ InputTokensMetric = InferenceModelComponent + "_input_tokens"
+ OutputTokensMetric = InferenceModelComponent + "_output_tokens"
+ NormalizedTimePerOutputTokenMetric = InferenceModelComponent + "_normalized_time_per_output_token_seconds"
+ RunningRequestsMetric = InferenceModelComponent + "_running_requests"
+ KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization"
+ QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size"
)
func TestRecordRequestCounterandSizes(t *testing.T) {
@@ -252,6 +253,107 @@ func TestRecordRequestLatencies(t *testing.T) {
}
}
+func TestRecordNormalizedTimePerOutputToken(t *testing.T) {
+ ctx := logutil.NewTestLoggerIntoContext(context.Background())
+ timeBaseline := time.Now()
+ type tokenRequests struct {
+ modelName string
+ targetModelName string
+ receivedTime time.Time
+ completeTime time.Time
+ outputTokens int
+ }
+ scenarios := []struct {
+ name string
+ reqs []tokenRequests
+ invalid bool
+ }{
+ {
+ name: "multiple requests",
+ reqs: []tokenRequests{
+ {
+ modelName: "m10",
+ targetModelName: "t10",
+ receivedTime: timeBaseline,
+ completeTime: timeBaseline.Add(time.Millisecond * 1000),
+ outputTokens: 100, // 10ms per token
+ },
+ {
+ modelName: "m10",
+ targetModelName: "t10",
+ receivedTime: timeBaseline,
+ completeTime: timeBaseline.Add(time.Millisecond * 1600),
+ outputTokens: 80, // 20ms per token
+ },
+ {
+ modelName: "m10",
+ targetModelName: "t11",
+ receivedTime: timeBaseline,
+ completeTime: timeBaseline.Add(time.Millisecond * 6000),
+ outputTokens: 300, // 20ms per token
+ },
+ {
+ modelName: "m20",
+ targetModelName: "t20",
+ receivedTime: timeBaseline,
+ completeTime: timeBaseline.Add(time.Millisecond * 2400),
+ outputTokens: 400, // 6ms per token
+ },
+ },
+ },
+ {
+ name: "invalid elapsed time",
+ reqs: []tokenRequests{
+ {
+ modelName: "m10",
+ targetModelName: "t10",
+ receivedTime: timeBaseline.Add(time.Millisecond * 10),
+ completeTime: timeBaseline,
+ outputTokens: 100,
+ },
+ },
+ invalid: true,
+ },
+ {
+ name: "invalid token count",
+ reqs: []tokenRequests{
+ {
+ modelName: "m10",
+ targetModelName: "t10",
+ receivedTime: timeBaseline,
+ completeTime: timeBaseline.Add(time.Millisecond * 1000),
+ outputTokens: 0, // Invalid: zero tokens
+ },
+ },
+ invalid: true,
+ },
+ }
+ Register()
+ for _, scenario := range scenarios {
+ t.Run(scenario.name, func(t *testing.T) {
+ for _, req := range scenario.reqs {
+ success := RecordNormalizedTimePerOutputToken(ctx, req.modelName, req.targetModelName, req.receivedTime, req.completeTime, req.outputTokens)
+ if success == scenario.invalid {
+ t.Errorf("got record success(%v), but the request expects invalid(%v)", success, scenario.invalid)
+ }
+ }
+
+ wantLatencyPerToken, err := os.Open("testdata/normalized_time_per_output_token_seconds_metric")
+ defer func() {
+ if err := wantLatencyPerToken.Close(); err != nil {
+ t.Error(err)
+ }
+ }()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantLatencyPerToken, NormalizedTimePerOutputTokenMetric); err != nil {
+ t.Error(err)
+ }
+ })
+ }
+}
+
func TestRecordResponseMetrics(t *testing.T) {
type responses struct {
modelName string
diff --git a/pkg/epp/metrics/testdata/normalized_time_per_output_token_seconds_metric b/pkg/epp/metrics/testdata/normalized_time_per_output_token_seconds_metric
new file mode 100644
index 00000000..bb6e9373
--- /dev/null
+++ b/pkg/epp/metrics/testdata/normalized_time_per_output_token_seconds_metric
@@ -0,0 +1,50 @@
+# HELP inference_model_normalized_time_per_output_token_seconds [ALPHA] Inference model latency divided by number of output tokens in seconds for each model and target model.
+# TYPE inference_model_normalized_time_per_output_token_seconds histogram
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.001"} 0
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.002"} 0
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.005"} 0
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.01"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.02"} 2
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.05"} 2
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.1"} 2
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.2"} 2
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.5"} 2
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="1.0"} 2
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="2.0"} 2
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="5.0"} 2
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="10.0"} 2
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="+Inf"} 2
+inference_model_normalized_time_per_output_token_seconds_sum{model_name="m10", target_model_name="t10"} 0.03
+inference_model_normalized_time_per_output_token_seconds_count{model_name="m10", target_model_name="t10"} 2
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.001"} 0
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.002"} 0
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.005"} 0
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.01"} 0
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.02"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.05"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.1"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.2"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.5"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="1.0"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="2.0"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="5.0"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="10.0"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="+Inf"} 1
+inference_model_normalized_time_per_output_token_seconds_sum{model_name="m10", target_model_name="t11"} 0.02
+inference_model_normalized_time_per_output_token_seconds_count{model_name="m10", target_model_name="t11"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.001"} 0
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.002"} 0
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.005"} 0
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.01"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.02"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.05"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.1"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.2"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.5"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="1.0"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="2.0"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="5.0"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="10.0"} 1
+inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="+Inf"} 1
+inference_model_normalized_time_per_output_token_seconds_sum{model_name="m20", target_model_name="t20"} 0.006
+inference_model_normalized_time_per_output_token_seconds_count{model_name="m20", target_model_name="t20"} 1
diff --git a/site-src/guides/metrics.md b/site-src/guides/metrics.md
index a781f721..d16c7d47 100644
--- a/site-src/guides/metrics.md
+++ b/site-src/guides/metrics.md
@@ -26,6 +26,7 @@ curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{
| inference_model_request_total | Counter | The counter of requests broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA |
| inference_model_request_error_total | Counter | The counter of requests errors broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA |
| inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA |
+| normalized_time_per_output_token_seconds | Distribution | Distribution of ntpot (response latency per output token) | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA |
| inference_model_request_sizes | Distribution | Distribution of request size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA |
| inference_model_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA |
| inference_model_input_tokens | Distribution | Distribution of input token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA |