diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go index a92f091c..8aca4b20 100644 --- a/pkg/epp/handlers/server.go +++ b/pkg/epp/handlers/server.go @@ -130,6 +130,7 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens) metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens) + metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens) } if reqCtx.modelServerStreaming { logger.V(logutil.DEBUG).Info("Request context after HandleResponseBody", "context", reqCtx) diff --git a/pkg/epp/handlers/streamingserver.go b/pkg/epp/handlers/streamingserver.go index 874dd734..37392167 100644 --- a/pkg/epp/handlers/streamingserver.go +++ b/pkg/epp/handlers/streamingserver.go @@ -184,6 +184,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer) reqCtx.ResponseCompleteTimestamp = time.Now() metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) + metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens) } reqCtx.respBodyResp = &extProcPb.ProcessingResponse{ @@ -226,6 +227,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer) metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens) metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens) + metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens) } } } diff --git a/pkg/epp/metrics/metrics.go b/pkg/epp/metrics/metrics.go index 434b8381..b474df36 100644 --- a/pkg/epp/metrics/metrics.go +++ b/pkg/epp/metrics/metrics.go @@ -131,6 +131,21 @@ var ( []string{"model_name"}, ) + // NTPOT - Normalized Time Per Output Token + NormalizedTimePerOutputToken = compbasemetrics.NewHistogramVec( + &compbasemetrics.HistogramOpts{ + Subsystem: InferenceModelComponent, + Name: "normalized_time_per_output_token_seconds", + Help: "Inference model latency divided by number of output tokens in seconds for each model and target model.", + // From few milliseconds per token to multiple seconds per token + Buckets: []float64{ + 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0, + }, + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"model_name", "target_model_name"}, + ) + // Inference Pool Metrics inferencePoolAvgKVCache = compbasemetrics.NewGaugeVec( &compbasemetrics.GaugeOpts{ @@ -176,6 +191,7 @@ func Register() { legacyregistry.MustRegister(inputTokens) legacyregistry.MustRegister(outputTokens) legacyregistry.MustRegister(runningRequests) + legacyregistry.MustRegister(NormalizedTimePerOutputToken) legacyregistry.MustRegister(inferencePoolAvgKVCache) legacyregistry.MustRegister(inferencePoolAvgQueueSize) @@ -231,6 +247,27 @@ func RecordOutputTokens(modelName, targetModelName string, size int) { } } +// RecordNormalizedTimePerOutputToken (NTPOT) records the normalized time per output token. +func RecordNormalizedTimePerOutputToken(ctx context.Context, modelName, targetModelName string, received time.Time, complete time.Time, outputTokenCount int) bool { + if !complete.After(received) { + log.FromContext(ctx).Error(nil, "Request latency values are invalid for NTPOT calculation", + "modelName", modelName, "targetModelName", targetModelName, "completeTime", complete, "receivedTime", received) + return false + } + + if outputTokenCount <= 0 { + log.FromContext(ctx).Error(nil, "Output token count must be positive for NTPOT calculation", + "modelName", modelName, "targetModelName", targetModelName, "outputTokenCount", outputTokenCount) + return false + } + + elapsedSeconds := complete.Sub(received).Seconds() + secondsPerToken := elapsedSeconds / float64(outputTokenCount) + + NormalizedTimePerOutputToken.WithLabelValues(modelName, targetModelName).Observe(secondsPerToken) + return true +} + // IncRunningRequests increases the current running requests. func IncRunningRequests(modelName string) { if modelName != "" { diff --git a/pkg/epp/metrics/metrics_test.go b/pkg/epp/metrics/metrics_test.go index dc4c7044..b5f19e6d 100644 --- a/pkg/epp/metrics/metrics_test.go +++ b/pkg/epp/metrics/metrics_test.go @@ -29,16 +29,17 @@ import ( ) const ( - RequestTotalMetric = InferenceModelComponent + "_request_total" - RequestErrorTotalMetric = InferenceModelComponent + "_request_error_total" - RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds" - RequestSizesMetric = InferenceModelComponent + "_request_sizes" - ResponseSizesMetric = InferenceModelComponent + "_response_sizes" - InputTokensMetric = InferenceModelComponent + "_input_tokens" - OutputTokensMetric = InferenceModelComponent + "_output_tokens" - RunningRequestsMetric = InferenceModelComponent + "_running_requests" - KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization" - QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size" + RequestTotalMetric = InferenceModelComponent + "_request_total" + RequestErrorTotalMetric = InferenceModelComponent + "_request_error_total" + RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds" + RequestSizesMetric = InferenceModelComponent + "_request_sizes" + ResponseSizesMetric = InferenceModelComponent + "_response_sizes" + InputTokensMetric = InferenceModelComponent + "_input_tokens" + OutputTokensMetric = InferenceModelComponent + "_output_tokens" + NormalizedTimePerOutputTokenMetric = InferenceModelComponent + "_normalized_time_per_output_token_seconds" + RunningRequestsMetric = InferenceModelComponent + "_running_requests" + KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization" + QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size" ) func TestRecordRequestCounterandSizes(t *testing.T) { @@ -252,6 +253,107 @@ func TestRecordRequestLatencies(t *testing.T) { } } +func TestRecordNormalizedTimePerOutputToken(t *testing.T) { + ctx := logutil.NewTestLoggerIntoContext(context.Background()) + timeBaseline := time.Now() + type tokenRequests struct { + modelName string + targetModelName string + receivedTime time.Time + completeTime time.Time + outputTokens int + } + scenarios := []struct { + name string + reqs []tokenRequests + invalid bool + }{ + { + name: "multiple requests", + reqs: []tokenRequests{ + { + modelName: "m10", + targetModelName: "t10", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 1000), + outputTokens: 100, // 10ms per token + }, + { + modelName: "m10", + targetModelName: "t10", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 1600), + outputTokens: 80, // 20ms per token + }, + { + modelName: "m10", + targetModelName: "t11", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 6000), + outputTokens: 300, // 20ms per token + }, + { + modelName: "m20", + targetModelName: "t20", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 2400), + outputTokens: 400, // 6ms per token + }, + }, + }, + { + name: "invalid elapsed time", + reqs: []tokenRequests{ + { + modelName: "m10", + targetModelName: "t10", + receivedTime: timeBaseline.Add(time.Millisecond * 10), + completeTime: timeBaseline, + outputTokens: 100, + }, + }, + invalid: true, + }, + { + name: "invalid token count", + reqs: []tokenRequests{ + { + modelName: "m10", + targetModelName: "t10", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 1000), + outputTokens: 0, // Invalid: zero tokens + }, + }, + invalid: true, + }, + } + Register() + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + for _, req := range scenario.reqs { + success := RecordNormalizedTimePerOutputToken(ctx, req.modelName, req.targetModelName, req.receivedTime, req.completeTime, req.outputTokens) + if success == scenario.invalid { + t.Errorf("got record success(%v), but the request expects invalid(%v)", success, scenario.invalid) + } + } + + wantLatencyPerToken, err := os.Open("testdata/normalized_time_per_output_token_seconds_metric") + defer func() { + if err := wantLatencyPerToken.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantLatencyPerToken, NormalizedTimePerOutputTokenMetric); err != nil { + t.Error(err) + } + }) + } +} + func TestRecordResponseMetrics(t *testing.T) { type responses struct { modelName string diff --git a/pkg/epp/metrics/testdata/normalized_time_per_output_token_seconds_metric b/pkg/epp/metrics/testdata/normalized_time_per_output_token_seconds_metric new file mode 100644 index 00000000..bb6e9373 --- /dev/null +++ b/pkg/epp/metrics/testdata/normalized_time_per_output_token_seconds_metric @@ -0,0 +1,50 @@ +# HELP inference_model_normalized_time_per_output_token_seconds [ALPHA] Inference model latency divided by number of output tokens in seconds for each model and target model. +# TYPE inference_model_normalized_time_per_output_token_seconds histogram +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.001"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.002"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.005"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.01"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.02"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.05"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.1"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.2"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.5"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="1.0"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="2.0"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="5.0"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="10.0"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="+Inf"} 2 +inference_model_normalized_time_per_output_token_seconds_sum{model_name="m10", target_model_name="t10"} 0.03 +inference_model_normalized_time_per_output_token_seconds_count{model_name="m10", target_model_name="t10"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.001"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.002"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.005"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.01"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.02"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.05"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.1"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.2"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.5"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="1.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="2.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="5.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="10.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="+Inf"} 1 +inference_model_normalized_time_per_output_token_seconds_sum{model_name="m10", target_model_name="t11"} 0.02 +inference_model_normalized_time_per_output_token_seconds_count{model_name="m10", target_model_name="t11"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.001"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.002"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.005"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.01"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.02"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.05"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.1"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.2"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.5"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="1.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="2.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="5.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="10.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="+Inf"} 1 +inference_model_normalized_time_per_output_token_seconds_sum{model_name="m20", target_model_name="t20"} 0.006 +inference_model_normalized_time_per_output_token_seconds_count{model_name="m20", target_model_name="t20"} 1 diff --git a/site-src/guides/metrics.md b/site-src/guides/metrics.md index a781f721..d16c7d47 100644 --- a/site-src/guides/metrics.md +++ b/site-src/guides/metrics.md @@ -26,6 +26,7 @@ curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ | inference_model_request_total | Counter | The counter of requests broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | | inference_model_request_error_total | Counter | The counter of requests errors broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | | inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| normalized_time_per_output_token_seconds | Distribution | Distribution of ntpot (response latency per output token) | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | | inference_model_request_sizes | Distribution | Distribution of request size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | | inference_model_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | | inference_model_input_tokens | Distribution | Distribution of input token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA |