Skip to content

Commit 55c5ea3

Browse files
committed
[Metrics] Add input/output token and request size metrics
Data will only populate if use buffered mode.
1 parent 5d000fa commit 55c5ea3

File tree

8 files changed

+364
-3
lines changed

8 files changed

+364
-3
lines changed

Diff for: pkg/ext-proc/handlers/response.go

+1
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ func (s *Server) HandleResponseBody(reqCtx *RequestContext, req *extProcPb.Proce
7373
return nil, fmt.Errorf("unmarshaling response body: %v", err)
7474
}
7575
reqCtx.Response = res
76+
reqCtx.ResponseSize = len(body.ResponseBody.Body)
7677
// ResponseComplete is to indicate the response is complete. In non-streaming
7778
// case, it will be set to be true once the response is processed; in
7879
// streaming case, it will be set to be true once the last chunk is processed.

Diff for: pkg/ext-proc/handlers/server.go

+4
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,9 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error {
9595
if err == nil && reqCtx.ResponseComplete {
9696
reqCtx.ResponseCompleteTimestamp = time.Now()
9797
metrics.RecordRequestLatencies(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp)
98+
metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize)
99+
metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Response.Usage.PromptTokens)
100+
metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Response.Usage.CompletionTokens)
98101
}
99102
klog.V(3).Infof("Request context after HandleResponseBody: %+v", reqCtx)
100103
default:
@@ -138,5 +141,6 @@ type RequestContext struct {
138141
ResponseCompleteTimestamp time.Time
139142
RequestSize int
140143
Response Response
144+
ResponseSize int
141145
ResponseComplete bool
142146
}

Diff for: pkg/ext-proc/metrics/README.md

+32-3
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,42 @@ This documentation is the current state of exposed metrics.
66
* [Exposed Metrics](#exposed-metrics)
77
* [Scrape Metrics](#scrape-metrics)
88

9+
## Requirements
10+
11+
> There is an ongoing work to support both streaming and buffered model: https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/178. Without it, each response for streamed request will process one token for each response, resulting in multiple small increasement to response metrics.
12+
13+
In order to process response metrics, you will need to enable streaming, or enable `Buffered` mode for response in `EnvoyExtensionPolicy`:
14+
15+
```
16+
apiVersion: gateway.envoyproxy.io/v1alpha1
17+
kind: EnvoyExtensionPolicy
18+
metadata:
19+
name: ext-proc-policy
20+
namespace: default
21+
spec:
22+
extProc:
23+
- backendRefs:
24+
- group: ""
25+
kind: Service
26+
name: inference-gateway-ext-proc
27+
port: 9002
28+
processingMode:
29+
request:
30+
body: Buffered
31+
response:
32+
body: Buffered
33+
```
34+
935
## Exposed metrics
1036

1137
| Metric name | Metric Type | Description | Labels | Status |
1238
| ------------|--------------| ----------- | ------ | ------ |
13-
| inference_model_request_total | Counter | The counter of requests broken out for each model. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; ` | ALPHA |
14-
| inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; ` | ALPHA |
15-
| inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; ` | ALPHA |
39+
| inference_model_request_total | Counter | The counter of requests broken out for each model. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
40+
| inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
41+
| inference_model_request_sizes | Distribution | Distribution of request size in bytes. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
42+
| inference_model_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
43+
| inference_model_input_tokens | Distribution | Distribution of input token count. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
44+
| inference_model_output_tokens | Distribution | Distribution of output token count. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
1645

1746
## Scrape Metrics
1847

Diff for: pkg/ext-proc/metrics/metrics.go

+59
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,43 @@ var (
5151
},
5252
[]string{"model_name", "target_model_name"},
5353
)
54+
55+
responseSizes = compbasemetrics.NewHistogramVec(
56+
&compbasemetrics.HistogramOpts{
57+
Subsystem: InferenceModelComponent,
58+
Name: "response_sizes",
59+
Help: "Inference model responses size distribution in bytes for each model and target model.",
60+
// Most models have a response token < 8192 tokens. Each token, in average, has 4 characters.
61+
// 8192 * 4 = 32768.
62+
Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32778, 65536},
63+
StabilityLevel: compbasemetrics.ALPHA,
64+
},
65+
[]string{"model_name", "target_model_name"},
66+
)
67+
68+
inputTokens = compbasemetrics.NewHistogramVec(
69+
&compbasemetrics.HistogramOpts{
70+
Subsystem: InferenceModelComponent,
71+
Name: "input_tokens",
72+
Help: "Inference model input token count distribution for requests in each model.",
73+
// Most models have a input context window less than 1 million tokens.
74+
Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32778, 65536, 131072, 262144, 524288, 1048576},
75+
StabilityLevel: compbasemetrics.ALPHA,
76+
},
77+
[]string{"model_name", "target_model_name"},
78+
)
79+
80+
outputTokens = compbasemetrics.NewHistogramVec(
81+
&compbasemetrics.HistogramOpts{
82+
Subsystem: InferenceModelComponent,
83+
Name: "output_tokens",
84+
Help: "Inference model output token count distribution for requests in each model.",
85+
// Most models generates output less than 8192 tokens.
86+
Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192},
87+
StabilityLevel: compbasemetrics.ALPHA,
88+
},
89+
[]string{"model_name", "target_model_name"},
90+
)
5491
)
5592

5693
var registerMetrics sync.Once
@@ -61,6 +98,9 @@ func Register() {
6198
legacyregistry.MustRegister(requestCounter)
6299
legacyregistry.MustRegister(requestLatencies)
63100
legacyregistry.MustRegister(requestSizes)
101+
legacyregistry.MustRegister(responseSizes)
102+
legacyregistry.MustRegister(inputTokens)
103+
legacyregistry.MustRegister(outputTokens)
64104
})
65105
}
66106

@@ -84,3 +124,22 @@ func RecordRequestLatencies(modelName, targetModelName string, received time.Tim
84124
requestLatencies.WithLabelValues(modelName, targetModelName).Observe(elapsedSeconds)
85125
return true
86126
}
127+
128+
// RecordResponseSizes records the response sizes.
129+
func RecordResponseSizes(modelName, targetModelName string, size int) {
130+
responseSizes.WithLabelValues(modelName, targetModelName).Observe(float64(size))
131+
}
132+
133+
// RecordInputTokens records input tokens count.
134+
func RecordInputTokens(modelName, targetModelName string, size int) {
135+
if size > 0 {
136+
inputTokens.WithLabelValues(modelName, targetModelName).Observe(float64(size))
137+
}
138+
}
139+
140+
// RecordOutputTokens records output tokens count.
141+
func RecordOutputTokens(modelName, targetModelName string, size int) {
142+
if size > 0 {
143+
outputTokens.WithLabelValues(modelName, targetModelName).Observe(float64(size))
144+
}
145+
}

Diff for: pkg/ext-proc/metrics/metrics_test.go

+97
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ import (
1212
const RequestTotalMetric = InferenceModelComponent + "_request_total"
1313
const RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds"
1414
const RequestSizesMetric = InferenceModelComponent + "_request_sizes"
15+
const ResponseSizesMetric = InferenceModelComponent + "_response_sizes"
16+
const InputTokensMetric = InferenceModelComponent + "_input_tokens"
17+
const OutputTokensMetric = InferenceModelComponent + "_output_tokens"
1518

1619
func TestRecordRequestCounterandSizes(t *testing.T) {
1720
type requests struct {
@@ -160,3 +163,97 @@ func TestRecordRequestLatencies(t *testing.T) {
160163
})
161164
}
162165
}
166+
167+
func TestRecordResponseMetrics(t *testing.T) {
168+
type responses struct {
169+
modelName string
170+
targetModelName string
171+
inputToken int
172+
outputToken int
173+
respSize int
174+
}
175+
scenarios := []struct {
176+
name string
177+
resp []responses
178+
}{{
179+
name: "multiple requests",
180+
resp: []responses{
181+
{
182+
modelName: "m10",
183+
targetModelName: "t10",
184+
respSize: 1200,
185+
inputToken: 10,
186+
outputToken: 100,
187+
},
188+
{
189+
modelName: "m10",
190+
targetModelName: "t10",
191+
respSize: 500,
192+
inputToken: 20,
193+
outputToken: 200,
194+
},
195+
{
196+
modelName: "m10",
197+
targetModelName: "t11",
198+
respSize: 2480,
199+
inputToken: 30,
200+
outputToken: 300,
201+
},
202+
{
203+
modelName: "m20",
204+
targetModelName: "t20",
205+
respSize: 80,
206+
inputToken: 40,
207+
outputToken: 400,
208+
},
209+
},
210+
}}
211+
Register()
212+
for _, scenario := range scenarios {
213+
t.Run(scenario.name, func(t *testing.T) {
214+
for _, resp := range scenario.resp {
215+
RecordInputTokens(resp.modelName, resp.targetModelName, resp.inputToken)
216+
RecordOutputTokens(resp.modelName, resp.targetModelName, resp.outputToken)
217+
RecordResponseSizes(resp.modelName, resp.targetModelName, resp.respSize)
218+
}
219+
wantResponseSize, err := os.Open("testdata/response_sizes_metric")
220+
defer func() {
221+
if err := wantResponseSize.Close(); err != nil {
222+
t.Error(err)
223+
}
224+
}()
225+
if err != nil {
226+
t.Fatal(err)
227+
}
228+
if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantResponseSize, ResponseSizesMetric); err != nil {
229+
t.Error(err)
230+
}
231+
232+
wantInputToken, err := os.Open("testdata/input_tokens_metric")
233+
defer func() {
234+
if err := wantInputToken.Close(); err != nil {
235+
t.Error(err)
236+
}
237+
}()
238+
if err != nil {
239+
t.Fatal(err)
240+
}
241+
if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantInputToken, InputTokensMetric); err != nil {
242+
t.Error(err)
243+
}
244+
245+
wantOutputToken, err := os.Open("testdata/output_tokens_metric")
246+
defer func() {
247+
if err := wantOutputToken.Close(); err != nil {
248+
t.Error(err)
249+
}
250+
}()
251+
if err != nil {
252+
t.Fatal(err)
253+
}
254+
if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantOutputToken, OutputTokensMetric); err != nil {
255+
t.Error(err)
256+
}
257+
})
258+
}
259+
}

Diff for: pkg/ext-proc/metrics/testdata/input_tokens_metric

+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# HELP inference_model_input_tokens [ALPHA] Inference model input token count distribution for requests in each model.
2+
# TYPE inference_model_input_tokens histogram
3+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="1"} 0
4+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="8"} 0
5+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="16"} 1
6+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="32"} 2
7+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="64"} 2
8+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="128"} 2
9+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="256"} 2
10+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="512"} 2
11+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="1024"} 2
12+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="2048"} 2
13+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="4096"} 2
14+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="8192"} 2
15+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="16384"} 2
16+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="32778"} 2
17+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="65536"} 2
18+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="131072"} 2
19+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="262144"} 2
20+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="524288"} 2
21+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="1.048576e+06"} 2
22+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="+Inf"} 2
23+
inference_model_input_tokens_sum{model_name="m10",target_model_name="t10"} 30
24+
inference_model_input_tokens_count{model_name="m10",target_model_name="t10"} 2
25+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="1"} 0
26+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="8"} 0
27+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="16"} 0
28+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="32"} 1
29+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="64"} 1
30+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="128"} 1
31+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="256"} 1
32+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="512"} 1
33+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="1024"} 1
34+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="2048"} 1
35+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="4096"} 1
36+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="8192"} 1
37+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="16384"} 1
38+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="32778"} 1
39+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="65536"} 1
40+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="131072"} 1
41+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="262144"} 1
42+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="524288"} 1
43+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="1.048576e+06"} 1
44+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1
45+
inference_model_input_tokens_sum{model_name="m10",target_model_name="t11"} 30
46+
inference_model_input_tokens_count{model_name="m10",target_model_name="t11"} 1
47+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="1"} 0
48+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="8"} 0
49+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="16"} 0
50+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="32"} 0
51+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="64"} 1
52+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="128"} 1
53+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="256"} 1
54+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="512"} 1
55+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="1024"} 1
56+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="2048"} 1
57+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="4096"} 1
58+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="8192"} 1
59+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="16384"} 1
60+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="32778"} 1
61+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="65536"} 1
62+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="131072"} 1
63+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="262144"} 1
64+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="524288"} 1
65+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="1.048576e+06"} 1
66+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1
67+
inference_model_input_tokens_sum{model_name="m20",target_model_name="t20"} 40
68+
inference_model_input_tokens_count{model_name="m20",target_model_name="t20"} 1

0 commit comments

Comments
 (0)