Skip to content

Commit 27a923d

Browse files
committed
[Metrics] Add input/output token and request size metrics
Data will only populate if use buffered mode.
1 parent 5d000fa commit 27a923d

File tree

6 files changed

+329
-0
lines changed

6 files changed

+329
-0
lines changed

Diff for: pkg/ext-proc/handlers/response.go

+6
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66

77
configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
88
extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
9+
"inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics"
910
klog "k8s.io/klog/v2"
1011
)
1112

@@ -88,6 +89,11 @@ func (s *Server) HandleResponseBody(reqCtx *RequestContext, req *extProcPb.Proce
8889
},
8990
},
9091
}
92+
93+
metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, len(body.ResponseBody.Body))
94+
metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, res.Usage.PromptTokens)
95+
metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, res.Usage.CompletionTokens)
96+
9197
return resp, nil
9298
}
9399

Diff for: pkg/ext-proc/metrics/metrics.go

+55
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,43 @@ var (
5151
},
5252
[]string{"model_name", "target_model_name"},
5353
)
54+
55+
responseSizes = compbasemetrics.NewHistogramVec(
56+
&compbasemetrics.HistogramOpts{
57+
Subsystem: InferenceModelComponent,
58+
Name: "response_sizes",
59+
Help: "Inference model responses size distribution in bytes for each model and target model.",
60+
// Most models have a response token < 8192 tokens. Each token, in average, has 4 characters.
61+
// 8192 * 4 = 32768.
62+
Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32778, 65536},
63+
StabilityLevel: compbasemetrics.ALPHA,
64+
},
65+
[]string{"model_name", "target_model_name"},
66+
)
67+
68+
inputTokens = compbasemetrics.NewHistogramVec(
69+
&compbasemetrics.HistogramOpts{
70+
Subsystem: InferenceModelComponent,
71+
Name: "input_tokens",
72+
Help: "Inference model input token count for requests in each model.",
73+
// Most models have a input context window less than 1 million tokens.
74+
Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32778, 65536, 131072, 262144, 524288, 1048576},
75+
StabilityLevel: compbasemetrics.ALPHA,
76+
},
77+
[]string{"model_name", "target_model_name"},
78+
)
79+
80+
outputTokens = compbasemetrics.NewHistogramVec(
81+
&compbasemetrics.HistogramOpts{
82+
Subsystem: InferenceModelComponent,
83+
Name: "output_tokens",
84+
Help: "Inference model output token count for requests in each model.",
85+
// Most models generates output less than 8192 tokens.
86+
Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192},
87+
StabilityLevel: compbasemetrics.ALPHA,
88+
},
89+
[]string{"model_name", "target_model_name"},
90+
)
5491
)
5592

5693
var registerMetrics sync.Once
@@ -61,6 +98,9 @@ func Register() {
6198
legacyregistry.MustRegister(requestCounter)
6299
legacyregistry.MustRegister(requestLatencies)
63100
legacyregistry.MustRegister(requestSizes)
101+
legacyregistry.MustRegister(responseSizes)
102+
legacyregistry.MustRegister(inputTokens)
103+
legacyregistry.MustRegister(outputTokens)
64104
})
65105
}
66106

@@ -84,3 +124,18 @@ func RecordRequestLatencies(modelName, targetModelName string, received time.Tim
84124
requestLatencies.WithLabelValues(modelName, targetModelName).Observe(elapsedSeconds)
85125
return true
86126
}
127+
128+
// RecordResponseSizes records the response sizes.
129+
func RecordResponseSizes(modelName, targetModelName string, size int) {
130+
responseSizes.WithLabelValues(modelName, targetModelName).Observe(float64(size))
131+
}
132+
133+
// RecordInputTokens records input tokens count.
134+
func RecordInputTokens(modelName, targetModelName string, size int) {
135+
inputTokens.WithLabelValues(modelName, targetModelName).Observe(float64(size))
136+
}
137+
138+
// RecordOutputTokens records output tokens count.
139+
func RecordOutputTokens(modelName, targetModelName string, size int) {
140+
outputTokens.WithLabelValues(modelName, targetModelName).Observe(float64(size))
141+
}

Diff for: pkg/ext-proc/metrics/metrics_test.go

+97
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ import (
1212
const RequestTotalMetric = InferenceModelComponent + "_request_total"
1313
const RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds"
1414
const RequestSizesMetric = InferenceModelComponent + "_request_sizes"
15+
const ResponseSizesMetric = InferenceModelComponent + "_response_sizes"
16+
const InputTokensMetric = InferenceModelComponent + "_input_tokens"
17+
const OutputTokensMetric = InferenceModelComponent + "_output_tokens"
1518

1619
func TestRecordRequestCounterandSizes(t *testing.T) {
1720
type requests struct {
@@ -160,3 +163,97 @@ func TestRecordRequestLatencies(t *testing.T) {
160163
})
161164
}
162165
}
166+
167+
func TestResponse(t *testing.T) {
168+
type responses struct {
169+
modelName string
170+
targetModelName string
171+
inputToken int
172+
outputToken int
173+
respSize int
174+
}
175+
scenarios := []struct {
176+
name string
177+
resp []responses
178+
}{{
179+
name: "multiple requests",
180+
resp: []responses{
181+
{
182+
modelName: "m10",
183+
targetModelName: "t10",
184+
respSize: 1200,
185+
inputToken: 10,
186+
outputToken: 100,
187+
},
188+
{
189+
modelName: "m10",
190+
targetModelName: "t10",
191+
respSize: 500,
192+
inputToken: 20,
193+
outputToken: 200,
194+
},
195+
{
196+
modelName: "m10",
197+
targetModelName: "t11",
198+
respSize: 2480,
199+
inputToken: 30,
200+
outputToken: 300,
201+
},
202+
{
203+
modelName: "m20",
204+
targetModelName: "t20",
205+
respSize: 80,
206+
inputToken: 40,
207+
outputToken: 400,
208+
},
209+
},
210+
}}
211+
Register()
212+
for _, scenario := range scenarios {
213+
t.Run(scenario.name, func(t *testing.T) {
214+
for _, resp := range scenario.resp {
215+
RecordInputTokens(resp.modelName, resp.targetModelName, resp.inputToken)
216+
RecordOutputTokens(resp.modelName, resp.targetModelName, resp.outputToken)
217+
RecordResponseSizes(resp.modelName, resp.targetModelName, resp.respSize)
218+
}
219+
wantResponseSize, err := os.Open("testdata/response_sizes_metric")
220+
defer func() {
221+
if err := wantResponseSize.Close(); err != nil {
222+
t.Error(err)
223+
}
224+
}()
225+
if err != nil {
226+
t.Fatal(err)
227+
}
228+
if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantResponseSize, ResponseSizesMetric); err != nil {
229+
t.Error(err)
230+
}
231+
232+
wantInputToken, err := os.Open("testdata/input_tokens_metric")
233+
defer func() {
234+
if err := wantInputToken.Close(); err != nil {
235+
t.Error(err)
236+
}
237+
}()
238+
if err != nil {
239+
t.Fatal(err)
240+
}
241+
if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantInputToken, InputTokensMetric); err != nil {
242+
t.Error(err)
243+
}
244+
245+
wantOutputToken, err := os.Open("testdata/output_tokens_metric")
246+
defer func() {
247+
if err := wantOutputToken.Close(); err != nil {
248+
t.Error(err)
249+
}
250+
}()
251+
if err != nil {
252+
t.Fatal(err)
253+
}
254+
if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantOutputToken, OutputTokensMetric); err != nil {
255+
t.Error(err)
256+
}
257+
})
258+
}
259+
}

Diff for: pkg/ext-proc/metrics/testdata/input_tokens_metric

+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# HELP inference_model_input_tokens [ALPHA] Inference model input token count for requests in each model.
2+
# TYPE inference_model_input_tokens histogram
3+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="1"} 0
4+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="8"} 0
5+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="16"} 1
6+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="32"} 2
7+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="64"} 2
8+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="128"} 2
9+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="256"} 2
10+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="512"} 2
11+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="1024"} 2
12+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="2048"} 2
13+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="4096"} 2
14+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="8192"} 2
15+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="16384"} 2
16+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="32778"} 2
17+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="65536"} 2
18+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="131072"} 2
19+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="262144"} 2
20+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="524288"} 2
21+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="1.048576e+06"} 2
22+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="+Inf"} 2
23+
inference_model_input_tokens_sum{model_name="m10",target_model_name="t10"} 30
24+
inference_model_input_tokens_count{model_name="m10",target_model_name="t10"} 2
25+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="1"} 0
26+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="8"} 0
27+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="16"} 0
28+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="32"} 1
29+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="64"} 1
30+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="128"} 1
31+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="256"} 1
32+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="512"} 1
33+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="1024"} 1
34+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="2048"} 1
35+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="4096"} 1
36+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="8192"} 1
37+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="16384"} 1
38+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="32778"} 1
39+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="65536"} 1
40+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="131072"} 1
41+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="262144"} 1
42+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="524288"} 1
43+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="1.048576e+06"} 1
44+
inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1
45+
inference_model_input_tokens_sum{model_name="m10",target_model_name="t11"} 30
46+
inference_model_input_tokens_count{model_name="m10",target_model_name="t11"} 1
47+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="1"} 0
48+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="8"} 0
49+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="16"} 0
50+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="32"} 0
51+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="64"} 1
52+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="128"} 1
53+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="256"} 1
54+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="512"} 1
55+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="1024"} 1
56+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="2048"} 1
57+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="4096"} 1
58+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="8192"} 1
59+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="16384"} 1
60+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="32778"} 1
61+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="65536"} 1
62+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="131072"} 1
63+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="262144"} 1
64+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="524288"} 1
65+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="1.048576e+06"} 1
66+
inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1
67+
inference_model_input_tokens_sum{model_name="m20",target_model_name="t20"} 40
68+
inference_model_input_tokens_count{model_name="m20",target_model_name="t20"} 1

Diff for: pkg/ext-proc/metrics/testdata/output_tokens_metric

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# HELP inference_model_output_tokens [ALPHA] Inference model output token count for requests in each model.
2+
# TYPE inference_model_output_tokens histogram
3+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="1"} 0
4+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="8"} 0
5+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="16"} 0
6+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="32"} 0
7+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="64"} 0
8+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="128"} 1
9+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="256"} 2
10+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="512"} 2
11+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="1024"} 2
12+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="2048"} 2
13+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="4096"} 2
14+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="8192"} 2
15+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="+Inf"} 2
16+
inference_model_output_tokens_sum{model_name="m10",target_model_name="t10"} 300
17+
inference_model_output_tokens_count{model_name="m10",target_model_name="t10"} 2
18+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="1"} 0
19+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="8"} 0
20+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="16"} 0
21+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="32"} 0
22+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="64"} 0
23+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="128"} 0
24+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="256"} 0
25+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="512"} 1
26+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="1024"} 1
27+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="2048"} 1
28+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="4096"} 1
29+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="8192"} 1
30+
inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1
31+
inference_model_output_tokens_sum{model_name="m10",target_model_name="t11"} 300
32+
inference_model_output_tokens_count{model_name="m10",target_model_name="t11"} 1
33+
inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="1"} 0
34+
inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="8"} 0
35+
inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="16"} 0
36+
inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="32"} 0
37+
inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="64"} 0
38+
inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="128"} 0
39+
inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="256"} 0
40+
inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="512"} 1
41+
inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="1024"} 1
42+
inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="2048"} 1
43+
inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="4096"} 1
44+
inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="8192"} 1
45+
inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1
46+
inference_model_output_tokens_sum{model_name="m20",target_model_name="t20"} 400
47+
inference_model_output_tokens_count{model_name="m20",target_model_name="t20"} 1

0 commit comments

Comments
 (0)