@@ -403,7 +403,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
403
403
requests []* extProcPb.ProcessingRequest
404
404
pods map [backendmetrics.Pod ]* backendmetrics.Metrics
405
405
wantResponses []* extProcPb.ProcessingResponse
406
- wantMetrics string
406
+ wantMetrics map [ string ] string
407
407
wantErr bool
408
408
immediateResponse * extProcPb.ImmediateResponse
409
409
}{
@@ -426,11 +426,11 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
426
426
KVCacheUsagePercent : 0.2 ,
427
427
},
428
428
},
429
- wantMetrics : `
429
+ wantMetrics : map [ string ] string { `inference_model_request_total` : `
430
430
# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
431
431
# TYPE inference_model_request_total counter
432
432
inference_model_request_total{model_name="my-model",target_model_name="my-model-12345"} 1
433
- ` ,
433
+ ` } ,
434
434
wantErr : false ,
435
435
wantResponses : []* extProcPb.ProcessingResponse {
436
436
{
@@ -507,11 +507,11 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
507
507
},
508
508
},
509
509
},
510
- wantMetrics : `
510
+ wantMetrics : map [ string ] string { `inference_model_request_total` : `
511
511
# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
512
512
# TYPE inference_model_request_total counter
513
513
inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1
514
- ` ,
514
+ ` } ,
515
515
wantErr : false ,
516
516
wantResponses : []* extProcPb.ProcessingResponse {
517
517
{
@@ -588,11 +588,11 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
588
588
},
589
589
},
590
590
},
591
- wantMetrics : `
591
+ wantMetrics : map [ string ] string { `inference_model_request_total` : `
592
592
# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
593
593
# TYPE inference_model_request_total counter
594
594
inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1
595
- ` ,
595
+ ` } ,
596
596
wantErr : false ,
597
597
wantResponses : []* extProcPb.ProcessingResponse {
598
598
{
@@ -671,7 +671,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
671
671
},
672
672
},
673
673
wantErr : false ,
674
- wantMetrics : "" ,
674
+ wantMetrics : map [ string ] string {} ,
675
675
wantResponses : []* extProcPb.ProcessingResponse {
676
676
{
677
677
Response : & extProcPb.ProcessingResponse_ImmediateResponse {
@@ -715,11 +715,11 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
715
715
},
716
716
},
717
717
},
718
- wantMetrics : `
718
+ wantMetrics : map [ string ] string { `inference_model_request_total` : `
719
719
# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
720
720
# TYPE inference_model_request_total counter
721
721
inference_model_request_total{model_name="sql-lora-sheddable",target_model_name="sql-lora-1fdg3"} 1
722
- ` ,
722
+ ` } ,
723
723
wantErr : false ,
724
724
wantResponses : []* extProcPb.ProcessingResponse {
725
725
{
@@ -823,11 +823,11 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
823
823
},
824
824
},
825
825
},
826
- wantMetrics : `
826
+ wantMetrics : map [ string ] string { `inference_model_request_total` : `
827
827
# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
828
828
# TYPE inference_model_request_total counter
829
829
inference_model_request_total{model_name="sql-lora-sheddable",target_model_name="sql-lora-1fdg3"} 1
830
- ` ,
830
+ ` } ,
831
831
wantErr : false ,
832
832
wantResponses : []* extProcPb.ProcessingResponse {
833
833
{
@@ -931,11 +931,11 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
931
931
},
932
932
},
933
933
},
934
- wantMetrics : `
934
+ wantMetrics : map [ string ] string { `inference_model_request_total` : `
935
935
# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
936
936
# TYPE inference_model_request_total counter
937
937
inference_model_request_total{model_name="direct-model",target_model_name="direct-model"} 1
938
- ` ,
938
+ ` } ,
939
939
wantErr : false ,
940
940
wantResponses : []* extProcPb.ProcessingResponse {
941
941
{
@@ -1233,19 +1233,47 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
1233
1233
{
1234
1234
Request : & extProcPb.ProcessingRequest_ResponseBody {
1235
1235
ResponseBody : & extProcPb.HttpBody {
1236
- Body : []byte (`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}` ),
1236
+ Body : []byte (`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}
1237
+ data: [DONE]` ,
1238
+ ),
1237
1239
EndOfStream : false },
1238
1240
},
1239
1241
},
1240
1242
{
1241
1243
Request : & extProcPb.ProcessingRequest_ResponseBody {
1242
1244
ResponseBody : & extProcPb.HttpBody {
1243
- Body : []byte ("data: [DONE] " ),
1245
+ Body : []byte ("" ),
1244
1246
EndOfStream : true },
1245
1247
},
1246
1248
},
1247
1249
},
1248
1250
wantErr : false ,
1251
+ wantMetrics : map [string ]string {`inference_model_input_tokens` : `
1252
+ # HELP inference_model_input_tokens [ALPHA] Inference model input token count distribution for requests in each model.
1253
+ # TYPE inference_model_input_tokens histogram
1254
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="1"} 0
1255
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="8"} 1
1256
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="16"} 1
1257
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="32"} 1
1258
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="64"} 1
1259
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="128"} 1
1260
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="256"} 1
1261
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="512"} 1
1262
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="1024"} 1
1263
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="2048"} 1
1264
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="4096"} 1
1265
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="8192"} 1
1266
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="16384"} 1
1267
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="32778"} 1
1268
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="65536"} 1
1269
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="131072"} 1
1270
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="262144"} 1
1271
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="524288"} 1
1272
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="1.048576e+06"} 1
1273
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="+Inf"} 1
1274
+ inference_model_input_tokens_sum{model_name="",target_model_name=""} 7
1275
+ inference_model_input_tokens_count{model_name="",target_model_name=""} 1
1276
+ ` },
1249
1277
wantResponses : []* extProcPb.ProcessingResponse {
1250
1278
{
1251
1279
Response : & extProcPb.ProcessingResponse_ResponseHeaders {
@@ -1352,7 +1380,9 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
1352
1380
BodyMutation : & extProcPb.BodyMutation {
1353
1381
Mutation : & extProcPb.BodyMutation_StreamedResponse {
1354
1382
StreamedResponse : & extProcPb.StreamedBodyResponse {
1355
- Body : []byte (`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}` ),
1383
+ Body : []byte (`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}
1384
+ data: [DONE]` ,
1385
+ ),
1356
1386
EndOfStream : false ,
1357
1387
},
1358
1388
},
@@ -1368,7 +1398,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
1368
1398
BodyMutation : & extProcPb.BodyMutation {
1369
1399
Mutation : & extProcPb.BodyMutation_StreamedResponse {
1370
1400
StreamedResponse : & extProcPb.StreamedBodyResponse {
1371
- Body : []byte ("data: [DONE] " ),
1401
+ Body : []byte ("" ),
1372
1402
EndOfStream : true ,
1373
1403
},
1374
1404
},
@@ -1394,9 +1424,11 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
1394
1424
t .Errorf ("Unexpected response, (-want +got): %v" , diff )
1395
1425
}
1396
1426
1397
- if test .wantMetrics != "" {
1398
- if err := metricsutils .GatherAndCompare (legacyregistry .DefaultGatherer , strings .NewReader (test .wantMetrics ), "inference_model_request_total" ); err != nil {
1399
- t .Error (err )
1427
+ if len (test .wantMetrics ) != 0 {
1428
+ for metricName , value := range test .wantMetrics {
1429
+ if err := metricsutils .GatherAndCompare (legacyregistry .DefaultGatherer , strings .NewReader (value ), metricName ); err != nil {
1430
+ t .Error (err )
1431
+ }
1400
1432
}
1401
1433
}
1402
1434
0 commit comments