@@ -873,6 +873,114 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
873
873
},
874
874
},
875
875
},
876
+ {
877
+ name : "inferencemodel's modelName is not translated, passthrough" ,
878
+ requests : []* extProcPb.ProcessingRequest {
879
+ {
880
+ Request : & extProcPb.ProcessingRequest_RequestHeaders {
881
+ RequestHeaders : & extProcPb.HttpHeaders {
882
+ Headers : & configPb.HeaderMap {
883
+ Headers : []* configPb.HeaderValue {
884
+ {
885
+ Key : "hi" ,
886
+ Value : "mom" ,
887
+ },
888
+ },
889
+ },
890
+ },
891
+ },
892
+ },
893
+ {
894
+ Request : & extProcPb.ProcessingRequest_RequestBody {
895
+ RequestBody : & extProcPb.HttpBody {Body : []byte ("{\" max_tokens\" :100,\" model\" :\" direct-" ), EndOfStream : false },
896
+ },
897
+ },
898
+ {
899
+ Request : & extProcPb.ProcessingRequest_RequestBody {
900
+ RequestBody : & extProcPb.HttpBody {Body : []byte ("model\" ,\" prompt\" :\" test6\" ,\" temperature\" :0}" ), EndOfStream : true },
901
+ },
902
+ },
903
+ },
904
+
905
+ //
906
+ // pod 0 will be picked as all other models are above threshold
907
+ pods : map [backendmetrics.Pod ]* backendmetrics.Metrics {
908
+ fakePod (0 ): {
909
+ WaitingQueueSize : 4 ,
910
+ KVCacheUsagePercent : 0.2 ,
911
+ ActiveModels : map [string ]int {
912
+ "foo" : 1 ,
913
+ "bar" : 1 ,
914
+ "sql-lora-1fdg3" : 1 ,
915
+ },
916
+ },
917
+ fakePod (1 ): {
918
+ WaitingQueueSize : 0 ,
919
+ KVCacheUsagePercent : 0.85 ,
920
+ ActiveModels : map [string ]int {
921
+ "foo" : 1 ,
922
+ "sql-lora-1fdg3" : 1 ,
923
+ },
924
+ },
925
+ fakePod (2 ): {
926
+ WaitingQueueSize : 10 ,
927
+ KVCacheUsagePercent : 0.9 ,
928
+ ActiveModels : map [string ]int {
929
+ "foo" : 1 ,
930
+ "sql-lora-1fdg3" : 1 ,
931
+ },
932
+ },
933
+ },
934
+ wantMetrics : `
935
+ # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
936
+ # TYPE inference_model_request_total counter
937
+ inference_model_request_total{model_name="direct-model",target_model_name="direct-model"} 1
938
+ ` ,
939
+ wantErr : false ,
940
+ wantResponses : []* extProcPb.ProcessingResponse {
941
+ {
942
+ Response : & extProcPb.ProcessingResponse_RequestHeaders {
943
+ RequestHeaders : & extProcPb.HeadersResponse {
944
+ Response : & extProcPb.CommonResponse {
945
+ ClearRouteCache : true ,
946
+ HeaderMutation : & extProcPb.HeaderMutation {
947
+ SetHeaders : []* configPb.HeaderValueOption {
948
+ {
949
+ Header : & configPb.HeaderValue {
950
+ Key : "x-gateway-destination-endpoint" ,
951
+ RawValue : []byte ("192.168.1.2:8000" ),
952
+ },
953
+ },
954
+ {
955
+ Header : & configPb.HeaderValue {
956
+ Key : "Content-Length" ,
957
+ RawValue : []byte (strconv .Itoa (74 )),
958
+ },
959
+ },
960
+ }},
961
+ },
962
+ },
963
+ },
964
+ DynamicMetadata : makeMetadata ("192.168.1.2:8000" ),
965
+ },
966
+ {
967
+ Response : & extProcPb.ProcessingResponse_RequestBody {
968
+ RequestBody : & extProcPb.BodyResponse {
969
+ Response : & extProcPb.CommonResponse {
970
+ BodyMutation : & extProcPb.BodyMutation {
971
+ Mutation : & extProcPb.BodyMutation_StreamedResponse {
972
+ StreamedResponse : & extProcPb.StreamedBodyResponse {
973
+ Body : []byte ("{\" max_tokens\" :100,\" model\" :\" direct-model\" ,\" prompt\" :\" test6\" ,\" temperature\" :0}" ),
974
+ EndOfStream : true ,
975
+ },
976
+ },
977
+ },
978
+ },
979
+ },
980
+ },
981
+ },
982
+ },
983
+ },
876
984
// Response flow tests
877
985
{
878
986
name : "responsebody sent over multiple requests, content-type is json, buffer" ,
0 commit comments