8
8
"testing"
9
9
"time"
10
10
11
+ "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1"
11
12
"inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend"
12
13
13
14
configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
@@ -28,13 +29,25 @@ func TestHandleRequestBody(t *testing.T) {
28
29
name string
29
30
req * extProcPb.ProcessingRequest
30
31
pods []* backend.PodMetrics
32
+ models map [string ]* v1alpha1.Model
31
33
wantHeaders []* configPb.HeaderValueOption
32
34
wantBody []byte
33
35
wantErr bool
34
36
}{
35
37
{
36
38
name : "success" ,
37
39
req : GenerateRequest ("my-model" ),
40
+ models : map [string ]* v1alpha1.Model {
41
+ "my-model" : {
42
+ Name : "my-model" ,
43
+ TargetModels : []v1alpha1.TargetModel {
44
+ {
45
+ Name : "my-model-v1" ,
46
+ Weight : 100 ,
47
+ },
48
+ },
49
+ },
50
+ },
38
51
// pod-1 will be picked because it has relatively low queue size, with the requested
39
52
// model being active, and has low KV cache.
40
53
pods : []* backend.PodMetrics {
@@ -52,11 +65,11 @@ func TestHandleRequestBody(t *testing.T) {
52
65
{
53
66
Pod : FakePod (1 ),
54
67
Metrics : backend.Metrics {
55
- WaitingQueueSize : 3 ,
68
+ WaitingQueueSize : 0 ,
56
69
KVCacheUsagePercent : 0.1 ,
57
70
ActiveModels : map [string ]int {
58
- "foo" : 1 ,
59
- "my-model" : 1 ,
71
+ "foo" : 1 ,
72
+ "my-model-v1 " : 1 ,
60
73
},
61
74
},
62
75
},
@@ -81,17 +94,17 @@ func TestHandleRequestBody(t *testing.T) {
81
94
{
82
95
Header : & configPb.HeaderValue {
83
96
Key : "Content-Length" ,
84
- RawValue : []byte ("70 " ),
97
+ RawValue : []byte ("73 " ),
85
98
},
86
99
},
87
100
},
88
- wantBody : []byte ("{\" max_tokens\" :100,\" model\" :\" my-model\" ,\" prompt\" :\" hello\" ,\" temperature\" :0}" ),
101
+ wantBody : []byte ("{\" max_tokens\" :100,\" model\" :\" my-model-v1 \" ,\" prompt\" :\" hello\" ,\" temperature\" :0}" ),
89
102
},
90
103
}
91
104
92
105
for _ , test := range tests {
93
106
t .Run (test .name , func (t * testing.T ) {
94
- client , cleanup := setUpServer (t , test .pods )
107
+ client , cleanup := setUpServer (t , test .pods , test . models )
95
108
t .Cleanup (cleanup )
96
109
want := & extProcPb.ProcessingResponse {
97
110
Response : & extProcPb.ProcessingResponse_RequestBody {
@@ -123,8 +136,8 @@ func TestHandleRequestBody(t *testing.T) {
123
136
124
137
}
125
138
126
- func setUpServer (t * testing.T , pods []* backend.PodMetrics ) (client extProcPb.ExternalProcessor_ProcessClient , cleanup func ()) {
127
- server := StartExtProc (port , time .Second , time .Second , pods )
139
+ func setUpServer (t * testing.T , pods []* backend.PodMetrics , models map [ string ] * v1alpha1. Model ) (client extProcPb.ExternalProcessor_ProcessClient , cleanup func ()) {
140
+ server := StartExtProc (port , time .Second , time .Second , pods , models )
128
141
129
142
address := fmt .Sprintf ("localhost:%v" , port )
130
143
// Create a grpc connection
0 commit comments