@@ -61,7 +61,7 @@ func TestKubeInferenceModelRequest(t *testing.T) {
61
61
}{
62
62
{
63
63
name : "select lower queue and kv cache, no active lora" ,
64
- req : extprocutils .GenerateRequest ("sql-lora " ),
64
+ req : extprocutils .GenerateRequest ("my-model " ),
65
65
// pod-1 will be picked because it has relatively low queue size and low KV cache.
66
66
pods : []* backend.PodMetrics {
67
67
{
@@ -109,7 +109,7 @@ func TestKubeInferenceModelRequest(t *testing.T) {
109
109
},
110
110
},
111
111
},
112
- wantBody : []byte ("{\" max_tokens\" :100,\" model\" :\" sql-lora-1fdg2 \" ,\" prompt\" :\" hello\" ,\" temperature\" :0}" ),
112
+ wantBody : []byte ("{\" max_tokens\" :100,\" model\" :\" my-model-12345 \" ,\" prompt\" :\" hello\" ,\" temperature\" :0}" ),
113
113
wantErr : false ,
114
114
},
115
115
{
@@ -180,7 +180,7 @@ func TestKubeInferenceModelRequest(t *testing.T) {
180
180
{
181
181
name : "select no lora despite active model, avoid excessive queue size" ,
182
182
req : extprocutils .GenerateRequest ("sql-lora" ),
183
- // pod-2 will be picked despite it having the requested model being active
183
+ // pod-2 will be picked despite it NOT having the requested model being active
184
184
// as it's above the affinity for queue size. Also is critical, so we should
185
185
// still honor request despite all queues > 5
186
186
pods : []* backend.PodMetrics {
@@ -294,6 +294,72 @@ func TestKubeInferenceModelRequest(t *testing.T) {
294
294
},
295
295
},
296
296
},
297
+ {
298
+ name : "noncritical, but one model has capacity, no not shed" ,
299
+ req : extprocutils .GenerateRequest ("sql-lora-sheddable" ),
300
+ // pod 0 will be picked as all other models are above threshold
301
+ pods : []* backend.PodMetrics {
302
+ {
303
+ Pod : extprocutils .FakePod (0 ),
304
+ Metrics : backend.Metrics {
305
+ WaitingQueueSize : 4 ,
306
+ KVCacheUsagePercent : 0.2 ,
307
+ ActiveModels : map [string ]int {
308
+ "foo" : 1 ,
309
+ "bar" : 1 ,
310
+ "sql-lora-1fdg3" : 1 ,
311
+ },
312
+ },
313
+ },
314
+ {
315
+ Pod : extprocutils .FakePod (1 ),
316
+ Metrics : backend.Metrics {
317
+ WaitingQueueSize : 0 ,
318
+ KVCacheUsagePercent : 0.85 ,
319
+ ActiveModels : map [string ]int {
320
+ "foo" : 1 ,
321
+ "sql-lora-1fdg3" : 1 ,
322
+ },
323
+ },
324
+ },
325
+ {
326
+ Pod : extprocutils .FakePod (2 ),
327
+ Metrics : backend.Metrics {
328
+ WaitingQueueSize : 10 ,
329
+ KVCacheUsagePercent : 0.9 ,
330
+ ActiveModels : map [string ]int {
331
+ "foo" : 1 ,
332
+ "sql-lora-1fdg3" : 1 ,
333
+ },
334
+ },
335
+ },
336
+ },
337
+ wantHeaders : []* configPb.HeaderValueOption {
338
+ {
339
+ Header : & configPb.HeaderValue {
340
+ Key : runserver .DefaultTargetEndpointKey ,
341
+ RawValue : []byte ("address-0" ),
342
+ },
343
+ },
344
+ {
345
+ Header : & configPb.HeaderValue {
346
+ Key : "Content-Length" ,
347
+ RawValue : []byte ("76" ),
348
+ },
349
+ },
350
+ },
351
+ wantMetadata : & structpb.Struct {
352
+ Fields : map [string ]* structpb.Value {
353
+ runserver .DefaultTargetEndpointKey : {
354
+ Kind : & structpb.Value_StringValue {
355
+ StringValue : "address-0" ,
356
+ },
357
+ },
358
+ },
359
+ },
360
+ wantBody : []byte ("{\" max_tokens\" :100,\" model\" :\" sql-lora-1fdg3\" ,\" prompt\" :\" hello\" ,\" temperature\" :0}" ),
361
+ wantErr : false ,
362
+ },
297
363
}
298
364
299
365
// Set up global k8sclient and extproc server runner with test environment config
@@ -487,6 +553,3 @@ func readDocuments(fp string) ([][]byte, error) {
487
553
}
488
554
return docs , nil
489
555
}
490
- func pointer (v int32 ) * int32 {
491
- return & v
492
- }
0 commit comments