Skip to content

Commit cc89e72

Browse files
Add test-case for sheddable that is not shed, fix nits and rename the non-lora test case to use a different model name.
1 parent a89b080 commit cc89e72

File tree

2 files changed

+83
-6
lines changed

2 files changed

+83
-6
lines changed

test/integration/hermetic_test.go

+69-6
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ func TestKubeInferenceModelRequest(t *testing.T) {
6161
}{
6262
{
6363
name: "select lower queue and kv cache, no active lora",
64-
req: extprocutils.GenerateRequest("sql-lora"),
64+
req: extprocutils.GenerateRequest("my-model"),
6565
// pod-1 will be picked because it has relatively low queue size and low KV cache.
6666
pods: []*backend.PodMetrics{
6767
{
@@ -109,7 +109,7 @@ func TestKubeInferenceModelRequest(t *testing.T) {
109109
},
110110
},
111111
},
112-
wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"hello\",\"temperature\":0}"),
112+
wantBody: []byte("{\"max_tokens\":100,\"model\":\"my-model-12345\",\"prompt\":\"hello\",\"temperature\":0}"),
113113
wantErr: false,
114114
},
115115
{
@@ -180,7 +180,7 @@ func TestKubeInferenceModelRequest(t *testing.T) {
180180
{
181181
name: "select no lora despite active model, avoid excessive queue size",
182182
req: extprocutils.GenerateRequest("sql-lora"),
183-
// pod-2 will be picked despite it having the requested model being active
183+
// pod-2 will be picked despite it NOT having the requested model being active
184184
// as it's above the affinity for queue size. Also is critical, so we should
185185
// still honor request despite all queues > 5
186186
pods: []*backend.PodMetrics{
@@ -294,6 +294,72 @@ func TestKubeInferenceModelRequest(t *testing.T) {
294294
},
295295
},
296296
},
297+
{
298+
name: "noncritical, but one model has capacity, no not shed",
299+
req: extprocutils.GenerateRequest("sql-lora-sheddable"),
300+
// pod 0 will be picked as all other models are above threshold
301+
pods: []*backend.PodMetrics{
302+
{
303+
Pod: extprocutils.FakePod(0),
304+
Metrics: backend.Metrics{
305+
WaitingQueueSize: 4,
306+
KVCacheUsagePercent: 0.2,
307+
ActiveModels: map[string]int{
308+
"foo": 1,
309+
"bar": 1,
310+
"sql-lora-1fdg3": 1,
311+
},
312+
},
313+
},
314+
{
315+
Pod: extprocutils.FakePod(1),
316+
Metrics: backend.Metrics{
317+
WaitingQueueSize: 0,
318+
KVCacheUsagePercent: 0.85,
319+
ActiveModels: map[string]int{
320+
"foo": 1,
321+
"sql-lora-1fdg3": 1,
322+
},
323+
},
324+
},
325+
{
326+
Pod: extprocutils.FakePod(2),
327+
Metrics: backend.Metrics{
328+
WaitingQueueSize: 10,
329+
KVCacheUsagePercent: 0.9,
330+
ActiveModels: map[string]int{
331+
"foo": 1,
332+
"sql-lora-1fdg3": 1,
333+
},
334+
},
335+
},
336+
},
337+
wantHeaders: []*configPb.HeaderValueOption{
338+
{
339+
Header: &configPb.HeaderValue{
340+
Key: runserver.DefaultTargetEndpointKey,
341+
RawValue: []byte("address-0"),
342+
},
343+
},
344+
{
345+
Header: &configPb.HeaderValue{
346+
Key: "Content-Length",
347+
RawValue: []byte("76"),
348+
},
349+
},
350+
},
351+
wantMetadata: &structpb.Struct{
352+
Fields: map[string]*structpb.Value{
353+
runserver.DefaultTargetEndpointKey: {
354+
Kind: &structpb.Value_StringValue{
355+
StringValue: "address-0",
356+
},
357+
},
358+
},
359+
},
360+
wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg3\",\"prompt\":\"hello\",\"temperature\":0}"),
361+
wantErr: false,
362+
},
297363
}
298364

299365
// Set up global k8sclient and extproc server runner with test environment config
@@ -487,6 +553,3 @@ func readDocuments(fp string) ([][]byte, error) {
487553
}
488554
return docs, nil
489555
}
490-
func pointer(v int32) *int32 {
491-
return &v
492-
}

test/testdata/inferencepool-with-model-hermetic.yaml

+14
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,17 @@ spec:
3636
targetModels:
3737
- name: sql-lora-1fdg3
3838
weight: 100
39+
---
40+
apiVersion: inference.networking.x-k8s.io/v1alpha1
41+
kind: InferenceModel
42+
metadata:
43+
name: inferencemodel-generic
44+
namespace: default
45+
spec:
46+
modelName: my-model
47+
criticality: Critical
48+
poolRef:
49+
name: vllm-llama2-7b-pool
50+
targetModels:
51+
- name: my-model-12345
52+
weight: 100

0 commit comments

Comments
 (0)