Add test-case for sheddable that is not shed, fix nits and rename the non-lora test case to use a different model name.

BenjaminBraunDev · BenjaminBraunDev · commit cc89e72a0e9d · 2025-02-06T23:31:34.000Z
diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go
@@ -61,7 +61,7 @@ func TestKubeInferenceModelRequest(t *testing.T) {
 	}{
 		{
 			name: "select lower queue and kv cache, no active lora",
-			req:  extprocutils.GenerateRequest("sql-lora"),
+			req:  extprocutils.GenerateRequest("my-model"),
 			// pod-1 will be picked because it has relatively low queue size and low KV cache.
 			pods: []*backend.PodMetrics{
 				{
@@ -109,7 +109,7 @@ func TestKubeInferenceModelRequest(t *testing.T) {
 					},
 				},
 			},
-			wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"hello\",\"temperature\":0}"),
+			wantBody: []byte("{\"max_tokens\":100,\"model\":\"my-model-12345\",\"prompt\":\"hello\",\"temperature\":0}"),
 			wantErr:  false,
 		},
 		{
@@ -180,7 +180,7 @@ func TestKubeInferenceModelRequest(t *testing.T) {
 		{
 			name: "select no lora despite active model, avoid excessive queue size",
 			req:  extprocutils.GenerateRequest("sql-lora"),
-			// pod-2 will be picked despite it having the requested model being active
+			// pod-2 will be picked despite it NOT having the requested model being active
 			// as it's above the affinity for queue size. Also is critical, so we should
 			// still honor request despite all queues > 5
 			pods: []*backend.PodMetrics{
@@ -294,6 +294,72 @@ func TestKubeInferenceModelRequest(t *testing.T) {
 				},
 			},
 		},
+		{
+			name: "noncritical, but one model has capacity, no not shed",
+			req:  extprocutils.GenerateRequest("sql-lora-sheddable"),
+			// pod 0 will be picked as all other models are above threshold
+			pods: []*backend.PodMetrics{
+				{
+					Pod: extprocutils.FakePod(0),
+					Metrics: backend.Metrics{
+						WaitingQueueSize:    4,
+						KVCacheUsagePercent: 0.2,
+						ActiveModels: map[string]int{
+							"foo":            1,
+							"bar":            1,
+							"sql-lora-1fdg3": 1,
+						},
+					},
+				},
+				{
+					Pod: extprocutils.FakePod(1),
+					Metrics: backend.Metrics{
+						WaitingQueueSize:    0,
+						KVCacheUsagePercent: 0.85,
+						ActiveModels: map[string]int{
+							"foo":            1,
+							"sql-lora-1fdg3": 1,
+						},
+					},
+				},
+				{
+					Pod: extprocutils.FakePod(2),
+					Metrics: backend.Metrics{
+						WaitingQueueSize:    10,
+						KVCacheUsagePercent: 0.9,
+						ActiveModels: map[string]int{
+							"foo":            1,
+							"sql-lora-1fdg3": 1,
+						},
+					},
+				},
+			},
+			wantHeaders: []*configPb.HeaderValueOption{
+				{
+					Header: &configPb.HeaderValue{
+						Key:      runserver.DefaultTargetEndpointKey,
+						RawValue: []byte("address-0"),
+					},
+				},
+				{
+					Header: &configPb.HeaderValue{
+						Key:      "Content-Length",
+						RawValue: []byte("76"),
+					},
+				},
+			},
+			wantMetadata: &structpb.Struct{
+				Fields: map[string]*structpb.Value{
+					runserver.DefaultTargetEndpointKey: {
+						Kind: &structpb.Value_StringValue{
+							StringValue: "address-0",
+						},
+					},
+				},
+			},
+			wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg3\",\"prompt\":\"hello\",\"temperature\":0}"),
+			wantErr:  false,
+		},
 	}
 
 	// Set up global k8sclient and extproc server runner with test environment config
@@ -487,6 +553,3 @@ func readDocuments(fp string) ([][]byte, error) {
 	}
 	return docs, nil
 }
-func pointer(v int32) *int32 {
-	return &v
-}
diff --git a/test/testdata/inferencepool-with-model-hermetic.yaml b/test/testdata/inferencepool-with-model-hermetic.yaml
@@ -36,3 +36,17 @@ spec:
   targetModels:
   - name: sql-lora-1fdg3
     weight: 100
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha1
+kind: InferenceModel
+metadata:
+  name: inferencemodel-generic
+  namespace: default
+spec:
+  modelName: my-model
+  criticality: Critical
+  poolRef:
+    name: vllm-llama2-7b-pool
+  targetModels:
+  - name: my-model-12345
+    weight: 100