Use the least kv cache for sheddable requests when there is capacity

liu-cong · liu-cong · commit c493a1852efb · 2024-10-24T18:19:50.000Z
diff --git a/pkg/ext-proc/backend/vllm/metrics.go b/pkg/ext-proc/backend/vllm/metrics.go
@@ -85,7 +85,7 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *bac
 	}
 	*/
 
-	// TODO: Read from vLLM metrics once the is available.
+	// TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/22): Read from vLLM metrics once the is available.
 	updated.MaxActiveModels = 4
 
 	// Update active loras
diff --git a/pkg/ext-proc/handlers/server.go b/pkg/ext-proc/handlers/server.go
@@ -85,6 +85,8 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error {
 		if err != nil {
 			klog.Errorf("failed to process request: %v", err)
 			switch status.Code(err) {
+			// This code can be returned by scheduler when there is no capacity for sheddable
+			// requests.
 			case codes.ResourceExhausted:
 				resp = &extProcPb.ProcessingResponse{
 					Response: &extProcPb.ProcessingResponse_ImmediateResponse{
diff --git a/pkg/ext-proc/scheduling/filter.go b/pkg/ext-proc/scheduling/filter.go
@@ -149,32 +149,6 @@ func leastKVCacheFilterFunc(req *LLMRequest, pods []*backend.PodMetrics) ([]*bac
 	return filtered, nil
 }
 
-// mostKVCacheFilterFunc is similar to leastKVCacheFilterFunc but prefers pods with higher KV cache.
-func mostKVCacheFilterFunc(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
-	min := math.MaxFloat64
-	var max float64 = 0
-	filtered := []*backend.PodMetrics{}
-
-	for _, pod := range pods {
-		if pod.KVCacheUsagePercent <= min {
-			min = pod.KVCacheUsagePercent
-		}
-		if pod.KVCacheUsagePercent >= max {
-			max = pod.KVCacheUsagePercent
-		}
-	}
-
-	klog.V(3).Infof("mostKVCacheFilterFunc, max=%v, min=%v", max, min)
-	for _, pod := range pods {
-		klog.V(3).Infof("Evaluating pod %v", pod.KVCacheUsagePercent)
-		if pod.KVCacheUsagePercent <= max && pod.KVCacheUsagePercent >= max-(max-min)/float64(len(pods)) {
-			klog.V(3).Infof("Selected pod %v", pod.KVCacheUsagePercent)
-			filtered = append(filtered, pod)
-		}
-	}
-	return filtered, nil
-}
-
 // podPredicate is a filter function to check whether a pod is desired.
 type podPredicate func(req *LLMRequest, pod *backend.PodMetrics) bool
 
@@ -189,12 +163,8 @@ func criticalRequestPredicate(req *LLMRequest, pod *backend.PodMetrics) bool {
 	return req.Critical
 }
 
-func noQueueAndLessThanKVCacheThresholdPredicate(threshold float64) podPredicate {
+func noQueueAndLessThanKVCacheThresholdPredicate(queueThreshold int, kvCacheThreshold float64) podPredicate {
 	return func(req *LLMRequest, pod *backend.PodMetrics) bool {
-		return pod.WaitingQueueSize <= 0 && pod.KVCacheUsagePercent <= threshold
+		return pod.WaitingQueueSize <= queueThreshold && pod.KVCacheUsagePercent <= kvCacheThreshold
 	}
 }
-
-func allowAllPredicate(req *LLMRequest, pod *backend.PodMetrics) bool {
-	return true
-}
diff --git a/pkg/ext-proc/scheduling/filter_test.go b/pkg/ext-proc/scheduling/filter_test.go
@@ -301,43 +301,9 @@ func TestFilterFunc(t *testing.T) {
 				},
 			},
 		},
-		{
-			name:   "most kv cache empty input",
-			f:      mostKVCacheFilterFunc,
-			input:  []*backend.PodMetrics{},
-			output: []*backend.PodMetrics{},
-		},
-		{
-			name: "most kv cache",
-			f:    mostKVCacheFilterFunc,
-			input: []*backend.PodMetrics{
-				{
-					Metrics: backend.Metrics{
-						KVCacheUsagePercent: 0,
-					},
-				},
-				{
-					Metrics: backend.Metrics{
-						KVCacheUsagePercent: 0.3,
-					},
-				},
-				{
-					Metrics: backend.Metrics{
-						KVCacheUsagePercent: 1.0,
-					},
-				},
-			},
-			output: []*backend.PodMetrics{
-				{
-					Metrics: backend.Metrics{
-						KVCacheUsagePercent: 1.0,
-					},
-				},
-			},
-		},
 		{
 			name: "noQueueAndLessThanKVCacheThresholdPredicate",
-			f:    toFilterFunc(noQueueAndLessThanKVCacheThresholdPredicate(0.8)),
+			f:    toFilterFunc(noQueueAndLessThanKVCacheThresholdPredicate(0, 0.8)),
 			input: []*backend.PodMetrics{
 				{
 					// This pod should be returned.
diff --git a/pkg/ext-proc/scheduling/scheduler.go b/pkg/ext-proc/scheduling/scheduler.go
@@ -13,27 +13,23 @@ import (
 )
 
 const (
-	// TODO Consider making this configurable.
-	kvCacheThreshold = 80
+	// TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/16) Make this configurable.
+	kvCacheThreshold = 0.8
+	// TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/16) Make this configurable.
+	queueThreshold = 5
 )
 
 var (
-	allowAllFilter = &filter{
-		name:   "noop",
-		filter: toFilterFunc(allowAllPredicate),
-	}
-
 	defaultFilter = &filter{
 		name:          "critical request",
 		filter:        toFilterFunc(criticalRequestPredicate),
-		nextOnSuccess: criticalRequestFilter,
+		nextOnSuccess: lowLatencyFilter,
 		nextOnFailure: sheddableRequestFilter,
 	}
 
-	// The goal for scheduling critical requests is to minimize the latency. The heuristic is to
-	// pick a server with least "load" (KV Cache), which typically yields lower latency.
-	// Heuristics for scheduling critical requests:
-	criticalRequestFilter = &filter{
+	// lowLatencyFilter tries to minimize the latency. The heuristic is to pick a server with lower
+	// cost to load an adapter and has low KV cache, which typically yields lower latency.
+	lowLatencyFilter = &filter{
 		name:   "least queuing",
 		filter: leastQueuingFilterFunc,
 		nextOnSuccessOrFailure: &filter{
@@ -46,23 +42,13 @@ var (
 		},
 	}
 
-	// The goal for scheduling sheddable requests is to optimize for throughput while reducing
-	// queuing, and leave low load (KV cache) servers to serve critical requests.
 	sheddableRequestFilter = &filter{
 		// When there is at least one model server that's not queuing requests, and still has KV
 		// cache below a certain threshold, we consider this model server has capacity to handle
 		// a sheddable request without impacting critical requests.
-		name:   "has capacity for sheddable requests",
-		filter: toFilterFunc(noQueueAndLessThanKVCacheThresholdPredicate(kvCacheThreshold)),
-		nextOnSuccess: &filter{
-			name:   "most KV cache percent",
-			filter: mostKVCacheFilterFunc,
-			nextOnSuccessOrFailure: &filter{
-				name:          "low cost LoRA",
-				filter:        toFilterFunc(lowLoRACostPredicate),
-				nextOnFailure: allowAllFilter,
-			},
-		},
+		name:          "has capacity for sheddable requests",
+		filter:        toFilterFunc(noQueueAndLessThanKVCacheThresholdPredicate(queueThreshold, kvCacheThreshold)),
+		nextOnSuccess: lowLatencyFilter,
 		// If all pods are queuing or running above the KVCache threshold, we drop the sheddable
 		// request to make room for critical requests.
 		nextOnFailure: &filter{

Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,7 @@ func promToPodMetrics(metricFamilies map[string]dto.MetricFamily, existing bac`
`85`	`85`	`}`
`86`	`86`	`*/`
`87`	`87`
`88`		`- // TODO: Read from vLLM metrics once the is available.`
	`88`	`+ // TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/22): Read from vLLM metrics once the is available.`
`89`	`89`	`updated.MaxActiveModels = 4`
`90`	`90`
`91`	`91`	`// Update active loras`