Add priority based scheduling

liu-cong · liu-cong · commit f272cc211996 · 2024-10-21T16:54:29.000Z
diff --git a/pkg/ext-proc/backend/metrics.go b/pkg/ext-proc/backend/metrics.go
@@ -89,18 +89,21 @@ func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *Pod
 	}
 	*/
 
+	// TODO: Read from vLLM metrics once the is available.
+	updated.MaxActiveModels = 4
+
 	// Update active loras
 	mf, ok := metricFamilies[ActiveLoRAAdaptersMetricName]
 	if ok {
 		// IMPORTANT: replace the map entries instead of appending to it.
-		updated.CachedModels = make(map[string]int)
+		updated.ActiveModels = make(map[string]int)
 		for _, metric := range mf.GetMetric() {
 			for _, label := range metric.GetLabel() {
 				if label.GetName() == "active_adapters" {
 					if label.GetValue() != "" {
 						adapterList := strings.Split(label.GetValue(), ",")
 						for _, adapter := range adapterList {
-							updated.CachedModels[adapter] = 0
+							updated.ActiveModels[adapter] = 0
 						}
 					}
 				}
diff --git a/pkg/ext-proc/backend/provider.go b/pkg/ext-proc/backend/provider.go
@@ -113,7 +113,7 @@ func (p *Provider) refreshPodsOnce() error {
 			new := &PodMetrics{
 				Pod: pod,
 				Metrics: Metrics{
-					CachedModels: make(map[string]int),
+					ActiveModels: make(map[string]int),
 				},
 			}
 			p.podMetrics.Store(pod, new)
diff --git a/pkg/ext-proc/backend/types.go b/pkg/ext-proc/backend/types.go
@@ -12,12 +12,14 @@ type Pod struct {
 }
 
 func (p Pod) String() string {
-	return p.Namespace + "." + p.Name
+	return p.Namespace + "/" + p.Name
 }
 
 type Metrics struct {
-	// CachedModels is a set of models(including LoRA adapters) that are currently cached to GPU.
-	CachedModels            map[string]int
+	// ActiveModels is a set of models(including LoRA adapters) that are currently cached to GPU.
+	ActiveModels map[string]int
+	// MaxActiveModels is the maximum number of models that can be loaded to GPU.
+	MaxActiveModels         int
 	RunningQueueSize        int
 	WaitingQueueSize        int
 	KVCacheUsagePercent     float64
@@ -34,14 +36,14 @@ func (pm *PodMetrics) String() string {
 }
 
 func (pm *PodMetrics) Clone() *PodMetrics {
-	cm := make(map[string]int, len(pm.CachedModels))
-	for k, v := range pm.CachedModels {
+	cm := make(map[string]int, len(pm.ActiveModels))
+	for k, v := range pm.ActiveModels {
 		cm[k] = v
 	}
 	clone := &PodMetrics{
 		Pod: pm.Pod,
 		Metrics: Metrics{
-			CachedModels:            cm,
+			ActiveModels:            cm,
 			RunningQueueSize:        pm.RunningQueueSize,
 			WaitingQueueSize:        pm.WaitingQueueSize,
 			KVCacheUsagePercent:     pm.KVCacheUsagePercent,
diff --git a/pkg/ext-proc/handlers/request.go b/pkg/ext-proc/handlers/request.go
@@ -38,6 +38,8 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces
 		// TODO: Once the API is approved, read the "LLMUseCase" configuration and apply traffic split.
 		TargetModels:        map[string]int{model: 100},
 		ResolvedTargetModel: model,
+		// TODO: Read from LLMService CRD.
+		Critical: true,
 	}
 
 	// Update target models in the body.
@@ -51,7 +53,7 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces
 
 	targetPod, err := s.scheduler.Schedule(llmReq)
 	if err != nil {
-		return nil, fmt.Errorf("failed to find target pod: %v", err)
+		return nil, fmt.Errorf("failed to find target pod: %w", err)
 	}
 	klog.V(3).Infof("Selected target model %v in target pod: %v\n", llmReq.ResolvedTargetModel, targetPod)
 
diff --git a/pkg/ext-proc/handlers/server.go b/pkg/ext-proc/handlers/server.go
@@ -4,6 +4,7 @@ import (
 	"io"
 
 	extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
+	envoyTypePb "github.com/envoyproxy/go-control-plane/envoy/type/v3"
 	"google.golang.org/grpc/codes"
 	"google.golang.org/grpc/status"
 	klog "k8s.io/klog/v2"
@@ -83,13 +84,26 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error {
 
 		if err != nil {
 			klog.Errorf("failed to process request: %v", err)
-			return status.Errorf(codes.Unknown, "failed to handle request: %v", err)
+			switch status.Code(err) {
+			case codes.ResourceExhausted:
+				resp = &extProcPb.ProcessingResponse{
+					Response: &extProcPb.ProcessingResponse_ImmediateResponse{
+						ImmediateResponse: &extProcPb.ImmediateResponse{
+							Status: &envoyTypePb.HttpStatus{
+								Code: envoyTypePb.StatusCode_TooManyRequests,
+							},
+						},
+					},
+				}
+			default:
+				return status.Errorf(status.Code(err), "failed to handle request: %w", err)
+			}
 		}
 
 		klog.V(3).Infof("response: %v", resp)
 		if err := srv.Send(resp); err != nil {
 			klog.Errorf("send error %v", err)
-			return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err)
+			return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %w", err)
 		}
 	}
 }
diff --git a/pkg/ext-proc/scheduling/filter.go b/pkg/ext-proc/scheduling/filter.go
@@ -42,30 +42,34 @@ func (f *filter) Name() string {
 }
 
 func (f *filter) Filter(b *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
-	if f == nil {
-		klog.V(3).Infof("Running nil filter, returning all input pods by default")
-		return pods, nil
-	}
 	klog.V(3).Infof("Running filter %q on request %v with %v pods", f.name, b, len(pods))
 
 	filtered, err := f.filter(b, pods)
 
 	next := f.nextOnSuccessOrFailure
 	if err == nil {
-		klog.V(3).Infof("onSuccess %v -> %v, filtered: %v", f.name, next.Name(), len(filtered))
+		if f.nextOnSuccess == nil && f.nextOnSuccessOrFailure == nil {
+			// No succeeding filters to run, return.
+			return filtered, err
+		}
 		if f.nextOnSuccess != nil {
 			next = f.nextOnSuccess
 		}
+		klog.V(3).Infof("onSuccess %q -> %q, filtered: %v", f.name, next.Name(), len(filtered))
 		// On success, pass the filtered result to the next filter.
 		return next.Filter(b, filtered)
+	} else {
+		if f.nextOnFailure == nil && f.nextOnSuccessOrFailure == nil {
+			// No succeeding filters to run, return.
+			return filtered, err
+		}
+		if f.nextOnFailure != nil {
+			next = f.nextOnFailure
+		}
+		klog.V(3).Infof("onFailure %q -> %q", f.name, next.Name())
+		// On failure, pass the initial set of pods to the next filter.
+		return next.Filter(b, pods)
 	}
-
-	klog.V(3).Infof("onFailure %v -> %v", f.name, next.Name())
-	if f.nextOnFailure != nil {
-		next = f.nextOnFailure
-	}
-	// On failure, pass the initial set of pods to the next filter.
-	return next.Filter(b, pods)
 }
 
 // filterFunc filters a set of input pods to a subset.
@@ -125,7 +129,7 @@ func leastQueuingFilterFunc(b *LLMRequest, pods []*backend.PodMetrics) ([]*backe
 // TODO: Compare this strategy with other strategies such as top K.
 func leastKVCacheFilterFunc(b *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
 	min := math.MaxFloat64
-	max := math.SmallestNonzeroFloat64
+	var max float64 = 0
 	filtered := []*backend.PodMetrics{}
 
 	for _, pod := range pods {
@@ -145,11 +149,52 @@ func leastKVCacheFilterFunc(b *LLMRequest, pods []*backend.PodMetrics) ([]*backe
 	return filtered, nil
 }
 
+// mostKVCacheFilterFunc is similar to leastKVCacheFilterFunc but prefers pods with higher KV cache.
+func mostKVCacheFilterFunc(b *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
+	min := math.MaxFloat64
+	var max float64 = 0
+	filtered := []*backend.PodMetrics{}
+
+	for _, pod := range pods {
+		if pod.KVCacheUsagePercent <= min {
+			min = pod.KVCacheUsagePercent
+		}
+		if pod.KVCacheUsagePercent >= max {
+			max = pod.KVCacheUsagePercent
+		}
+	}
+
+	klog.V(3).Infof("mostKVCacheFilterFunc, max=%v, min=%v", max, min)
+	for _, pod := range pods {
+		klog.V(3).Infof("Evaluating pod %v", pod.KVCacheUsagePercent)
+		if pod.KVCacheUsagePercent <= max && pod.KVCacheUsagePercent >= max-(max-min)/float64(len(pods)) {
+			klog.V(3).Infof("Selected pod %v", pod.KVCacheUsagePercent)
+			filtered = append(filtered, pod)
+		}
+	}
+	return filtered, nil
+}
+
 // podPredicate is a filter function to check whether a pod is desired.
 type podPredicate func(b *LLMRequest, pod *backend.PodMetrics) bool
 
-// loraAffinityPredicate return true if the pod have the requested LoRA adapter loaded.
-func loraAffinityPredicate(b *LLMRequest, pod *backend.PodMetrics) bool {
-	_, ok := pod.CachedModels[b.ResolvedTargetModel]
-	return ok
+// We consider serving an adapter low cost it the adapter is active in the model server, or the
+// model server has room to load the adapter
+func lowLoRACostPredicate(b *LLMRequest, pod *backend.PodMetrics) bool {
+	_, ok := pod.ActiveModels[b.ResolvedTargetModel]
+	return ok || len(pod.ActiveModels) < pod.MaxActiveModels
+}
+
+func criticalRequestPredicate(b *LLMRequest, pod *backend.PodMetrics) bool {
+	return b.Critical
+}
+
+func noQueueAndLessThanKVCacheThresholdPredicate(threshold float64) podPredicate {
+	return func(b *LLMRequest, pod *backend.PodMetrics) bool {
+		return pod.WaitingQueueSize <= 0 && pod.KVCacheUsagePercent <= threshold
+	}
+}
+
+func allowAllPredicate(b *LLMRequest, pod *backend.PodMetrics) bool {
+	return true
 }
diff --git a/pkg/ext-proc/scheduling/scheduler.go b/pkg/ext-proc/scheduling/scheduler.go
@@ -5,24 +5,74 @@ import (
 	"fmt"
 	"math/rand"
 
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
 	klog "k8s.io/klog/v2"
 
 	"ext-proc/backend"
 )
 
+const (
+	// TODO Consider making this configurable.
+	kvCacheThreshold = 80
+)
+
 var (
+	allowAllFilter = &filter{
+		name:   "noop",
+		filter: toFilterFunc(allowAllPredicate),
+	}
+
 	defaultFilter = &filter{
+		name:          "critical request",
+		filter:        toFilterFunc(criticalRequestPredicate),
+		nextOnSuccess: criticalRequestFilter,
+		nextOnFailure: sheddableRequestFilter,
+	}
+
+	// The goal for scheduling critical requests is to minimize the latency. The heuristic is to
+	// pick a server with least "load" (KV Cache), which typically yields lower latency.
+	// Heuristics for scheduling critical requests:
+	criticalRequestFilter = &filter{
 		name:   "least queuing",
 		filter: leastQueuingFilterFunc,
 		nextOnSuccessOrFailure: &filter{
-			name:   "lora affinity",
-			filter: toFilterFunc(loraAffinityPredicate),
+			name:   "low cost LoRA",
+			filter: toFilterFunc(lowLoRACostPredicate),
 			nextOnSuccessOrFailure: &filter{
 				name:   "least KV cache percent",
 				filter: leastKVCacheFilterFunc,
 			},
 		},
 	}
+
+	// The goal for scheduling sheddable requests is to optimize for throughput while reducing
+	// queuing, and leave low load (KV cache) servers to serve critical requests.
+	sheddableRequestFilter = &filter{
+		// When there is at least one model server that's not queuing requests, and still has KV
+		// cache below a certain threshold, we consider this model server has capacity to handle
+		// a sheddable request without impacting critical requests.
+		name:   "has capacity for sheddable requests",
+		filter: toFilterFunc(noQueueAndLessThanKVCacheThresholdPredicate(kvCacheThreshold)),
+		nextOnSuccess: &filter{
+			name:   "most KV cache percent",
+			filter: mostKVCacheFilterFunc,
+			nextOnSuccessOrFailure: &filter{
+				name:          "low cost LoRA",
+				filter:        toFilterFunc(lowLoRACostPredicate),
+				nextOnFailure: allowAllFilter,
+			},
+		},
+		// If all pods are queuing or running above the KVCache threshold, we drop the sheddable
+		// request to make room for critical requests.
+		nextOnFailure: &filter{
+			name: "drop request",
+			filter: func(b *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
+				klog.Infof("Dropping request %v", b)
+				return []*backend.PodMetrics{}, status.Errorf(codes.ResourceExhausted, "dropping request due to limited backend resources")
+			},
+		},
+	}
 )
 
 func NewScheduler(pmp PodMetricsProvider) *Scheduler {
@@ -48,8 +98,9 @@ func (s *Scheduler) Schedule(b *LLMRequest) (targetPod *backend.Pod, err error)
 	klog.V(3).Infof("request: %v; metrics: %+v", b, s.podMetricsProvider.AllPodMetrics())
 	pods, err := s.filter.Filter(b, s.podMetricsProvider.AllPodMetrics())
 	if err != nil || len(pods) == 0 {
-		return nil, fmt.Errorf("failed to apply filter, resulted %v pods, this should never happen: %v", len(pods), err)
+		return nil, fmt.Errorf("failed to apply filter, resulted %v pods, this should never happen: %w", len(pods), err)
 	}
+	klog.V(3).Infof("Going to randomly select a pod from the candidates: %+v", pods)
 	i := rand.Intn(len(pods))
 	return &pods[i].Pod, nil
 }
diff --git a/pkg/ext-proc/scheduling/types.go b/pkg/ext-proc/scheduling/types.go
@@ -7,4 +7,5 @@ type LLMRequest struct {
 	TargetModels map[string]int
 	// Resolved target model is the final target model after traffic split.
 	ResolvedTargetModel string
+	Critical            bool
 }

Original file line number	Diff line number	Diff line change
`@@ -89,18 +89,21 @@ func promToPodMetrics(metricFamilies map[string]dto.MetricFamily, existing Pod`
`89`	`89`	`}`
`90`	`90`	`*/`
`91`	`91`
	`92`	`+ // TODO: Read from vLLM metrics once the is available.`
	`93`	`+ updated.MaxActiveModels = 4`
	`94`	`+`
`92`	`95`	`// Update active loras`
`93`	`96`	`mf, ok := metricFamilies[ActiveLoRAAdaptersMetricName]`
`94`	`97`	`if ok {`
`95`	`98`	`// IMPORTANT: replace the map entries instead of appending to it.`
`96`		`- updated.CachedModels = make(map[string]int)`
	`99`	`+ updated.ActiveModels = make(map[string]int)`
`97`	`100`	`for _, metric := range mf.GetMetric() {`
`98`	`101`	`for _, label := range metric.GetLabel() {`
`99`	`102`	`if label.GetName() == "active_adapters" {`
`100`	`103`	`if label.GetValue() != "" {`
`101`	`104`	`adapterList := strings.Split(label.GetValue(), ",")`
`102`	`105`	`for _, adapter := range adapterList {`
`103`		`- updated.CachedModels[adapter] = 0`
	`106`	`+ updated.ActiveModels[adapter] = 0`
`104`	`107`	`}`
`105`	`108`	`}`
`106`	`109`	`}`
Original file line number	Diff line number	Diff line change
`@@ -113,7 +113,7 @@ func (p *Provider) refreshPodsOnce() error {`
`113`	`113`	`new := &PodMetrics{`
`114`	`114`	`Pod: pod,`
`115`	`115`	`Metrics: Metrics{`
`116`		`- CachedModels: make(map[string]int),`
	`116`	`+ ActiveModels: make(map[string]int),`
`117`	`117`	`},`
`118`	`118`	`}`
`119`	`119`	`p.podMetrics.Store(pod, new)`
Original file line number	Diff line number	Diff line change
`@@ -38,6 +38,8 @@ func (s Server) HandleRequestBody(reqCtx RequestContext, req *extProcPb.Proces`
`38`	`38`	`// TODO: Once the API is approved, read the "LLMUseCase" configuration and apply traffic split.`
`39`	`39`	`TargetModels: map[string]int{model: 100},`
`40`	`40`	`ResolvedTargetModel: model,`
	`41`	`+ // TODO: Read from LLMService CRD.`
	`42`	`+ Critical: true,`
`41`	`43`	`}`
`42`	`44`
`43`	`45`	`// Update target models in the body.`
`@@ -51,7 +53,7 @@ func (s Server) HandleRequestBody(reqCtx RequestContext, req *extProcPb.Proces`
`51`	`53`
`52`	`54`	`targetPod, err := s.scheduler.Schedule(llmReq)`
`53`	`55`	`if err != nil {`
`54`		`- return nil, fmt.Errorf("failed to find target pod: %v", err)`
	`56`	`+ return nil, fmt.Errorf("failed to find target pod: %w", err)`
`55`	`57`	`}`
`56`	`58`	`klog.V(3).Infof("Selected target model %v in target pod: %v\n", llmReq.ResolvedTargetModel, targetPod)`
`57`	`59`
Original file line number	Diff line number	Diff line change
`@@ -7,4 +7,5 @@ type LLMRequest struct {`
`7`	`7`	`TargetModels map[string]int`
`8`	`8`	`// Resolved target model is the final target model after traffic split.`
`9`	`9`	`ResolvedTargetModel string`
	`10`	`+ Critical bool`
`10`	`11`	`}`