Add priority based scheduling (kubernetes-sigs#25)

liu-cong · kfswain · commit 62f54a525052 · 2024-10-28T06:15:16.000Z
* Add priority based scheduling

* Use the least kv cache for sheddable requests when there is capacity
diff --git a/pkg/ext-proc/backend/provider.go b/pkg/ext-proc/backend/provider.go
@@ -95,7 +95,7 @@ func (p *Provider) refreshPodsOnce() error {
 			new := &PodMetrics{
 				Pod: pod,
 				Metrics: Metrics{
-					CachedModels: make(map[string]int),
+					ActiveModels: make(map[string]int),
 				},
 			}
 			p.podMetrics.Store(pod, new)
diff --git a/pkg/ext-proc/backend/types.go b/pkg/ext-proc/backend/types.go
@@ -13,12 +13,14 @@ type Pod struct {
 }
 
 func (p Pod) String() string {
-	return p.Namespace + "." + p.Name
+	return p.Namespace + "/" + p.Name
 }
 
 type Metrics struct {
-	// CachedModels is a set of models(including LoRA adapters) that are currently cached to GPU.
-	CachedModels            map[string]int
+	// ActiveModels is a set of models(including LoRA adapters) that are currently cached to GPU.
+	ActiveModels map[string]int
+	// MaxActiveModels is the maximum number of models that can be loaded to GPU.
+	MaxActiveModels         int
 	RunningQueueSize        int
 	WaitingQueueSize        int
 	KVCacheUsagePercent     float64
@@ -35,14 +37,14 @@ func (pm *PodMetrics) String() string {
 }
 
 func (pm *PodMetrics) Clone() *PodMetrics {
-	cm := make(map[string]int, len(pm.CachedModels))
-	for k, v := range pm.CachedModels {
+	cm := make(map[string]int, len(pm.ActiveModels))
+	for k, v := range pm.ActiveModels {
 		cm[k] = v
 	}
 	clone := &PodMetrics{
 		Pod: pm.Pod,
 		Metrics: Metrics{
-			CachedModels:            cm,
+			ActiveModels:            cm,
 			RunningQueueSize:        pm.RunningQueueSize,
 			WaitingQueueSize:        pm.WaitingQueueSize,
 			KVCacheUsagePercent:     pm.KVCacheUsagePercent,
diff --git a/pkg/ext-proc/backend/vllm/metrics.go b/pkg/ext-proc/backend/vllm/metrics.go
@@ -0,0 +1,133 @@
+// Package vllm provides vllm specific pod metrics implementation.
+package vllm
+
+import (
+	"ext-proc/backend"
+	"fmt"
+	"net/http"
+	"strings"
+	"time"
+
+	dto "github.com/prometheus/client_model/go"
+	"github.com/prometheus/common/expfmt"
+	"go.uber.org/multierr"
+	klog "k8s.io/klog/v2"
+)
+
+const (
+	ActiveLoRAAdaptersMetricName        = "vllm:info_active_adapters_info"
+	LoRAAdapterPendingRequestMetricName = "vllm:active_lora_adapters"
+	// TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork.
+	RunningQueueSizeMetricName = "vllm:num_requests_running"
+	WaitingQueueSizeMetricName = "vllm:num_requests_waiting"
+	/* TODO: Uncomment this once the following are added to the fork.
+	RunningQueueSizeMetricName        = "vllm:num_tokens_running"
+	WaitingQueueSizeMetricName        = "vllm:num_tokens_waiting"
+	*/
+	KVCacheUsagePercentMetricName     = "vllm:gpu_cache_usage_perc"
+	KvCacheMaxTokenCapacityMetricName = "vllm:gpu_cache_max_token_capacity"
+)
+
+type PodMetricsClientImpl struct {
+}
+
+// FetchMetrics fetches metrics from a given pod.
+func (p *PodMetricsClientImpl) FetchMetrics(pod backend.Pod, existing *backend.PodMetrics) (*backend.PodMetrics, error) {
+	// Currently the metrics endpoint is hard-coded, which works with vLLM.
+	// TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/16): Consume this from LLMServerPool config.
+	url := fmt.Sprintf("http://%s/metrics", pod.Address)
+	resp, err := http.Get(url)
+	if err != nil {
+		klog.Errorf("failed to fetch metrics from %s: %v", pod, err)
+		return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod, err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		klog.Errorf("unexpected status code from %s: %v", pod, resp.StatusCode)
+		return nil, fmt.Errorf("unexpected status code from %s: %v", pod, resp.StatusCode)
+	}
+
+	parser := expfmt.TextParser{}
+	metricFamilies, err := parser.TextToMetricFamilies(resp.Body)
+	if err != nil {
+		return nil, err
+	}
+	return promToPodMetrics(metricFamilies, existing)
+}
+
+// promToPodMetrics updates internal pod metrics with scraped prometheus metrics.
+// A combined error is returned if errors occur in one or more metric processing.
+// it returns a new PodMetrics pointer which can be used to atomically update the pod metrics map.
+func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *backend.PodMetrics) (*backend.PodMetrics, error) {
+	var errs error
+	updated := existing.Clone()
+	runningQueueSize, _, err := getLatestMetric(metricFamilies, RunningQueueSizeMetricName)
+	multierr.Append(errs, err)
+	if err == nil {
+		updated.RunningQueueSize = int(runningQueueSize.GetGauge().GetValue())
+	}
+	waitingQueueSize, _, err := getLatestMetric(metricFamilies, WaitingQueueSizeMetricName)
+	multierr.Append(errs, err)
+	if err == nil {
+		updated.WaitingQueueSize = int(waitingQueueSize.GetGauge().GetValue())
+	}
+	cachePercent, _, err := getLatestMetric(metricFamilies, KVCacheUsagePercentMetricName)
+	multierr.Append(errs, err)
+	if err == nil {
+		updated.KVCacheUsagePercent = cachePercent.GetGauge().GetValue()
+	}
+	/* TODO: uncomment once this is available in vllm.
+	kvCap, _, err := getGaugeLatestValue(metricFamilies, KvCacheMaxTokenCapacityMetricName)
+	multierr.Append(errs, err)
+	if err != nil {
+		updated.KvCacheMaxTokenCapacity = int(kvCap)
+	}
+	*/
+
+	// Update active loras
+	mf, ok := metricFamilies[ActiveLoRAAdaptersMetricName]
+	if ok {
+		// IMPORTANT: replace the map entries instead of appending to it.
+		updated.CachedModels = make(map[string]int)
+		for _, metric := range mf.GetMetric() {
+			for _, label := range metric.GetLabel() {
+				if label.GetName() == "active_adapters" {
+					if label.GetValue() != "" {
+						adapterList := strings.Split(label.GetValue(), ",")
+						for _, adapter := range adapterList {
+							updated.CachedModels[adapter] = 0
+						}
+					}
+				}
+			}
+		}
+	} else {
+		klog.Warningf("metric family %q not found", ActiveLoRAAdaptersMetricName)
+		multierr.Append(errs, fmt.Errorf("metric family %q not found", ActiveLoRAAdaptersMetricName))
+	}
+
+	return updated, errs
+}
+
+// getLatestMetric gets the latest metric of a family. This should be used to get the latest Gauge metric.
+func getLatestMetric(metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, time.Time, error) {
+	mf, ok := metricFamilies[metricName]
+	if !ok {
+		klog.Warningf("metric family %q not found", metricName)
+		return nil, time.Time{}, fmt.Errorf("metric family %q not found", metricName)
+	}
+	if len(mf.GetMetric()) == 0 {
+		return nil, time.Time{}, fmt.Errorf("no metrics available for %q", metricName)
+	}
+	var latestTs int64
+	var latest *dto.Metric
+	for _, m := range mf.GetMetric() {
+		if m.GetTimestampMs() >= latestTs {
+			latestTs = m.GetTimestampMs()
+			latest = m
+		}
+	}
+	klog.V(4).Infof("Got metric value %+v for metric %v", latest, metricName)
+	return latest, time.Unix(0, latestTs*1000), nil
+}
diff --git a/pkg/ext-proc/go.mod b/pkg/ext-proc/go.mod
@@ -5,6 +5,7 @@ go 1.22.0
 require (
 	github.com/bojand/ghz v0.120.0
 	github.com/envoyproxy/go-control-plane v0.13.0
+	github.com/google/go-cmp v0.6.0
 	github.com/jhump/protoreflect v1.15.1
 	github.com/prometheus/client_model v0.6.1
 	github.com/prometheus/common v0.55.0
diff --git a/pkg/ext-proc/handlers/request.go b/pkg/ext-proc/handlers/request.go
@@ -38,6 +38,8 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces
 		// TODO: Once the API is approved, read the "LLMUseCase" configuration and apply traffic split.
 		TargetModels:        map[string]int{model: 100},
 		ResolvedTargetModel: model,
+		// TODO: Read from LLMService CRD.
+		Critical: true,
 	}
 
 	// Update target models in the body.
@@ -51,7 +53,7 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces
 
 	targetPod, err := s.scheduler.Schedule(llmReq)
 	if err != nil {
-		return nil, fmt.Errorf("failed to find target pod: %v", err)
+		return nil, fmt.Errorf("failed to find target pod: %w", err)
 	}
 	klog.V(3).Infof("Selected target model %v in target pod: %v\n", llmReq.ResolvedTargetModel, targetPod)
 
diff --git a/pkg/ext-proc/handlers/server.go b/pkg/ext-proc/handlers/server.go
@@ -4,6 +4,7 @@ import (
 	"io"
 
 	extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
+	envoyTypePb "github.com/envoyproxy/go-control-plane/envoy/type/v3"
 	"google.golang.org/grpc/codes"
 	"google.golang.org/grpc/status"
 	klog "k8s.io/klog/v2"
@@ -83,13 +84,28 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error {
 
 		if err != nil {
 			klog.Errorf("failed to process request: %v", err)
-			return status.Errorf(codes.Unknown, "failed to handle request: %v", err)
+			switch status.Code(err) {
+			// This code can be returned by scheduler when there is no capacity for sheddable
+			// requests.
+			case codes.ResourceExhausted:
+				resp = &extProcPb.ProcessingResponse{
+					Response: &extProcPb.ProcessingResponse_ImmediateResponse{
+						ImmediateResponse: &extProcPb.ImmediateResponse{
+							Status: &envoyTypePb.HttpStatus{
+								Code: envoyTypePb.StatusCode_TooManyRequests,
+							},
+						},
+					},
+				}
+			default:
+				return status.Errorf(status.Code(err), "failed to handle request: %w", err)
+			}
 		}
 
 		klog.V(3).Infof("response: %v", resp)
 		if err := srv.Send(resp); err != nil {
 			klog.Errorf("send error %v", err)
-			return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err)
+			return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %w", err)
 		}
 	}
 }
diff --git a/pkg/ext-proc/scheduling/filter.go b/pkg/ext-proc/scheduling/filter.go
@@ -11,7 +11,7 @@ import (
 
 type Filter interface {
 	Name() string
-	Filter(b *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error)
+	Filter(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error)
 }
 
 // filter applies current filterFunc, and then recursively applies next filters depending success or
@@ -41,42 +41,46 @@ func (f *filter) Name() string {
 	return f.name
 }
 
-func (f *filter) Filter(b *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
-	if f == nil {
-		klog.V(3).Infof("Running nil filter, returning all input pods by default")
-		return pods, nil
-	}
-	klog.V(3).Infof("Running filter %q on request %v with %v pods", f.name, b, len(pods))
+func (f *filter) Filter(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
+	klog.V(3).Infof("Running filter %q on request %v with %v pods", f.name, req, len(pods))
 
-	filtered, err := f.filter(b, pods)
+	filtered, err := f.filter(req, pods)
 
 	next := f.nextOnSuccessOrFailure
-	if err == nil {
-		klog.V(3).Infof("onSuccess %v -> %v, filtered: %v", f.name, next.Name(), len(filtered))
+	if err == nil && len(filtered) > 0 {
+		if f.nextOnSuccess == nil && f.nextOnSuccessOrFailure == nil {
+			// No succeeding filters to run, return.
+			return filtered, err
+		}
 		if f.nextOnSuccess != nil {
 			next = f.nextOnSuccess
 		}
+		klog.V(3).Infof("onSuccess %q -> %q, filtered: %v", f.name, next.Name(), len(filtered))
 		// On success, pass the filtered result to the next filter.
-		return next.Filter(b, filtered)
-	}
-
-	klog.V(3).Infof("onFailure %v -> %v", f.name, next.Name())
-	if f.nextOnFailure != nil {
-		next = f.nextOnFailure
+		return next.Filter(req, filtered)
+	} else {
+		if f.nextOnFailure == nil && f.nextOnSuccessOrFailure == nil {
+			// No succeeding filters to run, return.
+			return filtered, err
+		}
+		if f.nextOnFailure != nil {
+			next = f.nextOnFailure
+		}
+		klog.V(3).Infof("onFailure %q -> %q", f.name, next.Name())
+		// On failure, pass the initial set of pods to the next filter.
+		return next.Filter(req, pods)
 	}
-	// On failure, pass the initial set of pods to the next filter.
-	return next.Filter(b, pods)
 }
 
 // filterFunc filters a set of input pods to a subset.
-type filterFunc func(b *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error)
+type filterFunc func(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error)
 
 // toFilterFunc is a helper function to convert a per pod filter func to the FilterFunc.
 func toFilterFunc(pp podPredicate) filterFunc {
-	return func(b *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
+	return func(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
 		filtered := []*backend.PodMetrics{}
 		for _, pod := range pods {
-			pass := pp(b, pod)
+			pass := pp(req, pod)
 			if pass {
 				filtered = append(filtered, pod)
 			}
@@ -95,7 +99,7 @@ func toFilterFunc(pp podPredicate) filterFunc {
 // the least one as it gives more choices for the next filter, which on aggregate gave better
 // results.
 // TODO: Compare this strategy with other strategies such as top K.
-func leastQueuingFilterFunc(b *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
+func leastQueuingFilterFunc(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
 	min := math.MaxInt
 	max := 0
 	filtered := []*backend.PodMetrics{}
@@ -123,9 +127,9 @@ func leastQueuingFilterFunc(b *LLMRequest, pods []*backend.PodMetrics) ([]*backe
 // should consider them all instead of the absolute minimum one. This worked better than picking the
 // least one as it gives more choices for the next filter, which on aggregate gave better results.
 // TODO: Compare this strategy with other strategies such as top K.
-func leastKVCacheFilterFunc(b *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
+func leastKVCacheFilterFunc(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
 	min := math.MaxFloat64
-	max := math.SmallestNonzeroFloat64
+	var max float64 = 0
 	filtered := []*backend.PodMetrics{}
 
 	for _, pod := range pods {
@@ -146,10 +150,21 @@ func leastKVCacheFilterFunc(b *LLMRequest, pods []*backend.PodMetrics) ([]*backe
 }
 
 // podPredicate is a filter function to check whether a pod is desired.
-type podPredicate func(b *LLMRequest, pod *backend.PodMetrics) bool
+type podPredicate func(req *LLMRequest, pod *backend.PodMetrics) bool
+
+// We consider serving an adapter low cost it the adapter is active in the model server, or the
+// model server has room to load the adapter
+func lowLoRACostPredicate(req *LLMRequest, pod *backend.PodMetrics) bool {
+	_, ok := pod.ActiveModels[req.ResolvedTargetModel]
+	return ok || len(pod.ActiveModels) < pod.MaxActiveModels
+}
 
-// loraAffinityPredicate return true if the pod have the requested LoRA adapter loaded.
-func loraAffinityPredicate(b *LLMRequest, pod *backend.PodMetrics) bool {
-	_, ok := pod.CachedModels[b.ResolvedTargetModel]
-	return ok
+func criticalRequestPredicate(req *LLMRequest, pod *backend.PodMetrics) bool {
+	return req.Critical
+}
+
+func noQueueAndLessThanKVCacheThresholdPredicate(queueThreshold int, kvCacheThreshold float64) podPredicate {
+	return func(req *LLMRequest, pod *backend.PodMetrics) bool {
+		return pod.WaitingQueueSize <= queueThreshold && pod.KVCacheUsagePercent <= kvCacheThreshold
+	}
 }
diff --git a/pkg/ext-proc/scheduling/filter_test.go b/pkg/ext-proc/scheduling/filter_test.go
diff --git a/pkg/ext-proc/scheduling/scheduler.go b/pkg/ext-proc/scheduling/scheduler.go
diff --git a/pkg/ext-proc/scheduling/types.go b/pkg/ext-proc/scheduling/types.go

Original file line number	Diff line number	Diff line change
`@@ -95,7 +95,7 @@ func (p *Provider) refreshPodsOnce() error {`
`95`	`95`	`new := &PodMetrics{`
`96`	`96`	`Pod: pod,`
`97`	`97`	`Metrics: Metrics{`
`98`		`- CachedModels: make(map[string]int),`
	`98`	`+ ActiveModels: make(map[string]int),`
`99`	`99`	`},`
`100`	`100`	`}`
`101`	`101`	`p.podMetrics.Store(pod, new)`
-Original file line number
+Diff line change
 require (
 	github.com/bojand/ghz v0.120.0
 	github.com/envoyproxy/go-control-plane v0.13.0
 +	github.com/google/go-cmp v0.6.0
 	github.com/jhump/protoreflect v1.15.1
 	github.com/prometheus/client_model v0.6.1
 	github.com/prometheus/common v0.55.0
Original file line number	Diff line number	Diff line change
`@@ -38,6 +38,8 @@ func (s Server) HandleRequestBody(reqCtx RequestContext, req *extProcPb.Proces`
`38`	`38`	`// TODO: Once the API is approved, read the "LLMUseCase" configuration and apply traffic split.`
`39`	`39`	`TargetModels: map[string]int{model: 100},`
`40`	`40`	`ResolvedTargetModel: model,`
	`41`	`+ // TODO: Read from LLMService CRD.`
	`42`	`+ Critical: true,`
`41`	`43`	`}`
`42`	`44`
`43`	`45`	`// Update target models in the body.`
`@@ -51,7 +53,7 @@ func (s Server) HandleRequestBody(reqCtx RequestContext, req *extProcPb.Proces`
`51`	`53`
`52`	`54`	`targetPod, err := s.scheduler.Schedule(llmReq)`
`53`	`55`	`if err != nil {`
`54`		`- return nil, fmt.Errorf("failed to find target pod: %v", err)`
	`56`	`+ return nil, fmt.Errorf("failed to find target pod: %w", err)`
`55`	`57`	`}`
`56`	`58`	`klog.V(3).Infof("Selected target model %v in target pod: %v\n", llmReq.ResolvedTargetModel, targetPod)`
`57`	`59`