multi lora changes for better load balancing

kaushikmitr · kaushikmitr · commit cac37ae4e64d · 2025-02-27T19:45:01.000Z
diff --git a/config/manifests/vllm/deployment.yaml b/config/manifests/vllm/deployment.yaml
@@ -3,7 +3,7 @@ kind: Deployment
 metadata:
   name: vllm-llama2-7b-pool
 spec:
-  replicas: 2
+  replicas: 6
   selector:
     matchLabels:
       app: vllm-llama2-7b-pool
@@ -38,19 +38,6 @@ spec:
           - "--lora-modules"
           - '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
           - '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
-          - '{"name": "tweet-summary-2", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
-          - '{"name": "tweet-summary-3", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
-          - '{"name": "tweet-summary-4", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
-          - '{"name": "tweet-summary-5", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
-          - '{"name": "tweet-summary-6", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
-          - '{"name": "tweet-summary-7", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
-          - '{"name": "tweet-summary-8", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
-          - '{"name": "tweet-summary-9", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
-          - '{"name": "tweet-summary-10", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
-          - '{"name": "tweet-summary-11", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
-          - '{"name": "tweet-summary-12", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
-          - '{"name": "tweet-summary-13", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
-          - '{"name": "tweet-summary-14", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
           env:
             - name: VLLM_USE_V1
               value: "1"
@@ -63,8 +50,6 @@ spec:
                   key: token
             - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
               value: "true"
-            - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
-              value: "true"
           ports:
             - containerPort: 8000
               name: http
diff --git a/pkg/epp/backend/vllm/metrics.go b/pkg/epp/backend/vllm/metrics.go
@@ -168,7 +168,7 @@ func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.Metr
 	loraRequests, ok := metricFamilies[LoraRequestInfoMetricName]
 	if !ok {
 		logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", LoraRequestInfoMetricName)
-		return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName)
+		return nil, time.Time{}, nil
 	}
 
 	var latest *dto.Metric
diff --git a/pkg/epp/scheduling/filter.go b/pkg/epp/scheduling/filter.go
@@ -19,22 +19,19 @@ package scheduling
 import (
 	"errors"
 	"math"
+	"math/rand"
+	"time"
 
 	"github.com/go-logr/logr"
-	klog "k8s.io/klog/v2"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend"
+	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 )
 
 type Filter interface {
 	Name() string
 	Filter(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error)
 }
 
-const (
-	maxLoRACost = 4
-)
-
 // filter applies current filterFunc, and then recursively applies next filters depending success or
 // failure of the current filterFunc.
 // It can be used to construct a flow chart algorithm.
@@ -62,8 +59,9 @@ func (f *filter) Name() string {
 	return f.name
 }
 
-func (f *filter) Filter(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
-	klog.InfoS("Running a filter", "name", f.Name(), "request", req, "podCount", len(pods))
+func (f *filter) Filter(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) {
+	loggerTrace := logger.V(logutil.TRACE)
+	loggerTrace.Info("Running a filter", "name", f.Name(), "podCount", len(pods))
 
 	filtered, err := f.filter(logger, req, pods)
 
@@ -76,8 +74,7 @@ func (f *filter) Filter(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend
 		if f.nextOnSuccess != nil {
 			next = f.nextOnSuccess
 		}
-		klog.InfoS("Filter succeeded", "filter", f.Name(), "next", next.Name(), "filteredPodCount", len(filtered))
-
+		loggerTrace.Info("Filter succeeded", "filter", f.Name(), "next", next.Name(), "filteredPodCount", len(filtered))
 		// On success, pass the filtered result to the next filter.
 		return next.Filter(logger, req, filtered)
 	} else {
@@ -88,7 +85,7 @@ func (f *filter) Filter(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend
 		if f.nextOnFailure != nil {
 			next = f.nextOnFailure
 		}
-		klog.InfoS("Filter failed", "filter", f.Name(), "next", next.Name())
+		loggerTrace.Info("Filter failed", "filter", f.Name(), "next", next.Name())
 		// On failure, pass the initial set of pods to the next filter.
 		return next.Filter(logger, req, pods)
 	}
@@ -175,22 +172,6 @@ func leastKVCacheFilterFunc(logger logr.Logger, req *LLMRequest, pods []*datasto
 	return filtered, nil
 }
 
-func minLoRACostFilterFunc(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
-
-	var min int = math.MaxInt
-	filtered := []*backend.PodMetrics{}
-
-	for _, pod := range pods {
-		if len(pod.ActiveModels) < min {
-			min = len(pod.ActiveModels)
-			filtered = []*backend.PodMetrics{pod}
-		} else if len(pod.ActiveModels) == min {
-			filtered = append(filtered, pod)
-		}
-	}
-	return filtered, nil
-}
-
 // podPredicate is a filter function to check whether a pod is desired.
 type podPredicate func(req *LLMRequest, pod *datastore.PodMetrics) bool
 
@@ -201,18 +182,67 @@ type podPredicate func(req *LLMRequest, pod *datastore.PodMetrics) bool
 // where # of lora slots > # of lora adapters.
 func lowLoRACostPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool {
 	_, ok := pod.ActiveModels[req.ResolvedTargetModel]
-	return ok || len(pod.ActiveModels) < maxLoRACost
+	return ok || len(pod.ActiveModels) < pod.MaxActiveModels
 }
 
-// loRAAffinityPredicate is a filter function to check whether a pod has affinity to the lora requested.
-func loRAAffinityPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool {
-	_, ok := pod.ActiveModels[req.ResolvedTargetModel]
-	return ok
+// loRASoftAffinityPredicate implements a pod selection strategy that prioritizes pods
+// with existing LoRA model affinity while allowing for load balancing through randomization.
+//
+// The function works by:
+// 1. Separating pods into two groups: those with target model affinity and those with available capacity
+// 2. Using a probability threshold to sometimes select from non-affinity pods to enable load balancing
+// 3. Falling back to whatever group has pods if one group is empty
+//
+// Parameters:
+//   - logger: Logger interface for diagnostic output
+//   - req: LLM request containing the resolved target model
+//   - pods: Slice of pod metrics to filter
+//
+// Returns:
+//   - Filtered slice of pod metrics based on affinity and availability
+//   - Error if any issues occur during filtering
+func loRASoftAffinityPredicate(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) {
+
+	// Pre-allocate slices with estimated capacity
+	filtered_affinity := make([]*datastore.PodMetrics, 0, len(pods))
+	filtered_available := make([]*datastore.PodMetrics, 0, len(pods))
+
+	// Categorize pods based on affinity and availability
+	for _, pod := range pods {
+		if pod == nil {
+			continue
+		}
+
+		if _, exists := pod.ActiveModels[req.ResolvedTargetModel]; exists {
+			filtered_affinity = append(filtered_affinity, pod)
+		} else if len(pod.ActiveModels) < pod.MaxActiveModels {
+			filtered_available = append(filtered_available, pod)
+		}
+	}
+
+	// Use crypto/rand for better randomization in production environments
+	randSource := rand.NewSource(time.Now().UnixNano())
+	randGen := rand.New(randSource)
+
+	// If both groups have pods, use probability to select which group to return
+	if len(filtered_affinity) > 0 && len(filtered_available) > 0 {
+		if randGen.Float64() < loraAffinityThreshold {
+			return filtered_affinity, nil
+		}
+		return filtered_available, nil
+	}
+
+	// Return whichever group has pods
+	if len(filtered_affinity) > 0 {
+		return filtered_affinity, nil
+	}
+
+	return filtered_available, nil
 }
 
 // canAcceptNewLoraPredicate is a filter function to check whether a pod has room to load the adapter.
-func canAcceptNewLoraPredicate(req *LLMRequest, pod *backend.PodMetrics) bool {
-	return len(pod.ActiveModels) < maxLoRACost
+func canAcceptNewLoraPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool {
+	return len(pod.ActiveModels) < pod.MaxActiveModels
 }
 
 func criticalRequestPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool {
diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go
@@ -18,11 +18,12 @@ limitations under the License.
 package scheduling
 
 import (
+	"context"
 	"fmt"
 	"math/rand"
 
 	"github.com/go-logr/logr"
-	klog "k8s.io/klog/v2"
+	"sigs.k8s.io/controller-runtime/pkg/log"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
 	errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error"
 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
@@ -35,8 +36,11 @@ const (
 	queueThresholdCritical = 5
 	// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable.
 	// the threshold for queued requests to be considered low below which we can prioritize LoRA affinity.
-	// The value of 50 is arrived heuristicically based on experiments.
+	// The value of 128 is arrived heuristicically based on experiments.
 	queueingThresholdLoRA = 128
+	// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable.
+	// loraAffinityThreshold indicates the probability with which we prefer a pod with LoRA affinity over a pod without but having room to fit more LoRA adapters.
+	loraAffinityThreshold = 0.999
 )
 
 var (
@@ -53,7 +57,7 @@ var (
 		filter: leastQueuingFilterFunc,
 		nextOnSuccessOrFailure: &filter{
 			name:   "low cost LoRA",
-			filter: minLoRACostFilterFunc,
+			filter: loRASoftAffinityPredicate,
 			nextOnSuccessOrFailure: &filter{
 				name:   "least KV cache percent",
 				filter: leastKVCacheFilterFunc,
@@ -75,14 +79,9 @@ var (
 		name:   "low queueing filter",
 		filter: toFilterFunc((lowQueueingPodPredicate)),
 		nextOnSuccess: &filter{
-			name:          "affinity LoRA",
-			filter:        toFilterFunc(loRAAffinityPredicate),
-			nextOnSuccess: queueAndKVCacheFilter,
-			nextOnFailure: &filter{
-				name:                   "can accept LoRA Adapter",
-				filter:                 minLoRACostFilterFunc,
-				nextOnSuccessOrFailure: queueAndKVCacheFilter,
-			},
+			name:                   "affinity LoRA",
+			filter:                 loRASoftAffinityPredicate,
+			nextOnSuccessOrFailure: queueAndKVCacheFilter,
 		},
 		nextOnFailure: queueLoRAAndKVCacheFilter,
 	}
@@ -121,14 +120,16 @@ type Scheduler struct {
 }
 
 // Schedule finds the target pod based on metrics and the requested lora adapter.
-func (s *Scheduler) Schedule(req *LLMRequest) (targetPod backend.Pod, err error) {
-	klog.InfoS("Scheduling a request", "request", req, "metrics", s.podMetricsProvider.AllPodMetrics())
-	pods, err := s.filter.Filter(req, s.podMetricsProvider.AllPodMetrics())
+func (s *Scheduler) Schedule(ctx context.Context, req *LLMRequest) (targetPod datastore.PodMetrics, err error) {
+	logger := log.FromContext(ctx).WithValues("request", req)
+	podMetrics := s.datastore.PodGetAll()
+	logger.V(logutil.VERBOSE).Info("Scheduling a request", "metrics", podMetrics)
+	pods, err := s.filter.Filter(logger, req, podMetrics)
 	if err != nil || len(pods) == 0 {
 		return datastore.PodMetrics{}, fmt.Errorf(
 			"failed to apply filter, resulted %v pods, this should never happen: %w", len(pods), err)
 	}
-	klog.InfoS("Selecting a random pod from the candidates", "candidatePods", pods)
+	logger.V(logutil.VERBOSE).Info("Selecting a random pod from the candidates", "candidatePods", pods)
 	i := rand.Intn(len(pods))
 	return *pods[i], nil
 }

Original file line number	Diff line number	Diff line change
`@@ -168,7 +168,7 @@ func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.Metr`
`168`	`168`	`loraRequests, ok := metricFamilies[LoraRequestInfoMetricName]`
`169`	`169`	`if !ok {`
`170`	`170`	`logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", LoraRequestInfoMetricName)`
`171`		`- return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName)`
	`171`	`+ return nil, time.Time{}, nil`
`172`	`172`	`}`
`173`	`173`
`174`	`174`	`var latest *dto.Metric`