From ad15e84b82ea28bf19b7554ee4314bd71ddf4edc Mon Sep 17 00:00:00 2001
From: kaushikmitr <kaushikmitra.umd@gmail.com>
Date: Thu, 27 Feb 2025 21:32:21 +0000
Subject: [PATCH 01/12] scheduling changes for lora affinity load balancing

---
 config/manifests/vllm/deployment.yaml | 12 +++++-
 pkg/epp/backend/vllm/metrics.go       | 37 ++++++++++++++++-
 pkg/epp/scheduling/filter.go          | 59 +++++++++++++++++++++++++--
 pkg/epp/scheduling/scheduler.go       | 20 ++++-----
 4 files changed, 110 insertions(+), 18 deletions(-)

diff --git a/config/manifests/vllm/deployment.yaml b/config/manifests/vllm/deployment.yaml
index 51689c9f..e6667809 100644
--- a/config/manifests/vllm/deployment.yaml
+++ b/config/manifests/vllm/deployment.yaml
@@ -3,7 +3,7 @@ kind: Deployment
 metadata:
   name: vllm-llama2-7b-pool
 spec:
-  replicas: 3
+  replicas: 6
   selector:
     matchLabels:
       app: vllm-llama2-7b-pool
@@ -24,15 +24,23 @@ spec:
           - "1"
           - "--port"
           - "8000"
+          - "--compilation-config"
+          - "3"
+          - "--max-num-seqs"
+          - "2048"
           - "--enable-lora"
           - "--max-loras"
           - "4"
           - "--max-cpu-loras"
-          - "12"
+          - "15"
+          - "--max-lora-rank"
+          - "16"
           - "--lora-modules"
           - '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
           - '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
           env:
+            - name: VLLM_USE_V1
+              value: "1"
             - name: PORT
               value: "8000"
             - name: HUGGING_FACE_HUB_TOKEN
diff --git a/pkg/epp/backend/vllm/metrics.go b/pkg/epp/backend/vllm/metrics.go
index 8648e24c..854ffbec 100644
--- a/pkg/epp/backend/vllm/metrics.go
+++ b/pkg/epp/backend/vllm/metrics.go
@@ -37,6 +37,7 @@ import (
 const (
 	LoraRequestInfoMetricName                = "vllm:lora_requests_info"
 	LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters"
+	LoraRequestInfoWaitingAdaptersMetricName = "waiting_lora_adapters"
 	LoraRequestInfoMaxAdaptersMetricName     = "max_lora"
 	// TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork.
 	RunningQueueSizeMetricName = "vllm:num_requests_running"
@@ -136,6 +137,14 @@ func promToPodMetrics(
 					}
 				}
 			}
+			if label.GetName() == LoraRequestInfoWaitingAdaptersMetricName {
+				if label.GetValue() != "" {
+					adapterList := strings.Split(label.GetValue(), ",")
+					for _, adapter := range adapterList {
+						updated.ActiveModels[adapter] = 0
+					}
+				}
+			}
 			if label.GetName() == LoraRequestInfoMaxAdaptersMetricName {
 				if label.GetValue() != "" {
 					updated.MaxActiveModels, err = strconv.Atoi(label.GetValue())
@@ -161,14 +170,40 @@ func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.Metr
 		logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", LoraRequestInfoMetricName)
 		return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName)
 	}
-	var latestTs float64
+
 	var latest *dto.Metric
+	var latestTs float64
+
+	// Iterate over all metrics in the family.
 	for _, m := range loraRequests.GetMetric() {
+		var running, waiting string
+		// Read the label values for running and waiting adapters.
+		for _, lp := range m.GetLabel() {
+			switch lp.GetName() {
+			case LoraRequestInfoRunningAdaptersMetricName:
+				running = lp.GetValue()
+			case LoraRequestInfoWaitingAdaptersMetricName:
+				waiting = lp.GetValue()
+			}
+		}
+
+		// Ignore metrics with both labels empty.
+		if running == "" && waiting == "" {
+			//	continue
+		}
+
+		// Select the metric with the latest creation timestamp.
 		if m.GetGauge().GetValue() > latestTs {
 			latestTs = m.GetGauge().GetValue()
 			latest = m
 		}
 	}
+
+	if latest == nil {
+		return nil, time.Time{}, fmt.Errorf("no valid metric found")
+	}
+
+	// Convert the gauge value (creation timestamp) to time.Time.
 	return latest, time.Unix(0, int64(latestTs*1000)), nil
 }
 
diff --git a/pkg/epp/scheduling/filter.go b/pkg/epp/scheduling/filter.go
index b7881468..21622bae 100644
--- a/pkg/epp/scheduling/filter.go
+++ b/pkg/epp/scheduling/filter.go
@@ -19,6 +19,8 @@ package scheduling
 import (
 	"errors"
 	"math"
+	"math/rand"
+	"time"
 
 	"github.com/go-logr/logr"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
@@ -183,10 +185,59 @@ func lowLoRACostPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool {
 	return ok || len(pod.ActiveModels) < pod.MaxActiveModels
 }
 
-// loRAAffinityPredicate is a filter function to check whether a pod has affinity to the lora requested.
-func loRAAffinityPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool {
-	_, ok := pod.ActiveModels[req.ResolvedTargetModel]
-	return ok
+// loRASoftAffinityPredicate implements a pod selection strategy that prioritizes pods
+// with existing LoRA model affinity while allowing for load balancing through randomization.
+//
+// The function works by:
+// 1. Separating pods into two groups: those with target model affinity and those with available capacity
+// 2. Using a probability threshold to sometimes select from non-affinity pods to enable load balancing
+// 3. Falling back to whatever group has pods if one group is empty
+//
+// Parameters:
+//   - logger: Logger interface for diagnostic output
+//   - req: LLM request containing the resolved target model
+//   - pods: Slice of pod metrics to filter
+//
+// Returns:
+//   - Filtered slice of pod metrics based on affinity and availability
+//   - Error if any issues occur during filtering
+func loRASoftAffinityPredicate(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) {
+
+	// Pre-allocate slices with estimated capacity
+	filtered_affinity := make([]*datastore.PodMetrics, 0, len(pods))
+	filtered_available := make([]*datastore.PodMetrics, 0, len(pods))
+
+	// Categorize pods based on affinity and availability
+	for _, pod := range pods {
+		if pod == nil {
+			continue
+		}
+
+		if _, exists := pod.ActiveModels[req.ResolvedTargetModel]; exists {
+			filtered_affinity = append(filtered_affinity, pod)
+		} else if len(pod.ActiveModels) < pod.MaxActiveModels {
+			filtered_available = append(filtered_available, pod)
+		}
+	}
+
+	// Use crypto/rand for better randomization in production environments
+	randSource := rand.NewSource(time.Now().UnixNano())
+	randGen := rand.New(randSource)
+
+	// If both groups have pods, use probability to select which group to return
+	if len(filtered_affinity) > 0 && len(filtered_available) > 0 {
+		if randGen.Float64() < loraAffinityThreshold {
+			return filtered_affinity, nil
+		}
+		return filtered_available, nil
+	}
+
+	// Return whichever group has pods
+	if len(filtered_affinity) > 0 {
+		return filtered_affinity, nil
+	}
+
+	return filtered_available, nil
 }
 
 // canAcceptNewLoraPredicate is a filter function to check whether a pod has room to load the adapter.
diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go
index a969948e..70429854 100644
--- a/pkg/epp/scheduling/scheduler.go
+++ b/pkg/epp/scheduling/scheduler.go
@@ -36,8 +36,11 @@ const (
 	queueThresholdCritical = 5
 	// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable.
 	// the threshold for queued requests to be considered low below which we can prioritize LoRA affinity.
-	// The value of 50 is arrived heuristicically based on experiments.
-	queueingThresholdLoRA = 50
+	// The value of 128 is arrived heuristicically based on experiments.
+	queueingThresholdLoRA = 128
+	// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable.
+	// loraAffinityThreshold indicates the probability with which we prefer a pod with LoRA affinity over a pod without but having room to fit more LoRA adapters.
+	loraAffinityThreshold = 0.999
 )
 
 var (
@@ -54,7 +57,7 @@ var (
 		filter: leastQueuingFilterFunc,
 		nextOnSuccessOrFailure: &filter{
 			name:   "low cost LoRA",
-			filter: toFilterFunc(lowLoRACostPredicate),
+			filter: loRASoftAffinityPredicate,
 			nextOnSuccessOrFailure: &filter{
 				name:   "least KV cache percent",
 				filter: leastKVCacheFilterFunc,
@@ -76,14 +79,9 @@ var (
 		name:   "low queueing filter",
 		filter: toFilterFunc((lowQueueingPodPredicate)),
 		nextOnSuccess: &filter{
-			name:          "affinity LoRA",
-			filter:        toFilterFunc(loRAAffinityPredicate),
-			nextOnSuccess: queueAndKVCacheFilter,
-			nextOnFailure: &filter{
-				name:                   "can accept LoRA Adapter",
-				filter:                 toFilterFunc(canAcceptNewLoraPredicate),
-				nextOnSuccessOrFailure: queueAndKVCacheFilter,
-			},
+			name:                   "affinity LoRA",
+			filter:                 loRASoftAffinityPredicate,
+			nextOnSuccessOrFailure: queueAndKVCacheFilter,
 		},
 		nextOnFailure: queueLoRAAndKVCacheFilter,
 	}

From b9f57c5dddfd927c780e80f7ea06fe3771279c17 Mon Sep 17 00:00:00 2001
From: kaushikmitr <kaushikmitra.umd@gmail.com>
Date: Sat, 1 Mar 2025 00:28:16 +0000
Subject: [PATCH 02/12] refactor unit tests, address comments

---
 pkg/epp/backend/vllm/metrics.go   | 11 +++++++----
 pkg/epp/scheduling/filter.go      |  5 +----
 pkg/epp/scheduling/scheduler.go   |  4 ++--
 test/integration/hermetic_test.go |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/pkg/epp/backend/vllm/metrics.go b/pkg/epp/backend/vllm/metrics.go
index 854ffbec..a93ec0b1 100644
--- a/pkg/epp/backend/vllm/metrics.go
+++ b/pkg/epp/backend/vllm/metrics.go
@@ -34,6 +34,9 @@ import (
 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 )
 
+// Metric names used in the vLLM metrics implementation.
+// Refer to the protocol doc for more details:
+// https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol
 const (
 	LoraRequestInfoMetricName                = "vllm:lora_requests_info"
 	LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters"
@@ -46,8 +49,7 @@ const (
 	RunningQueueSizeMetricName        = "vllm:num_tokens_running"
 	WaitingQueueSizeMetricName        = "vllm:num_tokens_waiting"
 	*/
-	KVCacheUsagePercentMetricName     = "vllm:gpu_cache_usage_perc"
-	KvCacheMaxTokenCapacityMetricName = "vllm:gpu_cache_max_token_capacity"
+	KVCacheUsagePercentMetricName = "vllm:gpu_cache_usage_perc"
 )
 
 type PodMetricsClientImpl struct{}
@@ -189,7 +191,7 @@ func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.Metr
 
 		// Ignore metrics with both labels empty.
 		if running == "" && waiting == "" {
-			//	continue
+			continue
 		}
 
 		// Select the metric with the latest creation timestamp.
@@ -200,7 +202,8 @@ func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.Metr
 	}
 
 	if latest == nil {
-		return nil, time.Time{}, fmt.Errorf("no valid metric found")
+		logger.V(logutil.TRACE).Info("Metric value Empty", "value", latest, "metric", LoraRequestInfoMetricName)
+		return nil, time.Time{}, nil
 	}
 
 	// Convert the gauge value (creation timestamp) to time.Time.
diff --git a/pkg/epp/scheduling/filter.go b/pkg/epp/scheduling/filter.go
index 21622bae..a911cbd6 100644
--- a/pkg/epp/scheduling/filter.go
+++ b/pkg/epp/scheduling/filter.go
@@ -201,7 +201,7 @@ func lowLoRACostPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool {
 // Returns:
 //   - Filtered slice of pod metrics based on affinity and availability
 //   - Error if any issues occur during filtering
-func loRASoftAffinityPredicate(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) {
+func loRASoftAffinityFilter(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) {
 
 	// Pre-allocate slices with estimated capacity
 	filtered_affinity := make([]*datastore.PodMetrics, 0, len(pods))
@@ -209,9 +209,6 @@ func loRASoftAffinityPredicate(logger logr.Logger, req *LLMRequest, pods []*data
 
 	// Categorize pods based on affinity and availability
 	for _, pod := range pods {
-		if pod == nil {
-			continue
-		}
 
 		if _, exists := pod.ActiveModels[req.ResolvedTargetModel]; exists {
 			filtered_affinity = append(filtered_affinity, pod)
diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go
index 70429854..bdddd972 100644
--- a/pkg/epp/scheduling/scheduler.go
+++ b/pkg/epp/scheduling/scheduler.go
@@ -57,7 +57,7 @@ var (
 		filter: leastQueuingFilterFunc,
 		nextOnSuccessOrFailure: &filter{
 			name:   "low cost LoRA",
-			filter: loRASoftAffinityPredicate,
+			filter: loRASoftAffinityFilter,
 			nextOnSuccessOrFailure: &filter{
 				name:   "least KV cache percent",
 				filter: leastKVCacheFilterFunc,
@@ -80,7 +80,7 @@ var (
 		filter: toFilterFunc((lowQueueingPodPredicate)),
 		nextOnSuccess: &filter{
 			name:                   "affinity LoRA",
-			filter:                 loRASoftAffinityPredicate,
+			filter:                 loRASoftAffinityFilter,
 			nextOnSuccessOrFailure: queueAndKVCacheFilter,
 		},
 		nextOnFailure: queueLoRAAndKVCacheFilter,
diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go
index 2ea66dba..1ab96754 100644
--- a/test/integration/hermetic_test.go
+++ b/test/integration/hermetic_test.go
@@ -179,7 +179,7 @@ func TestKubeInferenceModelRequest(t *testing.T) {
 					},
 				}),
 				extprocutils.FakePodMetrics(1, datastore.Metrics{
-					WaitingQueueSize:    50,
+					WaitingQueueSize:    200,
 					KVCacheUsagePercent: 0.1,
 					ActiveModels: map[string]int{
 						"foo":            1,

From 9e94fd987c2be5157dd5ef8562bb6872988c45a9 Mon Sep 17 00:00:00 2001
From: kaushikmitr <kaushikmitra.umd@gmail.com>
Date: Sat, 1 Mar 2025 00:33:39 +0000
Subject: [PATCH 03/12] restore vllm deployment manifest

---
 config/manifests/vllm/deployment.yaml | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/config/manifests/vllm/deployment.yaml b/config/manifests/vllm/deployment.yaml
index e6667809..46979ec3 100644
--- a/config/manifests/vllm/deployment.yaml
+++ b/config/manifests/vllm/deployment.yaml
@@ -3,7 +3,7 @@ kind: Deployment
 metadata:
   name: vllm-llama2-7b-pool
 spec:
-  replicas: 6
+  replicas: 3
   selector:
     matchLabels:
       app: vllm-llama2-7b-pool
@@ -24,23 +24,15 @@ spec:
           - "1"
           - "--port"
           - "8000"
-          - "--compilation-config"
-          - "3"
-          - "--max-num-seqs"
-          - "2048"
           - "--enable-lora"
           - "--max-loras"
           - "4"
           - "--max-cpu-loras"
-          - "15"
-          - "--max-lora-rank"
-          - "16"
+          - "12"
           - "--lora-modules"
           - '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
           - '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
           env:
-            - name: VLLM_USE_V1
-              value: "1"
             - name: PORT
               value: "8000"
             - name: HUGGING_FACE_HUB_TOKEN
@@ -128,4 +120,4 @@ data:
           - base-model: meta-llama/Llama-2-7b-hf
             id: tweet-summary-1
             source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
-  
+  
\ No newline at end of file

From 2b934d03e1428ea3412ebd2fcc7963a0a785c118 Mon Sep 17 00:00:00 2001
From: kaushikmitr <kaushikmitra.umd@gmail.com>
Date: Sat, 1 Mar 2025 00:40:52 +0000
Subject: [PATCH 04/12] update README for model server protocol to add waiting
 lora adapters

---
 docs/proposals/003-model-server-protocol/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/proposals/003-model-server-protocol/README.md b/docs/proposals/003-model-server-protocol/README.md
index 44ecf4e1..ab6d5819 100644
--- a/docs/proposals/003-model-server-protocol/README.md
+++ b/docs/proposals/003-model-server-protocol/README.md
@@ -47,3 +47,5 @@ The model server MUST expose the following LoRA adapter metrics via the same Pro
   requested adapter. Example: `"max_lora": "8"`.
   * `running_lora_adapters`: A comma separated list of adapters that are currently loaded in GPU
     memory and ready to serve requests. Example: `"running_lora_adapters": "adapter1, adapter2"`
+  * `waiting_lora_adapters`: A comma separated list of adapters that are currently loaded in GPU
+    memory and ready to serve requests. Example: `"waiting_lora_adapters": "adapter1, adapter2"`

From 2d3a3bb16d9dc9f0843130a3e7aa03d7c19a25c6 Mon Sep 17 00:00:00 2001
From: kaushikmitr <kaushikmitra.umd@gmail.com>
Date: Sat, 1 Mar 2025 00:57:25 +0000
Subject: [PATCH 05/12] remove unused variables

---
 pkg/epp/scheduling/filter.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/epp/scheduling/filter.go b/pkg/epp/scheduling/filter.go
index a911cbd6..e1a118b9 100644
--- a/pkg/epp/scheduling/filter.go
+++ b/pkg/epp/scheduling/filter.go
@@ -238,11 +238,11 @@ func loRASoftAffinityFilter(logger logr.Logger, req *LLMRequest, pods []*datasto
 }
 
 // canAcceptNewLoraPredicate is a filter function to check whether a pod has room to load the adapter.
-func canAcceptNewLoraPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool {
+func canAcceptNewLoraPredicate(_ *LLMRequest, pod *datastore.PodMetrics) bool {
 	return len(pod.ActiveModels) < pod.MaxActiveModels
 }
 
-func criticalRequestPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool {
+func criticalRequestPredicate(req *LLMRequest, _ *datastore.PodMetrics) bool {
 	return req.Critical
 }
 

From 323e141534b6c451ef473065abb4ac53641804b5 Mon Sep 17 00:00:00 2001
From: kaushikmitr <kaushikmitra.umd@gmail.com>
Date: Sat, 1 Mar 2025 02:20:15 +0000
Subject: [PATCH 06/12] removed unused func

---
 pkg/epp/scheduling/filter.go | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/pkg/epp/scheduling/filter.go b/pkg/epp/scheduling/filter.go
index e1a118b9..d3c22673 100644
--- a/pkg/epp/scheduling/filter.go
+++ b/pkg/epp/scheduling/filter.go
@@ -237,11 +237,6 @@ func loRASoftAffinityFilter(logger logr.Logger, req *LLMRequest, pods []*datasto
 	return filtered_available, nil
 }
 
-// canAcceptNewLoraPredicate is a filter function to check whether a pod has room to load the adapter.
-func canAcceptNewLoraPredicate(_ *LLMRequest, pod *datastore.PodMetrics) bool {
-	return len(pod.ActiveModels) < pod.MaxActiveModels
-}
-
 func criticalRequestPredicate(req *LLMRequest, _ *datastore.PodMetrics) bool {
 	return req.Critical
 }

From 41ec5b84f7c1ecefe5ef334684b079e76dbc20a5 Mon Sep 17 00:00:00 2001
From: kaushikmitr <kaushikmitra.umd@gmail.com>
Date: Sat, 1 Mar 2025 02:32:07 +0000
Subject: [PATCH 07/12] fix model protocol readme

---
 docs/proposals/003-model-server-protocol/README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/proposals/003-model-server-protocol/README.md b/docs/proposals/003-model-server-protocol/README.md
index ab6d5819..c0319461 100644
--- a/docs/proposals/003-model-server-protocol/README.md
+++ b/docs/proposals/003-model-server-protocol/README.md
@@ -47,5 +47,4 @@ The model server MUST expose the following LoRA adapter metrics via the same Pro
   requested adapter. Example: `"max_lora": "8"`.
   * `running_lora_adapters`: A comma separated list of adapters that are currently loaded in GPU
     memory and ready to serve requests. Example: `"running_lora_adapters": "adapter1, adapter2"`
-  * `waiting_lora_adapters`: A comma separated list of adapters that are currently loaded in GPU
-    memory and ready to serve requests. Example: `"waiting_lora_adapters": "adapter1, adapter2"`
+  * `waiting_lora_adapters`: A comma separated list of adapters that are waiting to be served. Example: `"waiting_lora_adapters": "adapter1, adapter2"`

From 299161749ef7b62e881cf448262a95040e7bbc7f Mon Sep 17 00:00:00 2001
From: kaushikmitr <kaushikmitra.umd@gmail.com>
Date: Mon, 3 Mar 2025 21:21:09 +0000
Subject: [PATCH 08/12] fix hermetic test for select active lora, low queue

---
 test/integration/hermetic_test.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go
index 1ab96754..81098353 100644
--- a/test/integration/hermetic_test.go
+++ b/test/integration/hermetic_test.go
@@ -142,6 +142,7 @@ func TestKubeInferenceModelRequest(t *testing.T) {
 					KVCacheUsagePercent: 0.2,
 					ActiveModels: map[string]int{
 						"foo": 1,
+						"bar": 1,
 					},
 				}),
 			},

From be3ce8beeb4d52c78c223bd6b222c2ebb3cbeef3 Mon Sep 17 00:00:00 2001
From: kaushikmitr <kaushikmitra.umd@gmail.com>
Date: Mon, 3 Mar 2025 21:26:14 +0000
Subject: [PATCH 09/12] update comment in metrics.go in vllm backend

---
 pkg/epp/backend/vllm/metrics.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pkg/epp/backend/vllm/metrics.go b/pkg/epp/backend/vllm/metrics.go
index a93ec0b1..5dddaf86 100644
--- a/pkg/epp/backend/vllm/metrics.go
+++ b/pkg/epp/backend/vllm/metrics.go
@@ -189,7 +189,8 @@ func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.Metr
 			}
 		}
 
-		// Ignore metrics with both labels empty.
+		// Ignore metrics with both labels empty. This happens when there are no running or waiting requests on
+		// the server, in this case it is best to use the last set of active adapters.
 		if running == "" && waiting == "" {
 			continue
 		}

From 71b95e61deb8608b6cbe6e9d372c17031c6ea58d Mon Sep 17 00:00:00 2001
From: kaushikmitr <kaushikmitra.umd@gmail.com>
Date: Mon, 3 Mar 2025 21:59:43 +0000
Subject: [PATCH 10/12] add filter test  TestLoRASoftAffinityDistribution

---
 pkg/epp/scheduling/filter_test.go | 84 +++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

diff --git a/pkg/epp/scheduling/filter_test.go b/pkg/epp/scheduling/filter_test.go
index ac765b78..33743418 100644
--- a/pkg/epp/scheduling/filter_test.go
+++ b/pkg/epp/scheduling/filter_test.go
@@ -429,3 +429,87 @@ func TestFilterFunc(t *testing.T) {
 		})
 	}
 }
+
+// TestLoRASoftAffinityDistribution tests that the loRASoftAffinityFilter function
+// properly distributes requests according to the loraAffinityThreshold
+func TestLoRASoftAffinityDistribution(t *testing.T) {
+	logger := logutil.NewTestLogger()
+
+	const (
+		testModelName     = "test-model"
+		testAffinityModel = "test-affinity-model"
+		numIterations     = 10000
+		tolerancePercent  = 5.0 // Allow 5% tolerance from expected distribution
+	)
+
+	// Create a test request and pods
+	req := &LLMRequest{
+		Model:               testAffinityModel,
+		ResolvedTargetModel: testAffinityModel,
+	}
+
+	// Test setup: One affinity pod and one available pod
+	pods := []*datastore.PodMetrics{
+		{
+			Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "affinity-pod"}},
+			Metrics: datastore.Metrics{
+				MaxActiveModels: 2,
+				ActiveModels: map[string]int{
+					testAffinityModel: 1,
+				},
+			},
+		},
+		{
+			Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "available-pod"}},
+			Metrics: datastore.Metrics{
+				MaxActiveModels: 2,
+				ActiveModels:    map[string]int{},
+			},
+		},
+	}
+
+	// Run the filter function multiple times and count the results
+	affinityCount := 0
+	availableCount := 0
+
+	// Use the actual loraAffinityThreshold as defined in the original code
+	// This test should work with whatever value is set there
+	expectedAffinityPercent := loraAffinityThreshold * 100
+
+	for i := 0; i < numIterations; i++ {
+		result, err := loRASoftAffinityFilter(logger, req, pods)
+		if err != nil {
+			t.Fatalf("Unexpected error: %v", err)
+		}
+
+		// Check which type of pod was returned
+		if len(result) != 1 {
+			t.Fatalf("Expected exactly one pod in result, got %d", len(result))
+		}
+
+		// Identify if the returned pod is the affinity pod or available pod
+		if _, exists := result[0].ActiveModels[testAffinityModel]; exists {
+			affinityCount++
+		} else {
+			availableCount++
+		}
+	}
+
+	// Calculate the actual percentages
+	actualAffinityPercent := float64(affinityCount) / float64(numIterations) * 100
+	actualAvailablePercent := float64(availableCount) / float64(numIterations) * 100
+
+	// Check if the distribution matches expected threshold within tolerance
+	affinityLowerBound := expectedAffinityPercent - tolerancePercent
+	affinityUpperBound := expectedAffinityPercent + tolerancePercent
+
+	t.Logf("Distribution results over %d iterations:", numIterations)
+	t.Logf("Expected affinity percent: %.2f%% (threshold: %.2f)", expectedAffinityPercent, loraAffinityThreshold)
+	t.Logf("Actual affinity percent: %.2f%% (%d out of %d)", actualAffinityPercent, affinityCount, numIterations)
+	t.Logf("Actual available percent: %.2f%% (%d out of %d)", actualAvailablePercent, availableCount, numIterations)
+
+	if actualAffinityPercent < affinityLowerBound || actualAffinityPercent > affinityUpperBound {
+		t.Errorf("Affinity selection percent %.2f%% outside expected range %.2f%% to %.2f%%",
+			actualAffinityPercent, affinityLowerBound, affinityUpperBound)
+	}
+}

From d6093cec937bfff29e6b64de2fa0bcf7d6bc05f8 Mon Sep 17 00:00:00 2001
From: kaushikmitr <kaushikmitra.umd@gmail.com>
Date: Tue, 4 Mar 2025 02:38:08 +0000
Subject: [PATCH 11/12] restore vllm manifest

---
 config/manifests/vllm/deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/manifests/vllm/deployment.yaml b/config/manifests/vllm/deployment.yaml
index 46979ec3..51689c9f 100644
--- a/config/manifests/vllm/deployment.yaml
+++ b/config/manifests/vllm/deployment.yaml
@@ -120,4 +120,4 @@ data:
           - base-model: meta-llama/Llama-2-7b-hf
             id: tweet-summary-1
             source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
-  
\ No newline at end of file
+  

From b5d7f8f5f6488bdbad94215e6b455a33ba3d0cf2 Mon Sep 17 00:00:00 2001
From: kaushikmitr <kaushikmitra.umd@gmail.com>
Date: Tue, 4 Mar 2025 18:02:21 +0000
Subject: [PATCH 12/12] update unit test

---
 pkg/epp/scheduling/filter_test.go | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pkg/epp/scheduling/filter_test.go b/pkg/epp/scheduling/filter_test.go
index 33743418..f76cece9 100644
--- a/pkg/epp/scheduling/filter_test.go
+++ b/pkg/epp/scheduling/filter_test.go
@@ -475,7 +475,6 @@ func TestLoRASoftAffinityDistribution(t *testing.T) {
 	// Use the actual loraAffinityThreshold as defined in the original code
 	// This test should work with whatever value is set there
 	expectedAffinityPercent := loraAffinityThreshold * 100
-
 	for i := 0; i < numIterations; i++ {
 		result, err := loRASoftAffinityFilter(logger, req, pods)
 		if err != nil {
@@ -503,6 +502,9 @@ func TestLoRASoftAffinityDistribution(t *testing.T) {
 	affinityLowerBound := expectedAffinityPercent - tolerancePercent
 	affinityUpperBound := expectedAffinityPercent + tolerancePercent
 
+	availableLowerBound := actualAvailablePercent - tolerancePercent
+	availableUpperBound := actualAvailablePercent + tolerancePercent
+
 	t.Logf("Distribution results over %d iterations:", numIterations)
 	t.Logf("Expected affinity percent: %.2f%% (threshold: %.2f)", expectedAffinityPercent, loraAffinityThreshold)
 	t.Logf("Actual affinity percent: %.2f%% (%d out of %d)", actualAffinityPercent, affinityCount, numIterations)
@@ -512,4 +514,8 @@ func TestLoRASoftAffinityDistribution(t *testing.T) {
 		t.Errorf("Affinity selection percent %.2f%% outside expected range %.2f%% to %.2f%%",
 			actualAffinityPercent, affinityLowerBound, affinityUpperBound)
 	}
+	if actualAvailablePercent < availableLowerBound || actualAvailablePercent > availableUpperBound {
+		t.Errorf("Availability selection percent %.2f%% outside expected range %.2f%% to %.2f%%",
+			actualAvailablePercent, availableLowerBound, availableUpperBound)
+	}
 }