From ad15e84b82ea28bf19b7554ee4314bd71ddf4edc Mon Sep 17 00:00:00 2001 From: kaushikmitr Date: Thu, 27 Feb 2025 21:32:21 +0000 Subject: [PATCH 01/12] scheduling changes for lora affinity load balancing --- config/manifests/vllm/deployment.yaml | 12 +++++- pkg/epp/backend/vllm/metrics.go | 37 ++++++++++++++++- pkg/epp/scheduling/filter.go | 59 +++++++++++++++++++++++++-- pkg/epp/scheduling/scheduler.go | 20 ++++----- 4 files changed, 110 insertions(+), 18 deletions(-) diff --git a/config/manifests/vllm/deployment.yaml b/config/manifests/vllm/deployment.yaml index 51689c9f..e6667809 100644 --- a/config/manifests/vllm/deployment.yaml +++ b/config/manifests/vllm/deployment.yaml @@ -3,7 +3,7 @@ kind: Deployment metadata: name: vllm-llama2-7b-pool spec: - replicas: 3 + replicas: 6 selector: matchLabels: app: vllm-llama2-7b-pool @@ -24,15 +24,23 @@ spec: - "1" - "--port" - "8000" + - "--compilation-config" + - "3" + - "--max-num-seqs" + - "2048" - "--enable-lora" - "--max-loras" - "4" - "--max-cpu-loras" - - "12" + - "15" + - "--max-lora-rank" + - "16" - "--lora-modules" - '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}' - '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}' env: + - name: VLLM_USE_V1 + value: "1" - name: PORT value: "8000" - name: HUGGING_FACE_HUB_TOKEN diff --git a/pkg/epp/backend/vllm/metrics.go b/pkg/epp/backend/vllm/metrics.go index 8648e24c..854ffbec 100644 --- a/pkg/epp/backend/vllm/metrics.go +++ b/pkg/epp/backend/vllm/metrics.go @@ -37,6 +37,7 @@ import ( const ( LoraRequestInfoMetricName = "vllm:lora_requests_info" LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters" + LoraRequestInfoWaitingAdaptersMetricName = "waiting_lora_adapters" LoraRequestInfoMaxAdaptersMetricName = "max_lora" // TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork. RunningQueueSizeMetricName = "vllm:num_requests_running" @@ -136,6 +137,14 @@ func promToPodMetrics( } } } + if label.GetName() == LoraRequestInfoWaitingAdaptersMetricName { + if label.GetValue() != "" { + adapterList := strings.Split(label.GetValue(), ",") + for _, adapter := range adapterList { + updated.ActiveModels[adapter] = 0 + } + } + } if label.GetName() == LoraRequestInfoMaxAdaptersMetricName { if label.GetValue() != "" { updated.MaxActiveModels, err = strconv.Atoi(label.GetValue()) @@ -161,14 +170,40 @@ func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.Metr logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", LoraRequestInfoMetricName) return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName) } - var latestTs float64 + var latest *dto.Metric + var latestTs float64 + + // Iterate over all metrics in the family. for _, m := range loraRequests.GetMetric() { + var running, waiting string + // Read the label values for running and waiting adapters. + for _, lp := range m.GetLabel() { + switch lp.GetName() { + case LoraRequestInfoRunningAdaptersMetricName: + running = lp.GetValue() + case LoraRequestInfoWaitingAdaptersMetricName: + waiting = lp.GetValue() + } + } + + // Ignore metrics with both labels empty. + if running == "" && waiting == "" { + // continue + } + + // Select the metric with the latest creation timestamp. if m.GetGauge().GetValue() > latestTs { latestTs = m.GetGauge().GetValue() latest = m } } + + if latest == nil { + return nil, time.Time{}, fmt.Errorf("no valid metric found") + } + + // Convert the gauge value (creation timestamp) to time.Time. return latest, time.Unix(0, int64(latestTs*1000)), nil } diff --git a/pkg/epp/scheduling/filter.go b/pkg/epp/scheduling/filter.go index b7881468..21622bae 100644 --- a/pkg/epp/scheduling/filter.go +++ b/pkg/epp/scheduling/filter.go @@ -19,6 +19,8 @@ package scheduling import ( "errors" "math" + "math/rand" + "time" "github.com/go-logr/logr" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" @@ -183,10 +185,59 @@ func lowLoRACostPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool { return ok || len(pod.ActiveModels) < pod.MaxActiveModels } -// loRAAffinityPredicate is a filter function to check whether a pod has affinity to the lora requested. -func loRAAffinityPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool { - _, ok := pod.ActiveModels[req.ResolvedTargetModel] - return ok +// loRASoftAffinityPredicate implements a pod selection strategy that prioritizes pods +// with existing LoRA model affinity while allowing for load balancing through randomization. +// +// The function works by: +// 1. Separating pods into two groups: those with target model affinity and those with available capacity +// 2. Using a probability threshold to sometimes select from non-affinity pods to enable load balancing +// 3. Falling back to whatever group has pods if one group is empty +// +// Parameters: +// - logger: Logger interface for diagnostic output +// - req: LLM request containing the resolved target model +// - pods: Slice of pod metrics to filter +// +// Returns: +// - Filtered slice of pod metrics based on affinity and availability +// - Error if any issues occur during filtering +func loRASoftAffinityPredicate(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) { + + // Pre-allocate slices with estimated capacity + filtered_affinity := make([]*datastore.PodMetrics, 0, len(pods)) + filtered_available := make([]*datastore.PodMetrics, 0, len(pods)) + + // Categorize pods based on affinity and availability + for _, pod := range pods { + if pod == nil { + continue + } + + if _, exists := pod.ActiveModels[req.ResolvedTargetModel]; exists { + filtered_affinity = append(filtered_affinity, pod) + } else if len(pod.ActiveModels) < pod.MaxActiveModels { + filtered_available = append(filtered_available, pod) + } + } + + // Use crypto/rand for better randomization in production environments + randSource := rand.NewSource(time.Now().UnixNano()) + randGen := rand.New(randSource) + + // If both groups have pods, use probability to select which group to return + if len(filtered_affinity) > 0 && len(filtered_available) > 0 { + if randGen.Float64() < loraAffinityThreshold { + return filtered_affinity, nil + } + return filtered_available, nil + } + + // Return whichever group has pods + if len(filtered_affinity) > 0 { + return filtered_affinity, nil + } + + return filtered_available, nil } // canAcceptNewLoraPredicate is a filter function to check whether a pod has room to load the adapter. diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go index a969948e..70429854 100644 --- a/pkg/epp/scheduling/scheduler.go +++ b/pkg/epp/scheduling/scheduler.go @@ -36,8 +36,11 @@ const ( queueThresholdCritical = 5 // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable. // the threshold for queued requests to be considered low below which we can prioritize LoRA affinity. - // The value of 50 is arrived heuristicically based on experiments. - queueingThresholdLoRA = 50 + // The value of 128 is arrived heuristicically based on experiments. + queueingThresholdLoRA = 128 + // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable. + // loraAffinityThreshold indicates the probability with which we prefer a pod with LoRA affinity over a pod without but having room to fit more LoRA adapters. + loraAffinityThreshold = 0.999 ) var ( @@ -54,7 +57,7 @@ var ( filter: leastQueuingFilterFunc, nextOnSuccessOrFailure: &filter{ name: "low cost LoRA", - filter: toFilterFunc(lowLoRACostPredicate), + filter: loRASoftAffinityPredicate, nextOnSuccessOrFailure: &filter{ name: "least KV cache percent", filter: leastKVCacheFilterFunc, @@ -76,14 +79,9 @@ var ( name: "low queueing filter", filter: toFilterFunc((lowQueueingPodPredicate)), nextOnSuccess: &filter{ - name: "affinity LoRA", - filter: toFilterFunc(loRAAffinityPredicate), - nextOnSuccess: queueAndKVCacheFilter, - nextOnFailure: &filter{ - name: "can accept LoRA Adapter", - filter: toFilterFunc(canAcceptNewLoraPredicate), - nextOnSuccessOrFailure: queueAndKVCacheFilter, - }, + name: "affinity LoRA", + filter: loRASoftAffinityPredicate, + nextOnSuccessOrFailure: queueAndKVCacheFilter, }, nextOnFailure: queueLoRAAndKVCacheFilter, } From b9f57c5dddfd927c780e80f7ea06fe3771279c17 Mon Sep 17 00:00:00 2001 From: kaushikmitr Date: Sat, 1 Mar 2025 00:28:16 +0000 Subject: [PATCH 02/12] refactor unit tests, address comments --- pkg/epp/backend/vllm/metrics.go | 11 +++++++---- pkg/epp/scheduling/filter.go | 5 +---- pkg/epp/scheduling/scheduler.go | 4 ++-- test/integration/hermetic_test.go | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pkg/epp/backend/vllm/metrics.go b/pkg/epp/backend/vllm/metrics.go index 854ffbec..a93ec0b1 100644 --- a/pkg/epp/backend/vllm/metrics.go +++ b/pkg/epp/backend/vllm/metrics.go @@ -34,6 +34,9 @@ import ( logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) +// Metric names used in the vLLM metrics implementation. +// Refer to the protocol doc for more details: +// https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol const ( LoraRequestInfoMetricName = "vllm:lora_requests_info" LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters" @@ -46,8 +49,7 @@ const ( RunningQueueSizeMetricName = "vllm:num_tokens_running" WaitingQueueSizeMetricName = "vllm:num_tokens_waiting" */ - KVCacheUsagePercentMetricName = "vllm:gpu_cache_usage_perc" - KvCacheMaxTokenCapacityMetricName = "vllm:gpu_cache_max_token_capacity" + KVCacheUsagePercentMetricName = "vllm:gpu_cache_usage_perc" ) type PodMetricsClientImpl struct{} @@ -189,7 +191,7 @@ func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.Metr // Ignore metrics with both labels empty. if running == "" && waiting == "" { - // continue + continue } // Select the metric with the latest creation timestamp. @@ -200,7 +202,8 @@ func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.Metr } if latest == nil { - return nil, time.Time{}, fmt.Errorf("no valid metric found") + logger.V(logutil.TRACE).Info("Metric value Empty", "value", latest, "metric", LoraRequestInfoMetricName) + return nil, time.Time{}, nil } // Convert the gauge value (creation timestamp) to time.Time. diff --git a/pkg/epp/scheduling/filter.go b/pkg/epp/scheduling/filter.go index 21622bae..a911cbd6 100644 --- a/pkg/epp/scheduling/filter.go +++ b/pkg/epp/scheduling/filter.go @@ -201,7 +201,7 @@ func lowLoRACostPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool { // Returns: // - Filtered slice of pod metrics based on affinity and availability // - Error if any issues occur during filtering -func loRASoftAffinityPredicate(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) { +func loRASoftAffinityFilter(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) { // Pre-allocate slices with estimated capacity filtered_affinity := make([]*datastore.PodMetrics, 0, len(pods)) @@ -209,9 +209,6 @@ func loRASoftAffinityPredicate(logger logr.Logger, req *LLMRequest, pods []*data // Categorize pods based on affinity and availability for _, pod := range pods { - if pod == nil { - continue - } if _, exists := pod.ActiveModels[req.ResolvedTargetModel]; exists { filtered_affinity = append(filtered_affinity, pod) diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go index 70429854..bdddd972 100644 --- a/pkg/epp/scheduling/scheduler.go +++ b/pkg/epp/scheduling/scheduler.go @@ -57,7 +57,7 @@ var ( filter: leastQueuingFilterFunc, nextOnSuccessOrFailure: &filter{ name: "low cost LoRA", - filter: loRASoftAffinityPredicate, + filter: loRASoftAffinityFilter, nextOnSuccessOrFailure: &filter{ name: "least KV cache percent", filter: leastKVCacheFilterFunc, @@ -80,7 +80,7 @@ var ( filter: toFilterFunc((lowQueueingPodPredicate)), nextOnSuccess: &filter{ name: "affinity LoRA", - filter: loRASoftAffinityPredicate, + filter: loRASoftAffinityFilter, nextOnSuccessOrFailure: queueAndKVCacheFilter, }, nextOnFailure: queueLoRAAndKVCacheFilter, diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go index 2ea66dba..1ab96754 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/hermetic_test.go @@ -179,7 +179,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, }), extprocutils.FakePodMetrics(1, datastore.Metrics{ - WaitingQueueSize: 50, + WaitingQueueSize: 200, KVCacheUsagePercent: 0.1, ActiveModels: map[string]int{ "foo": 1, From 9e94fd987c2be5157dd5ef8562bb6872988c45a9 Mon Sep 17 00:00:00 2001 From: kaushikmitr Date: Sat, 1 Mar 2025 00:33:39 +0000 Subject: [PATCH 03/12] restore vllm deployment manifest --- config/manifests/vllm/deployment.yaml | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/config/manifests/vllm/deployment.yaml b/config/manifests/vllm/deployment.yaml index e6667809..46979ec3 100644 --- a/config/manifests/vllm/deployment.yaml +++ b/config/manifests/vllm/deployment.yaml @@ -3,7 +3,7 @@ kind: Deployment metadata: name: vllm-llama2-7b-pool spec: - replicas: 6 + replicas: 3 selector: matchLabels: app: vllm-llama2-7b-pool @@ -24,23 +24,15 @@ spec: - "1" - "--port" - "8000" - - "--compilation-config" - - "3" - - "--max-num-seqs" - - "2048" - "--enable-lora" - "--max-loras" - "4" - "--max-cpu-loras" - - "15" - - "--max-lora-rank" - - "16" + - "12" - "--lora-modules" - '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}' - '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}' env: - - name: VLLM_USE_V1 - value: "1" - name: PORT value: "8000" - name: HUGGING_FACE_HUB_TOKEN @@ -128,4 +120,4 @@ data: - base-model: meta-llama/Llama-2-7b-hf id: tweet-summary-1 source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm - + \ No newline at end of file From 2b934d03e1428ea3412ebd2fcc7963a0a785c118 Mon Sep 17 00:00:00 2001 From: kaushikmitr Date: Sat, 1 Mar 2025 00:40:52 +0000 Subject: [PATCH 04/12] update README for model server protocol to add waiting lora adapters --- docs/proposals/003-model-server-protocol/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/proposals/003-model-server-protocol/README.md b/docs/proposals/003-model-server-protocol/README.md index 44ecf4e1..ab6d5819 100644 --- a/docs/proposals/003-model-server-protocol/README.md +++ b/docs/proposals/003-model-server-protocol/README.md @@ -47,3 +47,5 @@ The model server MUST expose the following LoRA adapter metrics via the same Pro requested adapter. Example: `"max_lora": "8"`. * `running_lora_adapters`: A comma separated list of adapters that are currently loaded in GPU memory and ready to serve requests. Example: `"running_lora_adapters": "adapter1, adapter2"` + * `waiting_lora_adapters`: A comma separated list of adapters that are currently loaded in GPU + memory and ready to serve requests. Example: `"waiting_lora_adapters": "adapter1, adapter2"` From 2d3a3bb16d9dc9f0843130a3e7aa03d7c19a25c6 Mon Sep 17 00:00:00 2001 From: kaushikmitr Date: Sat, 1 Mar 2025 00:57:25 +0000 Subject: [PATCH 05/12] remove unused variables --- pkg/epp/scheduling/filter.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/epp/scheduling/filter.go b/pkg/epp/scheduling/filter.go index a911cbd6..e1a118b9 100644 --- a/pkg/epp/scheduling/filter.go +++ b/pkg/epp/scheduling/filter.go @@ -238,11 +238,11 @@ func loRASoftAffinityFilter(logger logr.Logger, req *LLMRequest, pods []*datasto } // canAcceptNewLoraPredicate is a filter function to check whether a pod has room to load the adapter. -func canAcceptNewLoraPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool { +func canAcceptNewLoraPredicate(_ *LLMRequest, pod *datastore.PodMetrics) bool { return len(pod.ActiveModels) < pod.MaxActiveModels } -func criticalRequestPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool { +func criticalRequestPredicate(req *LLMRequest, _ *datastore.PodMetrics) bool { return req.Critical } From 323e141534b6c451ef473065abb4ac53641804b5 Mon Sep 17 00:00:00 2001 From: kaushikmitr Date: Sat, 1 Mar 2025 02:20:15 +0000 Subject: [PATCH 06/12] removed unused func --- pkg/epp/scheduling/filter.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pkg/epp/scheduling/filter.go b/pkg/epp/scheduling/filter.go index e1a118b9..d3c22673 100644 --- a/pkg/epp/scheduling/filter.go +++ b/pkg/epp/scheduling/filter.go @@ -237,11 +237,6 @@ func loRASoftAffinityFilter(logger logr.Logger, req *LLMRequest, pods []*datasto return filtered_available, nil } -// canAcceptNewLoraPredicate is a filter function to check whether a pod has room to load the adapter. -func canAcceptNewLoraPredicate(_ *LLMRequest, pod *datastore.PodMetrics) bool { - return len(pod.ActiveModels) < pod.MaxActiveModels -} - func criticalRequestPredicate(req *LLMRequest, _ *datastore.PodMetrics) bool { return req.Critical } From 41ec5b84f7c1ecefe5ef334684b079e76dbc20a5 Mon Sep 17 00:00:00 2001 From: kaushikmitr Date: Sat, 1 Mar 2025 02:32:07 +0000 Subject: [PATCH 07/12] fix model protocol readme --- docs/proposals/003-model-server-protocol/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/proposals/003-model-server-protocol/README.md b/docs/proposals/003-model-server-protocol/README.md index ab6d5819..c0319461 100644 --- a/docs/proposals/003-model-server-protocol/README.md +++ b/docs/proposals/003-model-server-protocol/README.md @@ -47,5 +47,4 @@ The model server MUST expose the following LoRA adapter metrics via the same Pro requested adapter. Example: `"max_lora": "8"`. * `running_lora_adapters`: A comma separated list of adapters that are currently loaded in GPU memory and ready to serve requests. Example: `"running_lora_adapters": "adapter1, adapter2"` - * `waiting_lora_adapters`: A comma separated list of adapters that are currently loaded in GPU - memory and ready to serve requests. Example: `"waiting_lora_adapters": "adapter1, adapter2"` + * `waiting_lora_adapters`: A comma separated list of adapters that are waiting to be served. Example: `"waiting_lora_adapters": "adapter1, adapter2"` From 299161749ef7b62e881cf448262a95040e7bbc7f Mon Sep 17 00:00:00 2001 From: kaushikmitr Date: Mon, 3 Mar 2025 21:21:09 +0000 Subject: [PATCH 08/12] fix hermetic test for select active lora, low queue --- test/integration/hermetic_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go index 1ab96754..81098353 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/hermetic_test.go @@ -142,6 +142,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { KVCacheUsagePercent: 0.2, ActiveModels: map[string]int{ "foo": 1, + "bar": 1, }, }), }, From be3ce8beeb4d52c78c223bd6b222c2ebb3cbeef3 Mon Sep 17 00:00:00 2001 From: kaushikmitr Date: Mon, 3 Mar 2025 21:26:14 +0000 Subject: [PATCH 09/12] update comment in metrics.go in vllm backend --- pkg/epp/backend/vllm/metrics.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/epp/backend/vllm/metrics.go b/pkg/epp/backend/vllm/metrics.go index a93ec0b1..5dddaf86 100644 --- a/pkg/epp/backend/vllm/metrics.go +++ b/pkg/epp/backend/vllm/metrics.go @@ -189,7 +189,8 @@ func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.Metr } } - // Ignore metrics with both labels empty. + // Ignore metrics with both labels empty. This happens when there are no running or waiting requests on + // the server, in this case it is best to use the last set of active adapters. if running == "" && waiting == "" { continue } From 71b95e61deb8608b6cbe6e9d372c17031c6ea58d Mon Sep 17 00:00:00 2001 From: kaushikmitr Date: Mon, 3 Mar 2025 21:59:43 +0000 Subject: [PATCH 10/12] add filter test TestLoRASoftAffinityDistribution --- pkg/epp/scheduling/filter_test.go | 84 +++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/pkg/epp/scheduling/filter_test.go b/pkg/epp/scheduling/filter_test.go index ac765b78..33743418 100644 --- a/pkg/epp/scheduling/filter_test.go +++ b/pkg/epp/scheduling/filter_test.go @@ -429,3 +429,87 @@ func TestFilterFunc(t *testing.T) { }) } } + +// TestLoRASoftAffinityDistribution tests that the loRASoftAffinityFilter function +// properly distributes requests according to the loraAffinityThreshold +func TestLoRASoftAffinityDistribution(t *testing.T) { + logger := logutil.NewTestLogger() + + const ( + testModelName = "test-model" + testAffinityModel = "test-affinity-model" + numIterations = 10000 + tolerancePercent = 5.0 // Allow 5% tolerance from expected distribution + ) + + // Create a test request and pods + req := &LLMRequest{ + Model: testAffinityModel, + ResolvedTargetModel: testAffinityModel, + } + + // Test setup: One affinity pod and one available pod + pods := []*datastore.PodMetrics{ + { + Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "affinity-pod"}}, + Metrics: datastore.Metrics{ + MaxActiveModels: 2, + ActiveModels: map[string]int{ + testAffinityModel: 1, + }, + }, + }, + { + Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "available-pod"}}, + Metrics: datastore.Metrics{ + MaxActiveModels: 2, + ActiveModels: map[string]int{}, + }, + }, + } + + // Run the filter function multiple times and count the results + affinityCount := 0 + availableCount := 0 + + // Use the actual loraAffinityThreshold as defined in the original code + // This test should work with whatever value is set there + expectedAffinityPercent := loraAffinityThreshold * 100 + + for i := 0; i < numIterations; i++ { + result, err := loRASoftAffinityFilter(logger, req, pods) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + + // Check which type of pod was returned + if len(result) != 1 { + t.Fatalf("Expected exactly one pod in result, got %d", len(result)) + } + + // Identify if the returned pod is the affinity pod or available pod + if _, exists := result[0].ActiveModels[testAffinityModel]; exists { + affinityCount++ + } else { + availableCount++ + } + } + + // Calculate the actual percentages + actualAffinityPercent := float64(affinityCount) / float64(numIterations) * 100 + actualAvailablePercent := float64(availableCount) / float64(numIterations) * 100 + + // Check if the distribution matches expected threshold within tolerance + affinityLowerBound := expectedAffinityPercent - tolerancePercent + affinityUpperBound := expectedAffinityPercent + tolerancePercent + + t.Logf("Distribution results over %d iterations:", numIterations) + t.Logf("Expected affinity percent: %.2f%% (threshold: %.2f)", expectedAffinityPercent, loraAffinityThreshold) + t.Logf("Actual affinity percent: %.2f%% (%d out of %d)", actualAffinityPercent, affinityCount, numIterations) + t.Logf("Actual available percent: %.2f%% (%d out of %d)", actualAvailablePercent, availableCount, numIterations) + + if actualAffinityPercent < affinityLowerBound || actualAffinityPercent > affinityUpperBound { + t.Errorf("Affinity selection percent %.2f%% outside expected range %.2f%% to %.2f%%", + actualAffinityPercent, affinityLowerBound, affinityUpperBound) + } +} From d6093cec937bfff29e6b64de2fa0bcf7d6bc05f8 Mon Sep 17 00:00:00 2001 From: kaushikmitr Date: Tue, 4 Mar 2025 02:38:08 +0000 Subject: [PATCH 11/12] restore vllm manifest --- config/manifests/vllm/deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/manifests/vllm/deployment.yaml b/config/manifests/vllm/deployment.yaml index 46979ec3..51689c9f 100644 --- a/config/manifests/vllm/deployment.yaml +++ b/config/manifests/vllm/deployment.yaml @@ -120,4 +120,4 @@ data: - base-model: meta-llama/Llama-2-7b-hf id: tweet-summary-1 source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm - \ No newline at end of file + From b5d7f8f5f6488bdbad94215e6b455a33ba3d0cf2 Mon Sep 17 00:00:00 2001 From: kaushikmitr Date: Tue, 4 Mar 2025 18:02:21 +0000 Subject: [PATCH 12/12] update unit test --- pkg/epp/scheduling/filter_test.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pkg/epp/scheduling/filter_test.go b/pkg/epp/scheduling/filter_test.go index 33743418..f76cece9 100644 --- a/pkg/epp/scheduling/filter_test.go +++ b/pkg/epp/scheduling/filter_test.go @@ -475,7 +475,6 @@ func TestLoRASoftAffinityDistribution(t *testing.T) { // Use the actual loraAffinityThreshold as defined in the original code // This test should work with whatever value is set there expectedAffinityPercent := loraAffinityThreshold * 100 - for i := 0; i < numIterations; i++ { result, err := loRASoftAffinityFilter(logger, req, pods) if err != nil { @@ -503,6 +502,9 @@ func TestLoRASoftAffinityDistribution(t *testing.T) { affinityLowerBound := expectedAffinityPercent - tolerancePercent affinityUpperBound := expectedAffinityPercent + tolerancePercent + availableLowerBound := actualAvailablePercent - tolerancePercent + availableUpperBound := actualAvailablePercent + tolerancePercent + t.Logf("Distribution results over %d iterations:", numIterations) t.Logf("Expected affinity percent: %.2f%% (threshold: %.2f)", expectedAffinityPercent, loraAffinityThreshold) t.Logf("Actual affinity percent: %.2f%% (%d out of %d)", actualAffinityPercent, affinityCount, numIterations) @@ -512,4 +514,8 @@ func TestLoRASoftAffinityDistribution(t *testing.T) { t.Errorf("Affinity selection percent %.2f%% outside expected range %.2f%% to %.2f%%", actualAffinityPercent, affinityLowerBound, affinityUpperBound) } + if actualAvailablePercent < availableLowerBound || actualAvailablePercent > availableUpperBound { + t.Errorf("Availability selection percent %.2f%% outside expected range %.2f%% to %.2f%%", + actualAvailablePercent, availableLowerBound, availableUpperBound) + } }