Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scheduling changes for lora affinity load balancing #423

Merged
merged 13 commits into from
Mar 4, 2025
1 change: 1 addition & 0 deletions docs/proposals/003-model-server-protocol/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,4 @@ The model server MUST expose the following LoRA adapter metrics via the same Pro
requested adapter. Example: `"max_lora": "8"`.
* `running_lora_adapters`: A comma separated list of adapters that are currently loaded in GPU
memory and ready to serve requests. Example: `"running_lora_adapters": "adapter1, adapter2"`
* `waiting_lora_adapters`: A comma separated list of adapters that are waiting to be served. Example: `"waiting_lora_adapters": "adapter1, adapter2"`
45 changes: 42 additions & 3 deletions pkg/epp/backend/vllm/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,13 @@ import (
logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
)

// Metric names used in the vLLM metrics implementation.
// Refer to the protocol doc for more details:
// https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol
const (
LoraRequestInfoMetricName = "vllm:lora_requests_info"
LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters"
LoraRequestInfoWaitingAdaptersMetricName = "waiting_lora_adapters"
LoraRequestInfoMaxAdaptersMetricName = "max_lora"
// TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork.
RunningQueueSizeMetricName = "vllm:num_requests_running"
Expand All @@ -45,8 +49,7 @@ const (
RunningQueueSizeMetricName = "vllm:num_tokens_running"
WaitingQueueSizeMetricName = "vllm:num_tokens_waiting"
*/
KVCacheUsagePercentMetricName = "vllm:gpu_cache_usage_perc"
KvCacheMaxTokenCapacityMetricName = "vllm:gpu_cache_max_token_capacity"
KVCacheUsagePercentMetricName = "vllm:gpu_cache_usage_perc"
)

type PodMetricsClientImpl struct{}
Expand Down Expand Up @@ -138,6 +141,14 @@ func promToPodMetrics(
}
}
}
if label.GetName() == LoraRequestInfoWaitingAdaptersMetricName {
if label.GetValue() != "" {
adapterList := strings.Split(label.GetValue(), ",")
for _, adapter := range adapterList {
updated.ActiveModels[adapter] = 0
}
}
}
if label.GetName() == LoraRequestInfoMaxAdaptersMetricName {
if label.GetValue() != "" {
updated.MaxActiveModels, err = strconv.Atoi(label.GetValue())
Expand All @@ -163,14 +174,42 @@ func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.Metr
logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", LoraRequestInfoMetricName)
return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName)
}
var latestTs float64

var latest *dto.Metric
var latestTs float64

// Iterate over all metrics in the family.
for _, m := range loraRequests.GetMetric() {
var running, waiting string
// Read the label values for running and waiting adapters.
for _, lp := range m.GetLabel() {
switch lp.GetName() {
case LoraRequestInfoRunningAdaptersMetricName:
running = lp.GetValue()
case LoraRequestInfoWaitingAdaptersMetricName:
waiting = lp.GetValue()
}
}

// Ignore metrics with both labels empty. This happens when there are no running or waiting requests on
// the server, in this case it is best to use the last set of active adapters.
if running == "" && waiting == "" {
continue
}

// Select the metric with the latest creation timestamp.
if m.GetGauge().GetValue() > latestTs {
latestTs = m.GetGauge().GetValue()
latest = m
}
}

if latest == nil {
logger.V(logutil.TRACE).Info("Metric value Empty", "value", latest, "metric", LoraRequestInfoMetricName)
return nil, time.Time{}, nil
}

// Convert the gauge value (creation timestamp) to time.Time.
return latest, time.Unix(0, int64(latestTs*1000)), nil
}

Expand Down
61 changes: 52 additions & 9 deletions pkg/epp/scheduling/filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ package scheduling
import (
"errors"
"math"
"math/rand"
"time"

"github.com/go-logr/logr"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
Expand Down Expand Up @@ -183,18 +185,59 @@ func lowLoRACostPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool {
return ok || len(pod.ActiveModels) < pod.MaxActiveModels
}

// loRAAffinityPredicate is a filter function to check whether a pod has affinity to the lora requested.
func loRAAffinityPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool {
_, ok := pod.ActiveModels[req.ResolvedTargetModel]
return ok
}
// loRASoftAffinityPredicate implements a pod selection strategy that prioritizes pods
// with existing LoRA model affinity while allowing for load balancing through randomization.
//
// The function works by:
// 1. Separating pods into two groups: those with target model affinity and those with available capacity
// 2. Using a probability threshold to sometimes select from non-affinity pods to enable load balancing
// 3. Falling back to whatever group has pods if one group is empty
//
// Parameters:
// - logger: Logger interface for diagnostic output
// - req: LLM request containing the resolved target model
// - pods: Slice of pod metrics to filter
//
// Returns:
// - Filtered slice of pod metrics based on affinity and availability
// - Error if any issues occur during filtering
func loRASoftAffinityFilter(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) {

// Pre-allocate slices with estimated capacity
filtered_affinity := make([]*datastore.PodMetrics, 0, len(pods))
filtered_available := make([]*datastore.PodMetrics, 0, len(pods))

// Categorize pods based on affinity and availability
for _, pod := range pods {

if _, exists := pod.ActiveModels[req.ResolvedTargetModel]; exists {
filtered_affinity = append(filtered_affinity, pod)
} else if len(pod.ActiveModels) < pod.MaxActiveModels {
filtered_available = append(filtered_available, pod)
}
}

// Use crypto/rand for better randomization in production environments
randSource := rand.NewSource(time.Now().UnixNano())
randGen := rand.New(randSource)

// If both groups have pods, use probability to select which group to return
if len(filtered_affinity) > 0 && len(filtered_available) > 0 {
if randGen.Float64() < loraAffinityThreshold {
return filtered_affinity, nil
}
return filtered_available, nil
}

// Return whichever group has pods
if len(filtered_affinity) > 0 {
return filtered_affinity, nil
}

// canAcceptNewLoraPredicate is a filter function to check whether a pod has room to load the adapter.
func canAcceptNewLoraPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool {
return len(pod.ActiveModels) < pod.MaxActiveModels
return filtered_available, nil
}

func criticalRequestPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool {
func criticalRequestPredicate(req *LLMRequest, _ *datastore.PodMetrics) bool {
return req.Critical
}

Expand Down
90 changes: 90 additions & 0 deletions pkg/epp/scheduling/filter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -429,3 +429,93 @@ func TestFilterFunc(t *testing.T) {
})
}
}

// TestLoRASoftAffinityDistribution tests that the loRASoftAffinityFilter function
// properly distributes requests according to the loraAffinityThreshold
func TestLoRASoftAffinityDistribution(t *testing.T) {
logger := logutil.NewTestLogger()

const (
testModelName = "test-model"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: not used?

testAffinityModel = "test-affinity-model"
numIterations = 10000
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

having to run 10k times seems a lot. How long does it take? Any chance we can reduce it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it takes 0.29 seconds, I had it 1000 initially which took 0.16 seconds. 1000 is fine but given the probabiliy i have is very skewed 99.9%, 10000 gives assigns some pods to both cases (~9990, 100)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the issue is the 99.9% threshold, you can change the percentage in tests to make it easier to test, like 90%

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yup thats a bit complicated though (for now) as the threshold is defined in the scheduler as const and the filter template does not take as input any arbitrary parameter. I have to update the const for the test and reset it back. I would rather test it against whats already set in the scheduler, 0.29 sec should not be too bad, if it is I can change it go 1000 (the test would still work, since it has a tolerance for 5%)

tolerancePercent = 5.0 // Allow 5% tolerance from expected distribution
)

// Create a test request and pods
req := &LLMRequest{
Model: testAffinityModel,
ResolvedTargetModel: testAffinityModel,
}

// Test setup: One affinity pod and one available pod
pods := []*datastore.PodMetrics{
{
Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "affinity-pod"}},
Metrics: datastore.Metrics{
MaxActiveModels: 2,
ActiveModels: map[string]int{
testAffinityModel: 1,
},
},
},
{
Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "available-pod"}},
Metrics: datastore.Metrics{
MaxActiveModels: 2,
ActiveModels: map[string]int{},
},
},
}

// Run the filter function multiple times and count the results
affinityCount := 0
availableCount := 0

// Use the actual loraAffinityThreshold as defined in the original code
// This test should work with whatever value is set there
expectedAffinityPercent := loraAffinityThreshold * 100
for i := 0; i < numIterations; i++ {
result, err := loRASoftAffinityFilter(logger, req, pods)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}

// Check which type of pod was returned
if len(result) != 1 {
t.Fatalf("Expected exactly one pod in result, got %d", len(result))
}

// Identify if the returned pod is the affinity pod or available pod
if _, exists := result[0].ActiveModels[testAffinityModel]; exists {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I recommend checking the pod name directly, it's more readable and reliable. (imagine in the future the ActiveModels is modified during tests for some reason).

affinityCount++
} else {
availableCount++
}
}

// Calculate the actual percentages
actualAffinityPercent := float64(affinityCount) / float64(numIterations) * 100
actualAvailablePercent := float64(availableCount) / float64(numIterations) * 100

// Check if the distribution matches expected threshold within tolerance
affinityLowerBound := expectedAffinityPercent - tolerancePercent
affinityUpperBound := expectedAffinityPercent + tolerancePercent

availableLowerBound := actualAvailablePercent - tolerancePercent
availableUpperBound := actualAvailablePercent + tolerancePercent

t.Logf("Distribution results over %d iterations:", numIterations)
t.Logf("Expected affinity percent: %.2f%% (threshold: %.2f)", expectedAffinityPercent, loraAffinityThreshold)
t.Logf("Actual affinity percent: %.2f%% (%d out of %d)", actualAffinityPercent, affinityCount, numIterations)
t.Logf("Actual available percent: %.2f%% (%d out of %d)", actualAvailablePercent, availableCount, numIterations)

if actualAffinityPercent < affinityLowerBound || actualAffinityPercent > affinityUpperBound {
t.Errorf("Affinity selection percent %.2f%% outside expected range %.2f%% to %.2f%%",
actualAffinityPercent, affinityLowerBound, affinityUpperBound)
}
if actualAvailablePercent < availableLowerBound || actualAvailablePercent > availableUpperBound {
t.Errorf("Availability selection percent %.2f%% outside expected range %.2f%% to %.2f%%",
actualAvailablePercent, availableLowerBound, availableUpperBound)
}
}
20 changes: 9 additions & 11 deletions pkg/epp/scheduling/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,11 @@ const (
queueThresholdCritical = 5
// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable.
// the threshold for queued requests to be considered low below which we can prioritize LoRA affinity.
// The value of 50 is arrived heuristicically based on experiments.
queueingThresholdLoRA = 50
// The value of 128 is arrived heuristicically based on experiments.
queueingThresholdLoRA = 128
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should make this configurable perhaps via a flag for now. Different environments will likely need different thresholds.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would rather levarage this to make this configurable. #16

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we have time to do API change for the next release. Given we already had to change it on different accelerator types, it's important to have this knob configurable. Exposing it as a flag seems straightforward and gives us time to gather feedback on this before making an API change.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I took at look, iiuc, adding this flag is not straightforward, the way scheduler is written. If its needed for next release would rather have it in another PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Defining a flag for each parameter is tedious, we can use a versioned configuration file, this is called ComponentConfig, ideally we do that for #383

Here is JobSet's config file as an example: https://github.com/kubernetes-sigs/jobset/tree/main/api/config/v1alpha1

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because setting an env var is not validated in general (i.e., if you set an env var that the binary doesn't care about, nothing will happen), while with flags it is more strict.

It is ugly either way, lack of validation is bad, but I would rather prioritize asserting that this is a temporary knob that will likely either not exist in the future or be set via a different mechanism.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggest to not block this PR on adding the env vars, and do that as a followup for this and other algorithm parameters.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sounds good to me. I am OK to lgtm once the integration test is fixed, and a unit test for the new filter

Copy link
Contributor Author

@kaushikmitr kaushikmitr Mar 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"but we need to evolve it in a way that the algorithm offers self tuning for the most part.": My concern is that we may never have a truly model-server configuration agnostic load balancing algorithm. For example, vLLM exposes settings like max_num_seq, max_lora, max_lora_rank, and max_num_batched_tokens, and the optimal load balancing strategy will depend on these parameters—many of which may not be directly available to the Gateway. While we could choose to scrape them as needed, I’m not sure if that’s the best design choice. I believe there will always be some scheduling configuration parameters that require independent tuning. For now, using environment variables works, but in the long run, we might want to make them configurable as load balancing parameters.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, I am just not confident that the current set of parameters are the ones we will actually have moving forward. We are very early, and our benchmarking so far is limited to a few use cases, and so only when we have more benchmarks across different model sizes/accelerators/ datasets or deployed more widely in practice we will get the confidence of what knobs should be exposed.

// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable.
// loraAffinityThreshold indicates the probability with which we prefer a pod with LoRA affinity over a pod without but having room to fit more LoRA adapters.
loraAffinityThreshold = 0.999
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you have some insights to show why this is needed and why this value is picked?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I picked it after some trail and error. This value worked well when we had skewed traffic for different adapters, helped spread out high QPS adapters while keeping low QPS adapters less spread out

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe we need to update the hermetic_test case "select active lora, low queue", given the new probabilistic behavior. You can set the pods without the requested lora adapter to have no room so they will never be picked.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, the current test might fail sometimes. I tried to make it probablistic but its more complicated than I thought. For now fixed it as you suggested by setting the pods without the requested lora adapter to have no room so they will never be picked

)

var (
Expand All @@ -54,7 +57,7 @@ var (
filter: leastQueuingFilterFunc,
nextOnSuccessOrFailure: &filter{
name: "low cost LoRA",
filter: toFilterFunc(lowLoRACostPredicate),
filter: loRASoftAffinityFilter,
nextOnSuccessOrFailure: &filter{
name: "least KV cache percent",
filter: leastKVCacheFilterFunc,
Expand All @@ -76,14 +79,9 @@ var (
name: "low queueing filter",
filter: toFilterFunc((lowQueueingPodPredicate)),
nextOnSuccess: &filter{
name: "affinity LoRA",
filter: toFilterFunc(loRAAffinityPredicate),
nextOnSuccess: queueAndKVCacheFilter,
nextOnFailure: &filter{
name: "can accept LoRA Adapter",
filter: toFilterFunc(canAcceptNewLoraPredicate),
nextOnSuccessOrFailure: queueAndKVCacheFilter,
},
name: "affinity LoRA",
filter: loRASoftAffinityFilter,
nextOnSuccessOrFailure: queueAndKVCacheFilter,
},
nextOnFailure: queueLoRAAndKVCacheFilter,
}
Expand Down
3 changes: 2 additions & 1 deletion test/integration/hermetic_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ func TestKubeInferenceModelRequest(t *testing.T) {
KVCacheUsagePercent: 0.2,
ActiveModels: map[string]int{
"foo": 1,
"bar": 1,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Can you explain this in the comment line 121 that because no pods have room for new lora, therefore the pod with lora affinity will always be picked?

},
}),
},
Expand Down Expand Up @@ -200,7 +201,7 @@ func TestKubeInferenceModelRequest(t *testing.T) {
},
}),
extprocutils.FakePodMetrics(1, datastore.Metrics{
WaitingQueueSize: 50,
WaitingQueueSize: 200,
KVCacheUsagePercent: 0.1,
ActiveModels: map[string]int{
"foo": 1,
Expand Down