You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable.
37
38
// the threshold for queued requests to be considered low below which we can prioritize LoRA affinity.
38
-
// The value of 50 is arrived heuristicically based on experiments.
39
+
// The value of 128 is arrived heuristicically based on experiments.
39
40
queueingThresholdLoRA=128
41
+
// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable.
42
+
// loraAffinityThreshold indicates the probability with which we prefer a pod with LoRA affinity over a pod without but having room to fit more LoRA adapters.
43
+
loraAffinityThreshold=0.999
40
44
)
41
45
42
46
var (
@@ -53,7 +57,7 @@ var (
53
57
filter: leastQueuingFilterFunc,
54
58
nextOnSuccessOrFailure: &filter{
55
59
name: "low cost LoRA",
56
-
filter: minLoRACostFilterFunc,
60
+
filter: loRASoftAffinityPredicate,
57
61
nextOnSuccessOrFailure: &filter{
58
62
name: "least KV cache percent",
59
63
filter: leastKVCacheFilterFunc,
@@ -75,14 +79,9 @@ var (
75
79
name: "low queueing filter",
76
80
filter: toFilterFunc((lowQueueingPodPredicate)),
77
81
nextOnSuccess: &filter{
78
-
name: "affinity LoRA",
79
-
filter: toFilterFunc(loRAAffinityPredicate),
80
-
nextOnSuccess: queueAndKVCacheFilter,
81
-
nextOnFailure: &filter{
82
-
name: "can accept LoRA Adapter",
83
-
filter: minLoRACostFilterFunc,
84
-
nextOnSuccessOrFailure: queueAndKVCacheFilter,
85
-
},
82
+
name: "affinity LoRA",
83
+
filter: loRASoftAffinityPredicate,
84
+
nextOnSuccessOrFailure: queueAndKVCacheFilter,
86
85
},
87
86
nextOnFailure: queueLoRAAndKVCacheFilter,
88
87
}
@@ -121,14 +120,16 @@ type Scheduler struct {
121
120
}
122
121
123
122
// Schedule finds the target pod based on metrics and the requested lora adapter.
0 commit comments