@@ -42,30 +42,34 @@ func (f *filter) Name() string {
42
42
}
43
43
44
44
func (f * filter ) Filter (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
45
- if f == nil {
46
- klog .V (3 ).Infof ("Running nil filter, returning all input pods by default" )
47
- return pods , nil
48
- }
49
45
klog .V (3 ).Infof ("Running filter %q on request %v with %v pods" , f .name , b , len (pods ))
50
46
51
47
filtered , err := f .filter (b , pods )
52
48
53
49
next := f .nextOnSuccessOrFailure
54
50
if err == nil {
55
- klog .V (3 ).Infof ("onSuccess %v -> %v, filtered: %v" , f .name , next .Name (), len (filtered ))
51
+ if f .nextOnSuccess == nil && f .nextOnSuccessOrFailure == nil {
52
+ // No succeeding filters to run, return.
53
+ return filtered , err
54
+ }
56
55
if f .nextOnSuccess != nil {
57
56
next = f .nextOnSuccess
58
57
}
58
+ klog .V (3 ).Infof ("onSuccess %q -> %q, filtered: %v" , f .name , next .Name (), len (filtered ))
59
59
// On success, pass the filtered result to the next filter.
60
60
return next .Filter (b , filtered )
61
+ } else {
62
+ if f .nextOnFailure == nil && f .nextOnSuccessOrFailure == nil {
63
+ // No succeeding filters to run, return.
64
+ return filtered , err
65
+ }
66
+ if f .nextOnFailure != nil {
67
+ next = f .nextOnFailure
68
+ }
69
+ klog .V (3 ).Infof ("onFailure %q -> %q" , f .name , next .Name ())
70
+ // On failure, pass the initial set of pods to the next filter.
71
+ return next .Filter (b , pods )
61
72
}
62
-
63
- klog .V (3 ).Infof ("onFailure %v -> %v" , f .name , next .Name ())
64
- if f .nextOnFailure != nil {
65
- next = f .nextOnFailure
66
- }
67
- // On failure, pass the initial set of pods to the next filter.
68
- return next .Filter (b , pods )
69
73
}
70
74
71
75
// filterFunc filters a set of input pods to a subset.
@@ -125,7 +129,7 @@ func leastQueuingFilterFunc(b *LLMRequest, pods []*backend.PodMetrics) ([]*backe
125
129
// TODO: Compare this strategy with other strategies such as top K.
126
130
func leastKVCacheFilterFunc (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
127
131
min := math .MaxFloat64
128
- max := math . SmallestNonzeroFloat64
132
+ var max float64 = 0
129
133
filtered := []* backend.PodMetrics {}
130
134
131
135
for _ , pod := range pods {
@@ -145,11 +149,52 @@ func leastKVCacheFilterFunc(b *LLMRequest, pods []*backend.PodMetrics) ([]*backe
145
149
return filtered , nil
146
150
}
147
151
152
+ // mostKVCacheFilterFunc is similar to leastKVCacheFilterFunc but prefers pods with higher KV cache.
153
+ func mostKVCacheFilterFunc (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
154
+ min := math .MaxFloat64
155
+ var max float64 = 0
156
+ filtered := []* backend.PodMetrics {}
157
+
158
+ for _ , pod := range pods {
159
+ if pod .KVCacheUsagePercent <= min {
160
+ min = pod .KVCacheUsagePercent
161
+ }
162
+ if pod .KVCacheUsagePercent >= max {
163
+ max = pod .KVCacheUsagePercent
164
+ }
165
+ }
166
+
167
+ klog .V (3 ).Infof ("mostKVCacheFilterFunc, max=%v, min=%v" , max , min )
168
+ for _ , pod := range pods {
169
+ klog .V (3 ).Infof ("Evaluating pod %v" , pod .KVCacheUsagePercent )
170
+ if pod .KVCacheUsagePercent <= max && pod .KVCacheUsagePercent >= max - (max - min )/ float64 (len (pods )) {
171
+ klog .V (3 ).Infof ("Selected pod %v" , pod .KVCacheUsagePercent )
172
+ filtered = append (filtered , pod )
173
+ }
174
+ }
175
+ return filtered , nil
176
+ }
177
+
148
178
// podPredicate is a filter function to check whether a pod is desired.
149
179
type podPredicate func (b * LLMRequest , pod * backend.PodMetrics ) bool
150
180
151
- // loraAffinityPredicate return true if the pod have the requested LoRA adapter loaded.
152
- func loraAffinityPredicate (b * LLMRequest , pod * backend.PodMetrics ) bool {
153
- _ , ok := pod .CachedModels [b .ResolvedTargetModel ]
154
- return ok
181
+ // We consider serving an adapter low cost it the adapter is active in the model server, or the
182
+ // model server has room to load the adapter
183
+ func lowLoRACostPredicate (b * LLMRequest , pod * backend.PodMetrics ) bool {
184
+ _ , ok := pod .ActiveModels [b .ResolvedTargetModel ]
185
+ return ok || len (pod .ActiveModels ) < pod .MaxActiveModels
186
+ }
187
+
188
+ func criticalRequestPredicate (b * LLMRequest , pod * backend.PodMetrics ) bool {
189
+ return b .Critical
190
+ }
191
+
192
+ func noQueueAndLessThanKVCacheThresholdPredicate (threshold float64 ) podPredicate {
193
+ return func (b * LLMRequest , pod * backend.PodMetrics ) bool {
194
+ return pod .WaitingQueueSize <= 0 && pod .KVCacheUsagePercent <= threshold
195
+ }
196
+ }
197
+
198
+ func allowAllPredicate (b * LLMRequest , pod * backend.PodMetrics ) bool {
199
+ return true
155
200
}
0 commit comments