@@ -11,7 +11,7 @@ import (
11
11
12
12
type Filter interface {
13
13
Name () string
14
- Filter (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error )
14
+ Filter (req * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error )
15
15
}
16
16
17
17
// filter applies current filterFunc, and then recursively applies next filters depending success or
@@ -41,42 +41,46 @@ func (f *filter) Name() string {
41
41
return f .name
42
42
}
43
43
44
- func (f * filter ) Filter (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
45
- if f == nil {
46
- klog .V (3 ).Infof ("Running nil filter, returning all input pods by default" )
47
- return pods , nil
48
- }
49
- klog .V (3 ).Infof ("Running filter %q on request %v with %v pods" , f .name , b , len (pods ))
44
+ func (f * filter ) Filter (req * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
45
+ klog .V (3 ).Infof ("Running filter %q on request %v with %v pods" , f .name , req , len (pods ))
50
46
51
- filtered , err := f .filter (b , pods )
47
+ filtered , err := f .filter (req , pods )
52
48
53
49
next := f .nextOnSuccessOrFailure
54
- if err == nil {
55
- klog .V (3 ).Infof ("onSuccess %v -> %v, filtered: %v" , f .name , next .Name (), len (filtered ))
50
+ if err == nil && len (filtered ) > 0 {
51
+ if f .nextOnSuccess == nil && f .nextOnSuccessOrFailure == nil {
52
+ // No succeeding filters to run, return.
53
+ return filtered , err
54
+ }
56
55
if f .nextOnSuccess != nil {
57
56
next = f .nextOnSuccess
58
57
}
58
+ klog .V (3 ).Infof ("onSuccess %q -> %q, filtered: %v" , f .name , next .Name (), len (filtered ))
59
59
// On success, pass the filtered result to the next filter.
60
- return next .Filter (b , filtered )
61
- }
62
-
63
- klog .V (3 ).Infof ("onFailure %v -> %v" , f .name , next .Name ())
64
- if f .nextOnFailure != nil {
65
- next = f .nextOnFailure
60
+ return next .Filter (req , filtered )
61
+ } else {
62
+ if f .nextOnFailure == nil && f .nextOnSuccessOrFailure == nil {
63
+ // No succeeding filters to run, return.
64
+ return filtered , err
65
+ }
66
+ if f .nextOnFailure != nil {
67
+ next = f .nextOnFailure
68
+ }
69
+ klog .V (3 ).Infof ("onFailure %q -> %q" , f .name , next .Name ())
70
+ // On failure, pass the initial set of pods to the next filter.
71
+ return next .Filter (req , pods )
66
72
}
67
- // On failure, pass the initial set of pods to the next filter.
68
- return next .Filter (b , pods )
69
73
}
70
74
71
75
// filterFunc filters a set of input pods to a subset.
72
- type filterFunc func (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error )
76
+ type filterFunc func (req * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error )
73
77
74
78
// toFilterFunc is a helper function to convert a per pod filter func to the FilterFunc.
75
79
func toFilterFunc (pp podPredicate ) filterFunc {
76
- return func (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
80
+ return func (req * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
77
81
filtered := []* backend.PodMetrics {}
78
82
for _ , pod := range pods {
79
- pass := pp (b , pod )
83
+ pass := pp (req , pod )
80
84
if pass {
81
85
filtered = append (filtered , pod )
82
86
}
@@ -95,7 +99,7 @@ func toFilterFunc(pp podPredicate) filterFunc {
95
99
// the least one as it gives more choices for the next filter, which on aggregate gave better
96
100
// results.
97
101
// TODO: Compare this strategy with other strategies such as top K.
98
- func leastQueuingFilterFunc (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
102
+ func leastQueuingFilterFunc (req * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
99
103
min := math .MaxInt
100
104
max := 0
101
105
filtered := []* backend.PodMetrics {}
@@ -123,9 +127,9 @@ func leastQueuingFilterFunc(b *LLMRequest, pods []*backend.PodMetrics) ([]*backe
123
127
// should consider them all instead of the absolute minimum one. This worked better than picking the
124
128
// least one as it gives more choices for the next filter, which on aggregate gave better results.
125
129
// TODO: Compare this strategy with other strategies such as top K.
126
- func leastKVCacheFilterFunc (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
130
+ func leastKVCacheFilterFunc (req * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
127
131
min := math .MaxFloat64
128
- max := math . SmallestNonzeroFloat64
132
+ var max float64 = 0
129
133
filtered := []* backend.PodMetrics {}
130
134
131
135
for _ , pod := range pods {
@@ -145,11 +149,52 @@ func leastKVCacheFilterFunc(b *LLMRequest, pods []*backend.PodMetrics) ([]*backe
145
149
return filtered , nil
146
150
}
147
151
152
+ // mostKVCacheFilterFunc is similar to leastKVCacheFilterFunc but prefers pods with higher KV cache.
153
+ func mostKVCacheFilterFunc (req * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
154
+ min := math .MaxFloat64
155
+ var max float64 = 0
156
+ filtered := []* backend.PodMetrics {}
157
+
158
+ for _ , pod := range pods {
159
+ if pod .KVCacheUsagePercent <= min {
160
+ min = pod .KVCacheUsagePercent
161
+ }
162
+ if pod .KVCacheUsagePercent >= max {
163
+ max = pod .KVCacheUsagePercent
164
+ }
165
+ }
166
+
167
+ klog .V (3 ).Infof ("mostKVCacheFilterFunc, max=%v, min=%v" , max , min )
168
+ for _ , pod := range pods {
169
+ klog .V (3 ).Infof ("Evaluating pod %v" , pod .KVCacheUsagePercent )
170
+ if pod .KVCacheUsagePercent <= max && pod .KVCacheUsagePercent >= max - (max - min )/ float64 (len (pods )) {
171
+ klog .V (3 ).Infof ("Selected pod %v" , pod .KVCacheUsagePercent )
172
+ filtered = append (filtered , pod )
173
+ }
174
+ }
175
+ return filtered , nil
176
+ }
177
+
148
178
// podPredicate is a filter function to check whether a pod is desired.
149
- type podPredicate func (b * LLMRequest , pod * backend.PodMetrics ) bool
179
+ type podPredicate func (req * LLMRequest , pod * backend.PodMetrics ) bool
180
+
181
+ // We consider serving an adapter low cost it the adapter is active in the model server, or the
182
+ // model server has room to load the adapter
183
+ func lowLoRACostPredicate (req * LLMRequest , pod * backend.PodMetrics ) bool {
184
+ _ , ok := pod .ActiveModels [req .ResolvedTargetModel ]
185
+ return ok || len (pod .ActiveModels ) < pod .MaxActiveModels
186
+ }
187
+
188
+ func criticalRequestPredicate (req * LLMRequest , pod * backend.PodMetrics ) bool {
189
+ return req .Critical
190
+ }
191
+
192
+ func noQueueAndLessThanKVCacheThresholdPredicate (threshold float64 ) podPredicate {
193
+ return func (req * LLMRequest , pod * backend.PodMetrics ) bool {
194
+ return pod .WaitingQueueSize <= 0 && pod .KVCacheUsagePercent <= threshold
195
+ }
196
+ }
150
197
151
- // loraAffinityPredicate return true if the pod have the requested LoRA adapter loaded.
152
- func loraAffinityPredicate (b * LLMRequest , pod * backend.PodMetrics ) bool {
153
- _ , ok := pod .CachedModels [b .ResolvedTargetModel ]
154
- return ok
198
+ func allowAllPredicate (req * LLMRequest , pod * backend.PodMetrics ) bool {
199
+ return true
155
200
}
0 commit comments