@@ -11,7 +11,7 @@ import (
11
11
12
12
type Filter interface {
13
13
Name () string
14
- Filter (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error )
14
+ Filter (req * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error )
15
15
}
16
16
17
17
// filter applies current filterFunc, and then recursively applies next filters depending success or
@@ -41,42 +41,46 @@ func (f *filter) Name() string {
41
41
return f .name
42
42
}
43
43
44
- func (f * filter ) Filter (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
45
- if f == nil {
46
- klog .V (3 ).Infof ("Running nil filter, returning all input pods by default" )
47
- return pods , nil
48
- }
49
- klog .V (3 ).Infof ("Running filter %q on request %v with %v pods" , f .name , b , len (pods ))
44
+ func (f * filter ) Filter (req * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
45
+ klog .V (3 ).Infof ("Running filter %q on request %v with %v pods" , f .name , req , len (pods ))
50
46
51
- filtered , err := f .filter (b , pods )
47
+ filtered , err := f .filter (req , pods )
52
48
53
49
next := f .nextOnSuccessOrFailure
54
- if err == nil {
55
- klog .V (3 ).Infof ("onSuccess %v -> %v, filtered: %v" , f .name , next .Name (), len (filtered ))
50
+ if err == nil && len (filtered ) > 0 {
51
+ if f .nextOnSuccess == nil && f .nextOnSuccessOrFailure == nil {
52
+ // No succeeding filters to run, return.
53
+ return filtered , err
54
+ }
56
55
if f .nextOnSuccess != nil {
57
56
next = f .nextOnSuccess
58
57
}
58
+ klog .V (3 ).Infof ("onSuccess %q -> %q, filtered: %v" , f .name , next .Name (), len (filtered ))
59
59
// On success, pass the filtered result to the next filter.
60
- return next .Filter (b , filtered )
61
- }
62
-
63
- klog .V (3 ).Infof ("onFailure %v -> %v" , f .name , next .Name ())
64
- if f .nextOnFailure != nil {
65
- next = f .nextOnFailure
60
+ return next .Filter (req , filtered )
61
+ } else {
62
+ if f .nextOnFailure == nil && f .nextOnSuccessOrFailure == nil {
63
+ // No succeeding filters to run, return.
64
+ return filtered , err
65
+ }
66
+ if f .nextOnFailure != nil {
67
+ next = f .nextOnFailure
68
+ }
69
+ klog .V (3 ).Infof ("onFailure %q -> %q" , f .name , next .Name ())
70
+ // On failure, pass the initial set of pods to the next filter.
71
+ return next .Filter (req , pods )
66
72
}
67
- // On failure, pass the initial set of pods to the next filter.
68
- return next .Filter (b , pods )
69
73
}
70
74
71
75
// filterFunc filters a set of input pods to a subset.
72
- type filterFunc func (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error )
76
+ type filterFunc func (req * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error )
73
77
74
78
// toFilterFunc is a helper function to convert a per pod filter func to the FilterFunc.
75
79
func toFilterFunc (pp podPredicate ) filterFunc {
76
- return func (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
80
+ return func (req * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
77
81
filtered := []* backend.PodMetrics {}
78
82
for _ , pod := range pods {
79
- pass := pp (b , pod )
83
+ pass := pp (req , pod )
80
84
if pass {
81
85
filtered = append (filtered , pod )
82
86
}
@@ -95,7 +99,7 @@ func toFilterFunc(pp podPredicate) filterFunc {
95
99
// the least one as it gives more choices for the next filter, which on aggregate gave better
96
100
// results.
97
101
// TODO: Compare this strategy with other strategies such as top K.
98
- func leastQueuingFilterFunc (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
102
+ func leastQueuingFilterFunc (req * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
99
103
min := math .MaxInt
100
104
max := 0
101
105
filtered := []* backend.PodMetrics {}
@@ -123,9 +127,9 @@ func leastQueuingFilterFunc(b *LLMRequest, pods []*backend.PodMetrics) ([]*backe
123
127
// should consider them all instead of the absolute minimum one. This worked better than picking the
124
128
// least one as it gives more choices for the next filter, which on aggregate gave better results.
125
129
// TODO: Compare this strategy with other strategies such as top K.
126
- func leastKVCacheFilterFunc (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
130
+ func leastKVCacheFilterFunc (req * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
127
131
min := math .MaxFloat64
128
- max := math . SmallestNonzeroFloat64
132
+ var max float64 = 0
129
133
filtered := []* backend.PodMetrics {}
130
134
131
135
for _ , pod := range pods {
@@ -146,10 +150,21 @@ func leastKVCacheFilterFunc(b *LLMRequest, pods []*backend.PodMetrics) ([]*backe
146
150
}
147
151
148
152
// podPredicate is a filter function to check whether a pod is desired.
149
- type podPredicate func (b * LLMRequest , pod * backend.PodMetrics ) bool
153
+ type podPredicate func (req * LLMRequest , pod * backend.PodMetrics ) bool
154
+
155
+ // We consider serving an adapter low cost it the adapter is active in the model server, or the
156
+ // model server has room to load the adapter
157
+ func lowLoRACostPredicate (req * LLMRequest , pod * backend.PodMetrics ) bool {
158
+ _ , ok := pod .ActiveModels [req .ResolvedTargetModel ]
159
+ return ok || len (pod .ActiveModels ) < pod .MaxActiveModels
160
+ }
150
161
151
- // loraAffinityPredicate return true if the pod have the requested LoRA adapter loaded.
152
- func loraAffinityPredicate (b * LLMRequest , pod * backend.PodMetrics ) bool {
153
- _ , ok := pod .CachedModels [b .ResolvedTargetModel ]
154
- return ok
162
+ func criticalRequestPredicate (req * LLMRequest , pod * backend.PodMetrics ) bool {
163
+ return req .Critical
164
+ }
165
+
166
+ func noQueueAndLessThanKVCacheThresholdPredicate (queueThreshold int , kvCacheThreshold float64 ) podPredicate {
167
+ return func (req * LLMRequest , pod * backend.PodMetrics ) bool {
168
+ return pod .WaitingQueueSize <= queueThreshold && pod .KVCacheUsagePercent <= kvCacheThreshold
169
+ }
155
170
}
0 commit comments