@@ -43,24 +43,24 @@ func (f *filter) Name() string {
43
43
44
44
func (f * filter ) Filter (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
45
45
if f == nil {
46
- klog .V (2 ).Infof ("Running nil filter, returning all input pods by default" )
46
+ klog .V (3 ).Infof ("Running nil filter, returning all input pods by default" )
47
47
return pods , nil
48
48
}
49
- klog .V (2 ).Infof ("Running filter %q on request %v with %v pods" , f .name , b , len (pods ))
49
+ klog .V (3 ).Infof ("Running filter %q on request %v with %v pods" , f .name , b , len (pods ))
50
50
51
51
filtered , err := f .filter (b , pods )
52
52
53
53
next := f .nextOnSuccessOrFailure
54
54
if err == nil {
55
- klog .V (2 ).Infof ("onSuccess %v -> %v, filtered: %v" , f .name , next .Name (), len (filtered ))
55
+ klog .V (3 ).Infof ("onSuccess %v -> %v, filtered: %v" , f .name , next .Name (), len (filtered ))
56
56
if f .nextOnSuccess != nil {
57
57
next = f .nextOnSuccess
58
58
}
59
59
// On success, pass the filtered result to the next filter.
60
60
return next .Filter (b , filtered )
61
61
}
62
62
63
- klog .V (2 ).Infof ("onFailure %v -> %v" , f .name , next .Name ())
63
+ klog .V (3 ).Infof ("onFailure %v -> %v" , f .name , next .Name ())
64
64
if f .nextOnFailure != nil {
65
65
next = f .nextOnFailure
66
66
}
@@ -88,32 +88,57 @@ func toFilterFunc(pp podPredicate) filterFunc {
88
88
}
89
89
}
90
90
91
+ // leastQueuingFilterFunc finds the max and min queue size of all pods, divides the whole range
92
+ // (max-min) by the number of pods, and finds the pods that fall into the first range.
93
+ // The intuition is that if there are multiple pods that share similar queue size in the low range,
94
+ // we should consider them all instead of the absolute minimum one. This worked better than picking
95
+ // the least one as it gives more choices for the next filter, which on aggregate gave better
96
+ // results.
97
+ // TODO: Compare this strategy with other strategies such as top K.
91
98
func leastQueuingFilterFunc (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
92
99
min := math .MaxInt
100
+ max := 0
93
101
filtered := []* backend.PodMetrics {}
102
+
94
103
for _ , pod := range pods {
95
- if pod .WaitingQueueSize < min {
104
+ if pod .WaitingQueueSize <= min {
96
105
min = pod .WaitingQueueSize
97
- filtered = []* backend.PodMetrics {}
98
106
}
99
- if pod .WaitingQueueSize == min {
107
+ if pod .WaitingQueueSize >= max {
108
+ max = pod .WaitingQueueSize
109
+ }
110
+ }
111
+
112
+ for _ , pod := range pods {
113
+ if pod .WaitingQueueSize >= min && pod .WaitingQueueSize <= min + (max - min )/ len (pods ) {
100
114
filtered = append (filtered , pod )
101
115
}
102
116
}
103
117
return filtered , nil
104
118
}
105
119
120
+ // leastKVCacheFilterFunc finds the max and min KV cache of all pods, divides the whole range
121
+ // (max-min) by the number of pods, and finds the pods that fall into the first range.
122
+ // The intuition is that if there are multiple pods that share similar KV cache in the low range, we
123
+ // should consider them all instead of the absolute minimum one. This worked better than picking the
124
+ // least one as it gives more choices for the next filter, which on aggregate gave better results.
125
+ // TODO: Compare this strategy with other strategies such as top K.
106
126
func leastKVCacheFilterFunc (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
107
- min := math .MaxInt
127
+ min := math .MaxFloat64
128
+ max := math .SmallestNonzeroFloat64
108
129
filtered := []* backend.PodMetrics {}
109
- margin := 5
130
+
110
131
for _ , pod := range pods {
111
- cur := int (pod .KVCacheUsagePercent ) / margin
112
- if cur < min {
113
- min = cur
114
- filtered = []* backend.PodMetrics {}
132
+ if pod .KVCacheUsagePercent <= min {
133
+ min = pod .KVCacheUsagePercent
134
+ }
135
+ if pod .KVCacheUsagePercent >= max {
136
+ max = pod .KVCacheUsagePercent
115
137
}
116
- if cur == min {
138
+ }
139
+
140
+ for _ , pod := range pods {
141
+ if pod .KVCacheUsagePercent >= min && pod .KVCacheUsagePercent <= min + (max - min )/ float64 (len (pods )) {
117
142
filtered = append (filtered , pod )
118
143
}
119
144
}
0 commit comments