@@ -17,6 +17,7 @@ limitations under the License.
17
17
package queue
18
18
19
19
import (
20
+ "sync"
20
21
"time"
21
22
22
23
v1 "k8s.io/api/core/v1"
@@ -35,13 +36,14 @@ import (
35
36
const backoffQOrderingWindowDuration = time .Second
36
37
37
38
// backoffQueuer is a wrapper for backoffQ related operations.
39
+ // Its methods that relies on the queues, take the lock inside.
38
40
type backoffQueuer interface {
39
41
// isPodBackingoff returns true if a pod is still waiting for its backoff timer.
40
42
// If this returns true, the pod should not be re-tried.
41
43
// If the pod backoff time is in the actual ordering window, it should still be backing off.
42
44
isPodBackingoff (podInfo * framework.QueuedPodInfo ) bool
43
- // popEachBackoffCompleted run fn for all pods from podBackoffQ and podErrorBackoffQ that completed backoff while popping them .
44
- popEachBackoffCompleted (logger klog.Logger , fn func ( pInfo * framework.QueuedPodInfo ))
45
+ // popAllBackoffCompleted pops all pods from podBackoffQ and podErrorBackoffQ that completed backoff.
46
+ popAllBackoffCompleted (logger klog.Logger ) [] * framework.QueuedPodInfo
45
47
46
48
// podInitialBackoffDuration returns initial backoff duration that pod can get.
47
49
podInitialBackoffDuration () time.Duration
@@ -61,7 +63,8 @@ type backoffQueuer interface {
61
63
// It returns new pod info if updated, nil otherwise.
62
64
update (newPod * v1.Pod , oldPodInfo * framework.QueuedPodInfo ) * framework.QueuedPodInfo
63
65
// delete deletes the pInfo from backoffQueue.
64
- delete (pInfo * framework.QueuedPodInfo )
66
+ // It returns true if the pod was deleted.
67
+ delete (pInfo * framework.QueuedPodInfo ) bool
65
68
// get returns the pInfo matching given pInfoLookup, if exists.
66
69
get (pInfoLookup * framework.QueuedPodInfo ) (* framework.QueuedPodInfo , bool )
67
70
// has inform if pInfo exists in the queue.
@@ -75,6 +78,14 @@ type backoffQueuer interface {
75
78
// backoffQueue implements backoffQueuer and wraps two queues inside,
76
79
// providing seamless access as if it were one queue.
77
80
type backoffQueue struct {
81
+ // lock synchronizes all operations related to backoffQ.
82
+ // It protects both podBackoffQ and podErrorBackoffQ.
83
+ // Caution: DO NOT take "SchedulingQueue.lock" or "activeQueue.lock" after taking "lock".
84
+ // You should always take "SchedulingQueue.lock" and "activeQueue.lock" first, otherwise the queue could end up in deadlock.
85
+ // "lock" should not be taken after taking "nominator.nLock".
86
+ // Correct locking order is: SchedulingQueue.lock > activeQueue.lock > lock > nominator.nLock.
87
+ lock sync.RWMutex
88
+
78
89
clock clock.WithTicker
79
90
80
91
// podBackoffQ is a heap ordered by backoff expiry. Pods which have completed backoff
@@ -239,7 +250,8 @@ func (bq *backoffQueue) calculateBackoffDuration(podInfo *framework.QueuedPodInf
239
250
return duration
240
251
}
241
252
242
- func (bq * backoffQueue ) popEachBackoffCompletedWithQueue (logger klog.Logger , fn func (pInfo * framework.QueuedPodInfo ), queue * heap.Heap [* framework.QueuedPodInfo ]) {
253
+ func (bq * backoffQueue ) popAllBackoffCompletedWithQueue (logger klog.Logger , queue * heap.Heap [* framework.QueuedPodInfo ]) []* framework.QueuedPodInfo {
254
+ var poppedPods []* framework.QueuedPodInfo
243
255
for {
244
256
pInfo , ok := queue .Peek ()
245
257
if ! ok || pInfo == nil {
@@ -254,23 +266,27 @@ func (bq *backoffQueue) popEachBackoffCompletedWithQueue(logger klog.Logger, fn
254
266
logger .Error (err , "Unable to pop pod from backoff queue despite backoff completion" , "pod" , klog .KObj (pod ))
255
267
break
256
268
}
257
- if fn != nil {
258
- fn (pInfo )
259
- }
269
+ poppedPods = append (poppedPods , pInfo )
260
270
}
271
+ return poppedPods
261
272
}
262
273
263
- // popEachBackoffCompleted run fn for all pods from podBackoffQ and podErrorBackoffQ that completed backoff while popping them.
264
- func (bq * backoffQueue ) popEachBackoffCompleted (logger klog.Logger , fn func (pInfo * framework.QueuedPodInfo )) {
274
+ // popAllBackoffCompleted pops all pods from podBackoffQ and podErrorBackoffQ that completed backoff.
275
+ func (bq * backoffQueue ) popAllBackoffCompleted (logger klog.Logger ) []* framework.QueuedPodInfo {
276
+ bq .lock .Lock ()
277
+ defer bq .lock .Unlock ()
278
+
265
279
// Ensure both queues are called
266
- bq .popEachBackoffCompletedWithQueue (logger , fn , bq .podBackoffQ )
267
- bq .popEachBackoffCompletedWithQueue (logger , fn , bq .podErrorBackoffQ )
280
+ return append (bq .popAllBackoffCompletedWithQueue (logger , bq .podBackoffQ ), bq .popAllBackoffCompletedWithQueue (logger , bq .podErrorBackoffQ )... )
268
281
}
269
282
270
283
// add adds the pInfo to backoffQueue.
271
284
// The event should show which event triggered this addition and is used for the metric recording.
272
285
// It also ensures that pInfo is not in both queues.
273
286
func (bq * backoffQueue ) add (logger klog.Logger , pInfo * framework.QueuedPodInfo , event string ) {
287
+ bq .lock .Lock ()
288
+ defer bq .lock .Unlock ()
289
+
274
290
// If pod has empty both unschedulable plugins and pending plugins,
275
291
// it means that it failed because of error and should be moved to podErrorBackoffQ.
276
292
if pInfo .UnschedulablePlugins .Len () == 0 && pInfo .PendingPlugins .Len () == 0 {
@@ -297,6 +313,9 @@ func (bq *backoffQueue) add(logger klog.Logger, pInfo *framework.QueuedPodInfo,
297
313
// update updates the pod in backoffQueue if oldPodInfo is already in the queue.
298
314
// It returns new pod info if updated, nil otherwise.
299
315
func (bq * backoffQueue ) update (newPod * v1.Pod , oldPodInfo * framework.QueuedPodInfo ) * framework.QueuedPodInfo {
316
+ bq .lock .Lock ()
317
+ defer bq .lock .Unlock ()
318
+
300
319
// If the pod is in the backoff queue, update it there.
301
320
if pInfo , exists := bq .podBackoffQ .Get (oldPodInfo ); exists {
302
321
_ = pInfo .Update (newPod )
@@ -313,13 +332,32 @@ func (bq *backoffQueue) update(newPod *v1.Pod, oldPodInfo *framework.QueuedPodIn
313
332
}
314
333
315
334
// delete deletes the pInfo from backoffQueue.
316
- func (bq * backoffQueue ) delete (pInfo * framework.QueuedPodInfo ) {
317
- _ = bq .podBackoffQ .Delete (pInfo )
318
- _ = bq .podErrorBackoffQ .Delete (pInfo )
335
+ // It returns true if the pod was deleted.
336
+ func (bq * backoffQueue ) delete (pInfo * framework.QueuedPodInfo ) bool {
337
+ bq .lock .Lock ()
338
+ defer bq .lock .Unlock ()
339
+
340
+ if bq .podBackoffQ .Delete (pInfo ) == nil {
341
+ return true
342
+ }
343
+ return bq .podErrorBackoffQ .Delete (pInfo ) == nil
344
+ }
345
+
346
+ // popBackoff pops the pInfo from the podBackoffQ.
347
+ // It returns error if the queue is empty.
348
+ // This doesn't pop the pods from the podErrorBackoffQ.
349
+ func (bq * backoffQueue ) popBackoff () (* framework.QueuedPodInfo , error ) {
350
+ bq .lock .Lock ()
351
+ defer bq .lock .Unlock ()
352
+
353
+ return bq .podBackoffQ .Pop ()
319
354
}
320
355
321
356
// get returns the pInfo matching given pInfoLookup, if exists.
322
357
func (bq * backoffQueue ) get (pInfoLookup * framework.QueuedPodInfo ) (* framework.QueuedPodInfo , bool ) {
358
+ bq .lock .RLock ()
359
+ defer bq .lock .RUnlock ()
360
+
323
361
pInfo , exists := bq .podBackoffQ .Get (pInfoLookup )
324
362
if exists {
325
363
return pInfo , true
@@ -329,11 +367,17 @@ func (bq *backoffQueue) get(pInfoLookup *framework.QueuedPodInfo) (*framework.Qu
329
367
330
368
// has inform if pInfo exists in the queue.
331
369
func (bq * backoffQueue ) has (pInfo * framework.QueuedPodInfo ) bool {
370
+ bq .lock .RLock ()
371
+ defer bq .lock .RUnlock ()
372
+
332
373
return bq .podBackoffQ .Has (pInfo ) || bq .podErrorBackoffQ .Has (pInfo )
333
374
}
334
375
335
376
// list returns all pods that are in the queue.
336
377
func (bq * backoffQueue ) list () []* v1.Pod {
378
+ bq .lock .RLock ()
379
+ defer bq .lock .RUnlock ()
380
+
337
381
var result []* v1.Pod
338
382
for _ , pInfo := range bq .podBackoffQ .List () {
339
383
result = append (result , pInfo .Pod )
@@ -346,5 +390,16 @@ func (bq *backoffQueue) list() []*v1.Pod {
346
390
347
391
// len returns length of the queue.
348
392
func (bq * backoffQueue ) len () int {
393
+ bq .lock .RLock ()
394
+ defer bq .lock .RUnlock ()
395
+
349
396
return bq .podBackoffQ .Len () + bq .podErrorBackoffQ .Len ()
350
397
}
398
+
399
+ // lenBackoff returns length of the podBackoffQ.
400
+ func (bq * backoffQueue ) lenBackoff () int {
401
+ bq .lock .RLock ()
402
+ defer bq .lock .RUnlock ()
403
+
404
+ return bq .podBackoffQ .Len ()
405
+ }
0 commit comments