Skip to content

Commit 9da01d0

Browse files
committed
scheduler: impose a backoff penalty on gated Pods
1 parent 2c6daa4 commit 9da01d0

File tree

3 files changed

+52
-27
lines changed

3 files changed

+52
-27
lines changed

Diff for: pkg/scheduler/framework/types.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ type QueuedPodInfo struct {
211211
// The time pod added to the scheduling queue.
212212
Timestamp time.Time
213213
// Number of schedule attempts before successfully scheduled.
214-
// It's used to record the # attempts metric.
214+
// It's used to record the # attempts metric and calculate the backoff time this Pod is obliged to get before retrying.
215215
Attempts int
216216
// The time when the pod is added to the queue for the first time. The pod may be added
217217
// back to the queue multiple times before it's successfully scheduled.

Diff for: pkg/scheduler/internal/queue/scheduling_queue.go

+6-4
Original file line numberDiff line numberDiff line change
@@ -657,9 +657,6 @@ func (p *PriorityQueue) activate(logger klog.Logger, pod *v1.Pod) bool {
657657
// isPodBackingoff returns true if a pod is still waiting for its backoff timer.
658658
// If this returns true, the pod should not be re-tried.
659659
func (p *PriorityQueue) isPodBackingoff(podInfo *framework.QueuedPodInfo) bool {
660-
if podInfo.Gated {
661-
return false
662-
}
663660
boTime := p.getBackoffTime(podInfo)
664661
return boTime.After(p.clock.Now())
665662
}
@@ -1045,7 +1042,6 @@ func (p *PriorityQueue) Update(logger klog.Logger, oldPod, newPod *v1.Pod) error
10451042
return nil
10461043
}
10471044
if isPodUpdated(oldPod, newPod) {
1048-
10491045
if p.isPodBackingoff(usPodInfo) {
10501046
if err := p.podBackoffQ.Add(pInfo); err != nil {
10511047
return err
@@ -1393,6 +1389,12 @@ func (p *PriorityQueue) getBackoffTime(podInfo *framework.QueuedPodInfo) time.Ti
13931389
// calculateBackoffDuration is a helper function for calculating the backoffDuration
13941390
// based on the number of attempts the pod has made.
13951391
func (p *PriorityQueue) calculateBackoffDuration(podInfo *framework.QueuedPodInfo) time.Duration {
1392+
if podInfo.Attempts == 0 {
1393+
// When the Pod hasn't experienced any scheduling attempts,
1394+
// they aren't obliged to get a backoff penalty at all.
1395+
return 0
1396+
}
1397+
13961398
duration := p.podInitialBackoffDuration
13971399
for i := 1; i < podInfo.Attempts; i++ {
13981400
// Use subtraction instead of addition or multiplication to avoid overflow.

Diff for: pkg/scheduler/internal/queue/scheduling_queue_test.go

+45-22
Original file line numberDiff line numberDiff line change
@@ -715,7 +715,7 @@ func Test_InFlightPods(t *testing.T) {
715715
case action.eventHappens != nil:
716716
q.MoveAllToActiveOrBackoffQueue(logger, *action.eventHappens, nil, nil, nil)
717717
case action.podEnqueued != nil:
718-
err := q.AddUnschedulableIfNotPresent(logger, action.podEnqueued, q.SchedulingCycle())
718+
err := q.AddUnschedulableIfNotPresent(logger, newAttemptedQueuedPodInfo(action.podEnqueued), q.SchedulingCycle())
719719
if err != nil {
720720
t.Fatalf("unexpected error from AddUnschedulableIfNotPresent: %v", err)
721721
}
@@ -1062,7 +1062,7 @@ func TestPriorityQueue_Update(t *testing.T) {
10621062
name: "when updating a pod which is in unschedulable queue and is backing off, it will be moved to backoff queue",
10631063
wantQ: backoffQ,
10641064
prepareFunc: func(t *testing.T, logger klog.Logger, q *PriorityQueue) (oldPod, newPod *v1.Pod) {
1065-
q.unschedulablePods.addOrUpdate(q.newQueuedPodInfo(medPriorityPodInfo.Pod, queuePlugin))
1065+
q.unschedulablePods.addOrUpdate(newAttemptedQueuedPodInfo(q.newQueuedPodInfo(medPriorityPodInfo.Pod, queuePlugin)))
10661066
updatedPod := medPriorityPodInfo.Pod.DeepCopy()
10671067
updatedPod.Annotations["foo"] = "test"
10681068
return medPriorityPodInfo.Pod, updatedPod
@@ -1073,7 +1073,7 @@ func TestPriorityQueue_Update(t *testing.T) {
10731073
name: "when updating a pod which is in unschedulable queue and is not backing off, it will be moved to active queue",
10741074
wantQ: activeQ,
10751075
prepareFunc: func(t *testing.T, logger klog.Logger, q *PriorityQueue) (oldPod, newPod *v1.Pod) {
1076-
q.unschedulablePods.addOrUpdate(q.newQueuedPodInfo(medPriorityPodInfo.Pod, queuePlugin))
1076+
q.unschedulablePods.addOrUpdate(newAttemptedQueuedPodInfo(q.newQueuedPodInfo(medPriorityPodInfo.Pod, queuePlugin)))
10771077
updatedPod := medPriorityPodInfo.Pod.DeepCopy()
10781078
updatedPod.Annotations["foo"] = "test1"
10791079
// Move clock by podInitialBackoffDuration, so that pods in the unschedulablePods would pass the backing off,
@@ -1217,7 +1217,7 @@ func TestPriorityQueue_UpdateWhenInflight(t *testing.T) {
12171217
// test-pod got rejected by fakePlugin,
12181218
// but the update event that it just got may change this scheduling result,
12191219
// and hence we should put this pod to activeQ/backoffQ.
1220-
err := q.AddUnschedulableIfNotPresent(logger, newQueuedPodInfoForLookup(updatedPod, "fakePlugin"), q.SchedulingCycle())
1220+
err := q.AddUnschedulableIfNotPresent(logger, newAttemptedQueuedPodInfo(newQueuedPodInfoForLookup(updatedPod, "fakePlugin")), q.SchedulingCycle())
12211221
if err != nil {
12221222
t.Fatalf("unexpected error from AddUnschedulableIfNotPresent: %v", err)
12231223
}
@@ -1567,7 +1567,7 @@ func TestPriorityQueue_MoveAllToActiveOrBackoffQueueWithQueueingHint(t *testing.
15671567
},
15681568
{
15691569
name: "Queue queues pod to backoffQ if Pod is backing off",
1570-
podInfo: &framework.QueuedPodInfo{PodInfo: mustNewPodInfo(p), UnschedulablePlugins: sets.New("foo")},
1570+
podInfo: &framework.QueuedPodInfo{PodInfo: mustNewPodInfo(p), Attempts: 1, UnschedulablePlugins: sets.New("foo")},
15711571
hint: queueHintReturnQueue,
15721572
expectedQ: backoffQ,
15731573
},
@@ -1598,6 +1598,12 @@ func TestPriorityQueue_MoveAllToActiveOrBackoffQueueWithQueueingHint(t *testing.
15981598
hint: queueHintReturnQueue,
15991599
expectedQ: activeQ,
16001600
},
1601+
{
1602+
name: "Pod that experienced a scheduling failure before should be queued to backoffQ after un-gated",
1603+
podInfo: setQueuedPodInfoGated(&framework.QueuedPodInfo{PodInfo: mustNewPodInfo(p), Attempts: 1, UnschedulablePlugins: sets.New("foo")}),
1604+
hint: queueHintReturnQueue,
1605+
expectedQ: backoffQ,
1606+
},
16011607
}
16021608

16031609
for _, test := range tests {
@@ -1666,11 +1672,11 @@ func TestPriorityQueue_MoveAllToActiveOrBackoffQueue(t *testing.T) {
16661672
t.Errorf("Expected: %v after Pop, but got: %v", highPriorityPodInfo.Pod.Name, p.Pod.Name)
16671673
}
16681674
expectInFlightPods(t, q, unschedulablePodInfo.Pod.UID, highPriorityPodInfo.Pod.UID)
1669-
err := q.AddUnschedulableIfNotPresent(logger, q.newQueuedPodInfo(unschedulablePodInfo.Pod, "fooPlugin"), q.SchedulingCycle())
1675+
err := q.AddUnschedulableIfNotPresent(logger, newAttemptedQueuedPodInfo(q.newQueuedPodInfo(unschedulablePodInfo.Pod, "fooPlugin")), q.SchedulingCycle())
16701676
if err != nil {
16711677
t.Fatalf("unexpected error from AddUnschedulableIfNotPresent: %v", err)
16721678
}
1673-
err = q.AddUnschedulableIfNotPresent(logger, q.newQueuedPodInfo(highPriorityPodInfo.Pod, "fooPlugin"), q.SchedulingCycle())
1679+
err = q.AddUnschedulableIfNotPresent(logger, newAttemptedQueuedPodInfo(q.newQueuedPodInfo(highPriorityPodInfo.Pod, "fooPlugin")), q.SchedulingCycle())
16741680
if err != nil {
16751681
t.Fatalf("unexpected error from AddUnschedulableIfNotPresent: %v", err)
16761682
}
@@ -1683,7 +1689,7 @@ func TestPriorityQueue_MoveAllToActiveOrBackoffQueue(t *testing.T) {
16831689
}
16841690
expectInFlightPods(t, q, hpp1.UID)
16851691
// This Pod will go to backoffQ because no failure plugin is associated with it.
1686-
err = q.AddUnschedulableIfNotPresent(logger, q.newQueuedPodInfo(hpp1), q.SchedulingCycle())
1692+
err = q.AddUnschedulableIfNotPresent(logger, newAttemptedQueuedPodInfo(q.newQueuedPodInfo(hpp1)), q.SchedulingCycle())
16871693
if err != nil {
16881694
t.Fatalf("unexpected error from AddUnschedulableIfNotPresent: %v", err)
16891695
}
@@ -1696,7 +1702,7 @@ func TestPriorityQueue_MoveAllToActiveOrBackoffQueue(t *testing.T) {
16961702
}
16971703
expectInFlightPods(t, q, hpp2.UID)
16981704
// This Pod will go to the unschedulable Pod pool.
1699-
err = q.AddUnschedulableIfNotPresent(logger, q.newQueuedPodInfo(hpp2, "barPlugin"), q.SchedulingCycle())
1705+
err = q.AddUnschedulableIfNotPresent(logger, newAttemptedQueuedPodInfo(q.newQueuedPodInfo(hpp2, "barPlugin")), q.SchedulingCycle())
17001706
if err != nil {
17011707
t.Fatalf("unexpected error from AddUnschedulableIfNotPresent: %v", err)
17021708
}
@@ -1740,9 +1746,9 @@ func TestPriorityQueue_MoveAllToActiveOrBackoffQueue(t *testing.T) {
17401746
if p, err := q.Pop(logger); err != nil || p.Pod != hpp1 {
17411747
t.Errorf("Expected: %v after Pop, but got: %v", hpp1, p.Pod.Name)
17421748
}
1743-
unschedulableQueuedPodInfo := q.newQueuedPodInfo(unschedulablePodInfo.Pod, "fooPlugin")
1744-
highPriorityQueuedPodInfo := q.newQueuedPodInfo(highPriorityPodInfo.Pod, "fooPlugin")
1745-
hpp1QueuedPodInfo := q.newQueuedPodInfo(hpp1)
1749+
unschedulableQueuedPodInfo := newAttemptedQueuedPodInfo(q.newQueuedPodInfo(unschedulablePodInfo.Pod, "fooPlugin"))
1750+
highPriorityQueuedPodInfo := newAttemptedQueuedPodInfo(q.newQueuedPodInfo(highPriorityPodInfo.Pod, "fooPlugin"))
1751+
hpp1QueuedPodInfo := newAttemptedQueuedPodInfo(q.newQueuedPodInfo(hpp1))
17461752
expectInFlightPods(t, q, medPriorityPodInfo.Pod.UID, unschedulablePodInfo.Pod.UID, highPriorityPodInfo.Pod.UID, hpp1.UID)
17471753
err = q.AddUnschedulableIfNotPresent(logger, unschedulableQueuedPodInfo, q.SchedulingCycle())
17481754
if err != nil {
@@ -1808,25 +1814,25 @@ func TestPriorityQueue_MoveAllToActiveOrBackoffQueueWithOutQueueingHint(t *testi
18081814
t.Errorf("add failed: %v", err)
18091815
}
18101816

1811-
err := q.AddUnschedulableIfNotPresent(logger, q.newQueuedPodInfo(unschedulablePodInfo.Pod, "fooPlugin"), q.SchedulingCycle())
1817+
err := q.AddUnschedulableIfNotPresent(logger, newAttemptedQueuedPodInfo(q.newQueuedPodInfo(unschedulablePodInfo.Pod, "fooPlugin")), q.SchedulingCycle())
18121818
if err != nil {
18131819
t.Fatalf("unexpected error from AddUnschedulableIfNotPresent: %v", err)
18141820
}
1815-
err = q.AddUnschedulableIfNotPresent(logger, q.newQueuedPodInfo(highPriorityPodInfo.Pod, "fooPlugin"), q.SchedulingCycle())
1821+
err = q.AddUnschedulableIfNotPresent(logger, newAttemptedQueuedPodInfo(q.newQueuedPodInfo(highPriorityPodInfo.Pod, "fooPlugin")), q.SchedulingCycle())
18161822
if err != nil {
18171823
t.Fatalf("unexpected error from AddUnschedulableIfNotPresent: %v", err)
18181824
}
18191825
// Construct a Pod, but don't associate its scheduler failure to any plugin
18201826
hpp1 := clonePod(highPriorityPodInfo.Pod, "hpp1")
18211827
// This Pod will go to backoffQ because no failure plugin is associated with it.
1822-
err = q.AddUnschedulableIfNotPresent(logger, q.newQueuedPodInfo(hpp1), q.SchedulingCycle())
1828+
err = q.AddUnschedulableIfNotPresent(logger, newAttemptedQueuedPodInfo(q.newQueuedPodInfo(hpp1)), q.SchedulingCycle())
18231829
if err != nil {
18241830
t.Fatalf("unexpected error from AddUnschedulableIfNotPresent: %v", err)
18251831
}
18261832
// Construct another Pod, and associate its scheduler failure to plugin "barPlugin".
18271833
hpp2 := clonePod(highPriorityPodInfo.Pod, "hpp2")
18281834
// This Pod will go to the unschedulable Pod pool.
1829-
err = q.AddUnschedulableIfNotPresent(logger, q.newQueuedPodInfo(hpp2, "barPlugin"), q.SchedulingCycle())
1835+
err = q.AddUnschedulableIfNotPresent(logger, newAttemptedQueuedPodInfo(q.newQueuedPodInfo(hpp2, "barPlugin")), q.SchedulingCycle())
18301836
if err != nil {
18311837
t.Fatalf("unexpected error from AddUnschedulableIfNotPresent: %v", err)
18321838
}
@@ -1855,9 +1861,9 @@ func TestPriorityQueue_MoveAllToActiveOrBackoffQueueWithOutQueueingHint(t *testi
18551861
}
18561862

18571863
q.schedulingCycle++
1858-
unschedulableQueuedPodInfo := q.newQueuedPodInfo(unschedulablePodInfo.Pod, "fooPlugin")
1859-
highPriorityQueuedPodInfo := q.newQueuedPodInfo(highPriorityPodInfo.Pod, "fooPlugin")
1860-
hpp1QueuedPodInfo := q.newQueuedPodInfo(hpp1)
1864+
unschedulableQueuedPodInfo := newAttemptedQueuedPodInfo(q.newQueuedPodInfo(unschedulablePodInfo.Pod, "fooPlugin"))
1865+
highPriorityQueuedPodInfo := newAttemptedQueuedPodInfo(q.newQueuedPodInfo(highPriorityPodInfo.Pod, "fooPlugin"))
1866+
hpp1QueuedPodInfo := newAttemptedQueuedPodInfo(q.newQueuedPodInfo(hpp1))
18611867
err = q.AddUnschedulableIfNotPresent(logger, unschedulableQueuedPodInfo, q.SchedulingCycle())
18621868
if err != nil {
18631869
t.Fatalf("unexpected error from AddUnschedulableIfNotPresent: %v", err)
@@ -2087,11 +2093,11 @@ func TestPriorityQueue_PendingPods(t *testing.T) {
20872093
t.Errorf("Expected: %v after Pop, but got: %v", highPriorityPodInfo.Pod.Name, p.Pod.Name)
20882094
}
20892095
q.Add(logger, medPriorityPodInfo.Pod)
2090-
err := q.AddUnschedulableIfNotPresent(logger, q.newQueuedPodInfo(unschedulablePodInfo.Pod, "plugin"), q.SchedulingCycle())
2096+
err := q.AddUnschedulableIfNotPresent(logger, newAttemptedQueuedPodInfo(q.newQueuedPodInfo(unschedulablePodInfo.Pod, "plugin")), q.SchedulingCycle())
20912097
if err != nil {
20922098
t.Fatalf("unexpected error from AddUnschedulableIfNotPresent: %v", err)
20932099
}
2094-
err = q.AddUnschedulableIfNotPresent(logger, q.newQueuedPodInfo(highPriorityPodInfo.Pod, "plugin"), q.SchedulingCycle())
2100+
err = q.AddUnschedulableIfNotPresent(logger, newAttemptedQueuedPodInfo(q.newQueuedPodInfo(highPriorityPodInfo.Pod, "plugin")), q.SchedulingCycle())
20952101
if err != nil {
20962102
t.Fatalf("unexpected error from AddUnschedulableIfNotPresent: %v", err)
20972103
}
@@ -2740,6 +2746,7 @@ var (
27402746
Reason: v1.PodReasonUnschedulable,
27412747
Message: "fake scheduling failure",
27422748
})
2749+
pInfo = newAttemptedQueuedPodInfo(pInfo)
27432750
}
27442751
queue.unschedulablePods.addOrUpdate(pInfo)
27452752
}
@@ -2884,6 +2891,16 @@ func TestPendingPodsMetric(t *testing.T) {
28842891
totalWithDelay := 20
28852892
pInfosWithDelay := makeQueuedPodInfos(totalWithDelay, "z", queueable, timestamp.Add(2*time.Second))
28862893

2894+
resetPodInfos := func() {
2895+
// reset PodInfo's Attempts because they influence the backoff time calculation.
2896+
for i := range pInfos {
2897+
pInfos[i].Attempts = 0
2898+
}
2899+
for i := range pInfosWithDelay {
2900+
pInfosWithDelay[i].Attempts = 0
2901+
}
2902+
}
2903+
28872904
tests := []struct {
28882905
name string
28892906
operations []operation
@@ -3118,6 +3135,7 @@ scheduler_plugin_execution_duration_seconds_count{extension_point="PreEnqueue",p
31183135
for _, test := range tests {
31193136
t.Run(test.name, func(t *testing.T) {
31203137
resetMetrics()
3138+
resetPodInfos()
31213139
logger, ctx := ktesting.NewTestContext(t)
31223140
ctx, cancel := context.WithCancel(ctx)
31233141
defer cancel()
@@ -3489,7 +3507,7 @@ func TestMoveAllToActiveOrBackoffQueue_PreEnqueueChecks(t *testing.T) {
34893507
t.Errorf("Expected: %v after Pop, but got: %v", podInfo.Pod.Name, p.Pod.Name)
34903508
}
34913509
podInfo.UnschedulablePlugins = sets.New("plugin")
3492-
err := q.AddUnschedulableIfNotPresent(logger, podInfo, q.schedulingCycle)
3510+
err := q.AddUnschedulableIfNotPresent(logger, newAttemptedQueuedPodInfo(podInfo), q.schedulingCycle)
34933511
if err != nil {
34943512
t.Fatalf("unexpected error from AddUnschedulableIfNotPresent: %v", err)
34953513
}
@@ -3902,3 +3920,8 @@ func Test_queuedPodInfo_gatedSetUponCreationAndUnsetUponUpdate(t *testing.T) {
39023920
t.Error("expected pod to be ungated")
39033921
}
39043922
}
3923+
3924+
func newAttemptedQueuedPodInfo(podInfo *framework.QueuedPodInfo) *framework.QueuedPodInfo {
3925+
podInfo.Attempts++
3926+
return podInfo
3927+
}

0 commit comments

Comments
 (0)