Skip to content

Commit ceb1dbd

Browse files
statefulsets: MinReadySeconds implementation
kubernetes#100842 introduced featuregate. This PR implements the logic behind it.
1 parent bc8acbc commit ceb1dbd

File tree

6 files changed

+364
-49
lines changed

6 files changed

+364
-49
lines changed

pkg/controller/statefulset/stateful_set.go

+31-2
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"k8s.io/apimachinery/pkg/labels"
3030
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
3131
"k8s.io/apimachinery/pkg/util/wait"
32+
utilfeature "k8s.io/apiserver/pkg/util/feature"
3233
appsinformers "k8s.io/client-go/informers/apps/v1"
3334
coreinformers "k8s.io/client-go/informers/core/v1"
3435
clientset "k8s.io/client-go/kubernetes"
@@ -39,8 +40,10 @@ import (
3940
"k8s.io/client-go/tools/cache"
4041
"k8s.io/client-go/tools/record"
4142
"k8s.io/client-go/util/workqueue"
43+
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
4244
"k8s.io/kubernetes/pkg/controller"
4345
"k8s.io/kubernetes/pkg/controller/history"
46+
"k8s.io/kubernetes/pkg/features"
4447

4548
"k8s.io/klog/v2"
4649
)
@@ -85,7 +88,6 @@ func NewStatefulSetController(
8588
eventBroadcaster.StartStructuredLogging(0)
8689
eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")})
8790
recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "statefulset-controller"})
88-
8991
ssc := &StatefulSetController{
9092
kubeClient: kubeClient,
9193
control: NewDefaultStatefulSetControl(
@@ -221,6 +223,15 @@ func (ssc *StatefulSetController) updatePod(old, cur interface{}) {
221223
}
222224
klog.V(4).Infof("Pod %s updated, objectMeta %+v -> %+v.", curPod.Name, oldPod.ObjectMeta, curPod.ObjectMeta)
223225
ssc.enqueueStatefulSet(set)
226+
// TODO: MinReadySeconds in the Pod will generate an Available condition to be added in
227+
// the Pod status which in turn will trigger a requeue of the owning replica set thus
228+
// having its status updated with the newly available replica.
229+
if utilfeature.DefaultFeatureGate.Enabled(features.StatefulSetMinReadySeconds) && !podutil.IsPodReady(oldPod) && podutil.IsPodReady(curPod) && set.Spec.MinReadySeconds > 0 {
230+
klog.V(2).Infof("StatefulSet %s will be enqueued after %ds for availability check", set.Name, set.Spec.MinReadySeconds)
231+
// Add a second to avoid milliseconds skew in AddAfter.
232+
// See https://github.com/kubernetes/kubernetes/issues/39785#issuecomment-279959133 for more info.
233+
ssc.enqueueSSAfter(set, (time.Duration(set.Spec.MinReadySeconds)*time.Second)+time.Second)
234+
}
224235
return
225236
}
226237

@@ -380,6 +391,16 @@ func (ssc *StatefulSetController) enqueueStatefulSet(obj interface{}) {
380391
ssc.queue.Add(key)
381392
}
382393

394+
// enqueueStatefulSet enqueues the given statefulset in the work queue after given time
395+
func (ssc *StatefulSetController) enqueueSSAfter(ss *apps.StatefulSet, duration time.Duration) {
396+
key, err := controller.KeyFunc(ss)
397+
if err != nil {
398+
utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", ss, err))
399+
return
400+
}
401+
ssc.queue.AddAfter(key, duration)
402+
}
403+
383404
// processNextWorkItem dequeues items, processes them, and marks them done. It enforces that the syncHandler is never
384405
// invoked concurrently with the same key.
385406
func (ssc *StatefulSetController) processNextWorkItem() bool {
@@ -446,10 +467,18 @@ func (ssc *StatefulSetController) sync(key string) error {
446467
// syncStatefulSet syncs a tuple of (statefulset, []*v1.Pod).
447468
func (ssc *StatefulSetController) syncStatefulSet(set *apps.StatefulSet, pods []*v1.Pod) error {
448469
klog.V(4).Infof("Syncing StatefulSet %v/%v with %d pods", set.Namespace, set.Name, len(pods))
470+
var status *apps.StatefulSetStatus
471+
var err error
449472
// TODO: investigate where we mutate the set during the update as it is not obvious.
450-
if err := ssc.control.UpdateStatefulSet(set.DeepCopy(), pods); err != nil {
473+
status, err = ssc.control.UpdateStatefulSet(set.DeepCopy(), pods)
474+
if err != nil {
451475
return err
452476
}
453477
klog.V(4).Infof("Successfully synced StatefulSet %s/%s successful", set.Namespace, set.Name)
478+
// One more sync to handle the clock skew. This is also helping in requeuing right after status update
479+
if utilfeature.DefaultFeatureGate.Enabled(features.StatefulSetMinReadySeconds) && set.Spec.MinReadySeconds > 0 && status != nil && status.AvailableReplicas != *set.Spec.Replicas {
480+
ssc.enqueueSSAfter(set, time.Duration(set.Spec.MinReadySeconds)*time.Second)
481+
}
482+
454483
return nil
455484
}

pkg/controller/statefulset/stateful_set_control.go

+56-25
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,11 @@ import (
2323
v1 "k8s.io/api/core/v1"
2424
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2525
utilerrors "k8s.io/apimachinery/pkg/util/errors"
26+
utilfeature "k8s.io/apiserver/pkg/util/feature"
2627
"k8s.io/client-go/tools/record"
2728
"k8s.io/klog/v2"
2829
"k8s.io/kubernetes/pkg/controller/history"
30+
"k8s.io/kubernetes/pkg/features"
2931
)
3032

3133
// StatefulSetControl implements the control logic for updating StatefulSets and their children Pods. It is implemented
@@ -36,7 +38,7 @@ type StatefulSetControlInterface interface {
3638
// If an implementation returns a non-nil error, the invocation will be retried using a rate-limited strategy.
3739
// Implementors should sink any errors that they do not wish to trigger a retry, and they may feel free to
3840
// exit exceptionally at any point provided they wish the update to be re-run at a later point in time.
39-
UpdateStatefulSet(set *apps.StatefulSet, pods []*v1.Pod) error
41+
UpdateStatefulSet(set *apps.StatefulSet, pods []*v1.Pod) (*apps.StatefulSetStatus, error)
4042
// ListRevisions returns a array of the ControllerRevisions that represent the revisions of set. If the returned
4143
// error is nil, the returns slice of ControllerRevisions is valid.
4244
ListRevisions(set *apps.StatefulSet) ([]*apps.ControllerRevision, error)
@@ -71,60 +73,57 @@ type defaultStatefulSetControl struct {
7173
// strategy allows these constraints to be relaxed - pods will be created and deleted eagerly and
7274
// in no particular order. Clients using the burst strategy should be careful to ensure they
7375
// understand the consistency implications of having unpredictable numbers of pods available.
74-
func (ssc *defaultStatefulSetControl) UpdateStatefulSet(set *apps.StatefulSet, pods []*v1.Pod) error {
75-
76+
func (ssc *defaultStatefulSetControl) UpdateStatefulSet(set *apps.StatefulSet, pods []*v1.Pod) (*apps.StatefulSetStatus, error) {
7677
// list all revisions and sort them
7778
revisions, err := ssc.ListRevisions(set)
7879
if err != nil {
79-
return err
80+
return nil, err
8081
}
8182
history.SortControllerRevisions(revisions)
8283

83-
currentRevision, updateRevision, err := ssc.performUpdate(set, pods, revisions)
84+
currentRevision, updateRevision, status, err := ssc.performUpdate(set, pods, revisions)
8485
if err != nil {
85-
return utilerrors.NewAggregate([]error{err, ssc.truncateHistory(set, pods, revisions, currentRevision, updateRevision)})
86+
return nil, utilerrors.NewAggregate([]error{err, ssc.truncateHistory(set, pods, revisions, currentRevision, updateRevision)})
8687
}
8788

8889
// maintain the set's revision history limit
89-
return ssc.truncateHistory(set, pods, revisions, currentRevision, updateRevision)
90+
return status, ssc.truncateHistory(set, pods, revisions, currentRevision, updateRevision)
9091
}
9192

9293
func (ssc *defaultStatefulSetControl) performUpdate(
93-
set *apps.StatefulSet, pods []*v1.Pod, revisions []*apps.ControllerRevision) (*apps.ControllerRevision, *apps.ControllerRevision, error) {
94-
94+
set *apps.StatefulSet, pods []*v1.Pod, revisions []*apps.ControllerRevision) (*apps.ControllerRevision, *apps.ControllerRevision, *apps.StatefulSetStatus, error) {
95+
var currentStatus *apps.StatefulSetStatus
9596
// get the current, and update revisions
9697
currentRevision, updateRevision, collisionCount, err := ssc.getStatefulSetRevisions(set, revisions)
9798
if err != nil {
98-
return currentRevision, updateRevision, err
99+
return currentRevision, updateRevision, currentStatus, err
99100
}
100101

101102
// perform the main update function and get the status
102-
status, err := ssc.updateStatefulSet(set, currentRevision, updateRevision, collisionCount, pods)
103+
currentStatus, err = ssc.updateStatefulSet(set, currentRevision, updateRevision, collisionCount, pods)
103104
if err != nil {
104-
return currentRevision, updateRevision, err
105+
return currentRevision, updateRevision, currentStatus, err
105106
}
106-
107107
// update the set's status
108-
err = ssc.updateStatefulSetStatus(set, status)
108+
err = ssc.updateStatefulSetStatus(set, currentStatus)
109109
if err != nil {
110-
return currentRevision, updateRevision, err
110+
return currentRevision, updateRevision, currentStatus, err
111111
}
112-
113112
klog.V(4).Infof("StatefulSet %s/%s pod status replicas=%d ready=%d current=%d updated=%d",
114113
set.Namespace,
115114
set.Name,
116-
status.Replicas,
117-
status.ReadyReplicas,
118-
status.CurrentReplicas,
119-
status.UpdatedReplicas)
115+
currentStatus.Replicas,
116+
currentStatus.ReadyReplicas,
117+
currentStatus.CurrentReplicas,
118+
currentStatus.UpdatedReplicas)
120119

121120
klog.V(4).Infof("StatefulSet %s/%s revisions current=%s update=%s",
122121
set.Namespace,
123122
set.Name,
124-
status.CurrentRevision,
125-
status.UpdateRevision)
123+
currentStatus.CurrentRevision,
124+
currentStatus.UpdateRevision)
126125

127-
return currentRevision, updateRevision, nil
126+
return currentRevision, updateRevision, currentStatus, nil
128127
}
129128

130129
func (ssc *defaultStatefulSetControl) ListRevisions(set *apps.StatefulSet) ([]*apps.ControllerRevision, error) {
@@ -307,6 +306,15 @@ func (ssc *defaultStatefulSetControl) updateStatefulSet(
307306
// count the number of running and ready replicas
308307
if isRunningAndReady(pods[i]) {
309308
status.ReadyReplicas++
309+
// count the number of running and available replicas
310+
if utilfeature.DefaultFeatureGate.Enabled(features.StatefulSetMinReadySeconds) {
311+
if isRunningAndAvailable(pods[i], set.Spec.MinReadySeconds) {
312+
status.AvailableReplicas++
313+
}
314+
} else {
315+
// If the featuregate is not enabled, all the ready replicas should be considered as available replicas
316+
status.AvailableReplicas = status.ReadyReplicas
317+
}
310318
}
311319

312320
// count the number of current and update replicas
@@ -447,6 +455,19 @@ func (ssc *defaultStatefulSetControl) updateStatefulSet(
447455
replicas[i].Name)
448456
return &status, nil
449457
}
458+
// If we have a Pod that has been created but is not available we can not make progress.
459+
// We must ensure that all for each Pod, when we create it, all of its predecessors, with respect to its
460+
// ordinal, are Available.
461+
// TODO: Since available is superset of Ready, once we have this featuregate enabled by default, we can remove the
462+
// isRunningAndReady block as only Available pods should be brought down.
463+
if utilfeature.DefaultFeatureGate.Enabled(features.StatefulSetMinReadySeconds) && !isRunningAndAvailable(replicas[i], set.Spec.MinReadySeconds) && monotonic {
464+
klog.V(4).Infof(
465+
"StatefulSet %s/%s is waiting for Pod %s to be Available",
466+
set.Namespace,
467+
set.Name,
468+
replicas[i].Name)
469+
return &status, nil
470+
}
450471
// Enforce the StatefulSet invariants
451472
if identityMatches(set, replicas[i]) && storageMatches(set, replicas[i]) {
452473
continue
@@ -458,7 +479,7 @@ func (ssc *defaultStatefulSetControl) updateStatefulSet(
458479
}
459480
}
460481

461-
// At this point, all of the current Replicas are Running and Ready, we can consider termination.
482+
// At this point, all of the current Replicas are Running, Ready and Available, we can consider termination.
462483
// We will wait for all predecessors to be Running and Ready prior to attempting a deletion.
463484
// We will terminate Pods in a monotonically decreasing order over [len(pods),set.Spec.Replicas).
464485
// Note that we do not resurrect Pods in this interval. Also note that scaling will take precedence over
@@ -486,6 +507,17 @@ func (ssc *defaultStatefulSetControl) updateStatefulSet(
486507
firstUnhealthyPod.Name)
487508
return &status, nil
488509
}
510+
// if we are in monotonic mode and the condemned target is not the first unhealthy Pod, block.
511+
// TODO: Since available is superset of Ready, once we have this featuregate enabled by default, we can remove the
512+
// isRunningAndReady block as only Available pods should be brought down.
513+
if utilfeature.DefaultFeatureGate.Enabled(features.StatefulSetMinReadySeconds) && !isRunningAndAvailable(condemned[target], set.Spec.MinReadySeconds) && monotonic && condemned[target] != firstUnhealthyPod {
514+
klog.V(4).Infof(
515+
"StatefulSet %s/%s is waiting for Pod %s to be Available prior to scale down",
516+
set.Namespace,
517+
set.Name,
518+
firstUnhealthyPod.Name)
519+
return &status, nil
520+
}
489521
klog.V(2).Infof("StatefulSet %s/%s terminating Pod %s for scale down",
490522
set.Namespace,
491523
set.Name,
@@ -549,7 +581,6 @@ func (ssc *defaultStatefulSetControl) updateStatefulSet(
549581
func (ssc *defaultStatefulSetControl) updateStatefulSetStatus(
550582
set *apps.StatefulSet,
551583
status *apps.StatefulSetStatus) error {
552-
553584
// complete any in progress rolling update if necessary
554585
completeRollingUpdate(set, status)
555586

0 commit comments

Comments
 (0)