Skip to content

Commit 1c87755

Browse files
committed
OCPBUGS-23514: Failing=Unknown upon long CO updating
When it takes too long (90m+ for machine-config and 30m+ for others) to upgrade a cluster operator, clusterversion shows a message with the indication that the upgrade might hit some issue. This will cover the case in the related OCPBUGS-23538: for some reason, the pod under the deployment that manages the CO hit CrashLoopBackOff. Deployment controller does not give useful conditions in this situation [1]. Otherwise, checkDeploymentHealth [2] would detect it. Instead of CVO's figuring out the underlying pod's CrashLoopBackOff which might be better to be implemented by deployment controller, it is expected that our cluster admin starts to dig into the cluster when such a message pops up. In addition to the condition's message. We propagate Fail=Unknown to make it available for other automations, such as update-status command. [1]. kubernetes/kubernetes#106054 [2]. https://github.com/openshift/cluster-version-operator/blob/08c0459df5096e9f16fad3af2831b62d06d415ee/lib/resourcebuilder/apps.go#L79-L136
1 parent ff3778c commit 1c87755

File tree

3 files changed

+115
-1
lines changed

3 files changed

+115
-1
lines changed

Diff for: pkg/cvo/status.go

+18
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,14 @@ func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status
327327
failingCondition.Reason = failingReason
328328
failingCondition.Message = failingMessage
329329
}
330+
if failure != nil &&
331+
skipFailure &&
332+
progressReason == "ClusterOperatorUpdating" &&
333+
strings.Contains(progressMessage, "longer than expected") {
334+
failingCondition.Status = configv1.ConditionUnknown
335+
failingCondition.Reason = "SlowClusterOperator"
336+
failingCondition.Message = progressMessage
337+
}
330338
resourcemerge.SetOperatorStatusCondition(&cvStatus.Conditions, failingCondition)
331339

332340
// update progressing
@@ -555,6 +563,16 @@ func convertErrorToProgressing(now time.Time, statusFailure error) (reason strin
555563
case payload.UpdateEffectReport:
556564
return uErr.Reason, uErr.Error(), false
557565
case payload.UpdateEffectNone:
566+
m := time.Duration(30)
567+
// It takes longer to upgrade MCO
568+
if uErr.Name == "machine-config" {
569+
m = 3 * m
570+
}
571+
t := payload.COUpdateStartTimesGet(uErr.Name)
572+
if (!t.IsZero()) && t.Before(now.Add(-(m * time.Minute))) {
573+
// returns true because it is still only a suspicion
574+
return uErr.Reason, fmt.Sprintf("waiting on %s over %d minutes which is longer than expected", uErr.Name, m), true
575+
}
558576
return uErr.Reason, fmt.Sprintf("waiting on %s", uErr.Name), true
559577
case payload.UpdateEffectFail:
560578
return "", "", false

Diff for: pkg/cvo/status_test.go

+90
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"reflect"
77
"testing"
8+
"time"
89

910
"github.com/google/go-cmp/cmp"
1011
"github.com/google/go-cmp/cmp/cmpopts"
@@ -358,12 +359,22 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
358359
type args struct {
359360
syncWorkerStatus *SyncWorkerStatus
360361
}
362+
payload.COUpdateStartTimesEnsure("co-not-timeout")
363+
payload.COUpdateStartTimesAt("co-timeout", time.Now().Add(-60*time.Minute))
364+
payload.COUpdateStartTimesAt("machine-config", time.Now().Add(-60*time.Minute))
365+
defer func() {
366+
payload.COUpdateStartTimesRemove("co-not-timeout")
367+
payload.COUpdateStartTimesRemove("co-timeout")
368+
payload.COUpdateStartTimesRemove("machine-config")
369+
}()
370+
361371
tests := []struct {
362372
name string
363373
args args
364374
shouldModifyWhenNotReconcilingAndHistoryNotEmpty bool
365375
expectedConditionNotModified *configv1.ClusterOperatorStatusCondition
366376
expectedConditionModified *configv1.ClusterOperatorStatusCondition
377+
expectedProgressingCondition *configv1.ClusterOperatorStatusCondition
367378
}{
368379
{
369380
name: "no errors are present",
@@ -394,10 +405,12 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
394405
name: "single UpdateEffectNone error",
395406
args: args{
396407
syncWorkerStatus: &SyncWorkerStatus{
408+
Actual: configv1.Release{Version: "1.2.3"},
397409
Failure: &payload.UpdateError{
398410
UpdateEffect: payload.UpdateEffectNone,
399411
Reason: "ClusterOperatorUpdating",
400412
Message: "Cluster operator A is updating",
413+
Name: "co-not-timeout",
401414
},
402415
},
403416
},
@@ -412,6 +425,76 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
412425
Type: ClusterStatusFailing,
413426
Status: configv1.ConditionFalse,
414427
},
428+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
429+
Type: configv1.OperatorProgressing,
430+
Status: configv1.ConditionTrue,
431+
Reason: "ClusterOperatorUpdating",
432+
Message: "Working towards 1.2.3: waiting on co-not-timeout",
433+
},
434+
},
435+
{
436+
name: "single UpdateEffectNone error and timeout",
437+
args: args{
438+
syncWorkerStatus: &SyncWorkerStatus{
439+
Actual: configv1.Release{Version: "1.2.3"},
440+
Failure: &payload.UpdateError{
441+
UpdateEffect: payload.UpdateEffectNone,
442+
Reason: "ClusterOperatorUpdating",
443+
Message: "Cluster operator A is updating",
444+
Name: "co-timeout",
445+
},
446+
},
447+
},
448+
expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{
449+
Type: ClusterStatusFailing,
450+
Status: configv1.ConditionTrue,
451+
Reason: "ClusterOperatorUpdating",
452+
Message: "Cluster operator A is updating",
453+
},
454+
shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true,
455+
expectedConditionModified: &configv1.ClusterOperatorStatusCondition{
456+
Type: ClusterStatusFailing,
457+
Status: configv1.ConditionUnknown,
458+
Reason: "SlowClusterOperator",
459+
Message: "waiting on co-timeout over 30 minutes which is longer than expected",
460+
},
461+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
462+
Type: configv1.OperatorProgressing,
463+
Status: configv1.ConditionTrue,
464+
Reason: "ClusterOperatorUpdating",
465+
Message: "Working towards 1.2.3: waiting on co-timeout over 30 minutes which is longer than expected",
466+
},
467+
},
468+
{
469+
name: "single UpdateEffectNone error and machine-config",
470+
args: args{
471+
syncWorkerStatus: &SyncWorkerStatus{
472+
Actual: configv1.Release{Version: "1.2.3"},
473+
Failure: &payload.UpdateError{
474+
UpdateEffect: payload.UpdateEffectNone,
475+
Reason: "ClusterOperatorUpdating",
476+
Message: "Cluster operator A is updating",
477+
Name: "machine-config",
478+
},
479+
},
480+
},
481+
expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{
482+
Type: ClusterStatusFailing,
483+
Status: configv1.ConditionTrue,
484+
Reason: "ClusterOperatorUpdating",
485+
Message: "Cluster operator A is updating",
486+
},
487+
shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true,
488+
expectedConditionModified: &configv1.ClusterOperatorStatusCondition{
489+
Type: ClusterStatusFailing,
490+
Status: configv1.ConditionFalse,
491+
},
492+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
493+
Type: configv1.OperatorProgressing,
494+
Status: configv1.ConditionTrue,
495+
Reason: "ClusterOperatorUpdating",
496+
Message: "Working towards 1.2.3: waiting on machine-config",
497+
},
415498
},
416499
{
417500
name: "single condensed UpdateEffectFail UpdateError",
@@ -621,6 +704,13 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
621704
if diff := cmp.Diff(expectedCondition, condition, ignoreLastTransitionTime); diff != "" {
622705
t.Errorf("unexpected condition when Reconciling == %t && isHistoryEmpty == %t\n:%s", c.isReconciling, c.isHistoryEmpty, diff)
623706
}
707+
708+
if tc.expectedProgressingCondition != nil && !c.isReconciling && !c.isHistoryEmpty {
709+
progressingCondition := resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, configv1.OperatorProgressing)
710+
if diff := cmp.Diff(tc.expectedProgressingCondition, progressingCondition, ignoreLastTransitionTime); diff != "" {
711+
t.Errorf("unexpected progressingCondition when Reconciling == %t && isHistoryEmpty == %t\n:%s", c.isReconciling, c.isHistoryEmpty, diff)
712+
}
713+
}
624714
}
625715
})
626716
}

Diff for: pkg/payload/task.go

+7-1
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,15 @@ func InitCOUpdateStartTimes() {
4747
// COUpdateStartTimesEnsure adds name to clusterOperatorUpdateStartTimes map and sets to
4848
// current time if name does not already exist in map.
4949
func COUpdateStartTimesEnsure(name string) {
50+
COUpdateStartTimesAt(name, time.Now())
51+
}
52+
53+
// COUpdateStartTimesAt adds name to clusterOperatorUpdateStartTimes map and sets to
54+
// t if name does not already exist in map.
55+
func COUpdateStartTimesAt(name string, t time.Time) {
5056
clusterOperatorUpdateStartTimes.lock.Lock()
5157
if _, ok := clusterOperatorUpdateStartTimes.m[name]; !ok {
52-
clusterOperatorUpdateStartTimes.m[name] = time.Now()
58+
clusterOperatorUpdateStartTimes.m[name] = t
5359
}
5460
clusterOperatorUpdateStartTimes.lock.Unlock()
5561
}

0 commit comments

Comments
 (0)