Skip to content

Commit 54e9f08

Browse files
committed
OCPBUGS-23514: Better a message upon long CO updating
When it takes too long (90m+ for machine-config and 30m+ for others) to upgrade a cluster operator, clusterversion shows a message with the indication that the upgrade might hit some issue. This will cover the case in the related OCPBUGS-23538: for some reason, the pod under the deployment that manages the CO hit CrashLoopBackOff. Deployment controller does not give useful conditions in this situation [1]. Otherwise, checkDeploymentHealth [2] would detect it. Instead of CVO's figuring out the underlying pod's CrashLoopBackOff which might be better to be implemented by deployment controller, it is expected that our cluster admin starts to dig into the cluster when such a message pops up. For now, we just modify the condition's message. We could propagate Fail=True in case such requirements are collected from customers. [1]. kubernetes/kubernetes#106054 [2]. https://github.com/openshift/cluster-version-operator/blob/08c0459df5096e9f16fad3af2831b62d06d415ee/lib/resourcebuilder/apps.go#L79-L136
1 parent ff3778c commit 54e9f08

File tree

3 files changed

+103
-1
lines changed

3 files changed

+103
-1
lines changed

pkg/cvo/status.go

+8
Original file line numberDiff line numberDiff line change
@@ -555,6 +555,14 @@ func convertErrorToProgressing(now time.Time, statusFailure error) (reason strin
555555
case payload.UpdateEffectReport:
556556
return uErr.Reason, uErr.Error(), false
557557
case payload.UpdateEffectNone:
558+
m := time.Duration(30)
559+
// It takes longer to upgrade MCO
560+
if uErr.Name == "machine-config" {
561+
m = 3 * m
562+
}
563+
if !payload.COUpdateStartTimesGet(uErr.Name).IsZero() && payload.COUpdateStartTimesGet(uErr.Name).Before(now.Add(-(m * time.Minute))) {
564+
return uErr.Reason, fmt.Sprintf("waiting on %s over %d minutes which is longer than expected", uErr.Name, m), true
565+
}
558566
return uErr.Reason, fmt.Sprintf("waiting on %s", uErr.Name), true
559567
case payload.UpdateEffectFail:
560568
return "", "", false

pkg/cvo/status_test.go

+88
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"reflect"
77
"testing"
8+
"time"
89

910
"github.com/google/go-cmp/cmp"
1011
"github.com/google/go-cmp/cmp/cmpopts"
@@ -358,12 +359,22 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
358359
type args struct {
359360
syncWorkerStatus *SyncWorkerStatus
360361
}
362+
payload.COUpdateStartTimesEnsure("co-not-timeout")
363+
payload.COUpdateStartTimesAt("co-timeout", time.Now().Add(-60*time.Minute))
364+
payload.COUpdateStartTimesAt("machine-config", time.Now().Add(-60*time.Minute))
365+
defer func() {
366+
payload.COUpdateStartTimesRemove("co-not-timeout")
367+
payload.COUpdateStartTimesRemove("co-timeout")
368+
payload.COUpdateStartTimesRemove("machine-config")
369+
}()
370+
361371
tests := []struct {
362372
name string
363373
args args
364374
shouldModifyWhenNotReconcilingAndHistoryNotEmpty bool
365375
expectedConditionNotModified *configv1.ClusterOperatorStatusCondition
366376
expectedConditionModified *configv1.ClusterOperatorStatusCondition
377+
expectedProgressingCondition *configv1.ClusterOperatorStatusCondition
367378
}{
368379
{
369380
name: "no errors are present",
@@ -394,10 +405,74 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
394405
name: "single UpdateEffectNone error",
395406
args: args{
396407
syncWorkerStatus: &SyncWorkerStatus{
408+
Actual: configv1.Release{Version: "1.2.3"},
409+
Failure: &payload.UpdateError{
410+
UpdateEffect: payload.UpdateEffectNone,
411+
Reason: "ClusterOperatorUpdating",
412+
Message: "Cluster operator A is updating",
413+
Name: "co-not-timeout",
414+
},
415+
},
416+
},
417+
expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{
418+
Type: ClusterStatusFailing,
419+
Status: configv1.ConditionTrue,
420+
Reason: "ClusterOperatorUpdating",
421+
Message: "Cluster operator A is updating",
422+
},
423+
shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true,
424+
expectedConditionModified: &configv1.ClusterOperatorStatusCondition{
425+
Type: ClusterStatusFailing,
426+
Status: configv1.ConditionFalse,
427+
},
428+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
429+
Type: configv1.OperatorProgressing,
430+
Status: configv1.ConditionTrue,
431+
Reason: "ClusterOperatorUpdating",
432+
Message: "Working towards 1.2.3: waiting on co-not-timeout",
433+
},
434+
},
435+
{
436+
name: "single UpdateEffectNone error and timeout",
437+
args: args{
438+
syncWorkerStatus: &SyncWorkerStatus{
439+
Actual: configv1.Release{Version: "1.2.3"},
440+
Failure: &payload.UpdateError{
441+
UpdateEffect: payload.UpdateEffectNone,
442+
Reason: "ClusterOperatorUpdating",
443+
Message: "Cluster operator A is updating",
444+
Name: "co-timeout",
445+
},
446+
},
447+
},
448+
expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{
449+
Type: ClusterStatusFailing,
450+
Status: configv1.ConditionTrue,
451+
Reason: "ClusterOperatorUpdating",
452+
Message: "Cluster operator A is updating",
453+
},
454+
shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true,
455+
expectedConditionModified: &configv1.ClusterOperatorStatusCondition{
456+
Type: ClusterStatusFailing,
457+
Status: configv1.ConditionFalse,
458+
},
459+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
460+
Type: configv1.OperatorProgressing,
461+
Status: configv1.ConditionTrue,
462+
Reason: "ClusterOperatorUpdating",
463+
Message: "Working towards 1.2.3: waiting on co-timeout over 30 minutes which is longer than expected",
464+
},
465+
},
466+
{
467+
name: "single UpdateEffectNone error and machine-config",
468+
args: args{
469+
syncWorkerStatus: &SyncWorkerStatus{
470+
Actual: configv1.Release{Version: "1.2.3"},
397471
Failure: &payload.UpdateError{
398472
UpdateEffect: payload.UpdateEffectNone,
399473
Reason: "ClusterOperatorUpdating",
400474
Message: "Cluster operator A is updating",
475+
Name: "machine-config",
401476
},
402477
},
403478
},
@@ -412,6 +487,12 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
412487
Type: ClusterStatusFailing,
413488
Status: configv1.ConditionFalse,
414489
},
490+
expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{
491+
Type: configv1.OperatorProgressing,
492+
Status: configv1.ConditionTrue,
493+
Reason: "ClusterOperatorUpdating",
494+
Message: "Working towards 1.2.3: waiting on machine-config",
495+
},
415496
},
416497
{
417498
name: "single condensed UpdateEffectFail UpdateError",
@@ -621,6 +702,13 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t
621702
if diff := cmp.Diff(expectedCondition, condition, ignoreLastTransitionTime); diff != "" {
622703
t.Errorf("unexpected condition when Reconciling == %t && isHistoryEmpty == %t\n:%s", c.isReconciling, c.isHistoryEmpty, diff)
623704
}
705+
706+
if tc.expectedProgressingCondition != nil && !c.isReconciling && !c.isHistoryEmpty {
707+
progressingCondition := resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, configv1.OperatorProgressing)
708+
if diff := cmp.Diff(tc.expectedProgressingCondition, progressingCondition, ignoreLastTransitionTime); diff != "" {
709+
t.Errorf("unexpected progressingCondition when Reconciling == %t && isHistoryEmpty == %t\n:%s", c.isReconciling, c.isHistoryEmpty, diff)
710+
}
711+
}
624712
}
625713
})
626714
}

pkg/payload/task.go

+7-1
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,15 @@ func InitCOUpdateStartTimes() {
4747
// COUpdateStartTimesEnsure adds name to clusterOperatorUpdateStartTimes map and sets to
4848
// current time if name does not already exist in map.
4949
func COUpdateStartTimesEnsure(name string) {
50+
COUpdateStartTimesAt(name, time.Now())
51+
}
52+
53+
// COUpdateStartTimesAt adds name to clusterOperatorUpdateStartTimes map and sets to
54+
// t if name does not already exist in map.
55+
func COUpdateStartTimesAt(name string, t time.Time) {
5056
clusterOperatorUpdateStartTimes.lock.Lock()
5157
if _, ok := clusterOperatorUpdateStartTimes.m[name]; !ok {
52-
clusterOperatorUpdateStartTimes.m[name] = time.Now()
58+
clusterOperatorUpdateStartTimes.m[name] = t
5359
}
5460
clusterOperatorUpdateStartTimes.lock.Unlock()
5561
}

0 commit comments

Comments
 (0)