From ba7f4f77fc394e49522fdeadc8f58a722259f06d Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Thu, 27 Feb 2025 20:22:40 -0500 Subject: [PATCH 1/3] OCPBUGS-23514: Failing=Unknown upon long CO updating When it takes too long (90m+ for machine-config and 30m+ for others) to upgrade a cluster operator, clusterversion shows a message with the indication that the upgrade might hit some issue. This will cover the case in the related OCPBUGS-23538: for some reason, the pod under the deployment that manages the CO hit CrashLoopBackOff. Deployment controller does not give useful conditions in this situation [1]. Otherwise, checkDeploymentHealth [2] would detect it. Instead of CVO's figuring out the underlying pod's CrashLoopBackOff which might be better to be implemented by deployment controller, it is expected that our cluster admin starts to dig into the cluster when such a message pops up. In addition to the condition's message. We propagate Fail=Unknown to make it available for other automations, such as update-status command. [1]. https://github.com/kubernetes/kubernetes/issues/106054 [2]. https://github.com/openshift/cluster-version-operator/blob/08c0459df5096e9f16fad3af2831b62d06d415ee/lib/resourcebuilder/apps.go#L79-L136 --- pkg/cvo/status.go | 44 ++++++- pkg/cvo/status_test.go | 266 +++++++++++++++++++++++++++++++++++++++++ pkg/payload/task.go | 8 +- 3 files changed, 316 insertions(+), 2 deletions(-) diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go index 80bfdcbbc..899939d0d 100644 --- a/pkg/cvo/status.go +++ b/pkg/cvo/status.go @@ -327,6 +327,15 @@ func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status failingCondition.Reason = failingReason failingCondition.Message = failingMessage } + if failure != nil && + skipFailure && + (progressReason == "ClusterOperatorUpdating" || progressReason == "ClusterOperatorsUpdating") && + strings.HasPrefix(progressMessage, slowCOUpdatePrefix) { + progressMessage = strings.TrimPrefix(progressMessage, slowCOUpdatePrefix) + failingCondition.Status = configv1.ConditionUnknown + failingCondition.Reason = "SlowClusterOperator" + failingCondition.Message = progressMessage + } resourcemerge.SetOperatorStatusCondition(&cvStatus.Conditions, failingCondition) // update progressing @@ -537,6 +546,8 @@ func setDesiredReleaseAcceptedCondition(cvStatus *configv1.ClusterVersionStatus, } } +const slowCOUpdatePrefix = "Slow::" + // convertErrorToProgressing returns true if the provided status indicates a failure condition can be interpreted as // still making internal progress. The general error we try to suppress is an operator or operators still being // progressing AND the general payload task making progress towards its goal. The error's UpdateEffect determines @@ -555,7 +566,38 @@ func convertErrorToProgressing(now time.Time, statusFailure error) (reason strin case payload.UpdateEffectReport: return uErr.Reason, uErr.Error(), false case payload.UpdateEffectNone: - return uErr.Reason, fmt.Sprintf("waiting on %s", uErr.Name), true + var exceeded []string + names := uErr.Names + if len(names) == 0 { + names = []string{uErr.Name} + } + var machineConfig bool + for _, name := range names { + m := 30 * time.Minute + // It takes longer to upgrade MCO + if name == "machine-config" { + m = 3 * m + } + t := payload.COUpdateStartTimesGet(name) + if (!t.IsZero()) && t.Before(now.Add(-(m))) { + if name == "machine-config" { + machineConfig = true + } else { + exceeded = append(exceeded, name) + } + } + } + // returns true in those slow cases because it is still only a suspicion + if len(exceeded) > 0 && !machineConfig { + return uErr.Reason, fmt.Sprintf("%swaiting on %s over 30 minutes which is longer than expected", slowCOUpdatePrefix, strings.Join(exceeded, ", ")), true + } + if len(exceeded) > 0 && machineConfig { + return uErr.Reason, fmt.Sprintf("%swaiting on %s over 30 minutes and machine-config over 90 minutes which is longer than expected", slowCOUpdatePrefix, strings.Join(exceeded, ", ")), true + } + if len(exceeded) == 0 && machineConfig { + return uErr.Reason, fmt.Sprintf("%swaiting on machine-config over 90 minutes which is longer than expected", slowCOUpdatePrefix), true + } + return uErr.Reason, fmt.Sprintf("waiting on %s", strings.Join(names, ", ")), true case payload.UpdateEffectFail: return "", "", false case payload.UpdateEffectFailAfterInterval: diff --git a/pkg/cvo/status_test.go b/pkg/cvo/status_test.go index c2aae33af..e70557556 100644 --- a/pkg/cvo/status_test.go +++ b/pkg/cvo/status_test.go @@ -5,6 +5,7 @@ import ( "fmt" "reflect" "testing" + "time" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" @@ -358,12 +359,27 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t type args struct { syncWorkerStatus *SyncWorkerStatus } + payload.COUpdateStartTimesEnsure("co-not-timeout") + payload.COUpdateStartTimesEnsure("co-bar-not-timeout") + payload.COUpdateStartTimesEnsure("co-foo-not-timeout") + payload.COUpdateStartTimesAt("co-timeout", time.Now().Add(-60*time.Minute)) + payload.COUpdateStartTimesAt("co-bar-timeout", time.Now().Add(-60*time.Minute)) + defer func() { + payload.COUpdateStartTimesRemove("co-not-timeout") + payload.COUpdateStartTimesRemove("co-bar-not-timeout") + payload.COUpdateStartTimesRemove("co-foo-not-timeout") + payload.COUpdateStartTimesRemove("co-timeout") + payload.COUpdateStartTimesRemove("co-bar-timeout") + }() + tests := []struct { name string args args shouldModifyWhenNotReconcilingAndHistoryNotEmpty bool expectedConditionNotModified *configv1.ClusterOperatorStatusCondition expectedConditionModified *configv1.ClusterOperatorStatusCondition + machineConfigTimeout bool + expectedProgressingCondition *configv1.ClusterOperatorStatusCondition }{ { name: "no errors are present", @@ -394,10 +410,12 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t name: "single UpdateEffectNone error", args: args{ syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, Failure: &payload.UpdateError{ UpdateEffect: payload.UpdateEffectNone, Reason: "ClusterOperatorUpdating", Message: "Cluster operator A is updating", + Name: "co-not-timeout", }, }, }, @@ -412,6 +430,110 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t Type: ClusterStatusFailing, Status: configv1.ConditionFalse, }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorUpdating", + Message: "Working towards 1.2.3: waiting on co-not-timeout", + }, + }, + { + name: "single UpdateEffectNone error and timeout", + args: args{ + syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, + Failure: &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorUpdating", + Message: "Cluster operator A is updating", + Name: "co-timeout", + }, + }, + }, + expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorUpdating", + Message: "Cluster operator A is updating", + }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionUnknown, + Reason: "SlowClusterOperator", + Message: "waiting on co-timeout over 30 minutes which is longer than expected", + }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorUpdating", + Message: "Working towards 1.2.3: waiting on co-timeout over 30 minutes which is longer than expected", + }, + }, + { + name: "single UpdateEffectNone error and machine-config", + args: args{ + syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, + Failure: &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorUpdating", + Message: "Cluster operator A is updating", + Name: "machine-config", + }, + }, + }, + expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorUpdating", + Message: "Cluster operator A is updating", + }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionFalse, + }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorUpdating", + Message: "Working towards 1.2.3: waiting on machine-config", + }, + }, + { + name: "single UpdateEffectNone error and machine-config timeout", + args: args{ + syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, + Failure: &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorUpdating", + Message: "Cluster operator A is updating", + Name: "machine-config", + }, + }, + }, + machineConfigTimeout: true, + expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorUpdating", + Message: "Cluster operator A is updating", + }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionUnknown, + Reason: "SlowClusterOperator", + Message: "waiting on machine-config over 90 minutes which is longer than expected", + }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorUpdating", + Message: "Working towards 1.2.3: waiting on machine-config over 90 minutes which is longer than expected", + }, }, { name: "single condensed UpdateEffectFail UpdateError", @@ -587,6 +709,137 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t Message: "Multiple errors are preventing progress:\n* Cluster operator A is not available\n* Cluster operator C is degraded", }, }, + { + name: "MultipleErrors: all updating and none slow", + args: args{ + syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, + Failure: &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + Names: []string{"co-not-timeout", "co-bar-not-timeout", "co-foo-not-timeout"}, + }, + }, + }, + expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionFalse, + }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "Working towards 1.2.3: waiting on co-not-timeout, co-bar-not-timeout, co-foo-not-timeout", + }, + }, + { + name: "MultipleErrors: all updating and one slow", + args: args{ + syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, + Failure: &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + Names: []string{"co-timeout", "co-bar-not-timeout", "co-foo-not-timeout"}, + }, + }, + }, + expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionUnknown, + Reason: "SlowClusterOperator", + Message: "waiting on co-timeout over 30 minutes which is longer than expected", + }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "Working towards 1.2.3: waiting on co-timeout over 30 minutes which is longer than expected", + }, + }, + { + name: "MultipleErrors: all updating and some slow", + args: args{ + syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, + Failure: &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + Names: []string{"co-timeout", "co-bar-timeout", "co-foo-not-timeout"}, + }, + }, + }, + expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionUnknown, + Reason: "SlowClusterOperator", + Message: "waiting on co-timeout, co-bar-timeout over 30 minutes which is longer than expected", + }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "Working towards 1.2.3: waiting on co-timeout, co-bar-timeout over 30 minutes which is longer than expected", + }, + }, + { + name: "MultipleErrors: all updating and all slow", + args: args{ + syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, + Failure: &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + Names: []string{"co-timeout", "co-bar-timeout", "machine-config"}, + }, + }, + }, + machineConfigTimeout: true, + expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionUnknown, + Reason: "SlowClusterOperator", + Message: "waiting on co-timeout, co-bar-timeout over 30 minutes and machine-config over 90 minutes which is longer than expected", + }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "Working towards 1.2.3: waiting on co-timeout, co-bar-timeout over 30 minutes and machine-config over 90 minutes which is longer than expected", + }, + }, } for _, tc := range tests { tc := tc @@ -606,6 +859,11 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t {true, false}, {true, true}, } + if tc.machineConfigTimeout { + payload.COUpdateStartTimesAt("machine-config", time.Now().Add(-120*time.Minute)) + } else { + payload.COUpdateStartTimesAt("machine-config", time.Now().Add(-60*time.Minute)) + } for _, c := range combinations { tc.args.syncWorkerStatus.Reconciling = c.isReconciling cvStatus := &configv1.ClusterVersionStatus{} @@ -621,7 +879,15 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t if diff := cmp.Diff(expectedCondition, condition, ignoreLastTransitionTime); diff != "" { t.Errorf("unexpected condition when Reconciling == %t && isHistoryEmpty == %t\n:%s", c.isReconciling, c.isHistoryEmpty, diff) } + + if tc.expectedProgressingCondition != nil && !c.isReconciling && !c.isHistoryEmpty { + progressingCondition := resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, configv1.OperatorProgressing) + if diff := cmp.Diff(tc.expectedProgressingCondition, progressingCondition, ignoreLastTransitionTime); diff != "" { + t.Errorf("unexpected progressingCondition when Reconciling == %t && isHistoryEmpty == %t\n:%s", c.isReconciling, c.isHistoryEmpty, diff) + } + } } + payload.COUpdateStartTimesRemove("machine-config") }) } } diff --git a/pkg/payload/task.go b/pkg/payload/task.go index 1ac592a56..a260573af 100644 --- a/pkg/payload/task.go +++ b/pkg/payload/task.go @@ -47,9 +47,15 @@ func InitCOUpdateStartTimes() { // COUpdateStartTimesEnsure adds name to clusterOperatorUpdateStartTimes map and sets to // current time if name does not already exist in map. func COUpdateStartTimesEnsure(name string) { + COUpdateStartTimesAt(name, time.Now()) +} + +// COUpdateStartTimesAt adds name to clusterOperatorUpdateStartTimes map and sets to +// t if name does not already exist in map. +func COUpdateStartTimesAt(name string, t time.Time) { clusterOperatorUpdateStartTimes.lock.Lock() if _, ok := clusterOperatorUpdateStartTimes.m[name]; !ok { - clusterOperatorUpdateStartTimes.m[name] = time.Now() + clusterOperatorUpdateStartTimes.m[name] = t } clusterOperatorUpdateStartTimes.lock.Unlock() } From 4b6ad66a1250a430debbf3dc70f3e6f75f12944a Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Wed, 26 Mar 2025 09:19:39 -0400 Subject: [PATCH 2/3] Move slowCOUpdatePrefix Reason instead of Message --- pkg/cvo/status.go | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go index 899939d0d..de6f940c0 100644 --- a/pkg/cvo/status.go +++ b/pkg/cvo/status.go @@ -328,14 +328,13 @@ func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status failingCondition.Message = failingMessage } if failure != nil && - skipFailure && - (progressReason == "ClusterOperatorUpdating" || progressReason == "ClusterOperatorsUpdating") && - strings.HasPrefix(progressMessage, slowCOUpdatePrefix) { - progressMessage = strings.TrimPrefix(progressMessage, slowCOUpdatePrefix) + strings.HasPrefix(progressReason, slowCOUpdatePrefix) { failingCondition.Status = configv1.ConditionUnknown failingCondition.Reason = "SlowClusterOperator" failingCondition.Message = progressMessage } + progressReason = strings.TrimPrefix(progressReason, slowCOUpdatePrefix) + resourcemerge.SetOperatorStatusCondition(&cvStatus.Conditions, failingCondition) // update progressing @@ -589,13 +588,13 @@ func convertErrorToProgressing(now time.Time, statusFailure error) (reason strin } // returns true in those slow cases because it is still only a suspicion if len(exceeded) > 0 && !machineConfig { - return uErr.Reason, fmt.Sprintf("%swaiting on %s over 30 minutes which is longer than expected", slowCOUpdatePrefix, strings.Join(exceeded, ", ")), true + return slowCOUpdatePrefix + uErr.Reason, fmt.Sprintf("waiting on %s over 30 minutes which is longer than expected", strings.Join(exceeded, ", ")), true } if len(exceeded) > 0 && machineConfig { - return uErr.Reason, fmt.Sprintf("%swaiting on %s over 30 minutes and machine-config over 90 minutes which is longer than expected", slowCOUpdatePrefix, strings.Join(exceeded, ", ")), true + return slowCOUpdatePrefix + uErr.Reason, fmt.Sprintf("waiting on %s over 30 minutes and machine-config over 90 minutes which is longer than expected", strings.Join(exceeded, ", ")), true } if len(exceeded) == 0 && machineConfig { - return uErr.Reason, fmt.Sprintf("%swaiting on machine-config over 90 minutes which is longer than expected", slowCOUpdatePrefix), true + return slowCOUpdatePrefix + uErr.Reason, "waiting on machine-config over 90 minutes which is longer than expected", true } return uErr.Reason, fmt.Sprintf("waiting on %s", strings.Join(names, ", ")), true case payload.UpdateEffectFail: From 8892f42e8096c375bd03e31eea93a64a25a1fefe Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Wed, 26 Mar 2025 09:29:07 -0400 Subject: [PATCH 3/3] Extract two cases from convertErrorToProgressing into its own function --- pkg/cvo/status.go | 98 +++++++++++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 45 deletions(-) diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go index de6f940c0..a55739b92 100644 --- a/pkg/cvo/status.go +++ b/pkg/cvo/status.go @@ -565,59 +565,67 @@ func convertErrorToProgressing(now time.Time, statusFailure error) (reason strin case payload.UpdateEffectReport: return uErr.Reason, uErr.Error(), false case payload.UpdateEffectNone: - var exceeded []string - names := uErr.Names - if len(names) == 0 { - names = []string{uErr.Name} - } - var machineConfig bool - for _, name := range names { - m := 30 * time.Minute - // It takes longer to upgrade MCO - if name == "machine-config" { - m = 3 * m - } - t := payload.COUpdateStartTimesGet(name) - if (!t.IsZero()) && t.Before(now.Add(-(m))) { - if name == "machine-config" { - machineConfig = true - } else { - exceeded = append(exceeded, name) - } - } - } - // returns true in those slow cases because it is still only a suspicion - if len(exceeded) > 0 && !machineConfig { - return slowCOUpdatePrefix + uErr.Reason, fmt.Sprintf("waiting on %s over 30 minutes which is longer than expected", strings.Join(exceeded, ", ")), true - } - if len(exceeded) > 0 && machineConfig { - return slowCOUpdatePrefix + uErr.Reason, fmt.Sprintf("waiting on %s over 30 minutes and machine-config over 90 minutes which is longer than expected", strings.Join(exceeded, ", ")), true - } - if len(exceeded) == 0 && machineConfig { - return slowCOUpdatePrefix + uErr.Reason, "waiting on machine-config over 90 minutes which is longer than expected", true - } - return uErr.Reason, fmt.Sprintf("waiting on %s", strings.Join(names, ", ")), true + return convertErrorToProgressingForUpdateEffectNone(uErr, now) case payload.UpdateEffectFail: return "", "", false case payload.UpdateEffectFailAfterInterval: - var exceeded []string - threshold := now.Add(-(40 * time.Minute)) - names := uErr.Names - if len(names) == 0 { - names = []string{uErr.Name} - } - for _, name := range names { - if payload.COUpdateStartTimesGet(name).Before(threshold) { + return convertErrorToProgressingForUpdateEffectFailAfterInterval(uErr, now) + } + return "", "", false +} + +func convertErrorToProgressingForUpdateEffectNone(uErr *payload.UpdateError, now time.Time) (string, string, bool) { + var exceeded []string + names := uErr.Names + if len(names) == 0 { + names = []string{uErr.Name} + } + var machineConfig bool + for _, name := range names { + m := 30 * time.Minute + // It takes longer to upgrade MCO + if name == "machine-config" { + m = 3 * m + } + t := payload.COUpdateStartTimesGet(name) + if (!t.IsZero()) && t.Before(now.Add(-(m))) { + if name == "machine-config" { + machineConfig = true + } else { exceeded = append(exceeded, name) } } - if len(exceeded) > 0 { - return uErr.Reason, fmt.Sprintf("wait has exceeded 40 minutes for these operators: %s", strings.Join(exceeded, ", ")), false - } else { - return uErr.Reason, fmt.Sprintf("waiting up to 40 minutes on %s", uErr.Name), true + } + // returns true in those slow cases because it is still only a suspicion + if len(exceeded) > 0 && !machineConfig { + return slowCOUpdatePrefix + uErr.Reason, fmt.Sprintf("waiting on %s over 30 minutes which is longer than expected", strings.Join(exceeded, ", ")), true + } + if len(exceeded) > 0 && machineConfig { + return slowCOUpdatePrefix + uErr.Reason, fmt.Sprintf("waiting on %s over 30 minutes and machine-config over 90 minutes which is longer than expected", strings.Join(exceeded, ", ")), true + } + if len(exceeded) == 0 && machineConfig { + return slowCOUpdatePrefix + uErr.Reason, "waiting on machine-config over 90 minutes which is longer than expected", true + } + return uErr.Reason, fmt.Sprintf("waiting on %s", strings.Join(names, ", ")), true +} + +func convertErrorToProgressingForUpdateEffectFailAfterInterval(uErr *payload.UpdateError, now time.Time) (string, string, bool) { + var exceeded []string + threshold := now.Add(-(40 * time.Minute)) + names := uErr.Names + if len(names) == 0 { + names = []string{uErr.Name} + } + for _, name := range names { + if payload.COUpdateStartTimesGet(name).Before(threshold) { + exceeded = append(exceeded, name) } } - return "", "", false + if len(exceeded) > 0 { + return uErr.Reason, fmt.Sprintf("wait has exceeded 40 minutes for these operators: %s", strings.Join(exceeded, ", ")), false + } else { + return uErr.Reason, fmt.Sprintf("waiting up to 40 minutes on %s", uErr.Name), true + } } // syncFailingStatus handles generic errors in the cluster version. It tries to preserve