Skip to content

OCPBUGS-23514: Failing=Unknown upon long CO updating #1165

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 63 additions & 14 deletions pkg/cvo/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,14 @@ func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status
failingCondition.Reason = failingReason
failingCondition.Message = failingMessage
}
if failure != nil &&
strings.HasPrefix(progressReason, slowCOUpdatePrefix) {
failingCondition.Status = configv1.ConditionUnknown
failingCondition.Reason = "SlowClusterOperator"
failingCondition.Message = progressMessage
}
progressReason = strings.TrimPrefix(progressReason, slowCOUpdatePrefix)

resourcemerge.SetOperatorStatusCondition(&cvStatus.Conditions, failingCondition)

// update progressing
Expand Down Expand Up @@ -537,6 +545,8 @@ func setDesiredReleaseAcceptedCondition(cvStatus *configv1.ClusterVersionStatus,
}
}

const slowCOUpdatePrefix = "Slow::"

// convertErrorToProgressing returns true if the provided status indicates a failure condition can be interpreted as
// still making internal progress. The general error we try to suppress is an operator or operators still being
// progressing AND the general payload task making progress towards its goal. The error's UpdateEffect determines
Expand All @@ -555,28 +565,67 @@ func convertErrorToProgressing(now time.Time, statusFailure error) (reason strin
case payload.UpdateEffectReport:
return uErr.Reason, uErr.Error(), false
case payload.UpdateEffectNone:
return uErr.Reason, fmt.Sprintf("waiting on %s", uErr.Name), true
return convertErrorToProgressingForUpdateEffectNone(uErr, now)
case payload.UpdateEffectFail:
return "", "", false
case payload.UpdateEffectFailAfterInterval:
var exceeded []string
threshold := now.Add(-(40 * time.Minute))
names := uErr.Names
if len(names) == 0 {
names = []string{uErr.Name}
}
for _, name := range names {
if payload.COUpdateStartTimesGet(name).Before(threshold) {
return convertErrorToProgressingForUpdateEffectFailAfterInterval(uErr, now)
}
return "", "", false
}

func convertErrorToProgressingForUpdateEffectNone(uErr *payload.UpdateError, now time.Time) (string, string, bool) {
var exceeded []string
names := uErr.Names
if len(names) == 0 {
names = []string{uErr.Name}
}
var machineConfig bool
for _, name := range names {
m := 30 * time.Minute
// It takes longer to upgrade MCO
if name == "machine-config" {
m = 3 * m
}
t := payload.COUpdateStartTimesGet(name)
if (!t.IsZero()) && t.Before(now.Add(-(m))) {
if name == "machine-config" {
machineConfig = true
} else {
exceeded = append(exceeded, name)
}
}
if len(exceeded) > 0 {
return uErr.Reason, fmt.Sprintf("wait has exceeded 40 minutes for these operators: %s", strings.Join(exceeded, ", ")), false
} else {
return uErr.Reason, fmt.Sprintf("waiting up to 40 minutes on %s", uErr.Name), true
}
// returns true in those slow cases because it is still only a suspicion
if len(exceeded) > 0 && !machineConfig {
return slowCOUpdatePrefix + uErr.Reason, fmt.Sprintf("waiting on %s over 30 minutes which is longer than expected", strings.Join(exceeded, ", ")), true
}
if len(exceeded) > 0 && machineConfig {
return slowCOUpdatePrefix + uErr.Reason, fmt.Sprintf("waiting on %s over 30 minutes and machine-config over 90 minutes which is longer than expected", strings.Join(exceeded, ", ")), true
}
if len(exceeded) == 0 && machineConfig {
return slowCOUpdatePrefix + uErr.Reason, "waiting on machine-config over 90 minutes which is longer than expected", true
}
return uErr.Reason, fmt.Sprintf("waiting on %s", strings.Join(names, ", ")), true
}

func convertErrorToProgressingForUpdateEffectFailAfterInterval(uErr *payload.UpdateError, now time.Time) (string, string, bool) {
var exceeded []string
threshold := now.Add(-(40 * time.Minute))
names := uErr.Names
if len(names) == 0 {
names = []string{uErr.Name}
}
for _, name := range names {
if payload.COUpdateStartTimesGet(name).Before(threshold) {
exceeded = append(exceeded, name)
}
}
return "", "", false
if len(exceeded) > 0 {
return uErr.Reason, fmt.Sprintf("wait has exceeded 40 minutes for these operators: %s", strings.Join(exceeded, ", ")), false
} else {
return uErr.Reason, fmt.Sprintf("waiting up to 40 minutes on %s", uErr.Name), true
}
}

// syncFailingStatus handles generic errors in the cluster version. It tries to preserve
Expand Down
Loading