-
Notifications
You must be signed in to change notification settings - Fork 423
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
WIP: OCPBUGS-53427: pkg/operator/status: Drop kubelet skew guard, add RHEL guard #4956
base: release-4.18
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -254,13 +254,6 @@ func (optr *Operator) syncDegradedStatus(co *configv1.ClusterOperator, ierr sync | |
cov1helpers.SetStatusCondition(&co.Status.Conditions, coDegradedCondition) | ||
} | ||
|
||
const ( | ||
skewUnchecked = "KubeletSkewUnchecked" | ||
skewSupported = "KubeletSkewSupported" | ||
skewUnsupported = "KubeletSkewUnsupported" | ||
skewPresent = "KubeletSkewPresent" | ||
) | ||
|
||
// syncUpgradeableStatus applies the new condition to the mco's ClusterOperator object. | ||
func (optr *Operator) syncUpgradeableStatus(co *configv1.ClusterOperator) error { | ||
|
||
|
@@ -321,37 +314,20 @@ func (optr *Operator) syncUpgradeableStatus(co *configv1.ClusterOperator) error | |
coStatusCondition.Message = "One or more machine config pools are updating, please see `oc get mcp` for further details" | ||
} | ||
|
||
// don't overwrite status if updating or degraded | ||
if !updating && !degraded && !interrupted { | ||
skewStatus, status, err := optr.isKubeletSkewSupported(pools) | ||
// don't overwrite status if already grumpy | ||
if coStatusCondition.Status == configv1.ConditionTrue { | ||
condition, err := optr.checkNodeUpgradeable(pools) | ||
if err != nil { | ||
klog.Errorf("Error checking version skew: %v, kubelet skew status: %v, status reason: %v, status message: %v", err, skewStatus, status.Reason, status.Message) | ||
coStatusCondition.Reason = status.Reason | ||
coStatusCondition.Message = status.Message | ||
cov1helpers.SetStatusCondition(&co.Status.Conditions, coStatusCondition) | ||
} | ||
switch skewStatus { | ||
case skewUnchecked: | ||
coStatusCondition.Reason = status.Reason | ||
coStatusCondition.Message = status.Message | ||
cov1helpers.SetStatusCondition(&co.Status.Conditions, coStatusCondition) | ||
case skewUnsupported: | ||
coStatusCondition.Reason = status.Reason | ||
coStatusCondition.Message = status.Message | ||
mcoObjectRef := &corev1.ObjectReference{ | ||
Kind: co.Kind, | ||
Name: co.Name, | ||
Namespace: co.Namespace, | ||
UID: co.GetUID(), | ||
} | ||
klog.Infof("kubelet skew status: %v, status reason: %v", skewStatus, status.Reason) | ||
optr.eventRecorder.Eventf(mcoObjectRef, corev1.EventTypeWarning, coStatusCondition.Reason, coStatusCondition.Message) | ||
cov1helpers.SetStatusCondition(&co.Status.Conditions, coStatusCondition) | ||
case skewPresent: | ||
coStatusCondition.Reason = status.Reason | ||
coStatusCondition.Message = status.Message | ||
klog.Infof("kubelet skew status: %v, status reason: %v", skewStatus, status.Reason) | ||
msg := fmt.Sprintf("Error checking Nodes for Upgradeable status: %v", err) | ||
klog.Error(msg) | ||
coStatusCondition.Status = configv1.ConditionUnknown | ||
coStatusCondition.Reason = condition.Reason | ||
coStatusCondition.Message = condition.Message | ||
cov1helpers.SetStatusCondition(&co.Status.Conditions, coStatusCondition) | ||
} else if condition.Status != configv1.ConditionTrue { | ||
coStatusCondition.Status = condition.Status | ||
coStatusCondition.Reason = condition.Reason | ||
coStatusCondition.Message = condition.Message | ||
} | ||
} | ||
cov1helpers.SetStatusCondition(&co.Status.Conditions, coStatusCondition) | ||
|
@@ -525,83 +501,33 @@ func (optr *Operator) cfeEvalCgroupsV1() (bool, error) { | |
return nodeClusterConfig.Spec.CgroupMode == configv1.CgroupModeV1, nil | ||
} | ||
|
||
// isKubeletSkewSupported checks the version skew of kube-apiserver and node kubelet version. | ||
// Returns the skew status. version skew > 2 is not supported. | ||
func (optr *Operator) isKubeletSkewSupported(pools []*mcfgv1.MachineConfigPool) (skewStatus string, coStatus configv1.ClusterOperatorStatusCondition, err error) { | ||
coStatus = configv1.ClusterOperatorStatusCondition{} | ||
kubeAPIServerStatus, err := optr.clusterOperatorLister.Get("kube-apiserver") | ||
if err != nil { | ||
coStatus.Reason = skewUnchecked | ||
coStatus.Message = fmt.Sprintf("An error occurred when checking kubelet version skew: %v", err) | ||
return skewUnchecked, coStatus, err | ||
} | ||
// looks like | ||
// - name: kube-apiserver | ||
// version: 1.21.0-rc.0 | ||
kubeAPIServerVersion := "" | ||
for _, version := range kubeAPIServerStatus.Status.Versions { | ||
if version.Name != "kube-apiserver" { | ||
continue | ||
} | ||
kubeAPIServerVersion = version.Version | ||
break | ||
} | ||
if kubeAPIServerVersion == "" { | ||
err = fmt.Errorf("kube-apiserver does not yet have a version") | ||
coStatus.Reason = skewUnchecked | ||
coStatus.Message = fmt.Sprintf("An error occurred when checking kubelet version skew: %v", err.Error()) | ||
return skewUnchecked, coStatus, err | ||
} | ||
kubeAPIServerMinorVersion, err := getMinorKubeletVersion(kubeAPIServerVersion) | ||
if err != nil { | ||
coStatus.Reason = skewUnchecked | ||
coStatus.Message = fmt.Sprintf("An error occurred when checking kubelet version skew: %v", err) | ||
return skewUnchecked, coStatus, err | ||
// checkNodeUpgradeable checks current Node status to look for anything incompatible with the next 4.(y+1) OpenShift release. | ||
func (optr *Operator) checkNodeUpgradeable(pools []*mcfgv1.MachineConfigPool) (coStatus configv1.ClusterOperatorStatusCondition, err error) { | ||
coStatus = configv1.ClusterOperatorStatusCondition{ | ||
Status: configv1.ConditionTrue, | ||
} | ||
var ( | ||
lastError error | ||
kubeletVersion string | ||
) | ||
nodes, err := optr.GetAllManagedNodes(pools) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As part of being a WIP, should this be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Every place in the MCO where we listing nodes, we're looking at ones that belong to a given MachineConfigPool, which generally means they're managed. So, for the sake of consistency, this should probably be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know enough about bare-RHEL though; are those Nodes also returned by There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe so, as long as they belong to a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I asked QE to spin up a 4.18 cluster with RHEL nodes and I can confirm this is the case, so we should be good here. |
||
if err != nil { | ||
err = fmt.Errorf("getting all managed nodes failed: %w", err) | ||
coStatus.Reason = skewUnchecked | ||
coStatus.Message = fmt.Sprintf("An error occurred when getting all the managed nodes: %v", err.Error()) | ||
coStatus.Status = configv1.ConditionUnknown | ||
coStatus.Reason = "FailedToGetNodes" | ||
coStatus.Message = err.Error() | ||
return coStatus, err | ||
} | ||
nonCoreOSNodes := make([]string, 0, len(nodes)) | ||
for _, node := range nodes { | ||
// looks like kubeletVersion: v1.21.0-rc.0+6143dea | ||
kubeletVersion = node.Status.NodeInfo.KubeletVersion | ||
if kubeletVersion == "" { | ||
continue | ||
osImage := node.Status.NodeInfo.OSImage | ||
if strings.HasPrefix(osImage, "Red Hat Enterprise Linux") && !strings.HasPrefix(osImage, "Red Hat Enterprise Linux CoreOS") { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. issue: We should consider OKD as well. Maybe something like this instead? if (strings.HasPrefix(osImage, "Red Hat Enterprise Linux") || strings.HasPrefix("CentOS Stream")) && !strings.Contains(osImage, "CoreOS") {
// Do stuff.
} For the record, we only support OKD on SCOS (CentOS Stream CoreOS) now, so no need to worry about the Fedora CoreOS case here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand how OKD vs. OCP comes into this. Aren't bare-RHEL Nodes something that users add on their own, and so something that they could have added to OKD or OCP clusters. RHCOS and SCOS are what the cluster will use when creating Machines on its own and managing kubelet versions on its own, but those Machines/Nodes are fine, and we're just looking for Nodes where the OS/kubelet is being managed directly by the user for this 4.18 -> 4.19 guard. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think your assessment is right @wking , if I'm understanding the goal correctly, we want to block bare RHEL nodes and we don't allow bare SCOS nodes, so that isn't something to consider here. |
||
nonCoreOSNodes = append(nonCoreOSNodes, fmt.Sprintf("%s (%s)", node.Name, osImage)) | ||
} | ||
nodeMinorVersion, err := getMinorKubeletVersion(kubeletVersion) | ||
if err != nil { | ||
lastError = err | ||
continue | ||
} | ||
if nodeMinorVersion+2 < kubeAPIServerMinorVersion { | ||
coStatus.Reason = skewUnsupported | ||
coStatus.Message = fmt.Sprintf("One or more nodes have an unsupported kubelet version skew. Please see `oc get nodes` for details and upgrade all nodes so that they have a kubelet version of at least %v.", getMinimalSkewSupportNodeVersion(kubeAPIServerVersion)) | ||
return skewUnsupported, coStatus, nil | ||
} | ||
if nodeMinorVersion+2 == kubeAPIServerMinorVersion { | ||
coStatus.Reason = skewPresent | ||
coStatus.Message = fmt.Sprintf("Current kubelet version %v will not be supported by newer kube-apiserver. Please upgrade the kubelet first if plan to upgrade the kube-apiserver", kubeletVersion) | ||
return skewPresent, coStatus, nil | ||
} | ||
} | ||
if kubeletVersion == "" { | ||
err = fmt.Errorf("kubelet does not yet have a version") | ||
coStatus.Reason = skewUnchecked | ||
coStatus.Message = fmt.Sprintf("An error occurred when checking kubelet version skew: %v", err.Error()) | ||
return skewUnchecked, coStatus, err | ||
} | ||
if lastError != nil { | ||
coStatus.Reason = skewUnchecked | ||
coStatus.Message = fmt.Sprintf("An error occurred when checking kubelet version skew: %v", err) | ||
return skewUnchecked, coStatus, lastError | ||
sort.Strings(nonCoreOSNodes) | ||
if len(nonCoreOSNodes) > 0 { | ||
coStatus.Status = configv1.ConditionFalse | ||
coStatus.Reason = "RHELNodes" | ||
coStatus.Message = fmt.Sprintf("%s RHEL nodes, including %s, but OpenShift 4.19 requires RHCOS https://FIXME-DOC-LINK", len(nonCoreOSNodes), nonCoreOSNodes[0]) | ||
} | ||
return skewSupported, coStatus, nil | ||
return coStatus, nil | ||
} | ||
|
||
// GetAllManagedNodes returns the nodes managed by MCO | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I pitch dropping this in my commit message, but thoughts about whether I should drop it in the dev branch? I'd like to, but looking for MCO-maintainer opinions first.
And then, if the MCO-maintainer opinion is "yes, please drop this in dev", I'd like MCO-maintainer opinions on whether you want me to use this same bug-series (because we'll need a dev bug anyway that says "4.19 won't need the bare-RHEL guard, because that's only a 4.18 -> 4.19 issue, so only 4.18's MCO needs that guard"), or if you want it NO-ISSUE, or you want a separate OCPBUGS series?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My opinion is that we can drop this in the dev branch. And since we'll need a bug, I think it's fine to use the same bug-series.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've opened #4970 to drop this guard in the dev branch, and I'll rebase this pull and close this thread once that's been kicked around, merged, and verified.