Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: OCPBUGS-53427: pkg/operator/status: Drop kubelet skew guard, add RHEL guard #4956

Open
wants to merge 1 commit into
base: release-4.18
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 30 additions & 104 deletions pkg/operator/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,13 +254,6 @@ func (optr *Operator) syncDegradedStatus(co *configv1.ClusterOperator, ierr sync
cov1helpers.SetStatusCondition(&co.Status.Conditions, coDegradedCondition)
}

const (
skewUnchecked = "KubeletSkewUnchecked"
skewSupported = "KubeletSkewSupported"
skewUnsupported = "KubeletSkewUnsupported"
skewPresent = "KubeletSkewPresent"
)

// syncUpgradeableStatus applies the new condition to the mco's ClusterOperator object.
func (optr *Operator) syncUpgradeableStatus(co *configv1.ClusterOperator) error {

Expand Down Expand Up @@ -321,37 +314,20 @@ func (optr *Operator) syncUpgradeableStatus(co *configv1.ClusterOperator) error
coStatusCondition.Message = "One or more machine config pools are updating, please see `oc get mcp` for further details"
}

// don't overwrite status if updating or degraded
if !updating && !degraded && !interrupted {
skewStatus, status, err := optr.isKubeletSkewSupported(pools)
// don't overwrite status if already grumpy
if coStatusCondition.Status == configv1.ConditionTrue {
condition, err := optr.checkNodeUpgradeable(pools)
if err != nil {
klog.Errorf("Error checking version skew: %v, kubelet skew status: %v, status reason: %v, status message: %v", err, skewStatus, status.Reason, status.Message)
coStatusCondition.Reason = status.Reason
coStatusCondition.Message = status.Message
cov1helpers.SetStatusCondition(&co.Status.Conditions, coStatusCondition)
}
switch skewStatus {
case skewUnchecked:
coStatusCondition.Reason = status.Reason
coStatusCondition.Message = status.Message
cov1helpers.SetStatusCondition(&co.Status.Conditions, coStatusCondition)
case skewUnsupported:
coStatusCondition.Reason = status.Reason
coStatusCondition.Message = status.Message
mcoObjectRef := &corev1.ObjectReference{
Kind: co.Kind,
Name: co.Name,
Namespace: co.Namespace,
UID: co.GetUID(),
}
klog.Infof("kubelet skew status: %v, status reason: %v", skewStatus, status.Reason)
optr.eventRecorder.Eventf(mcoObjectRef, corev1.EventTypeWarning, coStatusCondition.Reason, coStatusCondition.Message)
cov1helpers.SetStatusCondition(&co.Status.Conditions, coStatusCondition)
case skewPresent:
coStatusCondition.Reason = status.Reason
coStatusCondition.Message = status.Message
klog.Infof("kubelet skew status: %v, status reason: %v", skewStatus, status.Reason)
msg := fmt.Sprintf("Error checking Nodes for Upgradeable status: %v", err)
klog.Error(msg)
coStatusCondition.Status = configv1.ConditionUnknown
coStatusCondition.Reason = condition.Reason
coStatusCondition.Message = condition.Message
cov1helpers.SetStatusCondition(&co.Status.Conditions, coStatusCondition)
} else if condition.Status != configv1.ConditionTrue {
coStatusCondition.Status = condition.Status
coStatusCondition.Reason = condition.Reason
coStatusCondition.Message = condition.Message
}
}
cov1helpers.SetStatusCondition(&co.Status.Conditions, coStatusCondition)
Expand Down Expand Up @@ -525,83 +501,33 @@ func (optr *Operator) cfeEvalCgroupsV1() (bool, error) {
return nodeClusterConfig.Spec.CgroupMode == configv1.CgroupModeV1, nil
}

// isKubeletSkewSupported checks the version skew of kube-apiserver and node kubelet version.
// Returns the skew status. version skew > 2 is not supported.
func (optr *Operator) isKubeletSkewSupported(pools []*mcfgv1.MachineConfigPool) (skewStatus string, coStatus configv1.ClusterOperatorStatusCondition, err error) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I pitch dropping this in my commit message, but thoughts about whether I should drop it in the dev branch? I'd like to, but looking for MCO-maintainer opinions first.

And then, if the MCO-maintainer opinion is "yes, please drop this in dev", I'd like MCO-maintainer opinions on whether you want me to use this same bug-series (because we'll need a dev bug anyway that says "4.19 won't need the bare-RHEL guard, because that's only a 4.18 -> 4.19 issue, so only 4.18's MCO needs that guard"), or if you want it NO-ISSUE, or you want a separate OCPBUGS series?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My opinion is that we can drop this in the dev branch. And since we'll need a bug, I think it's fine to use the same bug-series.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've opened #4970 to drop this guard in the dev branch, and I'll rebase this pull and close this thread once that's been kicked around, merged, and verified.

coStatus = configv1.ClusterOperatorStatusCondition{}
kubeAPIServerStatus, err := optr.clusterOperatorLister.Get("kube-apiserver")
if err != nil {
coStatus.Reason = skewUnchecked
coStatus.Message = fmt.Sprintf("An error occurred when checking kubelet version skew: %v", err)
return skewUnchecked, coStatus, err
}
// looks like
// - name: kube-apiserver
// version: 1.21.0-rc.0
kubeAPIServerVersion := ""
for _, version := range kubeAPIServerStatus.Status.Versions {
if version.Name != "kube-apiserver" {
continue
}
kubeAPIServerVersion = version.Version
break
}
if kubeAPIServerVersion == "" {
err = fmt.Errorf("kube-apiserver does not yet have a version")
coStatus.Reason = skewUnchecked
coStatus.Message = fmt.Sprintf("An error occurred when checking kubelet version skew: %v", err.Error())
return skewUnchecked, coStatus, err
}
kubeAPIServerMinorVersion, err := getMinorKubeletVersion(kubeAPIServerVersion)
if err != nil {
coStatus.Reason = skewUnchecked
coStatus.Message = fmt.Sprintf("An error occurred when checking kubelet version skew: %v", err)
return skewUnchecked, coStatus, err
// checkNodeUpgradeable checks current Node status to look for anything incompatible with the next 4.(y+1) OpenShift release.
func (optr *Operator) checkNodeUpgradeable(pools []*mcfgv1.MachineConfigPool) (coStatus configv1.ClusterOperatorStatusCondition, err error) {
coStatus = configv1.ClusterOperatorStatusCondition{
Status: configv1.ConditionTrue,
}
var (
lastError error
kubeletVersion string
)
nodes, err := optr.GetAllManagedNodes(pools)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As part of being a WIP, should this be GetAllManagedNodes, or should I be looking at all nodes regardless of management? Or maybe just looking at unmanaged nodes? Or...?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Every place in the MCO where we listing nodes, we're looking at ones that belong to a given MachineConfigPool, which generally means they're managed. So, for the sake of consistency, this should probably be GetAllManagedNodes(). I could be persuaded otherwise though.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know enough about bare-RHEL though; are those Nodes also returned by GetAllManagedNodes?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe so, as long as they belong to a MachineConfigPool, they should be returned in this list. And if they're following the docs, they should end up in the worker pool? We can double check in pre merge QE testing to be sure

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I asked QE to spin up a 4.18 cluster with RHEL nodes and I can confirm this is the case, so we should be good here.

if err != nil {
err = fmt.Errorf("getting all managed nodes failed: %w", err)
coStatus.Reason = skewUnchecked
coStatus.Message = fmt.Sprintf("An error occurred when getting all the managed nodes: %v", err.Error())
coStatus.Status = configv1.ConditionUnknown
coStatus.Reason = "FailedToGetNodes"
coStatus.Message = err.Error()
return coStatus, err
}
nonCoreOSNodes := make([]string, 0, len(nodes))
for _, node := range nodes {
// looks like kubeletVersion: v1.21.0-rc.0+6143dea
kubeletVersion = node.Status.NodeInfo.KubeletVersion
if kubeletVersion == "" {
continue
osImage := node.Status.NodeInfo.OSImage
if strings.HasPrefix(osImage, "Red Hat Enterprise Linux") && !strings.HasPrefix(osImage, "Red Hat Enterprise Linux CoreOS") {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue: We should consider OKD as well. Maybe something like this instead?

if (strings.HasPrefix(osImage, "Red Hat Enterprise Linux") || strings.HasPrefix("CentOS Stream")) && !strings.Contains(osImage, "CoreOS") {
     // Do stuff.
}

For the record, we only support OKD on SCOS (CentOS Stream CoreOS) now, so no need to worry about the Fedora CoreOS case here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand how OKD vs. OCP comes into this. Aren't bare-RHEL Nodes something that users add on their own, and so something that they could have added to OKD or OCP clusters. RHCOS and SCOS are what the cluster will use when creating Machines on its own and managing kubelet versions on its own, but those Machines/Nodes are fine, and we're just looking for Nodes where the OS/kubelet is being managed directly by the user for this 4.18 -> 4.19 guard.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think your assessment is right @wking , if I'm understanding the goal correctly, we want to block bare RHEL nodes and we don't allow bare SCOS nodes, so that isn't something to consider here.

nonCoreOSNodes = append(nonCoreOSNodes, fmt.Sprintf("%s (%s)", node.Name, osImage))
}
nodeMinorVersion, err := getMinorKubeletVersion(kubeletVersion)
if err != nil {
lastError = err
continue
}
if nodeMinorVersion+2 < kubeAPIServerMinorVersion {
coStatus.Reason = skewUnsupported
coStatus.Message = fmt.Sprintf("One or more nodes have an unsupported kubelet version skew. Please see `oc get nodes` for details and upgrade all nodes so that they have a kubelet version of at least %v.", getMinimalSkewSupportNodeVersion(kubeAPIServerVersion))
return skewUnsupported, coStatus, nil
}
if nodeMinorVersion+2 == kubeAPIServerMinorVersion {
coStatus.Reason = skewPresent
coStatus.Message = fmt.Sprintf("Current kubelet version %v will not be supported by newer kube-apiserver. Please upgrade the kubelet first if plan to upgrade the kube-apiserver", kubeletVersion)
return skewPresent, coStatus, nil
}
}
if kubeletVersion == "" {
err = fmt.Errorf("kubelet does not yet have a version")
coStatus.Reason = skewUnchecked
coStatus.Message = fmt.Sprintf("An error occurred when checking kubelet version skew: %v", err.Error())
return skewUnchecked, coStatus, err
}
if lastError != nil {
coStatus.Reason = skewUnchecked
coStatus.Message = fmt.Sprintf("An error occurred when checking kubelet version skew: %v", err)
return skewUnchecked, coStatus, lastError
sort.Strings(nonCoreOSNodes)
if len(nonCoreOSNodes) > 0 {
coStatus.Status = configv1.ConditionFalse
coStatus.Reason = "RHELNodes"
coStatus.Message = fmt.Sprintf("%s RHEL nodes, including %s, but OpenShift 4.19 requires RHCOS https://FIXME-DOC-LINK", len(nonCoreOSNodes), nonCoreOSNodes[0])
}
return skewSupported, coStatus, nil
return coStatus, nil
}

// GetAllManagedNodes returns the nodes managed by MCO
Expand Down