Skip to content

Commit c805f52

Browse files
committed
OCPBUGS-53408: wait for build and ensure OS image is actually new
Occasionally, there is a delay between the time that a new rendered MachineConfig is produced and OCL begins a build. However, a couple of things happen in the interim: - The RenderController updates the MachineConfigPool. Because of the delay mentioned above, the NodeController begins updating all of the nodes with only the new rendered MachineConfig. The OS image remains the same because the NodeController is not ensuring that the image pullspec on the MachineOSConfig is the same as the MachineOSBuild. - Because of the work done to the MCD in #4825, the original check that we had to determine whether the image pullspecs were the same is no longer present. Additionally, the logic change there makes it possible for an OS update to always occur whenever OCL is enabled, further bypassing that check. This fixes that by doing two things: 1. Update the Node Controller to ensure that the both the MachineOSBuild's MachineConfig reference matches the MCP's current rendered MachineConfig while also checking that the MachineOSConfig's image pullspec matches the MachineOSBuild's. In the situation where the MachineOSBuild's pullspec is empty, this check will fail and the Node Controller will requeue. 2. Update the MCD so that even when OCL is enabled, if the OS images are the same, the OS update process is skipped.
1 parent 5968c11 commit c805f52

File tree

5 files changed

+258
-57
lines changed

5 files changed

+258
-57
lines changed

pkg/controller/common/mos_state.go

+4
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ func (c *MachineOSConfigState) GetOSImage() string {
4545
return osImage
4646
}
4747

48+
func (c *MachineOSConfigState) MachineOSBuildIsCurrent(mosb *mcfgv1.MachineOSBuild) bool {
49+
return mosb.Status.DigestedImagePushSpec == c.Config.Status.CurrentImagePullSpec
50+
}
51+
4852
// Determines if a given MachineConfigPool has an available OS image. Returns
4953
// false if the annotation is missing or set to an empty string.
5054
func (c *MachineOSConfigState) HasOSImage() bool {

pkg/controller/node/node_controller.go

+65-34
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ import (
4848
clientretry "k8s.io/client-go/util/retry"
4949
"k8s.io/client-go/util/workqueue"
5050
"k8s.io/klog/v2"
51+
52+
buildconstants "github.com/openshift/machine-config-operator/pkg/controller/build/constants"
5153
)
5254

5355
const (
@@ -888,8 +890,28 @@ func (ctrl *Controller) handleErr(err error, key string) {
888890
// 2. If a MachineConfig changes, we should wait for the OS image build to be
889891
// ready so we can update both the nodes' desired MachineConfig and desired
890892
// image annotations simultaneously.
893+
func (ctrl *Controller) getConfigAndBuildAndLayeredStatus(pool *mcfgv1.MachineConfigPool) (*mcfgv1.MachineOSConfig, *mcfgv1.MachineOSBuild, bool, error) {
894+
mosc, mosb, err := ctrl.getConfigAndBuild(pool)
895+
// If we attempt to list resources which are not present either because none
896+
// exist or they're behind an inactive feature gate, they will return an
897+
// IsNotFound error. Any other errors should be returned to the caller.
898+
if err != nil && !errors.IsNotFound(err) {
899+
return nil, nil, false, err
900+
}
901+
902+
isLayered, err := ctrl.isLayeredPool(mosc, mosb)
903+
if err != nil {
904+
return nil, nil, false, fmt.Errorf("Failed to determine whether pool %s opts in to OCL due to an error: %s", pool.Name, err)
905+
}
891906

892-
func (ctrl *Controller) GetConfigAndBuild(pool *mcfgv1.MachineConfigPool) (*mcfgv1.MachineOSConfig, *mcfgv1.MachineOSBuild, error) {
907+
return mosc, mosb, isLayered, nil
908+
}
909+
910+
func (ctrl *Controller) getConfigAndBuild(pool *mcfgv1.MachineConfigPool) (*mcfgv1.MachineOSConfig, *mcfgv1.MachineOSBuild, error) {
911+
// TODO: We should use the selectors from the build controller since they are
912+
// well-tested and makes querying for this information significantly easier.
913+
// Additionally, this should use listers instead of API clients in order to
914+
// reduce the impact on the API server.
893915
var ourConfig *mcfgv1.MachineOSConfig
894916
var ourBuild *mcfgv1.MachineOSBuild
895917
configList, err := ctrl.client.MachineconfigurationV1().MachineOSConfigs().List(context.TODO(), metav1.ListOptions{})
@@ -898,6 +920,7 @@ func (ctrl *Controller) GetConfigAndBuild(pool *mcfgv1.MachineConfigPool) (*mcfg
898920
}
899921

900922
for _, config := range configList.Items {
923+
config := config
901924
if config.Spec.MachineConfigPool.Name == pool.Name {
902925
ourConfig = &config
903926
break
@@ -914,24 +937,27 @@ func (ctrl *Controller) GetConfigAndBuild(pool *mcfgv1.MachineConfigPool) (*mcfg
914937
}
915938

916939
for _, build := range buildList.Items {
917-
if build.Spec.MachineOSConfig.Name == ourConfig.Name {
918-
if build.Spec.MachineConfig.Name == pool.Spec.Configuration.Name {
919-
ourBuild = &build
920-
break
921-
}
940+
build := build
941+
if build.Spec.MachineOSConfig.Name == ourConfig.Name && build.Spec.MachineConfig.Name == pool.Spec.Configuration.Name {
942+
ourBuild = &build
943+
break
922944
}
923945
}
924946

925947
return ourConfig, ourBuild, nil
926-
927948
}
928949

929-
func (ctrl *Controller) canLayeredPoolContinue(pool *mcfgv1.MachineConfigPool) (string, bool, error) {
930-
931-
mosc, mosb, _ := ctrl.GetConfigAndBuild(pool)
950+
func (ctrl *Controller) canLayeredPoolContinue(pool *mcfgv1.MachineConfigPool, mosc *mcfgv1.MachineOSConfig, mosb *mcfgv1.MachineOSBuild) (string, bool, error) {
951+
// This is an edgecase which we should ideally never hit. However, it is
952+
// better to anticipate it and have an error message ready vs. the
953+
// alternative.
954+
if mosc == nil && mosb != nil {
955+
msg := fmt.Sprintf("orphaned MachineOSBuild %q found, but MachineOSConfig %q not found", mosb.Name, mosb.Labels[buildconstants.MachineOSConfigNameLabelKey])
956+
return msg, false, fmt.Errorf(msg)
957+
}
932958

933-
if mosc == nil || mosb == nil {
934-
return "No MachineOSConfig or Build for this pool", false, nil
959+
if !ctrl.isConfigAndBuildPresent(mosc, mosb) {
960+
return "No MachineOSConfig or MachineOSBuild for this pool", false, nil
935961
}
936962

937963
cs := ctrlcommon.NewMachineOSConfigState(mosc)
@@ -941,15 +967,18 @@ func (ctrl *Controller) canLayeredPoolContinue(pool *mcfgv1.MachineConfigPool) (
941967
pullspec := cs.GetOSImage()
942968

943969
if !hasImage {
944-
return "Desired Image not set in MachineOSBuild", false, nil
970+
return "Desired image not set in MachineOSConfig", false, nil
945971
}
946972

947973
switch {
948-
// If the build is successful and we have the image pullspec, we can proceed
974+
// If the build is successful and the MachineOSConfig has the matching pullspec, we can proceed
949975
// with rolling out the new OS image.
950-
case bs.IsBuildSuccess() && hasImage:
976+
case bs.IsBuildSuccess() && hasImage && cs.MachineOSBuildIsCurrent(mosb):
951977
msg := fmt.Sprintf("Image built successfully, pullspec: %s", pullspec)
952978
return msg, true, nil
979+
case bs.IsBuildSuccess() && hasImage && !cs.MachineOSBuildIsCurrent(mosb):
980+
msg := fmt.Sprintf("Image built successfully, pullspec: %s, but MachineOSConfig %q has not updated yet", pullspec, mosc.Name)
981+
return msg, false, nil
953982
case bs.IsBuildPending():
954983
return "Image build pending", false, nil
955984
case bs.IsBuilding():
@@ -1015,14 +1044,13 @@ func (ctrl *Controller) syncMachineConfigPool(key string) error {
10151044
return ctrl.syncStatusOnly(pool)
10161045
}
10171046

1018-
mosc, mosb, _ := ctrl.GetConfigAndBuild(pool)
1019-
layered, err := ctrl.IsLayeredPool(mosc, mosb)
1047+
mosc, mosb, layered, err := ctrl.getConfigAndBuildAndLayeredStatus(pool)
10201048
if err != nil {
1021-
return fmt.Errorf("Failed to determine whether pool %s opts in to OCL due to an error: %s", pool.Name, err)
1049+
return fmt.Errorf("could not get config and build: %w", err)
10221050
}
10231051

10241052
if layered {
1025-
reason, canApplyUpdates, err := ctrl.canLayeredPoolContinue(pool)
1053+
reason, canApplyUpdates, err := ctrl.canLayeredPoolContinue(pool, mosc, mosb)
10261054
if err != nil {
10271055
klog.Infof("Layered pool %s encountered an error: %s", pool.Name, err)
10281056
return err
@@ -1095,7 +1123,7 @@ func (ctrl *Controller) syncMachineConfigPool(key string) error {
10951123
}
10961124
}
10971125
ctrl.logPool(pool, "%d candidate nodes in %d zones for update, capacity: %d", len(candidates), len(zones), capacity)
1098-
if err := ctrl.updateCandidateMachines(pool, candidates, capacity); err != nil {
1126+
if err := ctrl.updateCandidateMachines(layered, mosc, mosb, pool, candidates, capacity); err != nil {
10991127
if syncErr := ctrl.syncStatusOnly(pool); syncErr != nil {
11001128
errs := kubeErrs.NewAggregate([]error{syncErr, err})
11011129
return fmt.Errorf("error setting annotations for pool %q, sync error: %w", pool.Name, errs)
@@ -1184,7 +1212,7 @@ func (ctrl *Controller) updateCandidateNode(mosc *mcfgv1.MachineOSConfig, mosb *
11841212
}
11851213

11861214
lns := ctrlcommon.NewLayeredNodeState(oldNode)
1187-
layered, err := ctrl.IsLayeredPool(mosc, mosb)
1215+
layered, err := ctrl.isLayeredPool(mosc, mosb)
11881216
if err != nil {
11891217
return fmt.Errorf("Failed to determine whether pool %s opts in to OCL due to an error: %s", pool.Name, err)
11901218
}
@@ -1330,9 +1358,9 @@ func (ctrl *Controller) filterControlPlaneCandidateNodes(pool *mcfgv1.MachineCon
13301358
}
13311359

13321360
// SetDesiredStateFromPool in old mco explains how this works. Somehow you need to NOT FAIL if the mosb doesn't exist. So
1333-
// we still need to base this whole things on pools but IsLayeredPool == does mosb exist
1361+
// we still need to base this whole things on pools but isLayeredPool == does mosb exist
13341362
// updateCandidateMachines sets the desiredConfig annotation the candidate machines
1335-
func (ctrl *Controller) updateCandidateMachines(pool *mcfgv1.MachineConfigPool, candidates []*corev1.Node, capacity uint) error {
1363+
func (ctrl *Controller) updateCandidateMachines(layered bool, mosc *mcfgv1.MachineOSConfig, mosb *mcfgv1.MachineOSBuild, pool *mcfgv1.MachineConfigPool, candidates []*corev1.Node, capacity uint) error {
13361364
if pool.Name == ctrlcommon.MachineConfigPoolMaster {
13371365
var err error
13381366
candidates, capacity, err = ctrl.filterControlPlaneCandidateNodes(pool, candidates, capacity)
@@ -1351,25 +1379,20 @@ func (ctrl *Controller) updateCandidateMachines(pool *mcfgv1.MachineConfigPool,
13511379
candidates = candidates[:capacity]
13521380
}
13531381

1354-
return ctrl.setDesiredAnnotations(pool, candidates)
1382+
return ctrl.setDesiredAnnotations(layered, mosc, mosb, pool, candidates)
13551383
}
13561384

1357-
func (ctrl *Controller) setDesiredAnnotations(pool *mcfgv1.MachineConfigPool, candidates []*corev1.Node) error {
1385+
func (ctrl *Controller) setDesiredAnnotations(layered bool, mosc *mcfgv1.MachineOSConfig, mosb *mcfgv1.MachineOSBuild, pool *mcfgv1.MachineConfigPool, candidates []*corev1.Node) error {
13581386
eventName := "SetDesiredConfig"
13591387
updateName := fmt.Sprintf("MachineConfig: %s", pool.Spec.Configuration.Name)
1360-
config, build, _ := ctrl.GetConfigAndBuild(pool)
1361-
layered, err := ctrl.IsLayeredPool(config, build)
13621388

1363-
if err != nil {
1364-
return fmt.Errorf("Failed to determine whether pool %s opts in to OCL due to an error: %s", pool.Name, err)
1365-
}
13661389
if layered {
13671390
eventName = "SetDesiredConfigAndOSImage"
1368-
updateName = fmt.Sprintf("%s / Image: %s", updateName, ctrlcommon.NewMachineOSConfigState(config).GetOSImage())
1391+
updateName = fmt.Sprintf("%s / Image: %s", updateName, ctrlcommon.NewMachineOSConfigState(mosc).GetOSImage())
13691392
klog.Infof("Continuing to sync layered MachineConfigPool %s", pool.Name)
13701393
}
13711394
for _, node := range candidates {
1372-
if err := ctrl.updateCandidateNode(config, build, node.Name, pool); err != nil {
1395+
if err := ctrl.updateCandidateNode(mosc, mosb, node.Name, pool); err != nil {
13731396
return fmt.Errorf("setting desired %s for node %s: %w", pool.Spec.Configuration.Name, node.Name, err)
13741397
}
13751398
}
@@ -1507,10 +1530,18 @@ func getErrorString(err error) string {
15071530
return ""
15081531
}
15091532

1510-
func (ctrl *Controller) IsLayeredPool(mosc *mcfgv1.MachineOSConfig, mosb *mcfgv1.MachineOSBuild) (bool, error) {
1533+
func (ctrl *Controller) isLayeredPool(mosc *mcfgv1.MachineOSConfig, mosb *mcfgv1.MachineOSBuild) (bool, error) {
15111534
fg, err := ctrl.fgAcessor.CurrentFeatureGates()
15121535
if err != nil {
15131536
return false, err
15141537
}
1515-
return (mosc != nil || mosb != nil) && fg.Enabled(features.FeatureGateOnClusterBuild), nil
1538+
return ctrl.isConfigOrBuildPresent(mosc, mosb) && fg.Enabled(features.FeatureGateOnClusterBuild), nil
1539+
}
1540+
1541+
func (ctrl *Controller) isConfigOrBuildPresent(mosc *mcfgv1.MachineOSConfig, mosb *mcfgv1.MachineOSBuild) bool {
1542+
return (mosc != nil || mosb != nil)
1543+
}
1544+
1545+
func (ctrl *Controller) isConfigAndBuildPresent(mosc *mcfgv1.MachineOSConfig, mosb *mcfgv1.MachineOSBuild) bool {
1546+
return (mosc != nil && mosb != nil)
15161547
}

0 commit comments

Comments
 (0)