-
Notifications
You must be signed in to change notification settings - Fork 1.4k
🌱Add Machine and KCP conditions to KCP controller #3674
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
1b7d9ea
3836c76
849c5d3
f70db9e
efba223
bb8c4b5
e2d6f83
f984dba
800b8dd
b52b9b7
846573d
c512676
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -221,6 +221,7 @@ func patchKubeadmControlPlane(ctx context.Context, patchHelper *patch.Helper, kc | |
controlplanev1.MachinesReadyCondition, | ||
controlplanev1.AvailableCondition, | ||
controlplanev1.CertificatesAvailableCondition, | ||
controlplanev1.EtcdClusterHealthy, | ||
), | ||
) | ||
|
||
|
@@ -289,21 +290,25 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster * | |
return ctrl.Result{}, err | ||
} | ||
|
||
ownedMachines := controlPlaneMachines.Filter(machinefilters.OwnedMachines(kcp)) | ||
if len(ownedMachines) != len(controlPlaneMachines) { | ||
logger.Info("Not all control plane machines are owned by this KubeadmControlPlane, refusing to operate in mixed management mode") | ||
return ctrl.Result{}, nil | ||
} | ||
|
||
controlPlane, err := internal.NewControlPlane(ctx, r.Client, cluster, kcp, ownedMachines) | ||
if err != nil { | ||
controlPlane, err := r.createControlPlane(ctx, cluster, kcp) | ||
if controlPlane == nil || err != nil { | ||
logger.Error(err, "failed to initialize control plane") | ||
return ctrl.Result{}, err | ||
} | ||
if len(controlPlane.Machines) != len(controlPlaneMachines) { | ||
logger.Info("Not all control plane machines are owned by this KubeadmControlPlane, refusing to operate in mixed management mode") | ||
return ctrl.Result{}, nil | ||
} | ||
|
||
// Aggregate the operational state of all the machines; while aggregating we are adding the | ||
// source ref (reason@machine/name) so the problem can be easily tracked down to its source machine. | ||
conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, ownedMachines.ConditionGetters(), conditions.AddSourceRef()) | ||
conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, controlPlane.Machines.ConditionGetters(), conditions.AddSourceRef()) | ||
|
||
// reconcileControlPlaneHealth returns err if there is a machine being delete or control plane is unhealthy. | ||
// If control plane is not initialized, then control-plane machines will be empty and hence health check will not fail. | ||
if result, err := r.reconcileControlPlaneHealth(ctx, cluster, kcp, controlPlane); err != nil || !result.IsZero() { | ||
return result, err | ||
} | ||
|
||
// Control plane machines rollout due to configuration changes (e.g. upgrades) takes precedence over other operations. | ||
needRollout := controlPlane.MachinesNeedingRollout() | ||
|
@@ -324,7 +329,7 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster * | |
} | ||
|
||
// If we've made it this far, we can assume that all ownedMachines are up to date | ||
numMachines := len(ownedMachines) | ||
numMachines := len(controlPlane.Machines) | ||
desiredReplicas := int(*kcp.Spec.Replicas) | ||
|
||
switch { | ||
|
@@ -372,13 +377,42 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster * | |
return ctrl.Result{}, nil | ||
} | ||
|
||
func (r *KubeadmControlPlaneReconciler) createControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane) (*internal.ControlPlane, error) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This method is a little misleading, If the internal control plane struct requires more information, can we enrich There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is getting the owned machines and then create ControlPlane, added because we will do the same thing in both reconcile and reconcileDelete. Although ControlPlane struct is enough as it is, ownedMachines are also calculated here. I think this is a good abstraction, but maybe bad naming? |
||
logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name, "cluster", cluster.Name) | ||
|
||
controlPlaneMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster), machinefilters.ControlPlaneMachines(cluster.Name)) | ||
if err != nil { | ||
logger.Error(err, "failed to retrieve control plane machines for cluster") | ||
return nil, err | ||
} | ||
ownedMachines := controlPlaneMachines.Filter(machinefilters.OwnedMachines(kcp)) | ||
|
||
controlPlane, err := internal.NewControlPlane(ctx, r.Client, cluster, kcp, ownedMachines) | ||
if err != nil { | ||
logger.Error(err, "failed to initialize control plane") | ||
return nil, err | ||
} | ||
return controlPlane, nil | ||
} | ||
|
||
// reconcileDelete handles KubeadmControlPlane deletion. | ||
// The implementation does not take non-control plane workloads into consideration. This may or may not change in the future. | ||
// Please see https://github.com/kubernetes-sigs/cluster-api/issues/2064. | ||
func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane) (ctrl.Result, error) { | ||
logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name, "cluster", cluster.Name) | ||
logger.Info("Reconcile KubeadmControlPlane deletion") | ||
|
||
controlPlane, err := r.createControlPlane(ctx, cluster, kcp) | ||
if controlPlane == nil || err != nil { | ||
sedefsavas marked this conversation as resolved.
Show resolved
Hide resolved
|
||
logger.Error(err, "failed to initialize control plane") | ||
return ctrl.Result{}, err | ||
} | ||
|
||
// Ignore the health check results here as well as the errors, they are used to set health related conditions on Machines. | ||
// Errors may be dues not being able to get workload cluster nodes. | ||
r.managementCluster.TargetClusterControlPlaneHealthCheck(ctx, controlPlane, util.ObjectKey(cluster)) //nolint | ||
r.managementCluster.TargetClusterEtcdHealthCheck(ctx, controlPlane, util.ObjectKey(cluster)) //nolint | ||
sedefsavas marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
allMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster)) | ||
if err != nil { | ||
return ctrl.Result{}, err | ||
|
@@ -442,21 +476,43 @@ func (r *KubeadmControlPlaneReconciler) ClusterToKubeadmControlPlane(o handler.M | |
return nil | ||
} | ||
|
||
// reconcileHealth performs health checks for control plane components and etcd | ||
// reconcileControlPlaneHealth performs health checks for control plane components and etcd | ||
// It removes any etcd members that do not have a corresponding node. | ||
// Also, as a final step, checks if there is any machines that is being deleted. | ||
func (r *KubeadmControlPlaneReconciler) reconcileHealth(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) { | ||
func (r *KubeadmControlPlaneReconciler) reconcileControlPlaneHealth(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) { | ||
logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name) | ||
|
||
// If there is no KCP-owned control-plane machines, then control-plane has not been initialized yet. | ||
if controlPlane.Machines.Len() == 0 { | ||
return ctrl.Result{}, nil | ||
} | ||
|
||
for i := range controlPlane.Machines { | ||
m := controlPlane.Machines[i] | ||
// Initialize the patch helper. | ||
patchHelper, err := patch.NewHelper(m, r.Client) | ||
if err != nil { | ||
return ctrl.Result{}, err | ||
} | ||
|
||
defer func() { | ||
// Always attempt to Patch the Machine conditions after each health reconciliation. | ||
if err := patchHelper.Patch(ctx, m); err != nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess this is because we are delegating the responsibility to update machine conditions to reconcileControlPlaneHealth.
Have you considered these use cases? a possible idea to address this is to assign the responsibility to update machine to updateStatus? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Machine remediation will call It doesn't cover delete workflow because we don't call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I'm not sure we will call reconcileControlPlaneHealth during remediation because remediation should happen before we are checking reconcileControlPlaneHealth (also, curently we are returning if cluster is unhealthy)
This is why I was suggesting to consider have this code in updateStatus There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added reconcileHealth to reconcile delete with the minimum changes possible. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Had to remove error checks for |
||
logger.Error(err, "Failed to patch KubeadmControlPlane Machine", "machine", m.Name) | ||
} | ||
}() | ||
} | ||
|
||
// Do a health check of the Control Plane components | ||
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster)); err != nil { | ||
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, controlPlane, util.ObjectKey(cluster)); err != nil { | ||
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy", | ||
"Waiting for control plane to pass control plane health check to continue reconciliation: %v", err) | ||
return ctrl.Result{RequeueAfter: healthCheckFailedRequeueAfter}, nil | ||
return ctrl.Result{}, errors.Wrap(err, "failed to pass control-plane health check") | ||
} | ||
|
||
// If KCP should manage etcd, ensure etcd is healthy. | ||
if controlPlane.IsEtcdManaged() { | ||
if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, util.ObjectKey(cluster)); err != nil { | ||
if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, controlPlane, util.ObjectKey(cluster)); err != nil { | ||
errList := []error{errors.Wrap(err, "failed to pass etcd health check")} | ||
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy", | ||
"Waiting for control plane to pass etcd health check to continue reconciliation: %v", err) | ||
|
Uh oh!
There was an error while loading. Please reload this page.