Skip to content

Commit 701fdae

Browse files
author
Sedef
committed
Add Machine and KCP conditions to KCP controller
1 parent 0cf9f80 commit 701fdae

13 files changed

+509
-88
lines changed

api/v1alpha3/condition_consts.go

+58-2
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,8 @@ const (
109109
// MachineHasFailureReason is the reason used when a machine has either a FailureReason or a FailureMessage set on its status.
110110
MachineHasFailureReason = "MachineHasFailure"
111111

112-
// NodeNotFoundReason is the reason used when a machine's node has previously been observed but is now gone.
112+
// NodeNotFoundReason (Severity=Error) documents a machine's node has previously been observed but is now gone.
113+
// NB. provisioned --> NodeRef != ""
113114
NodeNotFoundReason = "NodeNotFound"
114115

115116
// NodeStartupTimeoutReason is the reason used when a machine's node does not appear within the specified timeout.
@@ -120,10 +121,65 @@ const (
120121
)
121122

122123
const (
123-
// MachineOwnerRemediatedCondition is set on machines that have failed a healthcheck by the MachineHealthCheck controller.
124+
// MachineOwnerRemediatedCondition is set on machines that have failed a healthcheck by the Machine's owner controller.
124125
// MachineOwnerRemediatedCondition is set to False after a health check fails, but should be changed to True by the owning controller after remediation succeeds.
125126
MachineOwnerRemediatedCondition ConditionType = "OwnerRemediated"
126127

127128
// WaitingForRemediationReason is the reason used when a machine fails a health check and remediation is needed.
128129
WaitingForRemediationReason = "WaitingForRemediation"
129130
)
131+
132+
// Common Pod-related Condition Reasons used by Pod-related Conditions such as MachineKubeAPIServerHealthyCondition etc.
133+
const (
134+
// PodProvisioningReason (Severity=Info) documents a pod waiting to be provisioned i.e., Pod is in "Pending" phase and
135+
// PodScheduled and Initialized conditions are not yet set to True.
136+
PodProvisioningReason = "PodProvisioning"
137+
138+
// PodProvisioningFailedReason (Severity=Warning) documents a pod failed during provisioning i.e., Pod is in "Pending" phase and
139+
// PodScheduled and Initialized conditions are set to True,
140+
// but ContainersReady or Ready condition is false (i.e., at least one of the containers are in waiting state(e.g CrashLoopbackOff, ImagePullBackOff)
141+
PodProvisioningFailedReason = "PodProvisioningFailed"
142+
143+
// PodMissingReason (Severity=Warning) documents a pod does not exist.
144+
PodMissingReason = "PodMissing"
145+
146+
// PodFailedReason (Severity=Error) documents a pod's at least one container has terminated in a failure
147+
// and hence Pod is in "Failed" phase.
148+
PodFailedReason = "PodFailed"
149+
)
150+
151+
// Conditions that are only for control-plane machines. KubeadmControlPlane is the owner of these conditions.
152+
153+
const (
154+
// MachineKubeAPIServerHealthyCondition reports a machine's kube-apiserver's health status.
155+
// Set to true if kube-apiserver pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
156+
MachineKubeAPIServerHealthyCondition ConditionType = "KubeAPIServerHealthy"
157+
158+
// MachineKubeControllerManagerHealthyCondition reports a machine's kube-controller-manager's health status.
159+
// Set to true if kube-controller-manager pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
160+
MachineKubeControllerManagerHealthyCondition ConditionType = "KubeControllerManagerHealthy"
161+
162+
// MachineKubeSchedulerHealthyCondition reports a machine's kube-scheduler's health status.
163+
// Set to true if kube-scheduler pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
164+
MachineKubeSchedulerHealthyCondition ConditionType = "KubeSchedulerHealthy"
165+
166+
// MachineEtcdPodHealthyCondition reports a machine's etcd pod's health status.
167+
// Set to true if etcd pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
168+
MachineEtcdPodHealthyCondition ConditionType = "EtcdPodHealthy"
169+
)
170+
171+
const (
172+
// MachineEtcdMemberHealthyCondition documents if the machine has an healthy etcd member.
173+
// If not true, Pod-related Condition Reasons can be used as reasons.
174+
MachineEtcdMemberHealthyCondition ConditionType = "EtcdMemberHealthy"
175+
176+
// EtcdMemberHasAlarmsReason (Severity=Warning) documents a Machine's etcd member has alarms.
177+
EtcdMemberHasAlarmsReason = "EtcdMemberHasAlarms"
178+
179+
// EtcdClientRelatedFailureReason (Severity=Warning) documents client-related failures,
180+
// either creating etcd client fails or using the created etcd client to perform some operations fails.
181+
EtcdClientRelatedFailureReason = "EtcdClientRelatedFailure"
182+
183+
// NodeEtcdMissingFromMemberListReason (Severity=Warning) documents the machine's corresponding node has a ready etcd pod but not part of etcd members yet.
184+
NodeEtcdMissingFromMemberListReason = "NodeEtcdMissingFromMemberList"
185+
)

controlplane/kubeadm/api/v1alpha3/condition_consts.go

+20
Original file line numberDiff line numberDiff line change
@@ -66,3 +66,23 @@ const (
6666
// ScalingDownReason (Severity=Info) documents a KubeadmControlPlane that is decreasing the number of replicas.
6767
ScalingDownReason = "ScalingDown"
6868
)
69+
70+
const (
71+
// EtcdClusterHealthy documents the overall etcd cluster's health for the KCP-managed etcd.
72+
EtcdClusterHealthy clusterv1.ConditionType = "EtcdClusterHealthy"
73+
74+
// EtcdUnknownMemberReason (Severity=Warning) documents that if there exist any node in etcd member list that cannot be associated with KCP machines.
75+
EtcdUnknownMemberReason = "EtcdUnknownMember"
76+
77+
// EtcdAlarmExistReason (Severity=Warning) documents that if etcd cluster has alarms armed.
78+
EtcdAlarmExistReason = "EtcdAlarmExist"
79+
80+
// EtcdMemberListUnstableReason (Severity=Info) documents if all etcd members do not have the same member-list view.
81+
EtcdMemberListUnstableReason = "EtcdMemberListUnstable"
82+
83+
// EtcdMemberNumMismatchWithPodNumReason (Severity=Warning) documents if number of etcd pods does not match with etcd members.
84+
// This case may occur when there is a failing pod but it's been removed from the member list.
85+
// TODO: During scale down, etcd quorum may be preserved (cluster remains healthy) but there may be a mismatch between number of pods and members,
86+
// TODO: while pod is being deleted and removed from the etcd list. This case should be differentiated from this one.
87+
EtcdMemberNumMismatchWithPodNumReason = "EtcdMemberMismatchWithPod"
88+
)

controlplane/kubeadm/controllers/controller.go

+61-11
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,9 @@ func (r *KubeadmControlPlaneReconciler) Reconcile(req ctrl.Request) (res ctrl.Re
213213
}
214214

215215
func patchKubeadmControlPlane(ctx context.Context, patchHelper *patch.Helper, kcp *controlplanev1.KubeadmControlPlane) error {
216+
217+
internal.SetKCPConditions(kcp)
218+
216219
// Always update the readyCondition by summarizing the state of other conditions.
217220
conditions.SetSummary(kcp,
218221
conditions.WithConditions(
@@ -221,6 +224,7 @@ func patchKubeadmControlPlane(ctx context.Context, patchHelper *patch.Helper, kc
221224
controlplanev1.MachinesReadyCondition,
222225
controlplanev1.AvailableCondition,
223226
controlplanev1.CertificatesAvailableCondition,
227+
controlplanev1.EtcdClusterHealthy,
224228
),
225229
)
226230

@@ -282,13 +286,6 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
282286
return ctrl.Result{}, err
283287
}
284288

285-
adoptableMachines := controlPlaneMachines.Filter(machinefilters.AdoptableControlPlaneMachines(cluster.Name))
286-
if len(adoptableMachines) > 0 {
287-
// We adopt the Machines and then wait for the update event for the ownership reference to re-queue them so the cache is up-to-date
288-
err = r.adoptMachines(ctx, kcp, adoptableMachines, cluster)
289-
return ctrl.Result{}, err
290-
}
291-
292289
ownedMachines := controlPlaneMachines.Filter(machinefilters.OwnedMachines(kcp))
293290
if len(ownedMachines) != len(controlPlaneMachines) {
294291
logger.Info("Not all control plane machines are owned by this KubeadmControlPlane, refusing to operate in mixed management mode")
@@ -301,6 +298,21 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
301298
return ctrl.Result{}, err
302299
}
303300

301+
// If control plane is initialized, reconcile health.
302+
if ownedMachines.Len() != 0 {
303+
// reconcileControlPlaneHealth returns err if there is a machine being delete
304+
if result, err := r.reconcileControlPlaneHealth(ctx, cluster, kcp, controlPlane); err != nil || !result.IsZero() {
305+
return result, err
306+
}
307+
}
308+
309+
adoptableMachines := controlPlaneMachines.Filter(machinefilters.AdoptableControlPlaneMachines(cluster.Name))
310+
if len(adoptableMachines) > 0 {
311+
// We adopt the Machines and then wait for the update event for the ownership reference to re-queue them so the cache is up-to-date
312+
err = r.adoptMachines(ctx, kcp, adoptableMachines, cluster)
313+
return ctrl.Result{}, err
314+
}
315+
304316
// Aggregate the operational state of all the machines; while aggregating we are adding the
305317
// source ref (reason@machine/name) so the problem can be easily tracked down to its source machine.
306318
conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, ownedMachines.ConditionGetters(), conditions.AddSourceRef())
@@ -442,21 +454,59 @@ func (r *KubeadmControlPlaneReconciler) ClusterToKubeadmControlPlane(o handler.M
442454
return nil
443455
}
444456

445-
// reconcileHealth performs health checks for control plane components and etcd
457+
func patchControlPlaneMachine(ctx context.Context, patchHelper *patch.Helper, machine *clusterv1.Machine) error {
458+
// Patch the object, ignoring conflicts on the conditions owned by this controller.
459+
460+
// TODO: Is it okay to own these conditions or just patch?
461+
// return patchHelper.Patch(ctx, machine)
462+
463+
return patchHelper.Patch(
464+
ctx,
465+
machine,
466+
patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
467+
clusterv1.MachineKubeAPIServerHealthyCondition,
468+
clusterv1.MachineKubeControllerManagerHealthyCondition,
469+
clusterv1.MachineEtcdMemberHealthyCondition,
470+
clusterv1.MachineEtcdPodHealthyCondition,
471+
clusterv1.MachineKubeSchedulerHealthyCondition,
472+
}},
473+
)
474+
}
475+
476+
// reconcileControlPlaneHealth performs health checks for control plane components and etcd
446477
// It removes any etcd members that do not have a corresponding node.
447478
// Also, as a final step, checks if there is any machines that is being deleted.
448-
func (r *KubeadmControlPlaneReconciler) reconcileHealth(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
479+
func (r *KubeadmControlPlaneReconciler) reconcileControlPlaneHealth(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
480+
logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name)
481+
482+
for _, m := range controlPlane.Machines {
483+
// Initialize the patch helper.
484+
patchHelper, err := patch.NewHelper(m, r.Client)
485+
if err != nil {
486+
logger.Error(err, "Failed to configure the patch helper")
487+
return ctrl.Result{Requeue: true}, nil
488+
}
489+
490+
machine := m
491+
defer func() {
492+
internal.SetSingleMachineConditions(machine)
493+
// Always attempt to Patch the Machine conditions after each health reconciliation.
494+
if err := patchControlPlaneMachine(ctx, patchHelper, machine); err != nil {
495+
logger.Error(err, "Failed to patch KubeadmControlPlane Machine")
496+
}
497+
}()
498+
}
449499

450500
// Do a health check of the Control Plane components
451-
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
501+
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, controlPlane.Machines.SortedByCreationTimestamp(), util.ObjectKey(cluster)); err != nil {
452502
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
453503
"Waiting for control plane to pass control plane health check to continue reconciliation: %v", err)
454504
return ctrl.Result{RequeueAfter: healthCheckFailedRequeueAfter}, nil
455505
}
456506

457507
// If KCP should manage etcd, ensure etcd is healthy.
458508
if controlPlane.IsEtcdManaged() {
459-
if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
509+
if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, controlPlane.Machines.SortedByCreationTimestamp(), util.ObjectKey(cluster)); err != nil {
460510
errList := []error{errors.Wrap(err, "failed to pass etcd health check")}
461511
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
462512
"Waiting for control plane to pass etcd health check to continue reconciliation: %v", err)

controlplane/kubeadm/controllers/fakes_test.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -57,14 +57,14 @@ func (f *fakeManagementCluster) GetMachinesForCluster(c context.Context, n clien
5757
return f.Machines, nil
5858
}
5959

60-
func (f *fakeManagementCluster) TargetClusterControlPlaneIsHealthy(_ context.Context, _ client.ObjectKey) error {
60+
func (f *fakeManagementCluster) TargetClusterControlPlaneIsHealthy(_ context.Context, _ []*clusterv1.Machine, _ client.ObjectKey) error {
6161
if !f.ControlPlaneHealthy {
6262
return errors.New("control plane is not healthy")
6363
}
6464
return nil
6565
}
6666

67-
func (f *fakeManagementCluster) TargetClusterEtcdIsHealthy(_ context.Context, _ client.ObjectKey) error {
67+
func (f *fakeManagementCluster) TargetClusterEtcdIsHealthy(_ context.Context, _ []*clusterv1.Machine, _ client.ObjectKey) error {
6868
if !f.EtcdHealthy {
6969
return errors.New("etcd is not healthy")
7070
}

controlplane/kubeadm/controllers/scale.go

+2-10
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,6 @@ func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Conte
6363
func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
6464
logger := controlPlane.Logger()
6565

66-
// reconcileHealth returns err if there is a machine being delete which is a required condition to check before scaling up
67-
if result, err := r.reconcileHealth(ctx, cluster, kcp, controlPlane); err != nil || !result.IsZero() {
68-
return result, err
69-
}
70-
7166
// Create the bootstrap configuration
7267
bootstrapSpec := controlPlane.JoinControlPlaneConfig()
7368
fd := controlPlane.NextFailureDomainForScaleUp()
@@ -90,10 +85,6 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
9085
) (ctrl.Result, error) {
9186
logger := controlPlane.Logger()
9287

93-
if result, err := r.reconcileHealth(ctx, cluster, kcp, controlPlane); err != nil || !result.IsZero() {
94-
return result, err
95-
}
96-
9788
workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(cluster))
9889
if err != nil {
9990
logger.Error(err, "Failed to create client to workload cluster")
@@ -123,7 +114,8 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
123114
}
124115
}
125116

126-
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
117+
// TODO: check if this is needed after moving the health check to the main reconcile
118+
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, controlPlane.Machines.SortedByCreationTimestamp(), util.ObjectKey(cluster)); err != nil {
127119
logger.V(2).Info("Waiting for control plane to pass control plane health check before removing a control plane machine", "cause", err)
128120
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
129121
"Waiting for control plane to pass control plane health check before removing a control plane machine: %v", err)

controlplane/kubeadm/controllers/status.go

+1-2
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ package controllers
1818

1919
import (
2020
"context"
21-
2221
"github.com/pkg/errors"
2322
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
2423
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3"
@@ -29,7 +28,7 @@ import (
2928
)
3029

3130
// updateStatus is called after every reconcilitation loop in a defer statement to always make sure we have the
32-
// resource status subresourcs up-to-date.
31+
// resource status subresources up-to-date.
3332
func (r *KubeadmControlPlaneReconciler) updateStatus(ctx context.Context, kcp *controlplanev1.KubeadmControlPlane, cluster *clusterv1.Cluster) error {
3433
selector := machinefilters.ControlPlaneSelectorForCluster(cluster.Name)
3534
// Copy label selector to its status counterpart in string format.

0 commit comments

Comments
 (0)