Skip to content

Commit 5d07335

Browse files
author
Sedef
committed
Add Machine and KCP conditions to KCP controller
1 parent 0cf9f80 commit 5d07335

16 files changed

+463
-157
lines changed

api/v1alpha3/condition_consts.go

+52-2
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,8 @@ const (
109109
// MachineHasFailureReason is the reason used when a machine has either a FailureReason or a FailureMessage set on its status.
110110
MachineHasFailureReason = "MachineHasFailure"
111111

112-
// NodeNotFoundReason is the reason used when a machine's node has previously been observed but is now gone.
112+
// NodeNotFoundReason (Severity=Error) documents a machine's node has previously been observed but is now gone.
113+
// NB. provisioned --> NodeRef != ""
113114
NodeNotFoundReason = "NodeNotFound"
114115

115116
// NodeStartupTimeoutReason is the reason used when a machine's node does not appear within the specified timeout.
@@ -120,10 +121,59 @@ const (
120121
)
121122

122123
const (
123-
// MachineOwnerRemediatedCondition is set on machines that have failed a healthcheck by the MachineHealthCheck controller.
124+
// MachineOwnerRemediatedCondition is set on machines that have failed a healthcheck by the Machine's owner controller.
124125
// MachineOwnerRemediatedCondition is set to False after a health check fails, but should be changed to True by the owning controller after remediation succeeds.
125126
MachineOwnerRemediatedCondition ConditionType = "OwnerRemediated"
126127

127128
// WaitingForRemediationReason is the reason used when a machine fails a health check and remediation is needed.
128129
WaitingForRemediationReason = "WaitingForRemediation"
129130
)
131+
132+
// Common Pod-related Condition Reasons used by Pod-related Conditions such as MachineAPIServerPodHealthyCondition etc.
133+
const (
134+
// PodProvisioningReason (Severity=Info) documents a pod waiting to be provisioned i.e., Pod is in "Pending" phase and
135+
// PodScheduled and Initialized conditions are not yet set to True.
136+
PodProvisioningReason = "PodProvisioning"
137+
138+
// PodMissingReason (Severity=Warning) documents a pod does not exist.
139+
PodMissingReason = "PodMissing"
140+
141+
// PodFailedReason (Severity=Error) documents if
142+
// i) a pod failed during provisioning i.e., Pod is in "Pending" phase and
143+
// PodScheduled and Initialized conditions are set to True but ContainersReady or Ready condition is false
144+
// (i.e., at least one of the containers are in waiting state(e.g CrashLoopbackOff, ImagePullBackOff)
145+
// ii) a pod has at least one container that is terminated with a failure and hence Pod is in "Failed" phase.
146+
PodFailedReason = "PodFailed"
147+
)
148+
149+
// Conditions that are only for control-plane machines. KubeadmControlPlane is the owner of these conditions.
150+
151+
const (
152+
// MachineAPIServerPodHealthyCondition reports a machine's kube-apiserver's health status.
153+
// Set to true if kube-apiserver pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
154+
MachineAPIServerPodHealthyCondition ConditionType = "APIServerPodHealthy"
155+
156+
// MachineControllerManagerHealthyCondition reports a machine's kube-controller-manager's health status.
157+
// Set to true if kube-controller-manager pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
158+
MachineControllerManagerHealthyCondition ConditionType = "ControllerManagerPodHealthy"
159+
160+
// MachineSchedulerPodHealthyCondition reports a machine's kube-scheduler's health status.
161+
// Set to true if kube-scheduler pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
162+
MachineSchedulerPodHealthyCondition ConditionType = "SchedulerPodHealthy"
163+
164+
// MachineEtcdPodHealthyCondition reports a machine's etcd pod's health status.
165+
// Set to true if etcd pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
166+
MachineEtcdPodHealthyCondition ConditionType = "EtcdPodHealthy"
167+
)
168+
169+
const (
170+
// MachineEtcdMemberHealthyCondition documents if the machine has an healthy etcd member.
171+
// If not true, Pod-related Condition Reasons can be used as reasons.
172+
MachineEtcdMemberHealthyCondition ConditionType = "EtcdMemberHealthy"
173+
174+
// EtcdMemberUnhealthyReason (Severity=Error) documents a Machine's etcd member is unhealthy for a number of reasons:
175+
// i) etcd member has alarms.
176+
// ii) creating etcd client fails or using the created etcd client to perform some operations fails.
177+
// iii) Quorum is lost
178+
EtcdMemberUnhealthyReason = "EtcdMemberUnhealthy"
179+
)

controlplane/kubeadm/api/v1alpha3/condition_consts.go

+12
Original file line numberDiff line numberDiff line change
@@ -66,3 +66,15 @@ const (
6666
// ScalingDownReason (Severity=Info) documents a KubeadmControlPlane that is decreasing the number of replicas.
6767
ScalingDownReason = "ScalingDown"
6868
)
69+
70+
const (
71+
// EtcdClusterHealthy documents the overall etcd cluster's health for the KCP-managed etcd.
72+
EtcdClusterHealthy clusterv1.ConditionType = "EtcdClusterHealthy"
73+
74+
// EtcdClusterUnhealthyReason (Severity=Warning) is set when the etcd cluster as unhealthy due to
75+
// i) if etcd cluster has lost its quorum.
76+
// ii) if etcd cluster has alarms armed.
77+
// iii) if etcd pods does not match with etcd members.
78+
EtcdClusterUnhealthyReason = "EtcdClusterUnhealthy"
79+
80+
)

controlplane/kubeadm/controllers/controller.go

+73-5
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,27 @@ func (r *KubeadmControlPlaneReconciler) Reconcile(req ctrl.Request) (res ctrl.Re
212212
return r.reconcile(ctx, cluster, kcp)
213213
}
214214

215+
// setSingleMachineConditions updates the machine's conditions according to health tracker.
216+
func setSingleMachineConditions(machine *clusterv1.Machine, controlPlane *internal.ControlPlane) {
217+
for condType, condition := range controlPlane.MachineConditions[machine.Name] {
218+
doesConditionExist := false
219+
for _, mCondition := range machine.Status.Conditions {
220+
// If the condition already exists, change the condition.
221+
if mCondition.Type == condType {
222+
conditions.Set(machine, condition)
223+
doesConditionExist = true
224+
}
225+
}
226+
if !doesConditionExist {
227+
if machine.Status.Conditions == nil {
228+
machine.Status.Conditions = clusterv1.Conditions{}
229+
}
230+
conditions.Set(machine, condition)
231+
}
232+
233+
}
234+
}
235+
215236
func patchKubeadmControlPlane(ctx context.Context, patchHelper *patch.Helper, kcp *controlplanev1.KubeadmControlPlane) error {
216237
// Always update the readyCondition by summarizing the state of other conditions.
217238
conditions.SetSummary(kcp,
@@ -221,6 +242,7 @@ func patchKubeadmControlPlane(ctx context.Context, patchHelper *patch.Helper, kc
221242
controlplanev1.MachinesReadyCondition,
222243
controlplanev1.AvailableCondition,
223244
controlplanev1.CertificatesAvailableCondition,
245+
controlplanev1.EtcdClusterHealthy,
224246
),
225247
)
226248

@@ -305,6 +327,14 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
305327
// source ref (reason@machine/name) so the problem can be easily tracked down to its source machine.
306328
conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, ownedMachines.ConditionGetters(), conditions.AddSourceRef())
307329

330+
// If control plane is initialized, reconcile health.
331+
if ownedMachines.Len() != 0 {
332+
// reconcileControlPlaneHealth returns err if there is a machine being delete
333+
if result, err := r.reconcileControlPlaneHealth(ctx, cluster, kcp, controlPlane); err != nil || !result.IsZero() {
334+
return result, err
335+
}
336+
}
337+
308338
// Control plane machines rollout due to configuration changes (e.g. upgrades) takes precedence over other operations.
309339
needRollout := controlPlane.MachinesNeedingRollout()
310340
switch {
@@ -442,21 +472,59 @@ func (r *KubeadmControlPlaneReconciler) ClusterToKubeadmControlPlane(o handler.M
442472
return nil
443473
}
444474

445-
// reconcileHealth performs health checks for control plane components and etcd
475+
func patchControlPlaneMachine(ctx context.Context, patchHelper *patch.Helper, machine *clusterv1.Machine) error {
476+
// Patch the object, ignoring conflicts on the conditions owned by this controller.
477+
478+
// TODO: Is it okay to own these conditions or just patch?
479+
// return patchHelper.Patch(ctx, machine)
480+
481+
return patchHelper.Patch(
482+
ctx,
483+
machine,
484+
patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
485+
clusterv1.MachineAPIServerPodHealthyCondition,
486+
clusterv1.MachineControllerManagerHealthyCondition,
487+
clusterv1.MachineEtcdMemberHealthyCondition,
488+
clusterv1.MachineEtcdPodHealthyCondition,
489+
clusterv1.MachineSchedulerPodHealthyCondition,
490+
}},
491+
)
492+
}
493+
494+
// reconcileControlPlaneHealth performs health checks for control plane components and etcd
446495
// It removes any etcd members that do not have a corresponding node.
447496
// Also, as a final step, checks if there is any machines that is being deleted.
448-
func (r *KubeadmControlPlaneReconciler) reconcileHealth(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
497+
func (r *KubeadmControlPlaneReconciler) reconcileControlPlaneHealth(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
498+
logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name)
499+
500+
for _, m := range controlPlane.Machines {
501+
// Initialize the patch helper.
502+
patchHelper, err := patch.NewHelper(m, r.Client)
503+
if err != nil {
504+
logger.Error(err, "Failed to configure the patch helper")
505+
return ctrl.Result{Requeue: true}, nil
506+
}
507+
508+
machine := m
509+
defer func() {
510+
setSingleMachineConditions(machine, controlPlane)
511+
// Always attempt to Patch the Machine conditions after each health reconciliation.
512+
if err := patchControlPlaneMachine(ctx, patchHelper, machine); err != nil {
513+
logger.Error(err, "Failed to patch KubeadmControlPlane Machine")
514+
}
515+
}()
516+
}
449517

450518
// Do a health check of the Control Plane components
451-
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
519+
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, controlPlane, util.ObjectKey(cluster)); err != nil {
452520
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
453521
"Waiting for control plane to pass control plane health check to continue reconciliation: %v", err)
454-
return ctrl.Result{RequeueAfter: healthCheckFailedRequeueAfter}, nil
522+
return ctrl.Result{}, errors.Wrap(err, "failed to pass control-plane health check")
455523
}
456524

457525
// If KCP should manage etcd, ensure etcd is healthy.
458526
if controlPlane.IsEtcdManaged() {
459-
if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
527+
if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, controlPlane, util.ObjectKey(cluster)); err != nil {
460528
errList := []error{errors.Wrap(err, "failed to pass etcd health check")}
461529
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
462530
"Waiting for control plane to pass etcd health check to continue reconciliation: %v", err)

controlplane/kubeadm/controllers/fakes_test.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -57,14 +57,14 @@ func (f *fakeManagementCluster) GetMachinesForCluster(c context.Context, n clien
5757
return f.Machines, nil
5858
}
5959

60-
func (f *fakeManagementCluster) TargetClusterControlPlaneIsHealthy(_ context.Context, _ client.ObjectKey) error {
60+
func (f *fakeManagementCluster) TargetClusterControlPlaneIsHealthy(_ context.Context, _ *internal.ControlPlane, _ client.ObjectKey) error {
6161
if !f.ControlPlaneHealthy {
6262
return errors.New("control plane is not healthy")
6363
}
6464
return nil
6565
}
6666

67-
func (f *fakeManagementCluster) TargetClusterEtcdIsHealthy(_ context.Context, _ client.ObjectKey) error {
67+
func (f *fakeManagementCluster) TargetClusterEtcdIsHealthy(_ context.Context, _ *internal.ControlPlane, _ client.ObjectKey) error {
6868
if !f.EtcdHealthy {
6969
return errors.New("etcd is not healthy")
7070
}

controlplane/kubeadm/controllers/scale.go

+2-10
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,6 @@ func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Conte
6363
func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
6464
logger := controlPlane.Logger()
6565

66-
// reconcileHealth returns err if there is a machine being delete which is a required condition to check before scaling up
67-
if result, err := r.reconcileHealth(ctx, cluster, kcp, controlPlane); err != nil || !result.IsZero() {
68-
return result, err
69-
}
70-
7166
// Create the bootstrap configuration
7267
bootstrapSpec := controlPlane.JoinControlPlaneConfig()
7368
fd := controlPlane.NextFailureDomainForScaleUp()
@@ -90,10 +85,6 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
9085
) (ctrl.Result, error) {
9186
logger := controlPlane.Logger()
9287

93-
if result, err := r.reconcileHealth(ctx, cluster, kcp, controlPlane); err != nil || !result.IsZero() {
94-
return result, err
95-
}
96-
9788
workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(cluster))
9889
if err != nil {
9990
logger.Error(err, "Failed to create client to workload cluster")
@@ -123,7 +114,8 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
123114
}
124115
}
125116

126-
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
117+
// TODO: check if this is needed after moving the health check to the main reconcile
118+
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, controlPlane, util.ObjectKey(cluster)); err != nil {
127119
logger.V(2).Info("Waiting for control plane to pass control plane health check before removing a control plane machine", "cause", err)
128120
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
129121
"Waiting for control plane to pass control plane health check before removing a control plane machine: %v", err)

controlplane/kubeadm/controllers/scale_test.go

+6-11
Original file line numberDiff line numberDiff line change
@@ -116,14 +116,16 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) {
116116

117117
result, err := r.scaleUpControlPlane(context.Background(), cluster, kcp, controlPlane)
118118
g.Expect(result).To(Equal(ctrl.Result{Requeue: true}))
119-
g.Expect(err).ToNot(HaveOccurred())
119+
g.Expect(err).NotTo(HaveOccurred())
120120

121121
controlPlaneMachines := clusterv1.MachineList{}
122122
g.Expect(fakeClient.List(context.Background(), &controlPlaneMachines)).To(Succeed())
123123
g.Expect(controlPlaneMachines.Items).To(HaveLen(3))
124124
})
125125
t.Run("does not create a control plane Machine if health checks fail", func(t *testing.T) {
126126
cluster, kcp, genericMachineTemplate := createClusterWithControlPlane()
127+
cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com"
128+
cluster.Spec.ControlPlaneEndpoint.Port = 6443
127129
initObjs := []runtime.Object{cluster.DeepCopy(), kcp.DeepCopy(), genericMachineTemplate.DeepCopy()}
128130

129131
beforeMachines := internal.NewFilterableMachineCollection()
@@ -170,18 +172,11 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) {
170172
Log: log.Log,
171173
recorder: record.NewFakeRecorder(32),
172174
}
173-
controlPlane := &internal.ControlPlane{
174-
KCP: kcp,
175-
Cluster: cluster,
176-
Machines: beforeMachines,
177-
}
178175

179-
result, err := r.scaleUpControlPlane(context.Background(), cluster.DeepCopy(), kcp.DeepCopy(), controlPlane)
180-
if tc.expectErr {
181-
g.Expect(err).To(HaveOccurred())
182-
}
183-
g.Expect(result).To(Equal(tc.expectResult))
176+
_, err := r.reconcile(context.Background(), cluster, kcp)
177+
g.Expect(err).To(HaveOccurred())
184178

179+
// scaleUpControlPlane is never called due to health check failure and new machine is not created to scale up.
185180
controlPlaneMachines := &clusterv1.MachineList{}
186181
g.Expect(fakeClient.List(context.Background(), controlPlaneMachines)).To(Succeed())
187182
g.Expect(controlPlaneMachines.Items).To(HaveLen(len(beforeMachines)))

controlplane/kubeadm/controllers/status.go

+1-2
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ package controllers
1818

1919
import (
2020
"context"
21-
2221
"github.com/pkg/errors"
2322
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
2423
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3"
@@ -29,7 +28,7 @@ import (
2928
)
3029

3130
// updateStatus is called after every reconcilitation loop in a defer statement to always make sure we have the
32-
// resource status subresourcs up-to-date.
31+
// resource status subresources up-to-date.
3332
func (r *KubeadmControlPlaneReconciler) updateStatus(ctx context.Context, kcp *controlplanev1.KubeadmControlPlane, cluster *clusterv1.Cluster) error {
3433
selector := machinefilters.ControlPlaneSelectorForCluster(cluster.Name)
3534
// Copy label selector to its status counterpart in string format.

controlplane/kubeadm/controllers/upgrade_test.go

+6-3
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,13 @@ func TestKubeadmControlPlaneReconciler_upgradeControlPlane(t *testing.T) {
3636
g := NewWithT(t)
3737

3838
cluster, kcp, genericMachineTemplate := createClusterWithControlPlane()
39+
cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com"
40+
cluster.Spec.ControlPlaneEndpoint.Port = 6443
3941
kcp.Spec.Version = "v1.17.3"
4042
kcp.Spec.KubeadmConfigSpec.ClusterConfiguration = nil
4143
kcp.Spec.Replicas = pointer.Int32Ptr(1)
4244

45+
4346
fakeClient := newFakeClient(g, cluster.DeepCopy(), kcp.DeepCopy(), genericMachineTemplate.DeepCopy())
4447

4548
r := &KubeadmControlPlaneReconciler{
@@ -89,9 +92,9 @@ func TestKubeadmControlPlaneReconciler_upgradeControlPlane(t *testing.T) {
8992

9093
// run upgrade a second time, simulate that the node has not appeared yet but the machine exists
9194
r.managementCluster.(*fakeManagementCluster).ControlPlaneHealthy = false
92-
result, err = r.upgradeControlPlane(context.Background(), cluster, kcp, controlPlane, needingUpgrade)
93-
g.Expect(result).To(Equal(ctrl.Result{RequeueAfter: healthCheckFailedRequeueAfter}))
94-
g.Expect(err).To(BeNil())
95+
// Unhealthy control plane will be detected during reconcile loop and upgrade will never be called.
96+
_, err = r.reconcile(context.Background(), cluster, kcp)
97+
g.Expect(err).To(HaveOccurred())
9598
g.Expect(fakeClient.List(context.Background(), bothMachines, client.InNamespace(cluster.Namespace))).To(Succeed())
9699
g.Expect(bothMachines.Items).To(HaveLen(2))
97100

0 commit comments

Comments
 (0)