Skip to content

Commit 88bcbf0

Browse files
Add v1beta2 available condition to KCP
1 parent 9f5a94e commit 88bcbf0

File tree

6 files changed

+852
-39
lines changed

6 files changed

+852
-39
lines changed

controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go

+13-8
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,20 @@ import clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
2020

2121
// KubeadmControlPlane's Available condition and corresponding reasons that will be used in v1Beta2 API version.
2222
const (
23-
// KubeadmControlPlaneAvailableV1Beta2Condition is True if the control plane can be reached, EtcdClusterHealthy is true,
24-
// and CertificatesAvailable is true.
23+
// KubeadmControlPlaneAvailableV1Beta2Condition is True if KubeadmControlPlane not delete, `CertificatesAvailable` is true,
24+
// at least one Kubernetes API server, scheduler and controller manager control plane are healthy,
25+
// and etcd has enough operational members to meet quorum requirements.
2526
KubeadmControlPlaneAvailableV1Beta2Condition = clusterv1.AvailableV1Beta2Condition
27+
28+
// KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason documents a failure when inspecting the status of the
29+
// etcd cluster hosted on KubeadmControlPlane controlled machines.
30+
KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason = clusterv1.InspectionFailedV1Beta2Reason
31+
32+
// KubeadmControlPlaneAvailableV1Beta2Reason surfaces when a Deployment is available.
33+
KubeadmControlPlaneAvailableV1Beta2Reason = clusterv1.AvailableV1Beta2Reason
34+
35+
// KubeadmControlPlaneNotAvailableV1Beta2Reason surfaces when a Deployment is not available.
36+
KubeadmControlPlaneNotAvailableV1Beta2Reason = clusterv1.NotAvailableV1Beta2Reason
2637
)
2738

2839
// KubeadmControlPlane's Initialized condition and corresponding reasons that will be used in v1Beta2 API version.
@@ -215,12 +226,6 @@ const (
215226
KubeadmControlPlaneDeletingV1Beta2Condition = clusterv1.DeletingV1Beta2Condition
216227
)
217228

218-
// KubeadmControlPlane's Paused condition and corresponding reasons that will be used in v1Beta2 API version.
219-
const (
220-
// KubeadmControlPlanePausedV1Beta2Condition is true if this resource or the Cluster it belongs to are paused.
221-
KubeadmControlPlanePausedV1Beta2Condition = clusterv1.PausedV1Beta2Condition
222-
)
223-
224229
// APIServerPodHealthy, ControllerManagerPodHealthy, SchedulerPodHealthy and EtcdPodHealthy condition and corresponding
225230
// reasons that will be used for KubeadmControlPlane controlled machines in v1Beta2 API version.
226231
const (

controlplane/kubeadm/internal/control_plane.go

+9
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1"
3131
"sigs.k8s.io/cluster-api/controllers/external"
3232
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
33+
"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd"
3334
"sigs.k8s.io/cluster-api/util/collections"
3435
"sigs.k8s.io/cluster-api/util/failuredomains"
3536
"sigs.k8s.io/cluster-api/util/patch"
@@ -58,6 +59,14 @@ type ControlPlane struct {
5859
KubeadmConfigs map[string]*bootstrapv1.KubeadmConfig
5960
InfraResources map[string]*unstructured.Unstructured
6061

62+
// EtcdMembers is the list of members read while computing reconcileControlPlaneConditions; also additional info below
63+
// comes from the same func.
64+
// NOTE: Those info are computed on what we know, so we can reason about availability eve if with a certain degree of problems in the cluster
65+
EtcdMembers []*etcd.Member
66+
EtcdMembersAgreeOnMemberList bool
67+
EtcdMembersAgreeOnClusterID bool
68+
EtcdMembersAndMachinesAreMatching bool
69+
6170
managementCluster ManagementCluster
6271
workloadCluster WorkloadCluster
6372
}

controlplane/kubeadm/internal/controllers/status.go

+136-2
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
3232
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
3333
"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal"
34+
"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd"
3435
"sigs.k8s.io/cluster-api/util/collections"
3536
"sigs.k8s.io/cluster-api/util/conditions"
3637
v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2"
@@ -167,8 +168,8 @@ func (r *KubeadmControlPlaneReconciler) updateV1beta2Status(ctx context.Context,
167168
setMachinesReadyCondition(ctx, controlPlane.KCP, controlPlane.Machines)
168169
setMachinesUpToDateCondition(ctx, controlPlane.KCP, controlPlane.Machines)
169170
setRemediatingCondition(ctx, controlPlane.KCP, controlPlane.MachinesToBeRemediatedByKCP(), controlPlane.UnhealthyMachines())
170-
171-
// TODO: Available, Deleting
171+
// TODO: Deleting
172+
setAvailableCondition(ctx, controlPlane.KCP, controlPlane.IsEtcdManaged(), controlPlane.EtcdMembers, controlPlane.EtcdMembersAgreeOnMemberList, controlPlane.EtcdMembersAgreeOnClusterID, controlPlane.EtcdMembersAndMachinesAreMatching, controlPlane.Machines)
172173
}
173174

174175
func setReplicas(_ context.Context, kcp *controlplanev1.KubeadmControlPlane, machines collections.Machines) {
@@ -423,6 +424,139 @@ func setRemediatingCondition(ctx context.Context, kcp *controlplanev1.KubeadmCon
423424
})
424425
}
425426

427+
func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControlPlane, etcdIsManaged bool, etcdMembers []*etcd.Member, etcdMembersAgreeOnMemberList bool, etcdMembersAgreeOnClusterID bool, etcdMembersAndMachinesAreMatching bool, machines collections.Machines) {
428+
if !kcp.Status.Initialized {
429+
v1beta2conditions.Set(kcp, metav1.Condition{
430+
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
431+
Status: metav1.ConditionFalse,
432+
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
433+
Message: "Control plane not yet initialized",
434+
})
435+
return
436+
}
437+
438+
if etcdIsManaged && etcdMembers == nil {
439+
v1beta2conditions.Set(kcp, metav1.Condition{
440+
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
441+
Status: metav1.ConditionUnknown,
442+
Reason: controlplanev1.KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason,
443+
Message: "Failed to get etcd members",
444+
})
445+
return
446+
}
447+
448+
if etcdIsManaged && !etcdMembersAgreeOnMemberList {
449+
v1beta2conditions.Set(kcp, metav1.Condition{
450+
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
451+
Status: metav1.ConditionFalse,
452+
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
453+
Message: "At least one etcd member reports a list of etcd members different than the list reported by other members",
454+
})
455+
return
456+
}
457+
458+
if etcdIsManaged && !etcdMembersAgreeOnClusterID {
459+
v1beta2conditions.Set(kcp, metav1.Condition{
460+
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
461+
Status: metav1.ConditionFalse,
462+
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
463+
Message: "At least one etcd member reports a cluster ID different than the cluster ID reported by other members",
464+
})
465+
return
466+
}
467+
468+
if etcdIsManaged && !etcdMembersAndMachinesAreMatching {
469+
v1beta2conditions.Set(kcp, metav1.Condition{
470+
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
471+
Status: metav1.ConditionFalse,
472+
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
473+
Message: "The list of etcd members does not match the list of Machines and Nodes",
474+
})
475+
return
476+
}
477+
478+
// Determine control plane availability looking at machines conditions, which at this stage are
479+
// already surfacing status from etcd member and all control plane pods hosted on every machine.
480+
// Note: we intentionally use the number of etcd members for determine the etcd quorum because
481+
// etcd members could not match with machines, e.g. while provisioning a new machine.
482+
etcdQuorum := (len(etcdMembers) / 2.0) + 1
483+
k8sControlPlaneHealthy := 0
484+
etcdMembersHealthy := 0
485+
for _, machine := range machines {
486+
// if external etcd, only look at the status of the K8s control plane components on this machine.
487+
if !etcdIsManaged {
488+
if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition) &&
489+
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition) &&
490+
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition) {
491+
k8sControlPlaneHealthy++
492+
}
493+
continue
494+
}
495+
496+
// Otherwise, etcd is managed.
497+
// In this case, when looking at the k8s control plane we should consider how kubeadm layouts control plane components,
498+
// and more specifically:
499+
// - API server on one machine only connect to the local etcd member
500+
// - ControllerManager and scheduler on a machine connect to the local API server (not to the control plane endpoint)
501+
// As a consequence, we consider the K8s control plane on this machine healthy only if everything is healthy.
502+
503+
if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) {
504+
etcdMembersHealthy++
505+
}
506+
507+
if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition) &&
508+
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition) &&
509+
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition) &&
510+
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) &&
511+
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyV1Beta2Condition) {
512+
k8sControlPlaneHealthy++
513+
}
514+
}
515+
516+
if kcp.DeletionTimestamp.IsZero() &&
517+
(!etcdIsManaged || etcdMembersHealthy >= etcdQuorum) &&
518+
k8sControlPlaneHealthy >= 1 &&
519+
v1beta2conditions.IsTrue(kcp, controlplanev1.KubeadmControlPlaneCertificatesAvailableV1Beta2Condition) {
520+
v1beta2conditions.Set(kcp, metav1.Condition{
521+
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
522+
Status: metav1.ConditionTrue,
523+
Reason: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Reason,
524+
})
525+
return
526+
}
527+
528+
messages := []string{}
529+
if !kcp.DeletionTimestamp.IsZero() {
530+
messages = append(messages, "Control plane metadata.deletionTimestamp is set")
531+
}
532+
533+
if !v1beta2conditions.IsTrue(kcp, controlplanev1.KubeadmControlPlaneCertificatesAvailableV1Beta2Condition) {
534+
messages = append(messages, "Control plane certificates are not available")
535+
}
536+
537+
if etcdIsManaged && etcdMembersHealthy < etcdQuorum {
538+
switch etcdMembersHealthy {
539+
case 0:
540+
messages = append(messages, fmt.Sprintf("There are no healthy etcd member, at least %d required", etcdQuorum))
541+
case 1:
542+
messages = append(messages, fmt.Sprintf("There is 1 healthy etcd member, at least %d required", etcdQuorum))
543+
default:
544+
messages = append(messages, fmt.Sprintf("There are %d healthy etcd members, at least %d required", etcdMembersHealthy, etcdQuorum))
545+
}
546+
}
547+
548+
if k8sControlPlaneHealthy < 1 {
549+
messages = append(messages, "There are no healthy control plane instances, at least 1 required")
550+
}
551+
552+
v1beta2conditions.Set(kcp, metav1.Condition{
553+
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
554+
Status: metav1.ConditionFalse,
555+
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
556+
Message: strings.Join(messages, ";"),
557+
})
558+
}
559+
426560
func aggregateStaleMachines(machines collections.Machines) string {
427561
if len(machines) == 0 {
428562
return ""

0 commit comments

Comments
 (0)