Skip to content

Commit 2bb2099

Browse files
authored
Merge pull request #11383 from fabriziopandini/add-kcp-available-condition
✨ Add v1beta2 available condition to KCP
2 parents bf61b1e + 5c3b51f commit 2bb2099

File tree

6 files changed

+892
-32
lines changed

6 files changed

+892
-32
lines changed

controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go

+21-3
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,32 @@ import clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
2020

2121
// KubeadmControlPlane's Available condition and corresponding reasons that will be used in v1Beta2 API version.
2222
const (
23-
// KubeadmControlPlaneAvailableV1Beta2Condition is True if the control plane can be reached, EtcdClusterHealthy is true,
24-
// and CertificatesAvailable is true.
23+
// KubeadmControlPlaneAvailableV1Beta2Condition is true if KubeadmControlPlane is not deleted, `CertificatesAvailable` is true,
24+
// at least one Machine with healthy control plane components, and etcd has enough operational members to meet quorum requirements.
25+
// More specifically, considering how kubeadm layouts components:
26+
// - Kubernetes API server, scheduler and controller manager health is inferred by the status of
27+
// the corresponding Pods hosted on each machine.
28+
// - In case of managed etcd, also a healthy etcd Pod and a healthy etcd member must exist on the same
29+
// machine with the healthy Kubernetes API server, scheduler and controller manager, otherwise the k8s control
30+
// plane cannot be considered operational (if etcd is not operational on a machine, most likely also API server,
31+
// scheduler and controller manager on the same machine will be impacted).
32+
// - In case of external etcd, KCP cannot make any assumption on etcd status, so all the etcd checks are skipped.
2533
KubeadmControlPlaneAvailableV1Beta2Condition = clusterv1.AvailableV1Beta2Condition
34+
35+
// KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason documents a failure when inspecting the status of the
36+
// etcd cluster hosted on KubeadmControlPlane controlled machines.
37+
KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason = clusterv1.InspectionFailedV1Beta2Reason
38+
39+
// KubeadmControlPlaneAvailableV1Beta2Reason surfaces when the KubeadmControlPlane is available.
40+
KubeadmControlPlaneAvailableV1Beta2Reason = clusterv1.AvailableV1Beta2Reason
41+
42+
// KubeadmControlPlaneNotAvailableV1Beta2Reason surfaces when the KubeadmControlPlane is not available.
43+
KubeadmControlPlaneNotAvailableV1Beta2Reason = clusterv1.NotAvailableV1Beta2Reason
2644
)
2745

2846
// KubeadmControlPlane's Initialized condition and corresponding reasons that will be used in v1Beta2 API version.
2947
const (
30-
// KubeadmControlPlaneInitializedV1Beta2Condition is True when the control plane is functional enough to accept
48+
// KubeadmControlPlaneInitializedV1Beta2Condition is true when the control plane is functional enough to accept
3149
// requests. This information is usually used as a signal for starting all the provisioning operations that
3250
// depend on a functional API server, but do not require a full HA control plane to exist.
3351
KubeadmControlPlaneInitializedV1Beta2Condition = "Initialized"

controlplane/kubeadm/internal/control_plane.go

+11
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1"
3131
"sigs.k8s.io/cluster-api/controllers/external"
3232
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
33+
"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd"
3334
"sigs.k8s.io/cluster-api/util/collections"
3435
"sigs.k8s.io/cluster-api/util/failuredomains"
3536
"sigs.k8s.io/cluster-api/util/patch"
@@ -58,6 +59,16 @@ type ControlPlane struct {
5859
KubeadmConfigs map[string]*bootstrapv1.KubeadmConfig
5960
InfraResources map[string]*unstructured.Unstructured
6061

62+
// EtcdMembers is the list of members read while computing reconcileControlPlaneConditions; also additional info below
63+
// comes from the same func.
64+
// NOTE: Those info are computed based on the info KCP was able to collect during inspection (e.g. if on a 3 CP
65+
// control plane one etcd member is down, those info are based on the answer collected from two members only).
66+
// NOTE: Those info are specifically designed for computing KCP's Available condition.
67+
EtcdMembers []*etcd.Member
68+
EtcdMembersAgreeOnMemberList bool
69+
EtcdMembersAgreeOnClusterID bool
70+
EtcdMembersAndMachinesAreMatching bool
71+
6172
managementCluster ManagementCluster
6273
workloadCluster WorkloadCluster
6374

controlplane/kubeadm/internal/controllers/status.go

+137-1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
3232
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
3333
"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal"
34+
"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd"
3435
"sigs.k8s.io/cluster-api/util/collections"
3536
"sigs.k8s.io/cluster-api/util/conditions"
3637
v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2"
@@ -168,7 +169,7 @@ func (r *KubeadmControlPlaneReconciler) updateV1Beta2Status(ctx context.Context,
168169
setMachinesUpToDateCondition(ctx, controlPlane.KCP, controlPlane.Machines)
169170
setRemediatingCondition(ctx, controlPlane.KCP, controlPlane.MachinesToBeRemediatedByKCP(), controlPlane.UnhealthyMachines())
170171
setDeletingCondition(ctx, controlPlane.KCP, controlPlane.DeletingReason, controlPlane.DeletingMessage)
171-
// TODO: Available
172+
setAvailableCondition(ctx, controlPlane.KCP, controlPlane.IsEtcdManaged(), controlPlane.EtcdMembers, controlPlane.EtcdMembersAgreeOnMemberList, controlPlane.EtcdMembersAgreeOnClusterID, controlPlane.EtcdMembersAndMachinesAreMatching, controlPlane.Machines)
172173
}
173174

174175
func setReplicas(_ context.Context, kcp *controlplanev1.KubeadmControlPlane, machines collections.Machines) {
@@ -441,6 +442,141 @@ func setDeletingCondition(_ context.Context, kcp *controlplanev1.KubeadmControlP
441442
})
442443
}
443444

445+
func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControlPlane, etcdIsManaged bool, etcdMembers []*etcd.Member, etcdMembersAgreeOnMemberList, etcdMembersAgreeOnClusterID, etcdMembersAndMachinesAreMatching bool, machines collections.Machines) {
446+
if !kcp.Status.Initialized {
447+
v1beta2conditions.Set(kcp, metav1.Condition{
448+
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
449+
Status: metav1.ConditionFalse,
450+
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
451+
Message: "Control plane not yet initialized",
452+
})
453+
return
454+
}
455+
456+
if etcdIsManaged {
457+
if etcdMembers == nil {
458+
v1beta2conditions.Set(kcp, metav1.Condition{
459+
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
460+
Status: metav1.ConditionUnknown,
461+
Reason: controlplanev1.KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason,
462+
Message: "Failed to get etcd members",
463+
})
464+
return
465+
}
466+
467+
if !etcdMembersAgreeOnMemberList {
468+
v1beta2conditions.Set(kcp, metav1.Condition{
469+
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
470+
Status: metav1.ConditionFalse,
471+
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
472+
Message: "At least one etcd member reports a list of etcd members different than the list reported by other members",
473+
})
474+
return
475+
}
476+
477+
if !etcdMembersAgreeOnClusterID {
478+
v1beta2conditions.Set(kcp, metav1.Condition{
479+
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
480+
Status: metav1.ConditionFalse,
481+
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
482+
Message: "At least one etcd member reports a cluster ID different than the cluster ID reported by other members",
483+
})
484+
return
485+
}
486+
487+
if !etcdMembersAndMachinesAreMatching {
488+
v1beta2conditions.Set(kcp, metav1.Condition{
489+
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
490+
Status: metav1.ConditionFalse,
491+
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
492+
Message: "The list of etcd members does not match the list of Machines and Nodes",
493+
})
494+
return
495+
}
496+
}
497+
498+
// Determine control plane availability looking at machines conditions, which at this stage are
499+
// already surfacing status from etcd member and all control plane pods hosted on every machine.
500+
// Note: we intentionally use the number of etcd members to determine the etcd quorum because
501+
// etcd members might not match with machines, e.g. while provisioning a new machine.
502+
etcdQuorum := (len(etcdMembers) / 2.0) + 1
503+
k8sControlPlaneHealthy := 0
504+
etcdMembersHealthy := 0
505+
for _, machine := range machines {
506+
// if external etcd, only look at the status of the K8s control plane components on this machine.
507+
if !etcdIsManaged {
508+
if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition) &&
509+
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition) &&
510+
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition) {
511+
k8sControlPlaneHealthy++
512+
}
513+
continue
514+
}
515+
516+
// Otherwise, etcd is managed.
517+
// In this case, when looking at the k8s control plane we should consider how kubeadm layouts control plane components,
518+
// and more specifically:
519+
// - API server on one machine only connect to the local etcd member
520+
// - ControllerManager and scheduler on a machine connect to the local API server (not to the control plane endpoint)
521+
// As a consequence, we consider the K8s control plane on this machine healthy only if everything is healthy.
522+
523+
if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) {
524+
etcdMembersHealthy++
525+
}
526+
527+
if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition) &&
528+
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition) &&
529+
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition) &&
530+
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) &&
531+
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyV1Beta2Condition) {
532+
k8sControlPlaneHealthy++
533+
}
534+
}
535+
536+
if kcp.DeletionTimestamp.IsZero() &&
537+
(!etcdIsManaged || etcdMembersHealthy >= etcdQuorum) &&
538+
k8sControlPlaneHealthy >= 1 &&
539+
v1beta2conditions.IsTrue(kcp, controlplanev1.KubeadmControlPlaneCertificatesAvailableV1Beta2Condition) {
540+
v1beta2conditions.Set(kcp, metav1.Condition{
541+
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
542+
Status: metav1.ConditionTrue,
543+
Reason: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Reason,
544+
})
545+
return
546+
}
547+
548+
messages := []string{}
549+
if !kcp.DeletionTimestamp.IsZero() {
550+
messages = append(messages, "Control plane metadata.deletionTimestamp is set")
551+
}
552+
553+
if !v1beta2conditions.IsTrue(kcp, controlplanev1.KubeadmControlPlaneCertificatesAvailableV1Beta2Condition) {
554+
messages = append(messages, "Control plane certificates are not available")
555+
}
556+
557+
if etcdIsManaged && etcdMembersHealthy < etcdQuorum {
558+
switch etcdMembersHealthy {
559+
case 0:
560+
messages = append(messages, fmt.Sprintf("There are no healthy etcd member, at least %d required for etcd quorum", etcdQuorum))
561+
case 1:
562+
messages = append(messages, fmt.Sprintf("There is 1 healthy etcd member, at least %d required for etcd quorum", etcdQuorum))
563+
default:
564+
messages = append(messages, fmt.Sprintf("There are %d healthy etcd members, at least %d required for etcd quorum", etcdMembersHealthy, etcdQuorum))
565+
}
566+
}
567+
568+
if k8sControlPlaneHealthy < 1 {
569+
messages = append(messages, "There are no Machines with healthy control plane components, at least 1 required")
570+
}
571+
572+
v1beta2conditions.Set(kcp, metav1.Condition{
573+
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
574+
Status: metav1.ConditionFalse,
575+
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
576+
Message: strings.Join(messages, ";"),
577+
})
578+
}
579+
444580
func aggregateStaleMachines(machines collections.Machines) string {
445581
if len(machines) == 0 {
446582
return ""

0 commit comments

Comments
 (0)