Skip to content

Commit b12ad96

Browse files
authored
Merge pull request #11425 from fabriziopandini/refine-v1beta2-kcp-available-condition
🌱 Refine v1beta2 Available condition in KCP
2 parents dc1051f + 58bc198 commit b12ad96

File tree

2 files changed

+255
-24
lines changed

2 files changed

+255
-24
lines changed

controlplane/kubeadm/internal/controllers/status.go

+98-10
Original file line numberDiff line numberDiff line change
@@ -474,6 +474,20 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
474474

475475
if etcdIsManaged {
476476
if etcdMembers == nil {
477+
// In case the control plane just initialized, give some more time before reporting failed to get etcd members.
478+
// Note: Two minutes is the time after which we assume that not getting the list of etcd members is an actual problem.
479+
if c := v1beta2conditions.Get(kcp, controlplanev1.KubeadmControlPlaneInitializedV1Beta2Condition); c != nil &&
480+
c.Status == metav1.ConditionTrue &&
481+
time.Since(c.LastTransitionTime.Time) < 2*time.Minute {
482+
v1beta2conditions.Set(kcp, metav1.Condition{
483+
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
484+
Status: metav1.ConditionFalse,
485+
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
486+
Message: "Waiting for etcd to report the list of members",
487+
})
488+
return
489+
}
490+
477491
v1beta2conditions.Set(kcp, metav1.Condition{
478492
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
479493
Status: metav1.ConditionUnknown,
@@ -520,14 +534,21 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
520534
// etcd members might not match with machines, e.g. while provisioning a new machine.
521535
etcdQuorum := (len(etcdMembers) / 2.0) + 1
522536
k8sControlPlaneHealthy := 0
537+
k8sControlPlaneNotHealthy := 0
523538
etcdMembersHealthy := 0
539+
etcdMembersNotHealthy := 0
524540
for _, machine := range machines {
525541
// if external etcd, only look at the status of the K8s control plane components on this machine.
526542
if !etcdIsManaged {
527543
if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition) &&
528544
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition) &&
529545
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition) {
530546
k8sControlPlaneHealthy++
547+
} else if shouldSurfaceWhenAvailableTrue(machine,
548+
controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition,
549+
controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition,
550+
controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition) {
551+
k8sControlPlaneNotHealthy++
531552
}
532553
continue
533554
}
@@ -541,6 +562,9 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
541562

542563
if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) {
543564
etcdMembersHealthy++
565+
} else if shouldSurfaceWhenAvailableTrue(machine,
566+
controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) {
567+
etcdMembersNotHealthy++
544568
}
545569

546570
if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition) &&
@@ -549,53 +573,117 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
549573
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) &&
550574
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyV1Beta2Condition) {
551575
k8sControlPlaneHealthy++
576+
} else if shouldSurfaceWhenAvailableTrue(machine,
577+
controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition,
578+
controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition,
579+
controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition,
580+
controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition,
581+
controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyV1Beta2Condition) {
582+
k8sControlPlaneNotHealthy++
552583
}
553584
}
554585

555586
if kcp.DeletionTimestamp.IsZero() &&
556587
(!etcdIsManaged || etcdMembersHealthy >= etcdQuorum) &&
557588
k8sControlPlaneHealthy >= 1 &&
558589
v1beta2conditions.IsTrue(kcp, controlplanev1.KubeadmControlPlaneCertificatesAvailableV1Beta2Condition) {
590+
messages := []string{}
591+
592+
if etcdIsManaged && etcdMembersNotHealthy > 0 {
593+
switch len(etcdMembers) - etcdMembersNotHealthy {
594+
case 1:
595+
messages = append(messages, fmt.Sprintf("* 1 of %d etcd members is healthy, at least %d required for etcd quorum", len(etcdMembers), etcdQuorum))
596+
default:
597+
messages = append(messages, fmt.Sprintf("* %d of %d etcd members are healthy, at least %d required for etcd quorum", len(etcdMembers)-etcdMembersNotHealthy, len(etcdMembers), etcdQuorum))
598+
}
599+
}
600+
601+
if k8sControlPlaneNotHealthy > 0 {
602+
switch len(machines) - k8sControlPlaneNotHealthy {
603+
case 1:
604+
messages = append(messages, fmt.Sprintf("* 1 of %d Machines has healthy control plane components, at least 1 required", len(machines)))
605+
default:
606+
messages = append(messages, fmt.Sprintf("* %d of %d Machines have healthy control plane components, at least 1 required", len(machines)-k8sControlPlaneNotHealthy, len(machines)))
607+
}
608+
}
609+
559610
v1beta2conditions.Set(kcp, metav1.Condition{
560-
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
561-
Status: metav1.ConditionTrue,
562-
Reason: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Reason,
611+
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
612+
Status: metav1.ConditionTrue,
613+
Reason: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Reason,
614+
Message: strings.Join(messages, "\n"),
563615
})
564616
return
565617
}
566618

567619
messages := []string{}
568620
if !kcp.DeletionTimestamp.IsZero() {
569-
messages = append(messages, "Control plane metadata.deletionTimestamp is set")
621+
messages = append(messages, "* Control plane metadata.deletionTimestamp is set")
570622
}
571623

572624
if !v1beta2conditions.IsTrue(kcp, controlplanev1.KubeadmControlPlaneCertificatesAvailableV1Beta2Condition) {
573-
messages = append(messages, "Control plane certificates are not available")
625+
messages = append(messages, "* Control plane certificates are not available")
574626
}
575627

576628
if etcdIsManaged && etcdMembersHealthy < etcdQuorum {
577629
switch etcdMembersHealthy {
578630
case 0:
579-
messages = append(messages, fmt.Sprintf("There are no healthy etcd member, at least %d required for etcd quorum", etcdQuorum))
631+
messages = append(messages, fmt.Sprintf("* There are no healthy etcd member, at least %d required for etcd quorum", etcdQuorum))
580632
case 1:
581-
messages = append(messages, fmt.Sprintf("There is 1 healthy etcd member, at least %d required for etcd quorum", etcdQuorum))
633+
messages = append(messages, fmt.Sprintf("* 1 of %d etcd members is healthy, at least %d required for etcd quorum", len(etcdMembers), etcdQuorum))
582634
default:
583-
messages = append(messages, fmt.Sprintf("There are %d healthy etcd members, at least %d required for etcd quorum", etcdMembersHealthy, etcdQuorum))
635+
messages = append(messages, fmt.Sprintf("* %d of %d etcd members are healthy, at least %d required for etcd quorum", etcdMembersHealthy, len(etcdMembers), etcdQuorum))
584636
}
585637
}
586638

587639
if k8sControlPlaneHealthy < 1 {
588-
messages = append(messages, "There are no Machines with healthy control plane components, at least 1 required")
640+
messages = append(messages, "* There are no Machines with healthy control plane components, at least 1 required")
589641
}
590642

591643
v1beta2conditions.Set(kcp, metav1.Condition{
592644
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
593645
Status: metav1.ConditionFalse,
594646
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
595-
Message: strings.Join(messages, ";"),
647+
Message: strings.Join(messages, "\n"),
596648
})
597649
}
598650

651+
// shouldSurfaceWhenAvailableTrue defines when a control plane components/etcd issue should surface when
652+
// Available condition is true.
653+
// The main goal of this check is to avoid to surface false negatives/flakes, and thus it requires that
654+
// an issue exists for at least more than 10 seconds before surfacing it.
655+
func shouldSurfaceWhenAvailableTrue(machine *clusterv1.Machine, conditionTypes ...string) bool {
656+
// Get the min time when one of the conditions in input transitioned to false or unknown.
657+
var t *time.Time
658+
for _, conditionType := range conditionTypes {
659+
c := v1beta2conditions.Get(machine, conditionType)
660+
if c == nil {
661+
continue
662+
}
663+
if c.Status == metav1.ConditionTrue {
664+
continue
665+
}
666+
if t == nil {
667+
t = ptr.To(c.LastTransitionTime.Time)
668+
}
669+
t = ptr.To(minTime(*t, c.LastTransitionTime.Time))
670+
}
671+
672+
if t != nil {
673+
if time.Since(*t) > 10*time.Second {
674+
return true
675+
}
676+
}
677+
return false
678+
}
679+
680+
func minTime(t1, t2 time.Time) time.Time {
681+
if t1.After(t2) {
682+
return t2
683+
}
684+
return t1
685+
}
686+
599687
func aggregateStaleMachines(machines collections.Machines) string {
600688
if len(machines) == 0 {
601689
return ""

0 commit comments

Comments
 (0)