Skip to content

Commit 463c545

Browse files
author
Sedef
committed
Add Node related condition to Machine conditions
1 parent 3aac2fc commit 463c545

8 files changed

+479
-87
lines changed

api/v1alpha3/condition_consts.go

+21-3
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,6 @@ const (
109109
// MachineHasFailureReason is the reason used when a machine has either a FailureReason or a FailureMessage set on its status.
110110
MachineHasFailureReason = "MachineHasFailure"
111111

112-
// NodeNotFoundReason is the reason used when a machine's node has previously been observed but is now gone.
113-
NodeNotFoundReason = "NodeNotFound"
114-
115112
// NodeStartupTimeoutReason is the reason used when a machine's node does not appear within the specified timeout.
116113
NodeStartupTimeoutReason = "NodeStartupTimeout"
117114

@@ -127,3 +124,24 @@ const (
127124
// WaitingForRemediationReason is the reason used when a machine fails a health check and remediation is needed.
128125
WaitingForRemediationReason = "WaitingForRemediation"
129126
)
127+
128+
// Conditions and condition Reasons for the Machine's Node object
129+
const (
130+
// MachineNodeHealthyCondition provides info about the operational state of the Kubernetes node hosted on the machine by summarizing node conditions.
131+
// If the conditions defined in a Kubernetes node (i.e., NodeReady, NodeMemoryPressure, NodeDiskPressure, NodePIDPressure, and NodeNetworkUnavailable) are in a healthy state, it will be set to True.
132+
MachineNodeHealthyCondition ConditionType = "NodeHealthy"
133+
134+
// WaitingForNodeRefReason (Severity=Info) documents a machine.spec.providerId is not assigned yet.
135+
WaitingForNodeRefReason = "WaitingForNodeRef"
136+
137+
// NodeProvisioningReason (Severity=Info) documents machine in the process of provisioning a node.
138+
// NB. provisioning --> NodeRef == ""
139+
NodeProvisioningReason = "NodeProvisioning"
140+
141+
// NodeNotFoundReason (Severity=Error) documents a machine's node has previously been observed but is now gone.
142+
// NB. provisioned --> NodeRef != ""
143+
NodeNotFoundReason = "NodeNotFound"
144+
145+
// NodeConditionsFailedReason (Severity=Warning) documents a node is not in a healthy state due to the failed state of at least 1 Kubelet condition.
146+
NodeConditionsFailedReason = "NodeConditionsFailed"
147+
)

api/v1alpha4/condition_consts.go

+21-3
Original file line numberDiff line numberDiff line change
@@ -116,9 +116,6 @@ const (
116116
// MachineHasFailureReason is the reason used when a machine has either a FailureReason or a FailureMessage set on its status.
117117
MachineHasFailureReason = "MachineHasFailure"
118118

119-
// NodeNotFoundReason is the reason used when a machine's node has previously been observed but is now gone.
120-
NodeNotFoundReason = "NodeNotFound"
121-
122119
// NodeStartupTimeoutReason is the reason used when a machine's node does not appear within the specified timeout.
123120
NodeStartupTimeoutReason = "NodeStartupTimeout"
124121

@@ -134,3 +131,24 @@ const (
134131
// WaitingForRemediationReason is the reason used when a machine fails a health check and remediation is needed.
135132
WaitingForRemediationReason = "WaitingForRemediation"
136133
)
134+
135+
// Conditions and condition Reasons for the Machine's Node object
136+
const (
137+
// MachineNodeHealthyCondition provides info about the operational state of the Kubernetes node hosted on the machine by summarizing node conditions.
138+
// If the conditions defined in a Kubernetes node (i.e., NodeReady, NodeMemoryPressure, NodeDiskPressure, NodePIDPressure, and NodeNetworkUnavailable) are in a healthy state, it will be set to True.
139+
MachineNodeHealthyCondition ConditionType = "NodeHealthy"
140+
141+
// WaitingForNodeRefReason (Severity=Info) documents a machine.spec.providerId is not assigned yet.
142+
WaitingForNodeRefReason = "WaitingForNodeRef"
143+
144+
// NodeProvisioningReason (Severity=Info) documents machine in the process of provisioning a node.
145+
// NB. provisioning --> NodeRef == ""
146+
NodeProvisioningReason = "NodeProvisioning"
147+
148+
// NodeNotFoundReason (Severity=Error) documents a machine's node has previously been observed but is now gone.
149+
// NB. provisioned --> NodeRef != ""
150+
NodeNotFoundReason = "NodeNotFound"
151+
152+
// NodeConditionsFailedReason (Severity=Warning) documents a node is not in a healthy state due to the failed state of at least 1 Kubelet condition.
153+
NodeConditionsFailedReason = "NodeConditionsFailed"
154+
)

controllers/machine_controller.go

+12-1
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ func (r *MachineReconciler) reconcile(ctx context.Context, cluster *clusterv1.Cl
263263
phases := []func(context.Context, *clusterv1.Cluster, *clusterv1.Machine) (ctrl.Result, error){
264264
r.reconcileBootstrap,
265265
r.reconcileInfrastructure,
266-
r.reconcileNodeRef,
266+
r.reconcileNode,
267267
}
268268

269269
res := ctrl.Result{}
@@ -346,6 +346,16 @@ func (r *MachineReconciler) reconcileDelete(ctx context.Context, cluster *cluste
346346
// Return early and don't remove the finalizer if we got an error or
347347
// the external reconciliation deletion isn't ready.
348348

349+
patchHelper, err := patch.NewHelper(m, r.Client)
350+
if err != nil {
351+
return ctrl.Result{}, err
352+
}
353+
conditions.MarkFalse(m, clusterv1.MachineNodeHealthyCondition, clusterv1.DeletingReason, clusterv1.ConditionSeverityInfo, "")
354+
if err := patchMachine(ctx, patchHelper, m); err != nil {
355+
conditions.MarkFalse(m, clusterv1.MachineNodeHealthyCondition, clusterv1.DeletionFailedReason, clusterv1.ConditionSeverityInfo, "")
356+
return ctrl.Result{}, errors.Wrap(err, "failed to patch Machine")
357+
}
358+
349359
if ok, err := r.reconcileDeleteInfrastructure(ctx, m); !ok || err != nil {
350360
return ctrl.Result{}, err
351361
}
@@ -368,6 +378,7 @@ func (r *MachineReconciler) reconcileDelete(ctx context.Context, cluster *cluste
368378
})
369379
if waitErr != nil {
370380
log.Error(deleteNodeErr, "Timed out deleting node, moving on", "node", m.Status.NodeRef.Name)
381+
conditions.MarkFalse(m, clusterv1.MachineNodeHealthyCondition, clusterv1.DeletionFailedReason, clusterv1.ConditionSeverityWarning, "")
371382
r.recorder.Eventf(m, corev1.EventTypeWarning, "FailedDeleteNode", "error deleting Machine's node: %v", deleteNodeErr)
372383
}
373384
}

controllers/machine_controller_noderef.go

+82-33
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,13 @@ package controllers
1919
import (
2020
"context"
2121
"fmt"
22-
"time"
2322

2423
"github.com/pkg/errors"
25-
apicorev1 "k8s.io/api/core/v1"
24+
corev1 "k8s.io/api/core/v1"
2625
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha4"
2726
"sigs.k8s.io/cluster-api/controllers/noderefutil"
2827
"sigs.k8s.io/cluster-api/util"
28+
"sigs.k8s.io/cluster-api/util/conditions"
2929
ctrl "sigs.k8s.io/controller-runtime"
3030
"sigs.k8s.io/controller-runtime/pkg/client"
3131
)
@@ -34,24 +34,14 @@ var (
3434
ErrNodeNotFound = errors.New("cannot find node with matching ProviderID")
3535
)
3636

37-
func (r *MachineReconciler) reconcileNodeRef(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine) (ctrl.Result, error) {
38-
log := ctrl.LoggerFrom(ctx, "cluster", cluster.Name)
39-
40-
// Check that the Machine hasn't been deleted or in the process.
41-
if !machine.DeletionTimestamp.IsZero() {
42-
return ctrl.Result{}, nil
43-
}
44-
45-
// Check that the Machine doesn't already have a NodeRef.
46-
if machine.Status.NodeRef != nil {
47-
return ctrl.Result{}, nil
48-
}
49-
37+
func (r *MachineReconciler) reconcileNode(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine) (ctrl.Result, error) {
38+
log := ctrl.LoggerFrom(ctx, "machine", machine.Name, "namespace", machine.Namespace)
5039
log = log.WithValues("cluster", cluster.Name)
5140

5241
// Check that the Machine has a valid ProviderID.
5342
if machine.Spec.ProviderID == nil || *machine.Spec.ProviderID == "" {
54-
log.Info("Machine doesn't have a valid ProviderID yet")
43+
log.Info("Cannot reconcile Machine's Node, no valid ProviderID yet")
44+
conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.WaitingForNodeRefReason, clusterv1.ConditionSeverityInfo, "")
5545
return ctrl.Result{}, nil
5646
}
5747

@@ -65,29 +55,93 @@ func (r *MachineReconciler) reconcileNodeRef(ctx context.Context, cluster *clust
6555
return ctrl.Result{}, err
6656
}
6757

68-
// Get the Node reference.
69-
nodeRef, err := r.getNodeReference(ctx, remoteClient, providerID)
58+
// Even if Status.NodeRef exists, continue to do the following checks to make sure Node is healthy
59+
node, err := r.getNode(ctx, remoteClient, providerID)
7060
if err != nil {
7161
if err == ErrNodeNotFound {
72-
log.Info(fmt.Sprintf("Cannot assign NodeRef to Machine: %s, requeuing", ErrNodeNotFound.Error()))
73-
return ctrl.Result{RequeueAfter: 20 * time.Second}, nil
62+
// While a NodeRef is set in the status, failing to get that node means the node is deleted.
63+
// If Status.NodeRef is not set before, node still can be in the provisioning state.
64+
if machine.Status.NodeRef != nil {
65+
conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeNotFoundReason, clusterv1.ConditionSeverityError, "")
66+
return ctrl.Result{}, errors.Wrapf(err, "no matching Node for Machine %q in namespace %q", machine.Name, machine.Namespace)
67+
}
68+
conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeProvisioningReason, clusterv1.ConditionSeverityWarning, "")
69+
return ctrl.Result{Requeue: true}, nil
7470
}
75-
log.Error(err, "Failed to assign NodeRef")
76-
r.recorder.Event(machine, apicorev1.EventTypeWarning, "FailedSetNodeRef", err.Error())
71+
log.Error(err, "Failed to retrieve Node by ProviderID")
72+
r.recorder.Event(machine, corev1.EventTypeWarning, "Failed to retrieve Node by ProviderID", err.Error())
7773
return ctrl.Result{}, err
7874
}
7975

8076
// Set the Machine NodeRef.
81-
machine.Status.NodeRef = nodeRef
82-
log.Info("Set Machine's NodeRef", "noderef", machine.Status.NodeRef.Name)
83-
r.recorder.Event(machine, apicorev1.EventTypeNormal, "SuccessfulSetNodeRef", machine.Status.NodeRef.Name)
77+
if machine.Status.NodeRef == nil {
78+
machine.Status.NodeRef = &corev1.ObjectReference{
79+
Kind: node.Kind,
80+
APIVersion: node.APIVersion,
81+
Name: node.Name,
82+
UID: node.UID,
83+
}
84+
log.Info("Set Machine's NodeRef", "noderef", machine.Status.NodeRef.Name)
85+
r.recorder.Event(machine, corev1.EventTypeNormal, "SuccessfulSetNodeRef", machine.Status.NodeRef.Name)
86+
}
87+
88+
// Do the remaining node health checks, then set the node health to true if all checks pass.
89+
status, message := summarizeNodeConditions(node)
90+
if status == corev1.ConditionFalse {
91+
conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeConditionsFailedReason, clusterv1.ConditionSeverityWarning, message)
92+
return ctrl.Result{}, nil
93+
}
94+
95+
conditions.MarkTrue(machine, clusterv1.MachineNodeHealthyCondition)
8496
return ctrl.Result{}, nil
8597
}
8698

87-
func (r *MachineReconciler) getNodeReference(ctx context.Context, c client.Reader, providerID *noderefutil.ProviderID) (*apicorev1.ObjectReference, error) {
99+
// summarizeNodeConditions summarizes a Node's conditions and returns the summary of condition statuses and concatenate failed condition messages:
100+
// if there is at least 1 semantically-negative condition, summarized status = False;
101+
// if there is at least 1 semantically-positive condition when there is 0 semantically negative condition, summarized status = True;
102+
// if all conditions are unknown, summarized status = Unknown.
103+
// (semantically true conditions: NodeMemoryPressure/NodeDiskPressure/NodePIDPressure == false or Ready == true.)
104+
func summarizeNodeConditions(node *corev1.Node) (corev1.ConditionStatus, string) {
105+
totalNumOfConditionsChecked := 4
106+
semanticallyFalseStatus := 0
107+
unknownStatus := 0
108+
109+
message := ""
110+
for _, condition := range node.Status.Conditions {
111+
switch condition.Type {
112+
case corev1.NodeMemoryPressure, corev1.NodeDiskPressure, corev1.NodePIDPressure:
113+
if condition.Status != corev1.ConditionFalse {
114+
message += fmt.Sprintf("Node condition %s is %s", condition.Type, condition.Status) + ". "
115+
if condition.Status == corev1.ConditionUnknown {
116+
unknownStatus++
117+
continue
118+
}
119+
semanticallyFalseStatus++
120+
}
121+
case corev1.NodeReady:
122+
if condition.Status != corev1.ConditionTrue {
123+
message += fmt.Sprintf("Node condition %s is %s", condition.Type, condition.Status) + ". "
124+
if condition.Status == corev1.ConditionUnknown {
125+
unknownStatus++
126+
continue
127+
}
128+
semanticallyFalseStatus++
129+
}
130+
}
131+
}
132+
if semanticallyFalseStatus > 0 {
133+
return corev1.ConditionFalse, message
134+
}
135+
if semanticallyFalseStatus+unknownStatus < totalNumOfConditionsChecked {
136+
return corev1.ConditionTrue, message
137+
}
138+
return corev1.ConditionUnknown, message
139+
}
140+
141+
func (r *MachineReconciler) getNode(ctx context.Context, c client.Reader, providerID *noderefutil.ProviderID) (*corev1.Node, error) {
88142
log := ctrl.LoggerFrom(ctx, "providerID", providerID)
89143

90-
nodeList := apicorev1.NodeList{}
144+
nodeList := corev1.NodeList{}
91145
for {
92146
if err := c.List(ctx, &nodeList, client.Continue(nodeList.Continue)); err != nil {
93147
return nil, err
@@ -101,12 +155,7 @@ func (r *MachineReconciler) getNodeReference(ctx context.Context, c client.Reade
101155
}
102156

103157
if providerID.Equals(nodeProviderID) {
104-
return &apicorev1.ObjectReference{
105-
Kind: node.Kind,
106-
APIVersion: node.APIVersion,
107-
Name: node.Name,
108-
UID: node.UID,
109-
}, nil
158+
return &node, nil
110159
}
111160
}
112161

controllers/machine_controller_noderef_test.go

+70-7
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,10 @@ import (
2525
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2626
"k8s.io/client-go/kubernetes/scheme"
2727
"k8s.io/client-go/tools/record"
28-
"sigs.k8s.io/controller-runtime/pkg/client"
29-
"sigs.k8s.io/controller-runtime/pkg/client/fake"
30-
3128
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha4"
3229
"sigs.k8s.io/cluster-api/controllers/noderefutil"
30+
"sigs.k8s.io/controller-runtime/pkg/client"
31+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
3332
)
3433

3534
func TestGetNodeReference(t *testing.T) {
@@ -106,21 +105,85 @@ func TestGetNodeReference(t *testing.T) {
106105
providerID, err := noderefutil.NewProviderID(test.providerID)
107106
gt.Expect(err).NotTo(HaveOccurred(), "Expected no error parsing provider id %q, got %v", test.providerID, err)
108107

109-
reference, err := r.getNodeReference(ctx, client, providerID)
108+
node, err := r.getNode(ctx, client, providerID)
110109
if test.err == nil {
111110
g.Expect(err).To(BeNil())
112111
} else {
113112
gt.Expect(err).NotTo(BeNil())
114113
gt.Expect(err).To(Equal(test.err), "Expected error %v, got %v", test.err, err)
115114
}
116115

117-
if test.expected == nil && reference == nil {
116+
if test.expected == nil && node == nil {
118117
return
119118
}
120119

121-
gt.Expect(reference.Name).To(Equal(test.expected.Name), "Expected NodeRef's name to be %v, got %v", reference.Name, test.expected.Name)
122-
gt.Expect(reference.Namespace).To(Equal(test.expected.Namespace), "Expected NodeRef's namespace to be %v, got %v", reference.Namespace, test.expected.Namespace)
120+
gt.Expect(node.Name).To(Equal(test.expected.Name), "Expected NodeRef's name to be %v, got %v", node.Name, test.expected.Name)
121+
gt.Expect(node.Namespace).To(Equal(test.expected.Namespace), "Expected NodeRef's namespace to be %v, got %v", node.Namespace, test.expected.Namespace)
123122
})
124123

125124
}
126125
}
126+
127+
func TestSummarizeNodeConditions(t *testing.T) {
128+
testCases := []struct {
129+
name string
130+
conditions []corev1.NodeCondition
131+
status corev1.ConditionStatus
132+
}{
133+
{
134+
name: "node is healthy",
135+
conditions: []corev1.NodeCondition{
136+
{Type: corev1.NodeReady, Status: corev1.ConditionTrue},
137+
{Type: corev1.NodeMemoryPressure, Status: corev1.ConditionFalse},
138+
{Type: corev1.NodeDiskPressure, Status: corev1.ConditionFalse},
139+
{Type: corev1.NodePIDPressure, Status: corev1.ConditionFalse},
140+
},
141+
status: corev1.ConditionTrue,
142+
},
143+
{
144+
name: "all conditions are unknown",
145+
conditions: []corev1.NodeCondition{
146+
{Type: corev1.NodeReady, Status: corev1.ConditionUnknown},
147+
{Type: corev1.NodeMemoryPressure, Status: corev1.ConditionUnknown},
148+
{Type: corev1.NodeDiskPressure, Status: corev1.ConditionUnknown},
149+
{Type: corev1.NodePIDPressure, Status: corev1.ConditionUnknown},
150+
},
151+
status: corev1.ConditionUnknown,
152+
},
153+
{
154+
name: "multiple semantically failed condition",
155+
conditions: []corev1.NodeCondition{
156+
{Type: corev1.NodeReady, Status: corev1.ConditionUnknown},
157+
{Type: corev1.NodeMemoryPressure, Status: corev1.ConditionTrue},
158+
{Type: corev1.NodeDiskPressure, Status: corev1.ConditionTrue},
159+
{Type: corev1.NodePIDPressure, Status: corev1.ConditionTrue},
160+
},
161+
status: corev1.ConditionFalse,
162+
},
163+
{
164+
name: "one positive condition when the rest is unknown",
165+
conditions: []corev1.NodeCondition{
166+
{Type: corev1.NodeReady, Status: corev1.ConditionTrue},
167+
{Type: corev1.NodeMemoryPressure, Status: corev1.ConditionUnknown},
168+
{Type: corev1.NodeDiskPressure, Status: corev1.ConditionUnknown},
169+
{Type: corev1.NodePIDPressure, Status: corev1.ConditionUnknown},
170+
},
171+
status: corev1.ConditionTrue,
172+
},
173+
}
174+
for _, test := range testCases {
175+
t.Run(test.name, func(t *testing.T) {
176+
g := NewWithT(t)
177+
node := &corev1.Node{
178+
ObjectMeta: metav1.ObjectMeta{
179+
Name: "node-1",
180+
},
181+
Status: corev1.NodeStatus{
182+
Conditions: test.conditions,
183+
},
184+
}
185+
status, _ := summarizeNodeConditions(node)
186+
g.Expect(status).To(Equal(test.status))
187+
})
188+
}
189+
}

0 commit comments

Comments
 (0)