Skip to content

Commit 9506423

Browse files
author
Sedef
committed
Add Node related condition to Machine conditions
1 parent bf2030b commit 9506423

7 files changed

+469
-111
lines changed

api/v1alpha3/condition_consts.go

+21-3
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,6 @@ const (
109109
// MachineHasFailureReason is the reason used when a machine has either a FailureReason or a FailureMessage set on its status.
110110
MachineHasFailureReason = "MachineHasFailure"
111111

112-
// NodeNotFoundReason is the reason used when a machine's node has previously been observed but is now gone.
113-
NodeNotFoundReason = "NodeNotFound"
114-
115112
// NodeStartupTimeoutReason is the reason used when a machine's node does not appear within the specified timeout.
116113
NodeStartupTimeoutReason = "NodeStartupTimeout"
117114

@@ -127,3 +124,24 @@ const (
127124
// WaitingForRemediationReason is the reason used when a machine fails a health check and remediation is needed.
128125
WaitingForRemediationReason = "WaitingForRemediation"
129126
)
127+
128+
// Conditions and condition Reasons for the Machine's Node object
129+
const (
130+
// MachineNodeHealthyCondition provides info about the operational state of the Kubernetes node hosted on the machine by summarizing node conditions.
131+
// If the conditions defined in a Kubernetes node (i.e., NodeReady, NodeMemoryPressure, NodeDiskPressure, NodePIDPressure, and NodeNetworkUnavailable) are in a healthy state, it will be set to True.
132+
MachineNodeHealthyCondition ConditionType = "NodeHealthy"
133+
134+
// WaitingForNodeRefReason (Severity=Info) documents a machine.spec.providerId is not assigned yet.
135+
WaitingForNodeRefReason = "WaitingForNodeRef"
136+
137+
// NodeProvisioningReason (Severity=Info) documents machine in the process of provisioning a node.
138+
// NB. provisioning --> NodeRef == ""
139+
NodeProvisioningReason = "NodeProvisioning"
140+
141+
// NodeNotFoundReason (Severity=Error) documents a machine's node has previously been observed but is now gone.
142+
// NB. provisioned --> NodeRef != ""
143+
NodeNotFoundReason = "NodeNotFound"
144+
145+
// NodeConditionsFailedReason (Severity=Warning) documents a node is not in a healthy state due to the failed state of at least 1 Kubelet condition.
146+
NodeConditionsFailedReason = "NodeConditionsFailed"
147+
)

controllers/machine_controller.go

+12-1
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ func (r *MachineReconciler) reconcile(ctx context.Context, cluster *clusterv1.Cl
271271
phases := []func(context.Context, *clusterv1.Cluster, *clusterv1.Machine) (ctrl.Result, error){
272272
r.reconcileBootstrap,
273273
r.reconcileInfrastructure,
274-
r.reconcileNodeRef,
274+
r.reconcileNode,
275275
}
276276

277277
res := ctrl.Result{}
@@ -355,6 +355,16 @@ func (r *MachineReconciler) reconcileDelete(ctx context.Context, cluster *cluste
355355
// Return early and don't remove the finalizer if we got an error or
356356
// the external reconciliation deletion isn't ready.
357357

358+
patchHelper, err := patch.NewHelper(m, r.Client)
359+
if err != nil {
360+
return ctrl.Result{}, err
361+
}
362+
conditions.MarkFalse(m, clusterv1.MachineNodeHealthyCondition, clusterv1.DeletingReason, clusterv1.ConditionSeverityInfo, "")
363+
if err := patchMachine(ctx, patchHelper, m); err != nil {
364+
conditions.MarkFalse(m, clusterv1.MachineNodeHealthyCondition, clusterv1.DeletionFailedReason, clusterv1.ConditionSeverityInfo, "")
365+
return ctrl.Result{}, errors.Wrap(err, "failed to patch Machine")
366+
}
367+
358368
if ok, err := r.reconcileDeleteInfrastructure(ctx, m); !ok || err != nil {
359369
return ctrl.Result{}, err
360370
}
@@ -377,6 +387,7 @@ func (r *MachineReconciler) reconcileDelete(ctx context.Context, cluster *cluste
377387
})
378388
if waitErr != nil {
379389
logger.Error(deleteNodeErr, "Timed out deleting node, moving on", "node", m.Status.NodeRef.Name)
390+
conditions.MarkFalse(m, clusterv1.MachineNodeHealthyCondition, clusterv1.DeletionFailedReason, clusterv1.ConditionSeverityWarning, "")
380391
r.recorder.Eventf(m, corev1.EventTypeWarning, "FailedDeleteNode", "error deleting Machine's node: %v", deleteNodeErr)
381392
}
382393
}

controllers/machine_controller_noderef.go

+81-32
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,13 @@ package controllers
1919
import (
2020
"context"
2121
"fmt"
22-
"time"
2322

2423
"github.com/pkg/errors"
25-
apicorev1 "k8s.io/api/core/v1"
24+
corev1 "k8s.io/api/core/v1"
2625
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
2726
"sigs.k8s.io/cluster-api/controllers/noderefutil"
2827
"sigs.k8s.io/cluster-api/util"
28+
"sigs.k8s.io/cluster-api/util/conditions"
2929
ctrl "sigs.k8s.io/controller-runtime"
3030
"sigs.k8s.io/controller-runtime/pkg/client"
3131
)
@@ -34,23 +34,13 @@ var (
3434
ErrNodeNotFound = errors.New("cannot find node with matching ProviderID")
3535
)
3636

37-
func (r *MachineReconciler) reconcileNodeRef(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine) (ctrl.Result, error) {
37+
func (r *MachineReconciler) reconcileNode(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine) (ctrl.Result, error) {
3838
logger := r.Log.WithValues("machine", machine.Name, "namespace", machine.Namespace)
39-
// Check that the Machine hasn't been deleted or in the process.
40-
if !machine.DeletionTimestamp.IsZero() {
41-
return ctrl.Result{}, nil
42-
}
43-
44-
// Check that the Machine doesn't already have a NodeRef.
45-
if machine.Status.NodeRef != nil {
46-
return ctrl.Result{}, nil
47-
}
48-
49-
logger = logger.WithValues("cluster", cluster.Name)
5039

5140
// Check that the Machine has a valid ProviderID.
5241
if machine.Spec.ProviderID == nil || *machine.Spec.ProviderID == "" {
53-
logger.Info("Machine doesn't have a valid ProviderID yet")
42+
logger.Info("Cannot reconcile Machine's Node, no valid ProviderID yet")
43+
conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.WaitingForNodeRefReason, clusterv1.ConditionSeverityInfo, "")
5444
return ctrl.Result{}, nil
5545
}
5646

@@ -64,29 +54,93 @@ func (r *MachineReconciler) reconcileNodeRef(ctx context.Context, cluster *clust
6454
return ctrl.Result{}, err
6555
}
6656

67-
// Get the Node reference.
68-
nodeRef, err := r.getNodeReference(remoteClient, providerID)
57+
// Even if Status.NodeRef exists, continue to do the following checks to make sure Node is healthy
58+
node, err := r.getNode(remoteClient, providerID)
6959
if err != nil {
7060
if err == ErrNodeNotFound {
71-
logger.Info(fmt.Sprintf("Cannot assign NodeRef to Machine: %s, requeuing", ErrNodeNotFound.Error()))
72-
return ctrl.Result{RequeueAfter: 20 * time.Second}, nil
61+
// While a NodeRef is set in the status, failing to get that node means the node is deleted.
62+
// If Status.NodeRef is not set before, node still can be in the provisioning state.
63+
if machine.Status.NodeRef != nil {
64+
conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeNotFoundReason, clusterv1.ConditionSeverityError, "")
65+
return ctrl.Result{}, errors.Wrapf(err, "no matching Node for Machine %q in namespace %q", machine.Name, machine.Namespace)
66+
}
67+
conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeProvisioningReason, clusterv1.ConditionSeverityWarning, "")
68+
return ctrl.Result{Requeue: true}, nil
7369
}
74-
logger.Error(err, "Failed to assign NodeRef")
75-
r.recorder.Event(machine, apicorev1.EventTypeWarning, "FailedSetNodeRef", err.Error())
70+
logger.Error(err, "Failed to retrieve Node by ProviderID")
71+
r.recorder.Event(machine, corev1.EventTypeWarning, "Failed to retrieve Node by ProviderID", err.Error())
7672
return ctrl.Result{}, err
7773
}
7874

7975
// Set the Machine NodeRef.
80-
machine.Status.NodeRef = nodeRef
81-
logger.Info("Set Machine's NodeRef", "noderef", machine.Status.NodeRef.Name)
82-
r.recorder.Event(machine, apicorev1.EventTypeNormal, "SuccessfulSetNodeRef", machine.Status.NodeRef.Name)
76+
if machine.Status.NodeRef == nil {
77+
machine.Status.NodeRef = &corev1.ObjectReference{
78+
Kind: node.Kind,
79+
APIVersion: node.APIVersion,
80+
Name: node.Name,
81+
UID: node.UID,
82+
}
83+
logger.Info("Set Machine's NodeRef", "noderef", machine.Status.NodeRef.Name)
84+
r.recorder.Event(machine, corev1.EventTypeNormal, "SuccessfulSetNodeRef", machine.Status.NodeRef.Name)
85+
}
86+
87+
// Do the remaining node health checks, then set the node health to true if all checks pass.
88+
status, message := summarizeNodeConditions(node)
89+
if status == corev1.ConditionFalse {
90+
conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeConditionsFailedReason, clusterv1.ConditionSeverityWarning, message)
91+
return ctrl.Result{}, nil
92+
}
93+
94+
conditions.MarkTrue(machine, clusterv1.MachineNodeHealthyCondition)
8395
return ctrl.Result{}, nil
8496
}
8597

86-
func (r *MachineReconciler) getNodeReference(c client.Reader, providerID *noderefutil.ProviderID) (*apicorev1.ObjectReference, error) {
98+
// summarizeNodeConditions summarizes a Node's conditions and returns the summary of condition statuses and concatenate failed condition messages:
99+
// if there is at least 1 semantically-negative condition, summarized status = False;
100+
// if there is at least 1 semantically-positive condition when there is 0 semantically negative condition, summarized status = True;
101+
// if all conditions are unknown, summarized status = Unknown.
102+
// (semantically true conditions: NodeMemoryPressure/NodeDiskPressure/NodePIDPressure == false or Ready == true.)
103+
func summarizeNodeConditions(node *corev1.Node) (corev1.ConditionStatus, string) {
104+
totalNumOfConditionsChecked := 4
105+
semanticallyFalseStatus := 0
106+
unknownStatus := 0
107+
108+
message := ""
109+
for _, condition := range node.Status.Conditions {
110+
switch condition.Type {
111+
case corev1.NodeMemoryPressure, corev1.NodeDiskPressure, corev1.NodePIDPressure:
112+
if condition.Status != corev1.ConditionFalse {
113+
message += fmt.Sprintf("Node condition %s is %s", condition.Type, condition.Status) + ". "
114+
if condition.Status == corev1.ConditionUnknown {
115+
unknownStatus++
116+
continue
117+
}
118+
semanticallyFalseStatus++
119+
}
120+
case corev1.NodeReady:
121+
if condition.Status != corev1.ConditionTrue {
122+
message += fmt.Sprintf("Node condition %s is %s", condition.Type, condition.Status) + ". "
123+
if condition.Status == corev1.ConditionUnknown {
124+
unknownStatus++
125+
continue
126+
}
127+
semanticallyFalseStatus++
128+
}
129+
}
130+
}
131+
if semanticallyFalseStatus > 0 {
132+
return corev1.ConditionFalse, message
133+
}
134+
if semanticallyFalseStatus+unknownStatus < totalNumOfConditionsChecked {
135+
return corev1.ConditionTrue, message
136+
}
137+
return corev1.ConditionUnknown, message
138+
}
139+
140+
func (r *MachineReconciler) getNode(c client.Reader, providerID *noderefutil.ProviderID) (*corev1.Node, error) {
87141
logger := r.Log.WithValues("providerID", providerID)
88142

89-
nodeList := apicorev1.NodeList{}
143+
nodeList := corev1.NodeList{}
90144
for {
91145
if err := c.List(context.TODO(), &nodeList, client.Continue(nodeList.Continue)); err != nil {
92146
return nil, err
@@ -100,12 +154,7 @@ func (r *MachineReconciler) getNodeReference(c client.Reader, providerID *nodere
100154
}
101155

102156
if providerID.Equals(nodeProviderID) {
103-
return &apicorev1.ObjectReference{
104-
Kind: node.Kind,
105-
APIVersion: node.APIVersion,
106-
Name: node.Name,
107-
UID: node.UID,
108-
}, nil
157+
return &node, nil
109158
}
110159
}
111160

controllers/machine_controller_noderef_test.go

+70-7
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,10 @@ import (
2626
"k8s.io/apimachinery/pkg/runtime"
2727
"k8s.io/client-go/kubernetes/scheme"
2828
"k8s.io/client-go/tools/record"
29-
"sigs.k8s.io/controller-runtime/pkg/client/fake"
30-
"sigs.k8s.io/controller-runtime/pkg/log"
31-
3229
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
3330
"sigs.k8s.io/cluster-api/controllers/noderefutil"
31+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
32+
"sigs.k8s.io/controller-runtime/pkg/log"
3433
)
3534

3635
func TestGetNodeReference(t *testing.T) {
@@ -108,21 +107,85 @@ func TestGetNodeReference(t *testing.T) {
108107
providerID, err := noderefutil.NewProviderID(test.providerID)
109108
gt.Expect(err).NotTo(HaveOccurred(), "Expected no error parsing provider id %q, got %v", test.providerID, err)
110109

111-
reference, err := r.getNodeReference(client, providerID)
110+
node, err := r.getNode(client, providerID)
112111
if test.err == nil {
113112
g.Expect(err).To(BeNil())
114113
} else {
115114
gt.Expect(err).NotTo(BeNil())
116115
gt.Expect(err).To(Equal(test.err), "Expected error %v, got %v", test.err, err)
117116
}
118117

119-
if test.expected == nil && reference == nil {
118+
if test.expected == nil && node == nil {
120119
return
121120
}
122121

123-
gt.Expect(reference.Name).To(Equal(test.expected.Name), "Expected NodeRef's name to be %v, got %v", reference.Name, test.expected.Name)
124-
gt.Expect(reference.Namespace).To(Equal(test.expected.Namespace), "Expected NodeRef's namespace to be %v, got %v", reference.Namespace, test.expected.Namespace)
122+
gt.Expect(node.Name).To(Equal(test.expected.Name), "Expected NodeRef's name to be %v, got %v", node.Name, test.expected.Name)
123+
gt.Expect(node.Namespace).To(Equal(test.expected.Namespace), "Expected NodeRef's namespace to be %v, got %v", node.Namespace, test.expected.Namespace)
125124
})
126125

127126
}
128127
}
128+
129+
func TestSummarizeNodeConditions(t *testing.T) {
130+
testCases := []struct {
131+
name string
132+
conditions []corev1.NodeCondition
133+
status corev1.ConditionStatus
134+
}{
135+
{
136+
name: "node is healthy",
137+
conditions: []corev1.NodeCondition{
138+
{Type: corev1.NodeReady, Status: corev1.ConditionTrue},
139+
{Type: corev1.NodeMemoryPressure, Status: corev1.ConditionFalse},
140+
{Type: corev1.NodeDiskPressure, Status: corev1.ConditionFalse},
141+
{Type: corev1.NodePIDPressure, Status: corev1.ConditionFalse},
142+
},
143+
status: corev1.ConditionTrue,
144+
},
145+
{
146+
name: "all conditions are unknown",
147+
conditions: []corev1.NodeCondition{
148+
{Type: corev1.NodeReady, Status: corev1.ConditionUnknown},
149+
{Type: corev1.NodeMemoryPressure, Status: corev1.ConditionUnknown},
150+
{Type: corev1.NodeDiskPressure, Status: corev1.ConditionUnknown},
151+
{Type: corev1.NodePIDPressure, Status: corev1.ConditionUnknown},
152+
},
153+
status: corev1.ConditionUnknown,
154+
},
155+
{
156+
name: "multiple semantically failed condition",
157+
conditions: []corev1.NodeCondition{
158+
{Type: corev1.NodeReady, Status: corev1.ConditionUnknown},
159+
{Type: corev1.NodeMemoryPressure, Status: corev1.ConditionTrue},
160+
{Type: corev1.NodeDiskPressure, Status: corev1.ConditionTrue},
161+
{Type: corev1.NodePIDPressure, Status: corev1.ConditionTrue},
162+
},
163+
status: corev1.ConditionFalse,
164+
},
165+
{
166+
name: "one positive condition when the rest is unknown",
167+
conditions: []corev1.NodeCondition{
168+
{Type: corev1.NodeReady, Status: corev1.ConditionTrue},
169+
{Type: corev1.NodeMemoryPressure, Status: corev1.ConditionUnknown},
170+
{Type: corev1.NodeDiskPressure, Status: corev1.ConditionUnknown},
171+
{Type: corev1.NodePIDPressure, Status: corev1.ConditionUnknown},
172+
},
173+
status: corev1.ConditionTrue,
174+
},
175+
}
176+
for _, test := range testCases {
177+
t.Run(test.name, func(t *testing.T) {
178+
g := NewWithT(t)
179+
node := &corev1.Node{
180+
ObjectMeta: metav1.ObjectMeta{
181+
Name: "node-1",
182+
},
183+
Status: corev1.NodeStatus{
184+
Conditions: test.conditions,
185+
},
186+
}
187+
status, _ := summarizeNodeConditions(node)
188+
g.Expect(status).To(Equal(test.status))
189+
})
190+
}
191+
}

0 commit comments

Comments
 (0)