Skip to content

Commit 42b6249

Browse files
author
Sedef
committed
Add Node related condition to Machine conditions
1 parent 57a5454 commit 42b6249

8 files changed

+555
-198
lines changed

api/v1alpha3/condition_consts.go

+21-3
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,6 @@ const (
109109
// MachineHasFailureReason is the reason used when a machine has either a FailureReason or a FailureMessage set on its status.
110110
MachineHasFailureReason = "MachineHasFailure"
111111

112-
// NodeNotFoundReason is the reason used when a machine's node has previously been observed but is now gone.
113-
NodeNotFoundReason = "NodeNotFound"
114-
115112
// NodeStartupTimeoutReason is the reason used when a machine's node does not appear within the specified timeout.
116113
NodeStartupTimeoutReason = "NodeStartupTimeout"
117114

@@ -127,3 +124,24 @@ const (
127124
// WaitingForRemediationReason is the reason used when a machine fails a health check and remediation is needed.
128125
WaitingForRemediationReason = "WaitingForRemediation"
129126
)
127+
128+
// Conditions and condition Reasons for the Machine's Node object
129+
const (
130+
// MachineNodeHealthyCondition provides info about the operational state of the Kubernetes node hosted on the machine by summarizing node conditions.
131+
// If the conditions defined in a Kubernetes node (i.e., NodeReady, NodeMemoryPressure, NodeDiskPressure, NodePIDPressure, and NodeNetworkUnavailable) are in a healthy state, it will be set to True.
132+
MachineNodeHealthyCondition ConditionType = "NodeHealthy"
133+
134+
// WaitingForNodeRefReason (Severity=Info) documents a machine.spec.providerId is not assigned yet.
135+
WaitingForNodeRefReason = "WaitingForNodeRef"
136+
137+
// NodeProvisioningReason (Severity=Info) documents machine in the process of provisioning a node.
138+
// NB. provisioning --> NodeRef == ""
139+
NodeProvisioningReason = "NodeProvisioning"
140+
141+
// NodeNotFoundReason (Severity=Error) documents a machine's node has previously been observed but is now gone.
142+
// NB. provisioned --> NodeRef != ""
143+
NodeNotFoundReason = "NodeNotFound"
144+
145+
// NodeConditionsFailedReason (Severity=Warning) documents a node is not in a healthy state due to the failed state of at least 1 Kubelet condition.
146+
NodeConditionsFailedReason = "NodeConditionsFailed"
147+
)

controllers/machine_controller.go

+12-2
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ func (r *MachineReconciler) reconcile(ctx context.Context, cluster *clusterv1.Cl
249249
phases := []func(context.Context, *clusterv1.Cluster, *clusterv1.Machine) (ctrl.Result, error){
250250
r.reconcileBootstrap,
251251
r.reconcileInfrastructure,
252-
r.reconcileNodeRef,
252+
r.reconcileNode,
253253
}
254254

255255
res := ctrl.Result{}
@@ -333,6 +333,16 @@ func (r *MachineReconciler) reconcileDelete(ctx context.Context, cluster *cluste
333333
// Return early and don't remove the finalizer if we got an error or
334334
// the external reconciliation deletion isn't ready.
335335

336+
patchHelper, err := patch.NewHelper(m, r.Client)
337+
if err != nil {
338+
return ctrl.Result{}, err
339+
}
340+
conditions.MarkFalse(m, clusterv1.MachineNodeHealthyCondition, clusterv1.DeletingReason, clusterv1.ConditionSeverityInfo, "")
341+
if err := patchMachine(ctx, patchHelper, m); err != nil {
342+
conditions.MarkFalse(m, clusterv1.MachineNodeHealthyCondition, clusterv1.DeletionFailedReason, clusterv1.ConditionSeverityInfo, "")
343+
return ctrl.Result{}, errors.Wrap(err, "failed to patch Machine")
344+
}
345+
336346
if ok, err := r.reconcileDeleteInfrastructure(ctx, m); !ok || err != nil {
337347
return ctrl.Result{}, err
338348
}
@@ -345,7 +355,6 @@ func (r *MachineReconciler) reconcileDelete(ctx context.Context, cluster *cluste
345355
// https://github.com/kubernetes-sigs/cluster-api/issues/2565
346356
if isDeleteNodeAllowed {
347357
logger.Info("Deleting node", "node", m.Status.NodeRef.Name)
348-
349358
var deleteNodeErr error
350359
waitErr := wait.PollImmediate(2*time.Second, 10*time.Second, func() (bool, error) {
351360
if deleteNodeErr = r.deleteNode(ctx, cluster, m.Status.NodeRef.Name); deleteNodeErr != nil && !apierrors.IsNotFound(deleteNodeErr) {
@@ -355,6 +364,7 @@ func (r *MachineReconciler) reconcileDelete(ctx context.Context, cluster *cluste
355364
})
356365
if waitErr != nil {
357366
logger.Error(deleteNodeErr, "Timed out deleting node, moving on", "node", m.Status.NodeRef.Name)
367+
conditions.MarkFalse(m, clusterv1.MachineNodeHealthyCondition, clusterv1.DeletionFailedReason, clusterv1.ConditionSeverityWarning, "")
358368
r.recorder.Eventf(m, corev1.EventTypeWarning, "FailedDeleteNode", "error deleting Machine's node: %v", deleteNodeErr)
359369
}
360370
}
+166
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
/*
2+
Copyright 2019 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package controllers
18+
19+
import (
20+
"context"
21+
"fmt"
22+
23+
"github.com/pkg/errors"
24+
corev1 "k8s.io/api/core/v1"
25+
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
26+
"sigs.k8s.io/cluster-api/controllers/noderefutil"
27+
"sigs.k8s.io/cluster-api/util"
28+
"sigs.k8s.io/cluster-api/util/conditions"
29+
"sigs.k8s.io/controller-runtime/pkg/client"
30+
"sigs.k8s.io/controller-runtime/pkg/reconcile"
31+
)
32+
33+
var (
34+
ErrNodeNotFound = errors.New("cannot find node with matching ProviderID")
35+
)
36+
37+
func (r *MachineReconciler) reconcileNode(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine) (reconcile.Result, error) {
38+
logger := r.Log.WithValues("machine", machine.Name, "namespace", machine.Namespace)
39+
40+
// Check that the Machine has a valid ProviderID.
41+
if machine.Spec.ProviderID == nil || *machine.Spec.ProviderID == "" {
42+
logger.Info("Cannot reconcile node, the machine doesn't have a valid ProviderID yet")
43+
conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.WaitingForNodeRefReason, clusterv1.ConditionSeverityInfo, "")
44+
return reconcile.Result{}, nil
45+
}
46+
47+
providerID, err := noderefutil.NewProviderID(*machine.Spec.ProviderID)
48+
if err != nil {
49+
return reconcile.Result{}, err
50+
}
51+
52+
remoteClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(cluster))
53+
if err != nil {
54+
return reconcile.Result{}, err
55+
}
56+
57+
// Even if Status.NodeRef exists, continue to do the following checks to make sure Node is healthy
58+
node, err := r.getNode(remoteClient, providerID)
59+
if err != nil {
60+
if err == ErrNodeNotFound {
61+
// While a NodeRef is set in the status, failing to get that node means the node is deleted.
62+
// If Status.NodeRef is not set before, node still can be in the provisioning state.
63+
if machine.Status.NodeRef != nil {
64+
conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeNotFoundReason, clusterv1.ConditionSeverityError, "")
65+
return reconcile.Result{}, errors.Wrapf(err, "no matching Node for Machine %q in namespace %q", machine.Name, machine.Namespace)
66+
}
67+
conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeProvisioningReason, clusterv1.ConditionSeverityWarning, "")
68+
return reconcile.Result{Requeue: true}, nil
69+
}
70+
logger.Error(err, "Failed to retrieve Node by ProviderID")
71+
r.recorder.Event(machine, corev1.EventTypeWarning, "Failed to retrieve Node by ProviderID", err.Error())
72+
return reconcile.Result{}, err
73+
}
74+
75+
// Set the Machine NodeRef.
76+
if machine.Status.NodeRef == nil {
77+
machine.Status.NodeRef = &corev1.ObjectReference{
78+
Kind: node.Kind,
79+
APIVersion: node.APIVersion,
80+
Name: node.Name,
81+
UID: node.UID,
82+
}
83+
logger.Info("Set Machine's NodeRef", "noderef", machine.Status.NodeRef.Name)
84+
r.recorder.Event(machine, corev1.EventTypeNormal, "SuccessfulSetNodeRef", machine.Status.NodeRef.Name)
85+
}
86+
87+
// Do the remaining node health checks, then set the node health to true if all checks pass.
88+
status, message := summarizeNodeConditions(node)
89+
if status == corev1.ConditionFalse {
90+
conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeConditionsFailedReason, clusterv1.ConditionSeverityWarning, message)
91+
return reconcile.Result{}, nil
92+
}
93+
94+
conditions.MarkTrue(machine, clusterv1.MachineNodeHealthyCondition)
95+
return reconcile.Result{}, nil
96+
}
97+
98+
// summarizeNodeConditions summarizes a Node's conditions and returns the summary of condition statuses and concatenate failed condition messages:
99+
// if there is at least 1 semantically-negative condition, summarized status = False;
100+
// if there is at least 1 semantically-positive condition when there is 0 semantically negative condition, summarized status = True;
101+
// if all conditions are unknown, summarized status = Unknown.
102+
// (semantically true conditions: NodeMemoryPressure/NodeDiskPressure/NodePIDPressure == false or Ready == true.)
103+
func summarizeNodeConditions(node *corev1.Node) (corev1.ConditionStatus, string) {
104+
totalNumOfConditionsChecked := 4
105+
semanticallyFalseStatus := 0
106+
unknownStatus := 0
107+
108+
message := ""
109+
for _, condition := range node.Status.Conditions {
110+
switch condition.Type {
111+
case corev1.NodeMemoryPressure, corev1.NodeDiskPressure, corev1.NodePIDPressure:
112+
if condition.Status != corev1.ConditionFalse {
113+
message += fmt.Sprintf("Condition %s is %s", condition.Type, condition.Status) + ". "
114+
if condition.Status == corev1.ConditionUnknown {
115+
unknownStatus++
116+
continue
117+
}
118+
semanticallyFalseStatus++
119+
}
120+
case corev1.NodeReady:
121+
if condition.Status != corev1.ConditionTrue {
122+
message += fmt.Sprintf("Condition %s is %s", condition.Type, condition.Status) + ". "
123+
if condition.Status == corev1.ConditionUnknown {
124+
unknownStatus++
125+
continue
126+
}
127+
semanticallyFalseStatus++
128+
}
129+
}
130+
}
131+
if semanticallyFalseStatus > 0 {
132+
return corev1.ConditionFalse, message
133+
}
134+
if semanticallyFalseStatus+unknownStatus < totalNumOfConditionsChecked {
135+
return corev1.ConditionTrue, message
136+
}
137+
return corev1.ConditionUnknown, message
138+
}
139+
140+
func (r *MachineReconciler) getNode(c client.Reader, providerID *noderefutil.ProviderID) (*corev1.Node, error) {
141+
logger := r.Log.WithValues("providerID", providerID)
142+
143+
nodeList := corev1.NodeList{}
144+
for {
145+
if err := c.List(context.TODO(), &nodeList, client.Continue(nodeList.Continue)); err != nil {
146+
return nil, err
147+
}
148+
149+
for _, node := range nodeList.Items {
150+
nodeProviderID, err := noderefutil.NewProviderID(node.Spec.ProviderID)
151+
if err != nil {
152+
logger.Error(err, "Failed to parse ProviderID", "node", node.Name)
153+
continue
154+
}
155+
156+
if providerID.Equals(nodeProviderID) {
157+
return &node, nil
158+
}
159+
}
160+
161+
if nodeList.Continue == "" {
162+
break
163+
}
164+
}
165+
return nil, ErrNodeNotFound
166+
}

controllers/machine_controller_noderef_test.go renamed to controllers/machine_controller_node_test.go

+70-7
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,10 @@ import (
2626
"k8s.io/apimachinery/pkg/runtime"
2727
"k8s.io/client-go/kubernetes/scheme"
2828
"k8s.io/client-go/tools/record"
29-
"sigs.k8s.io/controller-runtime/pkg/client/fake"
30-
"sigs.k8s.io/controller-runtime/pkg/log"
31-
3229
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
3330
"sigs.k8s.io/cluster-api/controllers/noderefutil"
31+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
32+
"sigs.k8s.io/controller-runtime/pkg/log"
3433
)
3534

3635
func TestGetNodeReference(t *testing.T) {
@@ -108,21 +107,85 @@ func TestGetNodeReference(t *testing.T) {
108107
providerID, err := noderefutil.NewProviderID(test.providerID)
109108
gt.Expect(err).NotTo(HaveOccurred(), "Expected no error parsing provider id %q, got %v", test.providerID, err)
110109

111-
reference, err := r.getNodeReference(client, providerID)
110+
node, err := r.getNode(client, providerID)
112111
if test.err == nil {
113112
g.Expect(err).To(BeNil())
114113
} else {
115114
gt.Expect(err).NotTo(BeNil())
116115
gt.Expect(err).To(Equal(test.err), "Expected error %v, got %v", test.err, err)
117116
}
118117

119-
if test.expected == nil && reference == nil {
118+
if test.expected == nil && node == nil {
120119
return
121120
}
122121

123-
gt.Expect(reference.Name).To(Equal(test.expected.Name), "Expected NodeRef's name to be %v, got %v", reference.Name, test.expected.Name)
124-
gt.Expect(reference.Namespace).To(Equal(test.expected.Namespace), "Expected NodeRef's namespace to be %v, got %v", reference.Namespace, test.expected.Namespace)
122+
gt.Expect(node.Name).To(Equal(test.expected.Name), "Expected NodeRef's name to be %v, got %v", node.Name, test.expected.Name)
123+
gt.Expect(node.Namespace).To(Equal(test.expected.Namespace), "Expected NodeRef's namespace to be %v, got %v", node.Namespace, test.expected.Namespace)
125124
})
126125

127126
}
128127
}
128+
129+
func TestSummarizeNodeConditions(t *testing.T) {
130+
testCases := []struct {
131+
name string
132+
conditions []corev1.NodeCondition
133+
status corev1.ConditionStatus
134+
}{
135+
{
136+
name: "node is healthy",
137+
conditions: []corev1.NodeCondition{
138+
{Type: corev1.NodeReady, Status: corev1.ConditionTrue},
139+
{Type: corev1.NodeMemoryPressure, Status: corev1.ConditionFalse},
140+
{Type: corev1.NodeDiskPressure, Status: corev1.ConditionFalse},
141+
{Type: corev1.NodePIDPressure, Status: corev1.ConditionFalse},
142+
},
143+
status: corev1.ConditionTrue,
144+
},
145+
{
146+
name: "all conditions are unknown",
147+
conditions: []corev1.NodeCondition{
148+
{Type: corev1.NodeReady, Status: corev1.ConditionUnknown},
149+
{Type: corev1.NodeMemoryPressure, Status: corev1.ConditionUnknown},
150+
{Type: corev1.NodeDiskPressure, Status: corev1.ConditionUnknown},
151+
{Type: corev1.NodePIDPressure, Status: corev1.ConditionUnknown},
152+
},
153+
status: corev1.ConditionUnknown,
154+
},
155+
{
156+
name: "multiple semantically failed condition",
157+
conditions: []corev1.NodeCondition{
158+
{Type: corev1.NodeReady, Status: corev1.ConditionUnknown},
159+
{Type: corev1.NodeMemoryPressure, Status: corev1.ConditionTrue},
160+
{Type: corev1.NodeDiskPressure, Status: corev1.ConditionTrue},
161+
{Type: corev1.NodePIDPressure, Status: corev1.ConditionTrue},
162+
},
163+
status: corev1.ConditionFalse,
164+
},
165+
{
166+
name: "one positive condition when the rest is unknown",
167+
conditions: []corev1.NodeCondition{
168+
{Type: corev1.NodeReady, Status: corev1.ConditionTrue},
169+
{Type: corev1.NodeMemoryPressure, Status: corev1.ConditionUnknown},
170+
{Type: corev1.NodeDiskPressure, Status: corev1.ConditionUnknown},
171+
{Type: corev1.NodePIDPressure, Status: corev1.ConditionUnknown},
172+
},
173+
status: corev1.ConditionTrue,
174+
},
175+
}
176+
for _, test := range testCases {
177+
t.Run(test.name, func(t *testing.T) {
178+
g := NewWithT(t)
179+
node := &corev1.Node{
180+
ObjectMeta: metav1.ObjectMeta{
181+
Name: "node-1",
182+
},
183+
Status: corev1.NodeStatus{
184+
Conditions: test.conditions,
185+
},
186+
}
187+
status, _ := summarizeNodeConditions(node)
188+
g.Expect(status).To(Equal(test.status))
189+
})
190+
}
191+
}

0 commit comments

Comments
 (0)