@@ -19,13 +19,13 @@ package controllers
19
19
import (
20
20
"context"
21
21
"fmt"
22
- "time"
23
22
24
23
"github.com/pkg/errors"
25
- apicorev1 "k8s.io/api/core/v1"
24
+ corev1 "k8s.io/api/core/v1"
26
25
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha4"
27
26
"sigs.k8s.io/cluster-api/controllers/noderefutil"
28
27
"sigs.k8s.io/cluster-api/util"
28
+ "sigs.k8s.io/cluster-api/util/conditions"
29
29
ctrl "sigs.k8s.io/controller-runtime"
30
30
"sigs.k8s.io/controller-runtime/pkg/client"
31
31
)
@@ -34,24 +34,14 @@ var (
34
34
ErrNodeNotFound = errors .New ("cannot find node with matching ProviderID" )
35
35
)
36
36
37
- func (r * MachineReconciler ) reconcileNodeRef (ctx context.Context , cluster * clusterv1.Cluster , machine * clusterv1.Machine ) (ctrl.Result , error ) {
38
- log := ctrl .LoggerFrom (ctx , "cluster" , cluster .Name )
39
-
40
- // Check that the Machine hasn't been deleted or in the process.
41
- if ! machine .DeletionTimestamp .IsZero () {
42
- return ctrl.Result {}, nil
43
- }
44
-
45
- // Check that the Machine doesn't already have a NodeRef.
46
- if machine .Status .NodeRef != nil {
47
- return ctrl.Result {}, nil
48
- }
49
-
37
+ func (r * MachineReconciler ) reconcileNode (ctx context.Context , cluster * clusterv1.Cluster , machine * clusterv1.Machine ) (ctrl.Result , error ) {
38
+ log := ctrl .LoggerFrom (ctx , "machine" , machine .Name , "namespace" , machine .Namespace )
50
39
log = log .WithValues ("cluster" , cluster .Name )
51
40
52
41
// Check that the Machine has a valid ProviderID.
53
42
if machine .Spec .ProviderID == nil || * machine .Spec .ProviderID == "" {
54
- log .Info ("Machine doesn't have a valid ProviderID yet" )
43
+ log .Info ("Cannot reconcile Machine's Node, no valid ProviderID yet" )
44
+ conditions .MarkFalse (machine , clusterv1 .MachineNodeHealthyCondition , clusterv1 .WaitingForNodeRefReason , clusterv1 .ConditionSeverityInfo , "" )
55
45
return ctrl.Result {}, nil
56
46
}
57
47
@@ -65,29 +55,93 @@ func (r *MachineReconciler) reconcileNodeRef(ctx context.Context, cluster *clust
65
55
return ctrl.Result {}, err
66
56
}
67
57
68
- // Get the Node reference.
69
- nodeRef , err := r .getNodeReference (ctx , remoteClient , providerID )
58
+ // Even if Status.NodeRef exists, continue to do the following checks to make sure Node is healthy
59
+ node , err := r .getNode (ctx , remoteClient , providerID )
70
60
if err != nil {
71
61
if err == ErrNodeNotFound {
72
- log .Info (fmt .Sprintf ("Cannot assign NodeRef to Machine: %s, requeuing" , ErrNodeNotFound .Error ()))
73
- return ctrl.Result {RequeueAfter : 20 * time .Second }, nil
62
+ // While a NodeRef is set in the status, failing to get that node means the node is deleted.
63
+ // If Status.NodeRef is not set before, node still can be in the provisioning state.
64
+ if machine .Status .NodeRef != nil {
65
+ conditions .MarkFalse (machine , clusterv1 .MachineNodeHealthyCondition , clusterv1 .NodeNotFoundReason , clusterv1 .ConditionSeverityError , "" )
66
+ return ctrl.Result {}, errors .Wrapf (err , "no matching Node for Machine %q in namespace %q" , machine .Name , machine .Namespace )
67
+ }
68
+ conditions .MarkFalse (machine , clusterv1 .MachineNodeHealthyCondition , clusterv1 .NodeProvisioningReason , clusterv1 .ConditionSeverityWarning , "" )
69
+ return ctrl.Result {Requeue : true }, nil
74
70
}
75
- log .Error (err , "Failed to assign NodeRef " )
76
- r .recorder .Event (machine , apicorev1 .EventTypeWarning , "FailedSetNodeRef " , err .Error ())
71
+ log .Error (err , "Failed to retrieve Node by ProviderID " )
72
+ r .recorder .Event (machine , corev1 .EventTypeWarning , "Failed to retrieve Node by ProviderID " , err .Error ())
77
73
return ctrl.Result {}, err
78
74
}
79
75
80
76
// Set the Machine NodeRef.
81
- machine .Status .NodeRef = nodeRef
82
- log .Info ("Set Machine's NodeRef" , "noderef" , machine .Status .NodeRef .Name )
83
- r .recorder .Event (machine , apicorev1 .EventTypeNormal , "SuccessfulSetNodeRef" , machine .Status .NodeRef .Name )
77
+ if machine .Status .NodeRef == nil {
78
+ machine .Status .NodeRef = & corev1.ObjectReference {
79
+ Kind : node .Kind ,
80
+ APIVersion : node .APIVersion ,
81
+ Name : node .Name ,
82
+ UID : node .UID ,
83
+ }
84
+ log .Info ("Set Machine's NodeRef" , "noderef" , machine .Status .NodeRef .Name )
85
+ r .recorder .Event (machine , corev1 .EventTypeNormal , "SuccessfulSetNodeRef" , machine .Status .NodeRef .Name )
86
+ }
87
+
88
+ // Do the remaining node health checks, then set the node health to true if all checks pass.
89
+ status , message := summarizeNodeConditions (node )
90
+ if status == corev1 .ConditionFalse {
91
+ conditions .MarkFalse (machine , clusterv1 .MachineNodeHealthyCondition , clusterv1 .NodeConditionsFailedReason , clusterv1 .ConditionSeverityWarning , message )
92
+ return ctrl.Result {}, nil
93
+ }
94
+
95
+ conditions .MarkTrue (machine , clusterv1 .MachineNodeHealthyCondition )
84
96
return ctrl.Result {}, nil
85
97
}
86
98
87
- func (r * MachineReconciler ) getNodeReference (ctx context.Context , c client.Reader , providerID * noderefutil.ProviderID ) (* apicorev1.ObjectReference , error ) {
99
+ // summarizeNodeConditions summarizes a Node's conditions and returns the summary of condition statuses and concatenate failed condition messages:
100
+ // if there is at least 1 semantically-negative condition, summarized status = False;
101
+ // if there is at least 1 semantically-positive condition when there is 0 semantically negative condition, summarized status = True;
102
+ // if all conditions are unknown, summarized status = Unknown.
103
+ // (semantically true conditions: NodeMemoryPressure/NodeDiskPressure/NodePIDPressure == false or Ready == true.)
104
+ func summarizeNodeConditions (node * corev1.Node ) (corev1.ConditionStatus , string ) {
105
+ totalNumOfConditionsChecked := 4
106
+ semanticallyFalseStatus := 0
107
+ unknownStatus := 0
108
+
109
+ message := ""
110
+ for _ , condition := range node .Status .Conditions {
111
+ switch condition .Type {
112
+ case corev1 .NodeMemoryPressure , corev1 .NodeDiskPressure , corev1 .NodePIDPressure :
113
+ if condition .Status != corev1 .ConditionFalse {
114
+ message += fmt .Sprintf ("Node condition %s is %s" , condition .Type , condition .Status ) + ". "
115
+ if condition .Status == corev1 .ConditionUnknown {
116
+ unknownStatus ++
117
+ continue
118
+ }
119
+ semanticallyFalseStatus ++
120
+ }
121
+ case corev1 .NodeReady :
122
+ if condition .Status != corev1 .ConditionTrue {
123
+ message += fmt .Sprintf ("Node condition %s is %s" , condition .Type , condition .Status ) + ". "
124
+ if condition .Status == corev1 .ConditionUnknown {
125
+ unknownStatus ++
126
+ continue
127
+ }
128
+ semanticallyFalseStatus ++
129
+ }
130
+ }
131
+ }
132
+ if semanticallyFalseStatus > 0 {
133
+ return corev1 .ConditionFalse , message
134
+ }
135
+ if semanticallyFalseStatus + unknownStatus < totalNumOfConditionsChecked {
136
+ return corev1 .ConditionTrue , message
137
+ }
138
+ return corev1 .ConditionUnknown , message
139
+ }
140
+
141
+ func (r * MachineReconciler ) getNode (ctx context.Context , c client.Reader , providerID * noderefutil.ProviderID ) (* corev1.Node , error ) {
88
142
log := ctrl .LoggerFrom (ctx , "providerID" , providerID )
89
143
90
- nodeList := apicorev1 .NodeList {}
144
+ nodeList := corev1 .NodeList {}
91
145
for {
92
146
if err := c .List (ctx , & nodeList , client .Continue (nodeList .Continue )); err != nil {
93
147
return nil , err
@@ -101,12 +155,7 @@ func (r *MachineReconciler) getNodeReference(ctx context.Context, c client.Reade
101
155
}
102
156
103
157
if providerID .Equals (nodeProviderID ) {
104
- return & apicorev1.ObjectReference {
105
- Kind : node .Kind ,
106
- APIVersion : node .APIVersion ,
107
- Name : node .Name ,
108
- UID : node .UID ,
109
- }, nil
158
+ return & node , nil
110
159
}
111
160
}
112
161
0 commit comments