@@ -19,13 +19,13 @@ package controllers
19
19
import (
20
20
"context"
21
21
"fmt"
22
- "time"
23
22
24
23
"github.com/pkg/errors"
25
- apicorev1 "k8s.io/api/core/v1"
24
+ corev1 "k8s.io/api/core/v1"
26
25
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
27
26
"sigs.k8s.io/cluster-api/controllers/noderefutil"
28
27
"sigs.k8s.io/cluster-api/util"
28
+ "sigs.k8s.io/cluster-api/util/conditions"
29
29
ctrl "sigs.k8s.io/controller-runtime"
30
30
"sigs.k8s.io/controller-runtime/pkg/client"
31
31
)
@@ -34,23 +34,13 @@ var (
34
34
ErrNodeNotFound = errors .New ("cannot find node with matching ProviderID" )
35
35
)
36
36
37
- func (r * MachineReconciler ) reconcileNodeRef (ctx context.Context , cluster * clusterv1.Cluster , machine * clusterv1.Machine ) (ctrl.Result , error ) {
37
+ func (r * MachineReconciler ) reconcileNode (ctx context.Context , cluster * clusterv1.Cluster , machine * clusterv1.Machine ) (ctrl.Result , error ) {
38
38
logger := r .Log .WithValues ("machine" , machine .Name , "namespace" , machine .Namespace )
39
- // Check that the Machine hasn't been deleted or in the process.
40
- if ! machine .DeletionTimestamp .IsZero () {
41
- return ctrl.Result {}, nil
42
- }
43
-
44
- // Check that the Machine doesn't already have a NodeRef.
45
- if machine .Status .NodeRef != nil {
46
- return ctrl.Result {}, nil
47
- }
48
-
49
- logger = logger .WithValues ("cluster" , cluster .Name )
50
39
51
40
// Check that the Machine has a valid ProviderID.
52
41
if machine .Spec .ProviderID == nil || * machine .Spec .ProviderID == "" {
53
- logger .Info ("Machine doesn't have a valid ProviderID yet" )
42
+ logger .Info ("Cannot reconcile Machine's Node, no valid ProviderID yet" )
43
+ conditions .MarkFalse (machine , clusterv1 .MachineNodeHealthyCondition , clusterv1 .WaitingForNodeRefReason , clusterv1 .ConditionSeverityInfo , "" )
54
44
return ctrl.Result {}, nil
55
45
}
56
46
@@ -64,29 +54,93 @@ func (r *MachineReconciler) reconcileNodeRef(ctx context.Context, cluster *clust
64
54
return ctrl.Result {}, err
65
55
}
66
56
67
- // Get the Node reference.
68
- nodeRef , err := r .getNodeReference (remoteClient , providerID )
57
+ // Even if Status.NodeRef exists, continue to do the following checks to make sure Node is healthy
58
+ node , err := r .getNode (remoteClient , providerID )
69
59
if err != nil {
70
60
if err == ErrNodeNotFound {
71
- logger .Info (fmt .Sprintf ("Cannot assign NodeRef to Machine: %s, requeuing" , ErrNodeNotFound .Error ()))
72
- return ctrl.Result {RequeueAfter : 20 * time .Second }, nil
61
+ // While a NodeRef is set in the status, failing to get that node means the node is deleted.
62
+ // If Status.NodeRef is not set before, node still can be in the provisioning state.
63
+ if machine .Status .NodeRef != nil {
64
+ conditions .MarkFalse (machine , clusterv1 .MachineNodeHealthyCondition , clusterv1 .NodeNotFoundReason , clusterv1 .ConditionSeverityError , "" )
65
+ return ctrl.Result {}, errors .Wrapf (err , "no matching Node for Machine %q in namespace %q" , machine .Name , machine .Namespace )
66
+ }
67
+ conditions .MarkFalse (machine , clusterv1 .MachineNodeHealthyCondition , clusterv1 .NodeProvisioningReason , clusterv1 .ConditionSeverityWarning , "" )
68
+ return ctrl.Result {Requeue : true }, nil
73
69
}
74
- logger .Error (err , "Failed to assign NodeRef " )
75
- r .recorder .Event (machine , apicorev1 .EventTypeWarning , "FailedSetNodeRef " , err .Error ())
70
+ logger .Error (err , "Failed to retrieve Node by ProviderID " )
71
+ r .recorder .Event (machine , corev1 .EventTypeWarning , "Failed to retrieve Node by ProviderID " , err .Error ())
76
72
return ctrl.Result {}, err
77
73
}
78
74
79
75
// Set the Machine NodeRef.
80
- machine .Status .NodeRef = nodeRef
81
- logger .Info ("Set Machine's NodeRef" , "noderef" , machine .Status .NodeRef .Name )
82
- r .recorder .Event (machine , apicorev1 .EventTypeNormal , "SuccessfulSetNodeRef" , machine .Status .NodeRef .Name )
76
+ if machine .Status .NodeRef == nil {
77
+ machine .Status .NodeRef = & corev1.ObjectReference {
78
+ Kind : node .Kind ,
79
+ APIVersion : node .APIVersion ,
80
+ Name : node .Name ,
81
+ UID : node .UID ,
82
+ }
83
+ logger .Info ("Set Machine's NodeRef" , "noderef" , machine .Status .NodeRef .Name )
84
+ r .recorder .Event (machine , corev1 .EventTypeNormal , "SuccessfulSetNodeRef" , machine .Status .NodeRef .Name )
85
+ }
86
+
87
+ // Do the remaining node health checks, then set the node health to true if all checks pass.
88
+ status , message := summarizeNodeConditions (node )
89
+ if status == corev1 .ConditionFalse {
90
+ conditions .MarkFalse (machine , clusterv1 .MachineNodeHealthyCondition , clusterv1 .NodeConditionsFailedReason , clusterv1 .ConditionSeverityWarning , message )
91
+ return ctrl.Result {}, nil
92
+ }
93
+
94
+ conditions .MarkTrue (machine , clusterv1 .MachineNodeHealthyCondition )
83
95
return ctrl.Result {}, nil
84
96
}
85
97
86
- func (r * MachineReconciler ) getNodeReference (c client.Reader , providerID * noderefutil.ProviderID ) (* apicorev1.ObjectReference , error ) {
98
+ // summarizeNodeConditions summarizes a Node's conditions and returns the summary of condition statuses and concatenate failed condition messages:
99
+ // if there is at least 1 semantically-negative condition, summarized status = False;
100
+ // if there is at least 1 semantically-positive condition when there is 0 semantically negative condition, summarized status = True;
101
+ // if all conditions are unknown, summarized status = Unknown.
102
+ // (semantically true conditions: NodeMemoryPressure/NodeDiskPressure/NodePIDPressure == false or Ready == true.)
103
+ func summarizeNodeConditions (node * corev1.Node ) (corev1.ConditionStatus , string ) {
104
+ totalNumOfConditionsChecked := 4
105
+ semanticallyFalseStatus := 0
106
+ unknownStatus := 0
107
+
108
+ message := ""
109
+ for _ , condition := range node .Status .Conditions {
110
+ switch condition .Type {
111
+ case corev1 .NodeMemoryPressure , corev1 .NodeDiskPressure , corev1 .NodePIDPressure :
112
+ if condition .Status != corev1 .ConditionFalse {
113
+ message += fmt .Sprintf ("Node condition %s is %s" , condition .Type , condition .Status ) + ". "
114
+ if condition .Status == corev1 .ConditionUnknown {
115
+ unknownStatus ++
116
+ continue
117
+ }
118
+ semanticallyFalseStatus ++
119
+ }
120
+ case corev1 .NodeReady :
121
+ if condition .Status != corev1 .ConditionTrue {
122
+ message += fmt .Sprintf ("Node condition %s is %s" , condition .Type , condition .Status ) + ". "
123
+ if condition .Status == corev1 .ConditionUnknown {
124
+ unknownStatus ++
125
+ continue
126
+ }
127
+ semanticallyFalseStatus ++
128
+ }
129
+ }
130
+ }
131
+ if semanticallyFalseStatus > 0 {
132
+ return corev1 .ConditionFalse , message
133
+ }
134
+ if semanticallyFalseStatus + unknownStatus < totalNumOfConditionsChecked {
135
+ return corev1 .ConditionTrue , message
136
+ }
137
+ return corev1 .ConditionUnknown , message
138
+ }
139
+
140
+ func (r * MachineReconciler ) getNode (c client.Reader , providerID * noderefutil.ProviderID ) (* corev1.Node , error ) {
87
141
logger := r .Log .WithValues ("providerID" , providerID )
88
142
89
- nodeList := apicorev1 .NodeList {}
143
+ nodeList := corev1 .NodeList {}
90
144
for {
91
145
if err := c .List (context .TODO (), & nodeList , client .Continue (nodeList .Continue )); err != nil {
92
146
return nil , err
@@ -100,12 +154,7 @@ func (r *MachineReconciler) getNodeReference(c client.Reader, providerID *nodere
100
154
}
101
155
102
156
if providerID .Equals (nodeProviderID ) {
103
- return & apicorev1.ObjectReference {
104
- Kind : node .Kind ,
105
- APIVersion : node .APIVersion ,
106
- Name : node .Name ,
107
- UID : node .UID ,
108
- }, nil
157
+ return & node , nil
109
158
}
110
159
}
111
160
0 commit comments