@@ -18,6 +18,7 @@ package appwrapper
18
18
19
19
import (
20
20
"context"
21
+ "sync"
21
22
22
23
v1 "k8s.io/api/core/v1"
23
24
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -38,8 +39,11 @@ type NodeHealthMonitor struct {
38
39
Config * config.AppWrapperConfig
39
40
}
40
41
41
- // unhealthyNodes is a mapping from Node names to a set of resources that Autopilot has labeled as unhealthy on that Node
42
- var unhealthyNodes = make (map [string ]sets.Set [string ])
42
+ var (
43
+ // unhealthyNodes is a mapping from Node names to a set of resources that Autopilot has labeled as unhealthy on that Node
44
+ unhealthyNodes = make (map [string ]sets.Set [string ])
45
+ unhealthyNodesMutex sync.RWMutex
46
+ )
43
47
44
48
// permission to watch nodes
45
49
//+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch
@@ -55,8 +59,6 @@ func (r *NodeHealthMonitor) Reconcile(ctx context.Context, req ctrl.Request) (ct
55
59
return ctrl.Result {}, nil
56
60
}
57
61
58
- log .FromContext (ctx ).V (2 ).Info ("Reconcilling" , "node" , req .NamespacedName )
59
-
60
62
flaggedResources := make (sets.Set [string ])
61
63
for key , value := range node .GetLabels () {
62
64
for resource , apLabels := range r .Config .Autopilot .ResourceUnhealthyConfig {
@@ -66,22 +68,27 @@ func (r *NodeHealthMonitor) Reconcile(ctx context.Context, req ctrl.Request) (ct
66
68
}
67
69
}
68
70
69
- hadEntries := len (unhealthyNodes ) > 0
70
-
71
- if len (flaggedResources ) == 0 {
72
- delete (unhealthyNodes , node .GetName ())
73
- } else {
71
+ nodeChanged := false
72
+ unhealthyNodesMutex .Lock () // BEGIN CRITICAL SECTION
73
+ if priorEntry , ok := unhealthyNodes [node .GetName ()]; ok {
74
+ if len (flaggedResources ) == 0 {
75
+ delete (unhealthyNodes , node .GetName ())
76
+ nodeChanged = true
77
+ } else if ! priorEntry .Equal (flaggedResources ) {
78
+ unhealthyNodes [node .GetName ()] = flaggedResources
79
+ nodeChanged = true
80
+ }
81
+ } else if len (flaggedResources ) > 0 {
74
82
unhealthyNodes [node .GetName ()] = flaggedResources
83
+ nodeChanged = true
75
84
}
85
+ numUnhealthy := len (unhealthyNodes )
86
+ unhealthyNodesMutex .Unlock () // END CRITICAL SECTION
76
87
77
- if len (unhealthyNodes ) == 0 {
78
- if hadEntries {
79
- log .FromContext (ctx ).Info ("All nodes now healthy" )
80
- } else {
81
- log .FromContext (ctx ).V (2 ).Info ("All nodes now healthy" )
82
- }
83
- } else {
84
- log .FromContext (ctx ).Info ("Some nodes unhealthy" , "number" , len (unhealthyNodes ), "details" , unhealthyNodes )
88
+ if nodeChanged {
89
+ // This unsynchronized read of unhealthyNodes for logging purposes is safe because this method
90
+ // is the only writer to the map and the controller runtime is configured to not allow concurrent execution of this method.
91
+ log .FromContext (ctx ).Info ("Updated node health information" , "Number Unhealthy Nodes" , numUnhealthy , "Unhealthy Resource Details" , unhealthyNodes )
85
92
}
86
93
87
94
return ctrl.Result {}, nil
0 commit comments