@@ -54,6 +54,7 @@ const (
54
54
healthCheckPollInterval = 10 * time .Second
55
55
healthCheckRequestTimeout = 5 * time .Second
56
56
healthCheckUnhealthyThreshold = 10
57
+ initialCacheSyncTimeout = 5 * time .Minute
57
58
clusterCacheControllerName = "cluster-cache-tracker"
58
59
)
59
60
@@ -64,9 +65,15 @@ type ClusterCacheTracker struct {
64
65
client client.Client
65
66
scheme * runtime.Scheme
66
67
67
- lock sync.RWMutex
68
+ // clusterAccessorsLock is used to lock the access to the clusterAccessors map.
69
+ clusterAccessorsLock sync.RWMutex
70
+ // clusterAccessors is the map of clusterAccessor by cluster.
68
71
clusterAccessors map [client.ObjectKey ]* clusterAccessor
69
- indexes []Index
72
+ // clusterLock is a per-cluster lock used whenever we lock per-cluster actions
73
+ // like creating a client or adding watches.
74
+ clusterLock * keyedMutex
75
+
76
+ indexes []Index
70
77
71
78
// controllerPodMetadata is the Pod metadata of the controller using this ClusterCacheTracker.
72
79
// This is only set when the POD_NAMESPACE, POD_NAME and POD_UID environment variables are set.
@@ -129,16 +136,14 @@ func NewClusterCacheTracker(manager ctrl.Manager, options ClusterCacheTrackerOpt
129
136
client : manager .GetClient (),
130
137
scheme : manager .GetScheme (),
131
138
clusterAccessors : make (map [client.ObjectKey ]* clusterAccessor ),
139
+ clusterLock : newKeyedMutex (),
132
140
indexes : options .Indexes ,
133
141
}, nil
134
142
}
135
143
136
144
// GetClient returns a cached client for the given cluster.
137
145
func (t * ClusterCacheTracker ) GetClient (ctx context.Context , cluster client.ObjectKey ) (client.Client , error ) {
138
- t .lock .Lock ()
139
- defer t .lock .Unlock ()
140
-
141
- accessor , err := t .getClusterAccessorLH (ctx , cluster , t .indexes ... )
146
+ accessor , err := t .getClusterAccessor (ctx , cluster , t .indexes ... )
142
147
if err != nil {
143
148
return nil , err
144
149
}
@@ -148,10 +153,7 @@ func (t *ClusterCacheTracker) GetClient(ctx context.Context, cluster client.Obje
148
153
149
154
// GetRESTConfig returns a cached REST config for the given cluster.
150
155
func (t * ClusterCacheTracker ) GetRESTConfig (ctc context.Context , cluster client.ObjectKey ) (* rest.Config , error ) {
151
- t .lock .Lock ()
152
- defer t .lock .Unlock ()
153
-
154
- accessor , err := t .getClusterAccessorLH (ctc , cluster , t .indexes ... )
156
+ accessor , err := t .getClusterAccessor (ctc , cluster , t .indexes ... )
155
157
if err != nil {
156
158
return nil , err
157
159
}
@@ -169,28 +171,63 @@ type clusterAccessor struct {
169
171
170
172
// clusterAccessorExists returns true if a clusterAccessor exists for cluster.
171
173
func (t * ClusterCacheTracker ) clusterAccessorExists (cluster client.ObjectKey ) bool {
172
- t .lock .RLock ()
173
- defer t .lock .RUnlock ()
174
+ t .clusterAccessorsLock .RLock ()
175
+ defer t .clusterAccessorsLock .RUnlock ()
174
176
175
177
_ , exists := t .clusterAccessors [cluster ]
176
178
return exists
177
179
}
178
180
179
- // getClusterAccessorLH first tries to return an already-created clusterAccessor for cluster, falling back to creating a
180
- // new clusterAccessor if needed. Note, this method requires t.lock to already be held (LH=lock held).
181
- func (t * ClusterCacheTracker ) getClusterAccessorLH (ctx context.Context , cluster client.ObjectKey , indexes ... Index ) (* clusterAccessor , error ) {
182
- a := t .clusterAccessors [cluster ]
181
+ // getClusterAccessor returns a clusterAccessor for cluster.
182
+ // It first tries to return an already-created clusterAccessor.
183
+ // It then falls back to create a new clusterAccessor if needed.
184
+ // If there is already another go routine trying to create a clusterAccessor
185
+ // for the same cluster, an error is returned.
186
+ func (t * ClusterCacheTracker ) getClusterAccessor (ctx context.Context , cluster client.ObjectKey , indexes ... Index ) (* clusterAccessor , error ) {
187
+ log := ctrl .LoggerFrom (ctx , "cluster" , klog .KRef (cluster .Namespace , cluster .Name ))
188
+
189
+ loadExistingAccessor := func () * clusterAccessor {
190
+ t .clusterAccessorsLock .RLock ()
191
+ defer t .clusterAccessorsLock .RUnlock ()
192
+ return t .clusterAccessors [cluster ]
193
+ }
194
+ storeAccessor := func (a * clusterAccessor ) {
195
+ t .clusterAccessorsLock .Lock ()
196
+ defer t .clusterAccessorsLock .Unlock ()
197
+ t .clusterAccessors [cluster ] = a
198
+ }
199
+
200
+ // If the clusterAccessor already exists, return early.
201
+ a := loadExistingAccessor ()
183
202
if a != nil {
184
203
return a , nil
185
204
}
186
205
187
- a , err := t .newClusterAccessor (ctx , cluster , indexes ... )
188
- if err != nil {
189
- return nil , errors .Wrap (err , "error creating client and cache for remote cluster" )
206
+ // clusterAccessor doesn't exist yet, we might have to initialize one.
207
+ // Lock on the cluster to ensure only one clusterAccessor is initialized
208
+ // for the cluster at the same time.
209
+ // Return an error if another go routine already tries to create a clusterAccessor.
210
+ unlockCluster , ok := t .clusterLock .TryLock (cluster )
211
+ if ! ok {
212
+ return nil , errors .Errorf ("error creating new cluster accessor: another go routine is already trying to create the cluster accessor for this cluster" )
190
213
}
214
+ defer unlockCluster ()
191
215
192
- t .clusterAccessors [cluster ] = a
216
+ // Until we got the cluster lock a different goroutine might have initialized the clusterAccessor
217
+ // for this cluster successfully already. If this is the case we return it.
218
+ a = loadExistingAccessor ()
219
+ if a != nil {
220
+ return a , nil
221
+ }
193
222
223
+ // We are the go routine who has to initialize the clusterAccessor.
224
+ log .V (4 ).Info ("Creating new cluster accessor" )
225
+ a , err := t .newClusterAccessor (ctx , cluster , indexes ... )
226
+ if err != nil {
227
+ return nil , errors .Wrap (err , "error creating new cluster accessor" )
228
+ }
229
+ log .V (4 ).Info ("Storing new cluster accessor" )
230
+ storeAccessor (a )
194
231
return a , nil
195
232
}
196
233
@@ -265,7 +302,12 @@ func (t *ClusterCacheTracker) newClusterAccessor(ctx context.Context, cluster cl
265
302
266
303
// Start the cache!!!
267
304
go cache .Start (cacheCtx ) //nolint:errcheck
268
- if ! cache .WaitForCacheSync (cacheCtx ) {
305
+
306
+ // Wait until the cache is initially synced
307
+ cacheSyncCtx , cacheSyncCtxCancel := context .WithTimeout (ctx , initialCacheSyncTimeout )
308
+ defer cacheSyncCtxCancel ()
309
+ if ! cache .WaitForCacheSync (cacheSyncCtx ) {
310
+ cache .Stop ()
269
311
return nil , fmt .Errorf ("failed waiting for cache for remote cluster %v to sync: %w" , cluster , cacheCtx .Err ())
270
312
}
271
313
@@ -337,8 +379,8 @@ func (t *ClusterCacheTracker) createClient(config *rest.Config, cluster client.O
337
379
338
380
// deleteAccessor stops a clusterAccessor's cache and removes the clusterAccessor from the tracker.
339
381
func (t * ClusterCacheTracker ) deleteAccessor (_ context.Context , cluster client.ObjectKey ) {
340
- t .lock .Lock ()
341
- defer t .lock .Unlock ()
382
+ t .clusterAccessorsLock .Lock ()
383
+ defer t .clusterAccessorsLock .Unlock ()
342
384
343
385
a , exists := t .clusterAccessors [cluster ]
344
386
if ! exists {
@@ -387,14 +429,18 @@ func (t *ClusterCacheTracker) Watch(ctx context.Context, input WatchInput) error
387
429
return errors .New ("input.Name is required" )
388
430
}
389
431
390
- t .lock .Lock ()
391
- defer t .lock .Unlock ()
392
-
393
- a , err := t .getClusterAccessorLH (ctx , input .Cluster , t .indexes ... )
432
+ a , err := t .getClusterAccessor (ctx , input .Cluster , t .indexes ... )
394
433
if err != nil {
395
434
return err
396
435
}
397
436
437
+ // We have to lock the cluster, so that the watch is not created multiple times in parallel.
438
+ unlock , ok := t .clusterLock .TryLock (input .Cluster )
439
+ if ! ok {
440
+ return errors .Errorf ("failed to add watch: another go routine is already trying to create the cluster accessor" )
441
+ }
442
+ defer unlock ()
443
+
398
444
if a .watches .Has (input .Name ) {
399
445
t .log .V (6 ).Info ("Watch already exists" , "Cluster" , klog .KRef (input .Cluster .Namespace , input .Cluster .Name ), "name" , input .Name )
400
446
return nil
@@ -505,7 +551,7 @@ func (t *ClusterCacheTracker) healthCheckCluster(ctx context.Context, in *health
505
551
// An error returned implies the health check has failed a sufficient number of
506
552
// times for the cluster to be considered unhealthy
507
553
// NB. we are ignoring ErrWaitTimeout because this error happens when the channel is close, that in this case
508
- // happens when the cache is explicitly stopped.F
554
+ // happens when the cache is explicitly stopped.
509
555
if err != nil && err != wait .ErrWaitTimeout {
510
556
t .log .Error (err , "Error health checking cluster" , "Cluster" , klog .KRef (in .cluster .Namespace , in .cluster .Name ))
511
557
t .deleteAccessor (ctx , in .cluster )
0 commit comments