Skip to content

Commit 20f9b5f

Browse files
authored
Merge pull request #9028 from sbueringer/pr-fix-cct-informer-deadlock
🌱 ClusterCacheTracker: ensure Get/List calls are not getting stuck when apiserver is unreachable
2 parents 7399f5f + 8183c39 commit 20f9b5f

File tree

1 file changed

+37
-0
lines changed

1 file changed

+37
-0
lines changed

controllers/remote/cluster_cache_tracker.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,13 @@ func (t *ClusterCacheTracker) createClient(ctx context.Context, config *rest.Con
457457
return nil, nil, nil, fmt.Errorf("failed waiting for cache for remote cluster %v to sync: %w", cluster, cacheCtx.Err())
458458
}
459459

460+
// Wrap the cached client with a client that sets timeouts on all Get and List calls
461+
// If we don't set timeouts here Get and List calls can get stuck if they lazily create a new informer
462+
// and the informer than doesn't sync because the workload cluster apiserver is not reachable.
463+
// An alternative would be to set timeouts in the contexts we pass into all Get and List calls.
464+
// It should be reasonable to have Get and List calls timeout within the duration configured in the restConfig.
465+
cachedClient = newClientWithTimeout(cachedClient, config.Timeout)
466+
460467
// Start cluster healthcheck!!!
461468
go t.healthCheckCluster(cacheCtx, &healthCheckInput{
462469
cluster: cluster,
@@ -656,3 +663,33 @@ func (t *ClusterCacheTracker) healthCheckCluster(ctx context.Context, in *health
656663
t.deleteAccessor(ctx, in.cluster)
657664
}
658665
}
666+
667+
// newClientWithTimeout returns a new client which sets the specified timeout on all Get and List calls.
668+
// If we don't set timeouts here Get and List calls can get stuck if they lazily create a new informer
669+
// and the informer than doesn't sync because the workload cluster apiserver is not reachable.
670+
// An alternative would be to set timeouts in the contexts we pass into all Get and List calls.
671+
func newClientWithTimeout(client client.Client, timeout time.Duration) client.Client {
672+
return clientWithTimeout{
673+
Client: client,
674+
timeout: timeout,
675+
}
676+
}
677+
678+
type clientWithTimeout struct {
679+
client.Client
680+
timeout time.Duration
681+
}
682+
683+
var _ client.Client = &clientWithTimeout{}
684+
685+
func (c clientWithTimeout) Get(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error {
686+
ctx, cancel := context.WithTimeout(ctx, c.timeout)
687+
defer cancel()
688+
return c.Client.Get(ctx, key, obj, opts...)
689+
}
690+
691+
func (c clientWithTimeout) List(ctx context.Context, list client.ObjectList, opts ...client.ListOption) error {
692+
ctx, cancel := context.WithTimeout(ctx, c.timeout)
693+
defer cancel()
694+
return c.Client.List(ctx, list, opts...)
695+
}

0 commit comments

Comments
 (0)