Retry initialization error conditions (#2979)

tmshort · web-flow · commit e908cfc6b645 · 2023-07-05T10:39:24.000-04:00
When the api server is flakey (e.g. during a cluster install), it is
possible for some of the OLM initialization to fail. When this happens,
OLM gets into a bad state (e.g. a monitoring go routine terminates)
and can't recover without a restart.

There were at least two places I found where a retry mechanism is
needed to handle intialization errors. This was as far as I peeled the
onion. It's not an exponential backoff retry, but a 1 minute retry
interval should be sufficient (no other backoffs are exponential).

The ServerVersion only retries once with a minute in between. This
required fixing a unit-test to take the retry into account.

Signed-off-by: Todd Short &lt;todd.short@me.com&gt;
diff --git a/pkg/lib/operatorstatus/monitor.go b/pkg/lib/operatorstatus/monitor.go
@@ -129,9 +129,18 @@ func (m *monitor) Run(stopCh <-chan struct{}) {
 	m.logger.Infof("initializing clusteroperator resource(s) for %s", m.names)
 
 	for _, name := range m.names {
-		if err := m.init(name); err != nil {
-			m.logger.Errorf("initialization error - %v", err)
-			break
+		for {
+			if err := m.init(name); err != nil {
+				m.logger.Errorf("initialization error - %v", err)
+			} else {
+				m.logger.Infof("initialized cluster resource - %s", name)
+				break
+			}
+			select {
+			case <-time.After(defaultProbeInterval):
+			case <-stopCh:
+				return
+			}
 		}
 	}
 
diff --git a/pkg/lib/queueinformer/queueinformer_operator.go b/pkg/lib/queueinformer/queueinformer_operator.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"
 	"sync"
+	"time"
 
 	"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/kubestate"
 	"github.com/pkg/errors"
@@ -13,6 +14,10 @@ import (
 	"k8s.io/client-go/tools/cache"
 )
 
+const (
+	defaultServerVersionInterval = 1 * time.Minute
+)
+
 // ExtensibleOperator describes a Reconciler that can be extended with additional informers and queue informers
 type ExtensibleOperator interface {
 	// RegisterQueueInformer registers the given QueueInformer with the Operator.
@@ -194,6 +199,16 @@ func (o *operator) start(ctx context.Context) error {
 	go func() {
 		defer close(errs)
 		v, err := o.serverVersion.ServerVersion()
+		if err == nil {
+			o.logger.Infof("connection established. cluster-version: %v", v)
+			return
+		}
+		select {
+		case <-time.After(defaultServerVersionInterval):
+		case <-ctx.Done():
+			return
+		}
+		v, err = o.serverVersion.ServerVersion()
 		if err != nil {
 			select {
 			case errs <- errors.Wrap(err, "communicating with server failed"):
diff --git a/pkg/lib/queueinformer/queueinformer_operator_test.go b/pkg/lib/queueinformer/queueinformer_operator_test.go
@@ -78,7 +78,7 @@ func TestOperatorRunChannelClosure(t *testing.T) {
 
 			o.Run(ctx)
 
-			timeout := time.After(time.Second)
+			timeout := time.After(2 * defaultServerVersionInterval)
 			for n, ch := range map[string]<-chan struct{}{
 				"ready": o.Ready(),
 				"done":  o.Done(),