Skip to content

Commit e908cfc

Browse files
authored
Retry initialization error conditions (#2979)
When the api server is flakey (e.g. during a cluster install), it is possible for some of the OLM initialization to fail. When this happens, OLM gets into a bad state (e.g. a monitoring go routine terminates) and can't recover without a restart. There were at least two places I found where a retry mechanism is needed to handle intialization errors. This was as far as I peeled the onion. It's not an exponential backoff retry, but a 1 minute retry interval should be sufficient (no other backoffs are exponential). The ServerVersion only retries once with a minute in between. This required fixing a unit-test to take the retry into account. Signed-off-by: Todd Short <[email protected]>
1 parent 12217d1 commit e908cfc

File tree

3 files changed

+28
-4
lines changed

3 files changed

+28
-4
lines changed

pkg/lib/operatorstatus/monitor.go

+12-3
Original file line numberDiff line numberDiff line change
@@ -129,9 +129,18 @@ func (m *monitor) Run(stopCh <-chan struct{}) {
129129
m.logger.Infof("initializing clusteroperator resource(s) for %s", m.names)
130130

131131
for _, name := range m.names {
132-
if err := m.init(name); err != nil {
133-
m.logger.Errorf("initialization error - %v", err)
134-
break
132+
for {
133+
if err := m.init(name); err != nil {
134+
m.logger.Errorf("initialization error - %v", err)
135+
} else {
136+
m.logger.Infof("initialized cluster resource - %s", name)
137+
break
138+
}
139+
select {
140+
case <-time.After(defaultProbeInterval):
141+
case <-stopCh:
142+
return
143+
}
135144
}
136145
}
137146

pkg/lib/queueinformer/queueinformer_operator.go

+15
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"fmt"
66
"sync"
7+
"time"
78

89
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/kubestate"
910
"github.com/pkg/errors"
@@ -13,6 +14,10 @@ import (
1314
"k8s.io/client-go/tools/cache"
1415
)
1516

17+
const (
18+
defaultServerVersionInterval = 1 * time.Minute
19+
)
20+
1621
// ExtensibleOperator describes a Reconciler that can be extended with additional informers and queue informers
1722
type ExtensibleOperator interface {
1823
// RegisterQueueInformer registers the given QueueInformer with the Operator.
@@ -194,6 +199,16 @@ func (o *operator) start(ctx context.Context) error {
194199
go func() {
195200
defer close(errs)
196201
v, err := o.serverVersion.ServerVersion()
202+
if err == nil {
203+
o.logger.Infof("connection established. cluster-version: %v", v)
204+
return
205+
}
206+
select {
207+
case <-time.After(defaultServerVersionInterval):
208+
case <-ctx.Done():
209+
return
210+
}
211+
v, err = o.serverVersion.ServerVersion()
197212
if err != nil {
198213
select {
199214
case errs <- errors.Wrap(err, "communicating with server failed"):

pkg/lib/queueinformer/queueinformer_operator_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ func TestOperatorRunChannelClosure(t *testing.T) {
7878

7979
o.Run(ctx)
8080

81-
timeout := time.After(time.Second)
81+
timeout := time.After(2 * defaultServerVersionInterval)
8282
for n, ch := range map[string]<-chan struct{}{
8383
"ready": o.Ready(),
8484
"done": o.Done(),

0 commit comments

Comments
 (0)