Skip to content

Commit d97e3c8

Browse files
authored
Bug 1995913: Degraded status in the OCM controller (#486)
* Mark as degraded when the pulling of SCA certs failed * Degraded with exponential backoff * Update the error cases and add some logging * Update the backoff a little * Fix * Remove unneccesary block
1 parent db300fd commit d97e3c8

File tree

6 files changed

+106
-17
lines changed

6 files changed

+106
-17
lines changed

pkg/controller/operator.go

+16-5
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,11 @@ func (s *Operator) Run(ctx context.Context, controller *controllercmd.Controller
168168
statusReporter.AddSources(reportGatherer)
169169
go reportGatherer.Run(ctx)
170170

171-
runOCMController(ctx, configClient, kubeClient, configObserver, insightsClient)
171+
ocmController := initiateOCMController(ctx, gatherKubeConfig, kubeClient, configObserver, insightsClient)
172+
if ocmController != nil {
173+
statusReporter.AddSources(ocmController)
174+
go ocmController.Run()
175+
}
172176
klog.Warning("started")
173177

174178
<-ctx.Done()
@@ -205,20 +209,27 @@ func isRunning(ctx context.Context, kubeConfig *rest.Config) wait.ConditionFunc
205209
}
206210
}
207211

208-
// runOCMController checks the "InsightsOperatorPullingSCA" feature and if it's enabled then run the OCM controller
209-
func runOCMController(ctx context.Context, configClient *configv1client.ConfigV1Client,
210-
kubeClient *kubernetes.Clientset, configObserver *configobserver.Controller, insightsClient *insightsclient.Client) {
212+
// initiateOCMController checks the "InsightsOperatorPullingSCA" feature and if it's enabled then create and retun the OCM controller
213+
func initiateOCMController(ctx context.Context, kubeConfig *rest.Config,
214+
kubeClient *kubernetes.Clientset, configObserver *configobserver.Controller, insightsClient *insightsclient.Client) *ocm.Controller {
215+
configClient, err := configv1client.NewForConfig(kubeConfig)
216+
if err != nil {
217+
klog.Error(err)
218+
return nil
219+
}
211220
ocmEnabled, err := featureEnabled(ctx, configClient, "InsightsOperatorPullingSCA")
212221
if err != nil {
213222
klog.Errorf("Pulling of SCA certs from the OCM is disabled. Unable to get cluster FeatureGate: %v", err)
223+
return nil
214224
}
215225
if ocmEnabled {
216226
klog.Info("Pulling of SCA certs from the OCM is enabled.")
217227
// OMC controller periodically checks and pull data from the OCM API
218228
// the data is exposed in the OpenShift API
219229
ocmController := ocm.New(ctx, kubeClient.CoreV1(), configObserver, insightsClient)
220-
go ocmController.Run()
230+
return ocmController
221231
}
232+
return nil
222233
}
223234

224235
// featureEnabled checks if the feature is enabled in the "cluster" FeatureGate

pkg/controller/status/status.go

+7
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ const (
3636
uploadFailuresCountThreshold = 5
3737
// GatherFailuresCountThreshold defines how many gatherings can fail in a row before we report Degraded
3838
GatherFailuresCountThreshold = 5
39+
// OCMAPIFailureCountThreshold defines how many unsuccessful responses from the OCM API in a row is tolerated
40+
// before the operator is marked as Degraded
41+
OCMAPIFailureCountThreshold = 5
3942
)
4043

4144
type Reported struct {
@@ -175,6 +178,10 @@ func (c *Controller) merge(existing *configv1.ClusterOperator) *configv1.Cluster
175178
klog.V(4).Info("Failed to download Insights report")
176179
downloadReason = summary.Reason
177180
downloadMessage = summary.Message
181+
} else if summary.Operation == controllerstatus.PullingSCACerts {
182+
klog.V(4).Infof("Failed to download the SCA certs within the threshold %d with exponential backoff. Marking as degraded.",
183+
OCMAPIFailureCountThreshold)
184+
degradingFailure = true
178185
}
179186

180187
if degradingFailure {

pkg/controllerstatus/controllerstatus.go

+2
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ const (
2020
Uploading Operation = "Uploading"
2121
// GatheringReport specific for gathering the report from the cluster
2222
GatheringReport Operation = "GatheringReport"
23+
// PullingSCACerts is specific operation for pulling the SCA certs data from the OCM API
24+
PullingSCACerts Operation = "PullingSCACerts"
2325
)
2426

2527
// Summary represents the status summary of an Operation

pkg/insights/insightsclient/insightsclient.go

+11-8
Original file line numberDiff line numberDiff line change
@@ -63,19 +63,19 @@ type Source struct {
6363
Contents io.Reader
6464
}
6565

66-
// InsightsError is helper error type to have HTTP error status code
67-
type InsightsError struct {
66+
// HttpError is helper error type to have HTTP error status code
67+
type HttpError struct {
6868
Err error
6969
StatusCode int
7070
}
7171

72-
func (e InsightsError) Error() string {
72+
func (e HttpError) Error() string {
7373
return e.Err.Error()
7474
}
7575

76-
func IsInsightsError(err error) bool {
76+
func IsHttpError(err error) bool {
7777
switch err.(type) {
78-
case InsightsError:
78+
case HttpError:
7979
return true
8080
default:
8181
return false
@@ -342,7 +342,7 @@ func (c Client) RecvReport(ctx context.Context, endpoint string) (*io.ReadCloser
342342
if len(body) > 1024 {
343343
body = body[:1024]
344344
}
345-
notFoundErr := InsightsError{
345+
notFoundErr := HttpError{
346346
StatusCode: resp.StatusCode,
347347
Err: fmt.Errorf("not found: %s (request=%s): %s", resp.Request.URL, requestID, string(body)),
348348
}
@@ -391,7 +391,7 @@ func (c Client) RecvSCACerts(ctx context.Context, endpoint string) ([]byte, erro
391391
return nil, fmt.Errorf("unable to retrieve SCA certs data from %s: %v", endpoint, err)
392392
}
393393

394-
if res.StatusCode >= 300 || res.StatusCode < 200 {
394+
if res.StatusCode > 399 || res.StatusCode < 200 {
395395
return nil, ocmErrorMessage(res.Request.URL, res)
396396
}
397397

@@ -416,7 +416,10 @@ func ocmErrorMessage(url *url.URL, r *http.Response) error {
416416
if r.StatusCode == http.StatusUnauthorized || r.StatusCode == http.StatusForbidden {
417417
return authorizer.Error{Err: err}
418418
}
419-
return err
419+
return HttpError{
420+
Err: err,
421+
StatusCode: r.StatusCode,
422+
}
420423
}
421424

422425
var (

pkg/insights/insightsreport/insightsreport.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,9 @@ func (c *Controller) PullSmartProxy() (bool, error) {
9393
} else if err == insightsclient.ErrWaitingForVersion {
9494
klog.Error(err)
9595
return false, err
96-
} else if insightsclient.IsInsightsError(err) {
96+
} else if insightsclient.IsHttpError(err) {
9797

98-
ie := err.(insightsclient.InsightsError)
98+
ie := err.(insightsclient.HttpError)
9999
klog.Errorf("Unexpected error retrieving the report: %s", ie)
100100
// if there's a 404 response then retry
101101
if ie.StatusCode == http.StatusNotFound {

pkg/ocm/ocm.go

+68-2
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,18 @@ package ocm
33
import (
44
"context"
55
"encoding/json"
6+
"fmt"
7+
"net/http"
68
"time"
79

810
"github.com/openshift/insights-operator/pkg/config"
11+
"github.com/openshift/insights-operator/pkg/controller/status"
12+
"github.com/openshift/insights-operator/pkg/controllerstatus"
913
"github.com/openshift/insights-operator/pkg/insights/insightsclient"
1014
v1 "k8s.io/api/core/v1"
1115
"k8s.io/apimachinery/pkg/api/errors"
1216
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
17+
"k8s.io/apimachinery/pkg/util/wait"
1318
corev1client "k8s.io/client-go/kubernetes/typed/core/v1"
1419
"k8s.io/klog/v2"
1520
)
@@ -21,6 +26,7 @@ const (
2126

2227
// Controller holds all the required resources to be able to communicate with OCM API
2328
type Controller struct {
29+
controllerstatus.Simple
2430
coreClient corev1client.CoreV1Interface
2531
ctx context.Context
2632
configurator Configurator
@@ -45,6 +51,7 @@ type ScaResponse struct {
4551
func New(ctx context.Context, coreClient corev1client.CoreV1Interface, configurator Configurator,
4652
insightsClient *insightsclient.Client) *Controller {
4753
return &Controller{
54+
Simple: controllerstatus.Simple{Name: "ocmcontroller"},
4855
coreClient: coreClient,
4956
ctx: ctx,
5057
configurator: configurator,
@@ -79,9 +86,25 @@ func (c *Controller) Run() {
7986
}
8087

8188
func (c *Controller) requestDataAndCheckSecret(endpoint string) {
82-
data, err := c.client.RecvSCACerts(c.ctx, endpoint)
89+
data, err := c.requestSCAWithExpBackoff(endpoint)
8390
if err != nil {
84-
klog.Errorf("Failed to retrieve data: %v", err)
91+
// in case of any error other than 404 mark the operator as degraded
92+
c.Simple.UpdateStatus(controllerstatus.Summary{
93+
Operation: controllerstatus.PullingSCACerts,
94+
Reason: "FailedToPullSCACerts",
95+
Message: fmt.Sprintf("Failed to pull SCA certs from %s: %v", endpoint, err),
96+
})
97+
return
98+
}
99+
// handle the case with HTTP 404
100+
if len(data) == 0 {
101+
msg := fmt.Sprintf("Received no SCA certs from the %s. Please check if it's enabled for your organization.", endpoint)
102+
klog.Info(msg)
103+
c.Simple.UpdateStatus(controllerstatus.Summary{
104+
Operation: controllerstatus.PullingSCACerts,
105+
Message: msg,
106+
Healthy: true,
107+
})
85108
return
86109
}
87110
var ocmRes ScaResponse
@@ -98,6 +121,11 @@ func (c *Controller) requestDataAndCheckSecret(endpoint string) {
98121
return
99122
}
100123
klog.Infof("%s secret successfully updated", secretName)
124+
c.Simple.UpdateStatus(controllerstatus.Summary{
125+
Operation: controllerstatus.PullingSCACerts,
126+
Message: fmt.Sprintf("SCA certs successfully updated in the %s secret", secretName),
127+
Healthy: true,
128+
})
101129
}
102130

103131
// checkSecret checks "etc-pki-entitlement" secret in the "openshift-config-managed" namespace.
@@ -156,3 +184,41 @@ func (c *Controller) updateSecret(s *v1.Secret, ocmData *ScaResponse) (*v1.Secre
156184
}
157185
return s, nil
158186
}
187+
188+
// requestSCAWithExpBackoff queries OCM API with exponential backoff and returns
189+
// an error only in case of an HTTP error other than 404 received from the OCM API.
190+
// Data return value still can be an empty array in case of HTTP 404 error.
191+
func (c *Controller) requestSCAWithExpBackoff(endpoint string) ([]byte, error) {
192+
bo := wait.Backoff{
193+
Duration: c.configurator.Config().OCMConfig.Interval / 32, // 15 min by default
194+
Factor: 2,
195+
Jitter: 0,
196+
Steps: status.OCMAPIFailureCountThreshold,
197+
Cap: c.configurator.Config().OCMConfig.Interval,
198+
}
199+
var data []byte
200+
err := wait.ExponentialBackoff(bo, func() (bool, error) {
201+
var err error
202+
data, err = c.client.RecvSCACerts(c.ctx, endpoint)
203+
if err != nil {
204+
// don't try again in case it's not an HTTP error - it could mean we're in disconnected env
205+
if !insightsclient.IsHttpError(err) {
206+
klog.Errorf("Failed to request the SCA certs: %v", err)
207+
return true, nil
208+
}
209+
httpErr := err.(insightsclient.HttpError)
210+
// don't try again in case of 404
211+
if httpErr.StatusCode == http.StatusNotFound {
212+
return true, nil
213+
}
214+
klog.Errorf("%v. Trying again in %s", httpErr, bo.Step())
215+
return false, nil
216+
}
217+
return true, nil
218+
})
219+
// exp. backoff timeouted -> error
220+
if err != nil {
221+
return nil, fmt.Errorf("timed out waiting for the successful response from %s", endpoint)
222+
}
223+
return data, nil
224+
}

0 commit comments

Comments
 (0)