Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

create new permanent clusteroperator conditions for SCA & #621

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion manifests/08-prometheus_rule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ spec:
description: 'Simple content access (SCA) is not enabled. Once enabled, Insights Operator can automatically import the SCA certificates from Red Hat OpenShift Cluster Manager making it easier to use the content provided by your Red Hat subscriptions when creating container images. See https://docs.openshift.com/container-platform/latest/cicd/builds/running-entitled-builds.html for more information.'
summary: Simple content access certificates are not available.
expr: |
max_over_time(cluster_operator_conditions{name="insights", condition="SCANotAvailable", reason="NotFound"}[5m]) == 1
max_over_time(cluster_operator_conditions{name="insights", condition="SCAAvailable", reason="NotFound"}[5m]) == 0
for: 5m
labels:
severity: info
Expand Down
10 changes: 5 additions & 5 deletions pkg/controller/periodic/periodic.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ type Controller struct {
configurator configobserver.Configurator
recorder recorder.FlushInterface
gatherers []gatherers.Interface
statuses map[string]*controllerstatus.Simple
statuses map[string]controllerstatus.StatusController
anonymizer *anonymization.Anonymizer
}

Expand All @@ -36,11 +36,11 @@ func New(
listGatherers []gatherers.Interface,
anonymizer *anonymization.Anonymizer,
) *Controller {
statuses := make(map[string]*controllerstatus.Simple)
statuses := make(map[string]controllerstatus.StatusController)

for _, gatherer := range listGatherers {
gathererName := gatherer.GetName()
statuses[gathererName] = &controllerstatus.Simple{Name: fmt.Sprintf("periodic-%s", gathererName)}
statuses[gathererName] = controllerstatus.New(fmt.Sprintf("periodic-%s", gathererName))
}

return &Controller{
Expand All @@ -52,13 +52,13 @@ func New(
}
}

func (c *Controller) Sources() []controllerstatus.Interface {
func (c *Controller) Sources() []controllerstatus.StatusController {
keys := make([]string, 0, len(c.statuses))
for key := range c.statuses {
keys = append(keys, key)
}
sort.Strings(keys)
sources := make([]controllerstatus.Interface, 0, len(keys))
sources := make([]controllerstatus.StatusController, 0, len(keys))
for _, key := range keys {
sources = append(sources, c.statuses[key])
}
Expand Down
23 changes: 17 additions & 6 deletions pkg/controller/status/conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,10 @@ const (
InsightsUploadDegraded configv1.ClusterStatusConditionType = "UploadDegraded"
// InsightsDownloadDegraded defines the condition type (when set to True) when the Insights report can't be successfully downloaded
InsightsDownloadDegraded configv1.ClusterStatusConditionType = "InsightsDownloadDegraded"
// SCANotAvailable is a condition type providing info about unsuccessful SCA pull attempt from the OCM API
SCANotAvailable configv1.ClusterStatusConditionType = "SCANotAvailable"
// ClusterTransferFailed is a condition type providing info about unsuccessful pull attempt of the ClusterTransfer from the OCM API
// or unsuccessful pull-secret update
ClusterTransferFailed configv1.ClusterStatusConditionType = "ClusterTransferFailed"
// ClusterTransferAvailable is a condition type providing info about ClusterTransfer controller status
ClusterTransferAvailable configv1.ClusterStatusConditionType = "ClusterTransferAvailable"
// SCAAvailable is a condition type providing info about SCA controller status
SCAAvailable configv1.ClusterStatusConditionType = "SCAAvailable"
)

type conditionsMap map[configv1.ClusterStatusConditionType]configv1.ClusterOperatorStatusCondition
Expand All @@ -26,7 +25,7 @@ type conditions struct {
}

func newConditions(cos *configv1.ClusterOperatorStatus, time metav1.Time) *conditions {
entries := map[configv1.ClusterStatusConditionType]configv1.ClusterOperatorStatusCondition{
entries := map[configv1.ClusterStatusConditionType]configv1.ClusterOperatorStatusCondition{ // nolint: dupl
configv1.OperatorAvailable: {
Type: configv1.OperatorAvailable,
Status: configv1.ConditionUnknown,
Expand All @@ -45,6 +44,18 @@ func newConditions(cos *configv1.ClusterOperatorStatus, time metav1.Time) *condi
LastTransitionTime: time,
Reason: "",
},
SCAAvailable: {
Type: SCAAvailable,
Status: configv1.ConditionUnknown,
LastTransitionTime: time,
Reason: "",
},
ClusterTransferAvailable: {
Type: ClusterTransferAvailable,
Status: configv1.ConditionUnknown,
LastTransitionTime: time,
Reason: "",
},
}

for _, c := range cos.Conditions {
Expand Down
28 changes: 26 additions & 2 deletions pkg/controller/status/conditions_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ func Test_conditions_setCondition(t *testing.T) {
}
}

func Test_newConditions(t *testing.T) {
func Test_newConditions(t *testing.T) { // nolint: funlen
time := metav1.Now()

type args struct {
Expand All @@ -396,7 +396,7 @@ func Test_newConditions(t *testing.T) {
time: time,
},
want: &conditions{
entryMap: map[configv1.ClusterStatusConditionType]configv1.ClusterOperatorStatusCondition{
entryMap: map[configv1.ClusterStatusConditionType]configv1.ClusterOperatorStatusCondition{ // nolint: dupl
configv1.OperatorAvailable: {
Type: configv1.OperatorAvailable,
Status: configv1.ConditionUnknown,
Expand All @@ -415,6 +415,18 @@ func Test_newConditions(t *testing.T) {
LastTransitionTime: time,
Reason: "",
},
SCAAvailable: {
Type: SCAAvailable,
Status: configv1.ConditionUnknown,
LastTransitionTime: time,
Reason: "",
},
ClusterTransferAvailable: {
Type: ClusterTransferAvailable,
Status: configv1.ConditionUnknown,
LastTransitionTime: time,
Reason: "",
},
},
},
},
Expand Down Expand Up @@ -455,6 +467,18 @@ func Test_newConditions(t *testing.T) {
Reason: "degraded reason",
Message: "degraded message",
},
SCAAvailable: {
Type: SCAAvailable,
Status: configv1.ConditionUnknown,
LastTransitionTime: time,
Reason: "",
},
ClusterTransferAvailable: {
Type: ClusterTransferAvailable,
Status: configv1.ConditionUnknown,
LastTransitionTime: time,
Reason: "",
},
},
},
},
Expand Down
85 changes: 54 additions & 31 deletions pkg/controller/status/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,15 @@ import (

"github.com/openshift/insights-operator/pkg/config/configobserver"
"github.com/openshift/insights-operator/pkg/controllerstatus"
"github.com/openshift/insights-operator/pkg/ocm"
"github.com/openshift/insights-operator/pkg/ocm/clustertransfer"
"github.com/openshift/insights-operator/pkg/ocm/sca"
)

const (
// How many upload failures in a row we tolerate before starting reporting
// as InsightsUploadDegraded
uploadFailuresCountThreshold = 5
// OCMAPIFailureCountThreshold defines how many unsuccessful responses from the OCM API in a row is tolerated
// before the operator is marked as Degraded
OCMAPIFailureCountThreshold = 5

insightsAvailableMessage = "Insights works as expected"
)
Expand All @@ -50,7 +50,7 @@ type Controller struct {
statusCh chan struct{}
configurator configobserver.Configurator

sources []controllerstatus.Interface
sources map[string]controllerstatus.StatusController
reported Reported
start time.Time

Expand All @@ -67,6 +67,7 @@ func NewController(client configv1client.ConfigV1Interface, configurator configo
configurator: configurator,
client: client,
namespace: namespace,
sources: make(map[string]controllerstatus.StatusController),
ctrlStatus: newControllerStatus(),
}
return c
Expand Down Expand Up @@ -108,20 +109,29 @@ func (c *Controller) SetLastReportedTime(at time.Time) {

// AddSources adds sources in a thread-safe way.
// A source is used to monitor parts of the operator.
func (c *Controller) AddSources(sources ...controllerstatus.Interface) {
func (c *Controller) AddSources(sources ...controllerstatus.StatusController) {
c.lock.Lock()
defer c.lock.Unlock()
c.sources = append(c.sources, sources...)
for i := range sources {
source := sources[i]
c.sources[source.Name()] = source
}
}

// Sources provides the sources in a thread-safe way.
// A source is used to monitor parts of the operator.
func (c *Controller) Sources() []controllerstatus.Interface {
func (c *Controller) Sources() map[string]controllerstatus.StatusController {
c.lock.Lock()
defer c.lock.Unlock()
return c.sources
}

func (c *Controller) Source(name string) controllerstatus.StatusController {
c.lock.Lock()
defer c.lock.Unlock()
return c.sources[name]
}

func (c *Controller) merge(clusterOperator *configv1.ClusterOperator) *configv1.ClusterOperator {
// prime the object if it does not exist
if clusterOperator == nil {
Expand All @@ -144,7 +154,7 @@ func (c *Controller) merge(clusterOperator *configv1.ClusterOperator) *configv1.

// cluster operator conditions
cs := newConditions(&clusterOperator.Status, metav1.Time{Time: now})
updateControllerConditions(cs, c.ctrlStatus, isInitializing, lastTransition)
c.updateControllerConditions(cs, isInitializing, lastTransition)
updateControllerConditionsByStatus(cs, c.ctrlStatus, isInitializing)

// all status conditions from conditions to cluster operator
Expand Down Expand Up @@ -172,18 +182,18 @@ func (c *Controller) currentControllerStatus() (allReady bool, lastTransition ti

allReady = true

for i, source := range c.Sources() {
for name, source := range c.Sources() {
summary, ready := source.CurrentStatus()
if !ready {
klog.V(4).Infof("Source %d %T is not ready", i, source)
klog.V(4).Infof("Source %s %T is not ready", name, source)
allReady = false
continue
}
if summary.Healthy {
continue
}
if len(summary.Message) == 0 {
klog.Errorf("Programmer error: status source %d %T reported an empty message: %#v", i, source, summary)
klog.Errorf("Programmer error: status source %s %T reported an empty message: %#v", name, source, summary)
continue
}

Expand All @@ -206,18 +216,16 @@ func (c *Controller) currentControllerStatus() (allReady bool, lastTransition ti
// mark as degraded only in case of HTTP 500 and higher
if summary.Operation.HTTPStatusCode >= 500 {
klog.V(4).Infof("Failed to download the SCA certs within the threshold %d with exponential backoff. Marking as degraded.",
OCMAPIFailureCountThreshold)
ocm.OCMAPIFailureCountThreshold)
degradingFailure = true
}
c.ctrlStatus.setStatus(SCAPullStatus, summary.Reason, summary.Message)
} else if summary.Operation.Name == controllerstatus.PullingClusterTransfer.Name {
// mark as degraded only in case of HTTP 500 and higher
if summary.Operation.HTTPStatusCode >= 500 {
klog.V(4).Infof("Failed to pull the cluster transfer object within the threshold %d with exponential backoff. Marking as degraded.",
OCMAPIFailureCountThreshold)
ocm.OCMAPIFailureCountThreshold)
degradingFailure = true
}
c.ctrlStatus.setStatus(ClusterTransferStatus, summary.Reason, summary.Message)
}

if degradingFailure {
Expand Down Expand Up @@ -317,11 +325,11 @@ func (c *Controller) updateStatus(ctx context.Context, initial bool) error {
}

// update the cluster controller status conditions
func updateControllerConditions(cs *conditions, ctrlStatus *controllerStatus,
func (c *Controller) updateControllerConditions(cs *conditions,
isInitializing bool, lastTransition time.Time) {
if isInitializing {
// the disabled condition is optional, but set it now if we already know we're disabled
if ds := ctrlStatus.getStatus(DisabledStatus); ds != nil {
if ds := c.ctrlStatus.getStatus(DisabledStatus); ds != nil {
cs.setCondition(OperatorDisabled, configv1.ConditionTrue, ds.reason, ds.message, metav1.Now())
}
if !cs.hasCondition(configv1.OperatorDegraded) {
Expand All @@ -331,45 +339,60 @@ func updateControllerConditions(cs *conditions, ctrlStatus *controllerStatus,

// once we've initialized set Failing and Disabled as best we know
// handle when disabled
if ds := ctrlStatus.getStatus(DisabledStatus); ds != nil {
if ds := c.ctrlStatus.getStatus(DisabledStatus); ds != nil {
cs.setCondition(OperatorDisabled, configv1.ConditionTrue, ds.reason, ds.message, metav1.Now())
} else {
cs.setCondition(OperatorDisabled, configv1.ConditionFalse, "AsExpected", "", metav1.Now())
}

// handle when has errors
if es := ctrlStatus.getStatus(ErrorStatus); es != nil && !ctrlStatus.isDisabled() {
if es := c.ctrlStatus.getStatus(ErrorStatus); es != nil && !c.ctrlStatus.isDisabled() {
cs.setCondition(configv1.OperatorDegraded, configv1.ConditionTrue, es.reason, es.message, metav1.Time{Time: lastTransition})
} else {
cs.setCondition(configv1.OperatorDegraded, configv1.ConditionFalse, "AsExpected", insightsAvailableMessage, metav1.Now())
}

// handle when upload fails
if ur := ctrlStatus.getStatus(UploadStatus); ur != nil && !ctrlStatus.isDisabled() {
if ur := c.ctrlStatus.getStatus(UploadStatus); ur != nil && !c.ctrlStatus.isDisabled() {
cs.setCondition(InsightsUploadDegraded, configv1.ConditionTrue, ur.reason, ur.message, metav1.Time{Time: lastTransition})
} else {
cs.removeCondition(InsightsUploadDegraded)
}

// handle when download fails
if ds := ctrlStatus.getStatus(DownloadStatus); ds != nil && !ctrlStatus.isDisabled() {
if ds := c.ctrlStatus.getStatus(DownloadStatus); ds != nil && !c.ctrlStatus.isDisabled() {
cs.setCondition(InsightsDownloadDegraded, configv1.ConditionTrue, ds.reason, ds.message, metav1.Time{Time: lastTransition})
} else {
cs.removeCondition(InsightsDownloadDegraded)
}
c.updateControllerConditionByReason(cs, SCAAvailable, sca.ControllerName, sca.AvailableReason, isInitializing)
c.updateControllerConditionByReason(cs,
ClusterTransferAvailable,
clustertransfer.ControllerName,
clustertransfer.AvailableReason,
isInitializing)
}

// handler when SCA pull from OCM fails
if ss := ctrlStatus.getStatus(SCAPullStatus); ss != nil {
cs.setCondition(SCANotAvailable, configv1.ConditionTrue, ss.reason, ss.message, metav1.Time{Time: lastTransition})
} else {
cs.removeCondition(SCANotAvailable)
func (c *Controller) updateControllerConditionByReason(cs *conditions,
condition configv1.ClusterStatusConditionType,
controllerName, reason string,
isInitializing bool) {
controller := c.Source(controllerName)
if controller == nil {
return
}

// handler when ClusterTransfer pull from the OCM fails
if ss := ctrlStatus.getStatus(ClusterTransferStatus); ss != nil {
cs.setCondition(ClusterTransferFailed, configv1.ConditionTrue, ss.reason, ss.message, metav1.Time{Time: lastTransition})
if isInitializing {
return
}
summary, ok := controller.CurrentStatus()
// no summary to read
if !ok {
return
}
if summary.Reason == reason {
cs.setCondition(condition, configv1.ConditionTrue, summary.Reason, summary.Message, metav1.Time{Time: summary.LastTransitionTime})
} else {
cs.removeCondition(ClusterTransferFailed)
cs.setCondition(condition, configv1.ConditionFalse, summary.Reason, summary.Message, metav1.Time{Time: summary.LastTransitionTime})
}
}

Expand Down
10 changes: 4 additions & 6 deletions pkg/controller/status/status.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
package status

const (
DisabledStatus = "disabled"
UploadStatus = "upload"
DownloadStatus = "download"
ErrorStatus = "error"
SCAPullStatus = "scaPullStatus"
ClusterTransferStatus = "clusterTransferStatus"
DisabledStatus = "disabled"
UploadStatus = "upload"
DownloadStatus = "download"
ErrorStatus = "error"
)

type controllerStatus struct {
Expand Down
Loading