Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prevent OLM from creating InstallPlans when bundle unpack fails #2942

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/controller/bundle/bundle_unpacker.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ import (

const (
// TODO: This can be a spec field
// BundleUnpackTimeoutAnnotationKey allows setting a bundle unpack timeout per InstallPlan
// BundleUnpackTimeoutAnnotationKey allows setting a bundle unpack timeout per OperatorGroup
// and overrides the default specified by the --bundle-unpack-timeout flag
// The time duration should be in the same format as accepted by time.ParseDuration()
// e.g 1m30s
Expand Down
219 changes: 115 additions & 104 deletions pkg/controller/operators/catalog/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -934,7 +934,13 @@ func (o *Operator) syncResolvingNamespace(obj interface{}) error {
return err
}

failForwardEnabled, err := resolver.IsFailForwardEnabled(o.lister.OperatorsV1().OperatorGroupLister().OperatorGroups(namespace))
ogLister := o.lister.OperatorsV1().OperatorGroupLister().OperatorGroups(namespace)
failForwardEnabled, err := resolver.IsFailForwardEnabled(ogLister)
if err != nil {
return err
}

unpackTimeout, err := bundle.OperatorGroupBundleUnpackTimeout(ogLister)
if err != nil {
return err
}
Expand Down Expand Up @@ -1028,6 +1034,80 @@ func (o *Operator) syncResolvingNamespace(obj interface{}) error {
return err
}

// Attempt to unpack bundles before installing
// Note: This should probably use the attenuated client to prevent users from resolving resources they otherwise don't have access to.
if len(bundleLookups) > 0 {
logger.Debug("unpacking bundles")

var unpacked bool
unpacked, steps, bundleLookups, err = o.unpackBundles(namespace, steps, bundleLookups, unpackTimeout)
if err != nil {
// If the error was fatal capture and fail
if olmerrors.IsFatal(err) {
_, updateErr := o.updateSubscriptionStatuses(
o.setSubsCond(subs, v1alpha1.SubscriptionCondition{
Type: v1alpha1.SubscriptionBundleUnpackFailed,
Reason: "ErrorPreventedUnpacking",
Message: err.Error(),
Status: corev1.ConditionTrue,
}))
if updateErr != nil {
logger.WithError(updateErr).Debug("failed to update subs conditions")
return updateErr
}
return nil
}
// Retry sync if non-fatal error
return fmt.Errorf("bundle unpacking failed with an error: %w", err)
}

// Check BundleLookup status conditions to see if the BundleLookupFailed condtion is true
// which means bundle lookup has failed and subscriptions need to be updated
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// which means bundle lookup has failed and subscriptions need to be updated
// which means bundle lookup has failed and subscriptions needs to be updated

// with a condition indicating the failure.
isFailed, cond := hasBundleLookupFailureCondition(bundleLookups)
if isFailed {
err := fmt.Errorf("bundle unpacking failed. Reason: %v, and Message: %v", cond.Reason, cond.Message)
logger.Infof("%v", err)

_, updateErr := o.updateSubscriptionStatuses(
o.setSubsCond(subs, v1alpha1.SubscriptionCondition{
Type: v1alpha1.SubscriptionBundleUnpackFailed,
Reason: "BundleUnpackFailed",
Message: err.Error(),
Status: corev1.ConditionTrue,
}))
if updateErr != nil {
logger.WithError(updateErr).Debug("failed to update subs conditions")
return updateErr
}
// Since this is likely requires intervention we do not want to
// requeue too often. We return no error here and rely on a
// periodic resync which will help to automatically resolve
// some issues such as unreachable bundle images caused by
// bad catalog updates.
return nil
}

// This means that the unpack job is still running (most likely) or
// there was some issue which we did not handle above.
if !unpacked {
_, updateErr := o.updateSubscriptionStatuses(
o.setSubsCond(subs, v1alpha1.SubscriptionCondition{
Type: v1alpha1.SubscriptionBundleUnpacking,
Reason: "UnpackingInProgress",
Status: corev1.ConditionTrue,
}))
if updateErr != nil {
logger.WithError(updateErr).Debug("failed to update subs conditions")
return updateErr
}

logger.Debug("unpacking is not complete yet, requeueing")
o.nsResolveQueue.AddAfter(namespace, 5*time.Second)
return nil
}
}

// create installplan if anything updated
if len(updatedSubs) > 0 {
logger.Debug("resolution caused subscription changes, creating installplan")
Expand Down Expand Up @@ -1062,8 +1142,17 @@ func (o *Operator) syncResolvingNamespace(obj interface{}) error {
logger.Debugf("no subscriptions were updated")
}

// Make sure that we no longer indicate unpacking progress
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apologies if I'm not understanding the order of operations correctly here, but does this condition not become true before calling ensureInstallPlan? Or in other words shouldn't these status updates come on line 1110?

Or else, are we saying that a bundle unpack isn't successful until an InstallPlan is successfully created with the unpacked bundles?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apologies if I'm not understanding the order of operations correctly here, but does this condition not become true before calling ensureInstallPlan? Or in other words shouldn't these status updates come on line 1110?

Your understanding is correct. Order is as follows:

  • We first create an unpack job within o.unpackBundles(...)
  • Based on the result from the above we either set SubscriptionBundleUnpackFailed (in case of errors) or SubscriptionBundleUnpacking (in case when we still waiting on the unpack job) to True. I believe SubscriptionBundleUnpacking will be the most common on the first run when syncing with a new bundle.
  • Once unpacking is done - we create an InstallPlan in o.ensureInstallPlan(...)
    • We just create an InstallPlan here, but syncing of the InstallPlan is a separate process - (see o.syncInstallPlans)
  • All being well - we make sure to remove SubscriptionBundleUnpackFailed on subs (if any exist) and we set SubscriptionBundleUnpacking to False.
  • Update subs

Or else, are we saying that a bundle unpack isn't successful until an InstallPlan is successfully created with the unpacked bundles?

Technically unpack can be successfull without successfull InstallPlan creation. But in terms of current implementation in this PR - you are right.

I considered removing SubscriptionBundleUnpackFailed and setting SubscriptionBundleUnpacking to False before o.ensureInstallPlan(...), but it makes syncing a bit more complex becaues we would have to do two updates to the subs. We would have to implement something like this:

  • Unpacking (first two steps from the list above)
  • Removing SubscriptionBundleUnpackFailed and setting SubscriptionBundleUnpacking to False
  • o.updateSubscriptionStatuses(subs) <--- First update
  • o.ensureInstallPlan(...) and o.setIPReference(...)
  • o.updateSubscriptionStatuses(subs) <--- Second update

Instead we just do:

  • Unpacking
  • o.ensureInstallPlan(...) and o.setIPReference(...)
  • Removing SubscriptionBundleUnpackFailed and setting SubscriptionBundleUnpacking to False
  • o.updateSubscriptionStatuses(subs) <--- The only update to subs at the end

So it is a bit of optimisation, but I'm happy to reconsider. Let me know if you see an issue with this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for answering, I appreciate the level of detail!
I'm fine with what you have here to keep API calls to a minimum. We probably don't need such a fine-grained level of tracing for the process anyway - I mostly wanted to make sure there was intention behind it. We can keep it the way you have it :)

subs = o.setSubsCond(subs, v1alpha1.SubscriptionCondition{
Type: v1alpha1.SubscriptionBundleUnpacking,
Status: corev1.ConditionFalse,
})

// Remove BundleUnpackFailed condition from subscriptions
o.removeSubsCond(subs, v1alpha1.SubscriptionBundleUnpackFailed)

// Remove resolutionfailed condition from subscriptions
subs = o.removeSubsCond(subs, v1alpha1.SubscriptionResolutionFailed)
o.removeSubsCond(subs, v1alpha1.SubscriptionResolutionFailed)
newSub := true
for _, updatedSub := range updatedSubs {
updatedSub.Status.RemoveConditions(v1alpha1.SubscriptionResolutionFailed)
Expand Down Expand Up @@ -1408,19 +1497,27 @@ type UnpackedBundleReference struct {
Properties string `json:"properties"`
}

func (o *Operator) unpackBundles(plan *v1alpha1.InstallPlan, unpackTimeout time.Duration) (bool, *v1alpha1.InstallPlan, error) {
out := plan.DeepCopy()
func (o *Operator) unpackBundles(namespace string, installPlanSteps []*v1alpha1.Step, bundleLookups []v1alpha1.BundleLookup, unpackTimeout time.Duration) (bool, []*v1alpha1.Step, []v1alpha1.BundleLookup, error) {
unpacked := true

outBundleLookups := make([]v1alpha1.BundleLookup, len(bundleLookups))
for i := range bundleLookups {
bundleLookups[i].DeepCopyInto(&outBundleLookups[i])
}
outInstallPlanSteps := make([]*v1alpha1.Step, len(installPlanSteps))
for i := range installPlanSteps {
outInstallPlanSteps[i] = installPlanSteps[i].DeepCopy()
}

var errs []error
for i := 0; i < len(out.Status.BundleLookups); i++ {
lookup := out.Status.BundleLookups[i]
for i := 0; i < len(outBundleLookups); i++ {
lookup := outBundleLookups[i]
res, err := o.bundleUnpacker.UnpackBundle(&lookup, unpackTimeout)
if err != nil {
errs = append(errs, err)
continue
}
out.Status.BundleLookups[i] = *res.BundleLookup
outBundleLookups[i] = *res.BundleLookup

// if the failed condition is present it means the bundle unpacking has failed
failedCondition := res.GetCondition(v1alpha1.BundleLookupFailed)
Expand All @@ -1442,11 +1539,11 @@ func (o *Operator) unpackBundles(plan *v1alpha1.InstallPlan, unpackTimeout time.
continue
}

// Ensure that bundle can be applied by the current version of OLM by converting to steps
steps, err := resolver.NewStepsFromBundle(res.Bundle(), out.GetNamespace(), res.Replaces, res.CatalogSourceRef.Name, res.CatalogSourceRef.Namespace)
// Ensure that bundle can be applied by the current version of OLM by converting to bundleSteps
bundleSteps, err := resolver.NewStepsFromBundle(res.Bundle(), namespace, res.Replaces, res.CatalogSourceRef.Name, res.CatalogSourceRef.Namespace)
if err != nil {
if fatal := olmerrors.IsFatal(err); fatal {
return false, nil, err
return false, nil, nil, err
}

errs = append(errs, fmt.Errorf("failed to turn bundle into steps: %v", err))
Expand All @@ -1455,7 +1552,7 @@ func (o *Operator) unpackBundles(plan *v1alpha1.InstallPlan, unpackTimeout time.
}

// step manifests are replaced with references to the configmap containing them
for i, s := range steps {
for i, s := range bundleSteps {
ref := UnpackedBundleReference{
Kind: "ConfigMap",
Namespace: res.CatalogSourceRef.Namespace,
Expand All @@ -1472,19 +1569,19 @@ func (o *Operator) unpackBundles(plan *v1alpha1.InstallPlan, unpackTimeout time.
continue
}
s.Resource.Manifest = string(r)
steps[i] = s
bundleSteps[i] = s
}
res.RemoveCondition(resolver.BundleLookupConditionPacked)
out.Status.BundleLookups[i] = *res.BundleLookup
out.Status.Plan = append(out.Status.Plan, steps...)
outBundleLookups[i] = *res.BundleLookup
outInstallPlanSteps = append(outInstallPlanSteps, bundleSteps...)
}

if err := utilerrors.NewAggregate(errs); err != nil {
o.logger.Debugf("failed to unpack bundles: %v", err)
return false, nil, err
return false, nil, nil, err
}

return unpacked, out, nil
return unpacked, outInstallPlanSteps, outBundleLookups, nil
}

// gcInstallPlans garbage collects installplans that are too old
Expand Down Expand Up @@ -1631,71 +1728,6 @@ func (o *Operator) syncInstallPlans(obj interface{}) (syncError error) {
}
}

ogLister := o.lister.OperatorsV1().OperatorGroupLister().OperatorGroups(plan.GetNamespace())
unpackTimeout, err := bundle.OperatorGroupBundleUnpackTimeout(ogLister)
if err != nil {
return err
}

// Attempt to unpack bundles before installing
// Note: This should probably use the attenuated client to prevent users from resolving resources they otherwise don't have access to.
if len(plan.Status.BundleLookups) > 0 {
unpacked, out, err := o.unpackBundles(plan, unpackTimeout)
if err != nil {
// If the error was fatal capture and fail
if fatal := olmerrors.IsFatal(err); fatal {
if err := o.transitionInstallPlanToFailed(plan, logger, v1alpha1.InstallPlanReasonInstallCheckFailed, err.Error()); err != nil {
// retry for failure to update status
syncError = err
return
}
}
// Retry sync if non-fatal error
syncError = fmt.Errorf("bundle unpacking failed: %v", err)
return
}

if !reflect.DeepEqual(plan.Status, out.Status) {
logger.Warnf("status not equal, updating...")
if _, err := o.client.OperatorsV1alpha1().InstallPlans(out.GetNamespace()).UpdateStatus(context.TODO(), out, metav1.UpdateOptions{}); err != nil {
syncError = fmt.Errorf("failed to update installplan bundle lookups: %v", err)
}

return
}

// Check BundleLookup status conditions to see if the BundleLookupPending condtion is false
// which means bundle lookup has failed and the InstallPlan should be failed as well
isFailed, cond := hasBundleLookupFailureCondition(plan)
if isFailed {
err := fmt.Errorf("bundle unpacking failed. Reason: %v, and Message: %v", cond.Reason, cond.Message)
// Mark the InstallPlan as failed for a fatal bundle unpack error
logger.Infof("%v", err)

if err := o.transitionInstallPlanToFailed(plan, logger, v1alpha1.InstallPlanReasonInstallCheckFailed, err.Error()); err != nil {
// retry for failure to update status
syncError = err
return
}

// Requeue subscription to propagate SubscriptionInstallPlanFailed condtion to subscription
o.requeueSubscriptionForInstallPlan(plan, logger)
return
}

// TODO: Remove in favor of job and configmap informer requeuing
if !unpacked {
err := o.ipQueueSet.RequeueAfter(plan.GetNamespace(), plan.GetName(), 5*time.Second)
if err != nil {
syncError = err
return
}
logger.Debug("install plan not yet populated from bundle image, requeueing")

return
}
}

outInstallPlan, syncError := transitionInstallPlanState(logger.Logger, o, *plan, o.now(), o.installPlanTimeout)

if syncError != nil {
Expand Down Expand Up @@ -1723,8 +1755,8 @@ func (o *Operator) syncInstallPlans(obj interface{}) (syncError error) {
return
}

func hasBundleLookupFailureCondition(plan *v1alpha1.InstallPlan) (bool, *v1alpha1.BundleLookupCondition) {
for _, bundleLookup := range plan.Status.BundleLookups {
func hasBundleLookupFailureCondition(bundleLookups []v1alpha1.BundleLookup) (bool, *v1alpha1.BundleLookupCondition) {
for _, bundleLookup := range bundleLookups {
for _, cond := range bundleLookup.Conditions {
if cond.Type == v1alpha1.BundleLookupFailed && cond.Status == corev1.ConditionTrue {
return true, &cond
Expand All @@ -1734,27 +1766,6 @@ func hasBundleLookupFailureCondition(plan *v1alpha1.InstallPlan) (bool, *v1alpha
return false, nil
}

func (o *Operator) transitionInstallPlanToFailed(plan *v1alpha1.InstallPlan, logger logrus.FieldLogger, reason v1alpha1.InstallPlanConditionReason, message string) error {
now := o.now()
out := plan.DeepCopy()
out.Status.SetCondition(v1alpha1.ConditionFailed(v1alpha1.InstallPlanInstalled,
reason, message, &now))
out.Status.Phase = v1alpha1.InstallPlanPhaseFailed

logger.Info("transitioning InstallPlan to failed")
_, err := o.client.OperatorsV1alpha1().InstallPlans(plan.GetNamespace()).UpdateStatus(context.TODO(), out, metav1.UpdateOptions{})
if err == nil {
return nil
}

updateErr := errors.New("error updating InstallPlan status: " + err.Error())
logger = logger.WithField("updateError", updateErr)
logger.Errorf("error transitioning InstallPlan to failed")

// retry sync with error to update InstallPlan status
return fmt.Errorf("installplan failed: %s and error updating InstallPlan status as failed: %s", message, updateErr)
}

func (o *Operator) requeueSubscriptionForInstallPlan(plan *v1alpha1.InstallPlan, logger *logrus.Entry) {
// Notify subscription loop of installplan changes
owners := ownerutil.GetOwnersByKind(plan, v1alpha1.SubscriptionKind)
Expand Down
Loading