Skip to content

Commit 9d6e88c

Browse files
awgreeneopenshift-merge-robot
authored andcommitted
Introduce support for Fail Forward Upgrades
This commit introduces OLM support for fail forward upgrades. Existing upgrade behavior is now defined as "Default" upgrade behavior. A new upgrade strategy referred to as UnsafeFailForward has been introduced in a tech preview state. When fail forward upgrades are enabled, CSVs will now be able to move from the failed state to the replacing state, effectively allowing operators in a namespace to fail forward from failed installs or upgrades. Additionally, it will be possible to recover from failed installPlans by updating the upgrade graph defined in the catalog. Fail forward upgrades are generally unsafe and unsupported as they could result in unexpected behavior or lost data due to the fact that critical version of an operator may not be visited. Signed-off-by: Alexander Greene <[email protected]>
1 parent 63c0981 commit 9d6e88c

11 files changed

+962
-33
lines changed

pkg/controller/operators/catalog/operator.go

+35-5
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ import (
4242
"k8s.io/client-go/util/workqueue"
4343

4444
"github.com/operator-framework/api/pkg/operators/reference"
45+
operatorsv1 "github.com/operator-framework/api/pkg/operators/v1"
4546
"github.com/operator-framework/api/pkg/operators/v1alpha1"
4647
"github.com/operator-framework/operator-lifecycle-manager/pkg/api/client/clientset/versioned"
4748
"github.com/operator-framework/operator-lifecycle-manager/pkg/api/client/informers/externalversions"
@@ -882,6 +883,19 @@ func (o *Operator) syncCatalogSources(obj interface{}) (syncError error) {
882883
return
883884
}
884885

886+
func (o *Operator) isFailForwardEnabled(namespace string) (bool, error) {
887+
ogs, err := o.lister.OperatorsV1().OperatorGroupLister().OperatorGroups(namespace).List(labels.Everything())
888+
if err != nil {
889+
// Couldn't list operatorGroups, assuming default upgradeStrategy
890+
// so existing behavior is observed for failed CSVs.
891+
return false, nil
892+
}
893+
if len(ogs) != 1 {
894+
return false, fmt.Errorf("found %d operatorGroups in namespace %s, expected 1", len(ogs), namespace)
895+
}
896+
return ogs[0].UpgradeStrategy() == operatorsv1.UpgradeStrategyUnsafeFailForward, nil
897+
}
898+
885899
func (o *Operator) syncResolvingNamespace(obj interface{}) error {
886900
ns, ok := obj.(*corev1.Namespace)
887901
if !ok {
@@ -908,6 +922,11 @@ func (o *Operator) syncResolvingNamespace(obj interface{}) error {
908922
return err
909923
}
910924

925+
failForwardEnabled, err := o.isFailForwardEnabled(namespace)
926+
if err != nil {
927+
return err
928+
}
929+
911930
// TODO: parallel
912931
maxGeneration := 0
913932
subscriptionUpdated := false
@@ -924,15 +943,15 @@ func (o *Operator) syncResolvingNamespace(obj interface{}) error {
924943
}
925944

926945
// ensure the installplan reference is correct
927-
sub, changedIP, err := o.ensureSubscriptionInstallPlanState(logger, sub)
946+
sub, changedIP, err := o.ensureSubscriptionInstallPlanState(logger, sub, failForwardEnabled)
928947
if err != nil {
929948
logger.Debugf("error ensuring installplan state: %v", err)
930949
return err
931950
}
932951
subscriptionUpdated = subscriptionUpdated || changedIP
933952

934953
// record the current state of the desired corresponding CSV in the status. no-op if we don't know the csv yet.
935-
sub, changedCSV, err := o.ensureSubscriptionCSVState(logger, sub)
954+
sub, changedCSV, err := o.ensureSubscriptionCSVState(logger, sub, failForwardEnabled)
936955
if err != nil {
937956
logger.Debugf("error recording current state of CSV in status: %v", err)
938957
return err
@@ -958,7 +977,7 @@ func (o *Operator) syncResolvingNamespace(obj interface{}) error {
958977
logger.Debug("resolving subscriptions in namespace")
959978

960979
// resolve a set of steps to apply to a cluster, a set of subscriptions to create/update, and any errors
961-
steps, bundleLookups, updatedSubs, err := o.resolver.ResolveSteps(namespace)
980+
steps, bundleLookups, updatedSubs, err := o.resolver.ResolveSteps(namespace, failForwardEnabled)
962981
if err != nil {
963982
go o.recorder.Event(ns, corev1.EventTypeWarning, "ResolutionFailed", err.Error())
964983
// If the error is constraints not satisfiable, then simply project the
@@ -1080,7 +1099,7 @@ func (o *Operator) nothingToUpdate(logger *logrus.Entry, sub *v1alpha1.Subscript
10801099
return false
10811100
}
10821101

1083-
func (o *Operator) ensureSubscriptionInstallPlanState(logger *logrus.Entry, sub *v1alpha1.Subscription) (*v1alpha1.Subscription, bool, error) {
1102+
func (o *Operator) ensureSubscriptionInstallPlanState(logger *logrus.Entry, sub *v1alpha1.Subscription, failForwardEnabled bool) (*v1alpha1.Subscription, bool, error) {
10841103
if sub.Status.InstallPlanRef != nil || sub.Status.Install != nil {
10851104
return sub, false, nil
10861105
}
@@ -1110,13 +1129,16 @@ func (o *Operator) ensureSubscriptionInstallPlanState(logger *logrus.Entry, sub
11101129
out.Status.InstallPlanRef = ref
11111130
out.Status.Install = v1alpha1.NewInstallPlanReference(ref)
11121131
out.Status.State = v1alpha1.SubscriptionStateUpgradePending
1132+
if failForwardEnabled && ip.Status.Phase == v1alpha1.InstallPlanPhaseFailed {
1133+
out.Status.State = v1alpha1.SubscriptionStateFailed
1134+
}
11131135
out.Status.CurrentCSV = out.Spec.StartingCSV
11141136
out.Status.LastUpdated = o.now()
11151137

11161138
return out, true, nil
11171139
}
11181140

1119-
func (o *Operator) ensureSubscriptionCSVState(logger *logrus.Entry, sub *v1alpha1.Subscription) (*v1alpha1.Subscription, bool, error) {
1141+
func (o *Operator) ensureSubscriptionCSVState(logger *logrus.Entry, sub *v1alpha1.Subscription, failForwardEnabled bool) (*v1alpha1.Subscription, bool, error) {
11201142
if sub.Status.CurrentCSV == "" {
11211143
return sub, false, nil
11221144
}
@@ -1126,6 +1148,14 @@ func (o *Operator) ensureSubscriptionCSVState(logger *logrus.Entry, sub *v1alpha
11261148
if err != nil {
11271149
logger.WithError(err).WithField("currentCSV", sub.Status.CurrentCSV).Debug("error fetching csv listed in subscription status")
11281150
out.Status.State = v1alpha1.SubscriptionStateUpgradePending
1151+
if failForwardEnabled && sub.Status.InstallPlanRef != nil {
1152+
ip, err := o.client.OperatorsV1alpha1().InstallPlans(sub.GetNamespace()).Get(context.TODO(), sub.Status.InstallPlanRef.Name, metav1.GetOptions{})
1153+
if err != nil {
1154+
logger.WithError(err).WithField("currentCSV", sub.Status.CurrentCSV).Debug("error fetching installplan listed in subscription status")
1155+
} else if ip.Status.Phase == v1alpha1.InstallPlanPhaseFailed {
1156+
out.Status.State = v1alpha1.SubscriptionStateFailed
1157+
}
1158+
}
11291159
} else {
11301160
out.Status.State = v1alpha1.SubscriptionStateAtLatest
11311161
out.Status.InstalledCSV = sub.Status.CurrentCSV

pkg/controller/operators/catalog/operator_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -1254,7 +1254,7 @@ func TestSyncResolvingNamespace(t *testing.T) {
12541254

12551255
o.sourcesLastUpdate.Set(tt.fields.sourcesLastUpdate.Time)
12561256
o.resolver = &fakes.FakeStepResolver{
1257-
ResolveStepsStub: func(string) ([]*v1alpha1.Step, []v1alpha1.BundleLookup, []*v1alpha1.Subscription, error) {
1257+
ResolveStepsStub: func(string, bool) ([]*v1alpha1.Step, []v1alpha1.BundleLookup, []*v1alpha1.Subscription, error) {
12581258
return nil, nil, nil, tt.fields.resolveErr
12591259
},
12601260
}

pkg/controller/operators/catalog/subscriptions_test.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -1031,7 +1031,7 @@ func TestSyncSubscriptions(t *testing.T) {
10311031

10321032
o.sourcesLastUpdate.Set(tt.fields.sourcesLastUpdate.Time)
10331033
o.resolver = &fakes.FakeStepResolver{
1034-
ResolveStepsStub: func(string) ([]*v1alpha1.Step, []v1alpha1.BundleLookup, []*v1alpha1.Subscription, error) {
1034+
ResolveStepsStub: func(string, bool) ([]*v1alpha1.Step, []v1alpha1.BundleLookup, []*v1alpha1.Subscription, error) {
10351035
return tt.fields.resolveSteps, tt.fields.bundleLookups, tt.fields.resolveSubs, tt.fields.resolveErr
10361036
},
10371037
}
@@ -1168,7 +1168,7 @@ func BenchmarkSyncResolvingNamespace(b *testing.B) {
11681168
},
11691169
},
11701170
resolver: &fakes.FakeStepResolver{
1171-
ResolveStepsStub: func(string) ([]*v1alpha1.Step, []v1alpha1.BundleLookup, []*v1alpha1.Subscription, error) {
1171+
ResolveStepsStub: func(string, bool) ([]*v1alpha1.Step, []v1alpha1.BundleLookup, []*v1alpha1.Subscription, error) {
11721172
steps := []*v1alpha1.Step{
11731173
{
11741174
Resolving: "csv.v.2",

pkg/controller/operators/olm/operator.go

+33-1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ import (
3838
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/install"
3939
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/operators/internal/pruning"
4040
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/operators/olm/overrides"
41+
resolver "github.com/operator-framework/operator-lifecycle-manager/pkg/controller/registry/resolver"
4142
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/clients"
4243
csvutility "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/csv"
4344
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/event"
@@ -1996,6 +1997,15 @@ func (a *Operator) transitionCSVState(in v1alpha1.ClusterServiceVersion) (out *v
19961997
}
19971998

19981999
case v1alpha1.CSVPhaseFailed:
2000+
// Transition to the replacing phase if FailForward is enabled and a CSV exists that replaces the operator.
2001+
if operatorGroup.UpgradeStrategy() == operatorsv1.UpgradeStrategyUnsafeFailForward {
2002+
if replacement := a.isBeingReplaced(out, a.csvSet(out.GetNamespace(), v1alpha1.CSVPhaseAny)); replacement != nil {
2003+
msg := fmt.Sprintf("Fail Forward is enabled, allowing %s csv to be replaced by csv: %s", out.Status.Phase, replacement.GetName())
2004+
out.SetPhaseWithEvent(v1alpha1.CSVPhaseReplacing, v1alpha1.CSVReasonBeingReplaced, msg, a.now(), a.recorder)
2005+
metrics.CSVUpgradeCount.Inc()
2006+
return
2007+
}
2008+
}
19992009
installer, strategy := a.parseStrategiesAndUpdateStatus(out)
20002010
if strategy == nil {
20012011
return
@@ -2087,7 +2097,29 @@ func (a *Operator) transitionCSVState(in v1alpha1.ClusterServiceVersion) (out *v
20872097
}
20882098

20892099
// If there is a succeeded replacement, mark this for deletion
2090-
if next := a.isBeingReplaced(out, a.csvSet(out.GetNamespace(), v1alpha1.CSVPhaseAny)); next != nil {
2100+
next := a.isBeingReplaced(out, a.csvSet(out.GetNamespace(), v1alpha1.CSVPhaseAny))
2101+
// Get the newest CSV in the replacement chain if fail forward upgrades are enabled.
2102+
if operatorGroup.UpgradeStrategy() == operatorsv1.UpgradeStrategyUnsafeFailForward {
2103+
csvs, err := a.lister.OperatorsV1alpha1().ClusterServiceVersionLister().ClusterServiceVersions(next.GetNamespace()).List(labels.Everything())
2104+
if err != nil {
2105+
syncError = err
2106+
return
2107+
}
2108+
2109+
lastCSVInChain, err := resolver.WalkReplacementChain(next, resolver.ReplacementMapping(csvs), resolver.WithUniqueCSVs())
2110+
if err != nil {
2111+
syncError = err
2112+
return
2113+
}
2114+
2115+
if lastCSVInChain == nil {
2116+
syncError = fmt.Errorf("fail forward upgrades enabled, unable to identify last CSV in replacement chain")
2117+
return
2118+
}
2119+
2120+
next = lastCSVInChain
2121+
}
2122+
if next != nil {
20912123
if next.Status.Phase == v1alpha1.CSVPhaseSucceeded {
20922124
out.SetPhaseWithEvent(v1alpha1.CSVPhaseDeleting, v1alpha1.CSVReasonReplaced, "has been replaced by a newer ClusterServiceVersion that has successfully installed.", now, a.recorder)
20932125
} else {

0 commit comments

Comments
 (0)