From cd87fe2dccab97b9435731ab724631e288650fd9 Mon Sep 17 00:00:00 2001 From: Artur Zych <5843875+azych@users.noreply.github.com> Date: Fri, 14 Feb 2025 13:53:37 +0100 Subject: [PATCH 1/2] Handle interrupted helm releases in applier Introduces a workaround for 'interrupted' helm releases which enter into a 'pending' (-install/uninstall/rollback) state. If that happens, for example because of immediate application exit with one of those operations being in flight, helm is not able to resolve it automatically which means we end up in a permanent reconcile error state. One of the workarounds for this that has been repeated in the community is to remove metadata of the pending release, which is the solution chosen here. For full context see: https://github.com/operator-framework/operator-controller/pull/1776 https://github.com/helm/helm/issues/5595 --- go.mod | 2 +- go.sum | 4 +- .../operator-controller/action/helm_test.go | 6 ++ internal/operator-controller/applier/helm.go | 26 +++++- .../operator-controller/applier/helm_test.go | 81 +++++++++++++++++++ .../clusterextension_controller_test.go | 5 ++ 6 files changed, 119 insertions(+), 5 deletions(-) diff --git a/go.mod b/go.mod index 1b37df107..4a3064ebd 100644 --- a/go.mod +++ b/go.mod @@ -18,7 +18,7 @@ require ( github.com/opencontainers/go-digest v1.0.0 github.com/opencontainers/image-spec v1.1.1 github.com/operator-framework/api v0.30.0 - github.com/operator-framework/helm-operator-plugins v0.8.0 + github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b github.com/operator-framework/operator-registry v1.50.0 github.com/prometheus/client_golang v1.21.1 github.com/sirupsen/logrus v1.9.3 diff --git a/go.sum b/go.sum index 2e5ddec2f..2dbb8619b 100644 --- a/go.sum +++ b/go.sum @@ -536,8 +536,8 @@ github.com/openshift/crd-schema-checker v0.0.0-20240404194209-35a9033b1d11 h1:eT github.com/openshift/crd-schema-checker v0.0.0-20240404194209-35a9033b1d11/go.mod h1:EmVJt97N+pfWFsli/ipXTBZqSG5F5KGQhm3c3IsGq1o= github.com/operator-framework/api v0.30.0 h1:44hCmGnEnZk/Miol5o44dhSldNH0EToQUG7vZTl29kk= github.com/operator-framework/api v0.30.0/go.mod h1:FYxAPhjtlXSAty/fbn5YJnFagt6SpJZJgFNNbvDe5W0= -github.com/operator-framework/helm-operator-plugins v0.8.0 h1:0f6HOQC5likkf0b/OvGvw7nhDb6h8Cj5twdCNjwNzMc= -github.com/operator-framework/helm-operator-plugins v0.8.0/go.mod h1:Sc+8bE38xTCgCChBUvtq/PxatEg9fAypr7S5iAw8nlA= +github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b h1:andoOL7lpEafTXdjPGDuNLJlJQh0UmRzgj0+D31K8HU= +github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b/go.mod h1:qioyxwOSI89RAtTWhccc+RyaPQdKTTJRc6sFkT6kqys= github.com/operator-framework/operator-lib v0.17.0 h1:cbz51wZ9+GpWR1ZYP4CSKSSBxDlWxmmnseaHVZZjZt4= github.com/operator-framework/operator-lib v0.17.0/go.mod h1:TGopBxIE8L6E/Cojzo26R3NFp1eNlqhQNmzqhOblaLw= github.com/operator-framework/operator-registry v1.50.0 h1:kMAwsKAEDjuSx5dGchMX+CD3SMHWwOAC/xyK3LQweB4= diff --git a/internal/operator-controller/action/helm_test.go b/internal/operator-controller/action/helm_test.go index 3f1f02f9d..b301bcd4c 100644 --- a/internal/operator-controller/action/helm_test.go +++ b/internal/operator-controller/action/helm_test.go @@ -9,6 +9,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" + "helm.sh/helm/v3/pkg/action" "helm.sh/helm/v3/pkg/chart" "helm.sh/helm/v3/pkg/release" "sigs.k8s.io/controller-runtime/pkg/client" @@ -70,6 +71,11 @@ func (m *mockActionClient) Reconcile(rel *release.Release) error { return args.Error(0) } +func (m *mockActionClient) Config() *action.Configuration { + args := m.Called() + return args.Get(0).(*action.Configuration) +} + var _ actionclient.ActionClientGetter = &mockActionClientGetter{} type mockActionClientGetter struct { diff --git a/internal/operator-controller/applier/helm.go b/internal/operator-controller/applier/helm.go index 60a03477a..7ea4d4769 100644 --- a/internal/operator-controller/applier/helm.go +++ b/internal/operator-controller/applier/helm.go @@ -95,7 +95,7 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte labels: objectLabels, } - rel, desiredRel, state, err := h.getReleaseState(ac, ext, chrt, values, post) + rel, desiredRel, state, err := h.getReleaseState(ctx, ac, ext, chrt, values, post) if err != nil { return nil, "", err } @@ -164,9 +164,30 @@ func (h *Helm) buildHelmChart(bundleFS fs.FS, ext *ocv1.ClusterExtension) (*char return h.BundleToHelmChartFn(bundleFS, ext.Spec.Namespace, watchNamespace) } -func (h *Helm) getReleaseState(cl helmclient.ActionInterface, ext *ocv1.ClusterExtension, chrt *chart.Chart, values chartutil.Values, post postrender.PostRenderer) (*release.Release, *release.Release, string, error) { +func (h *Helm) getReleaseState(ctx context.Context, cl helmclient.ActionInterface, ext *ocv1.ClusterExtension, chrt *chart.Chart, values chartutil.Values, post postrender.PostRenderer) (*release.Release, *release.Release, string, error) { + logger := log.FromContext(ctx) currentRelease, err := cl.Get(ext.GetName()) + + // if a release is pending at this point, that means that a helm action + // (installation/upgrade) we were attempting was likely interrupted in-flight. + // Pending release would leave us in reconciliation error loop because helm + // wouldn't be able to progress automatically from it. + // + // one of the workarounds is to try and remove helm metadata relating to + // that pending release which should 'reset' its state communicated to helm + // and the next reconciliation should be able to successfully pick up from here + // for context see: https://github.com/helm/helm/issues/5595 and https://github.com/helm/helm/issues/7476 + // and the discussion in https://github.com/operator-framework/operator-controller/pull/1776 + if err == nil && currentRelease.Info.Status.IsPending() { + if _, err = cl.Config().Releases.Delete(currentRelease.Name, currentRelease.Version); err != nil { + return nil, nil, StateError, fmt.Errorf("failed removing interrupted release %q version %d metadata: %w", currentRelease.Name, currentRelease.Version, err) + } + // return error to try to detect proper state (installation/upgrade) at next reconciliation + return nil, nil, StateError, fmt.Errorf("removed interrupted release %q version %d metadata", currentRelease.Name, currentRelease.Version) + } + if errors.Is(err, driver.ErrReleaseNotFound) { + logger.V(4).Info("ClusterExtension dry-run install", "extension", ext.GetName()) desiredRelease, err := cl.Install(ext.GetName(), ext.Spec.Namespace, chrt, values, func(i *action.Install) error { i.DryRun = true i.DryRunOption = "server" @@ -186,6 +207,7 @@ func (h *Helm) getReleaseState(cl helmclient.ActionInterface, ext *ocv1.ClusterE } desiredRelease, err := cl.Upgrade(ext.GetName(), ext.Spec.Namespace, chrt, values, func(upgrade *action.Upgrade) error { + logger.V(4).Info("ClusterExtension dry-run upgrade", "extension", ext.GetName()) upgrade.MaxHistory = maxHelmReleaseHistory upgrade.DryRun = true upgrade.DryRunOption = "server" diff --git a/internal/operator-controller/applier/helm_test.go b/internal/operator-controller/applier/helm_test.go index faaa41783..812b10ea4 100644 --- a/internal/operator-controller/applier/helm_test.go +++ b/internal/operator-controller/applier/helm_test.go @@ -13,6 +13,7 @@ import ( "helm.sh/helm/v3/pkg/action" "helm.sh/helm/v3/pkg/chart" "helm.sh/helm/v3/pkg/release" + "helm.sh/helm/v3/pkg/storage" "helm.sh/helm/v3/pkg/storage/driver" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -50,6 +51,7 @@ type mockActionGetter struct { reconcileErr error desiredRel *release.Release currentRel *release.Release + config *action.Configuration } func (mag *mockActionGetter) ActionClientFor(ctx context.Context, obj client.Object) (helmclient.ActionInterface, error) { @@ -98,6 +100,10 @@ func (mag *mockActionGetter) Reconcile(rel *release.Release) error { return mag.reconcileErr } +func (mag *mockActionGetter) Config() *action.Configuration { + return mag.config +} + var ( // required for unmockable call to convert.RegistryV1ToHelmChart validFS = fstest.MapFS{ @@ -179,6 +185,80 @@ func TestApply_Base(t *testing.T) { }) } +func TestApply_InterruptedRelease(t *testing.T) { + t.Run("fails removing an interrupted install release", func(t *testing.T) { + testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingInstall}} + testStorage := storage.Init(driver.NewMemory()) + + mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}} + helmApplier := applier.Helm{ + ActionClientGetter: mockAcg, + BundleToHelmChartFn: convert.RegistryV1ToHelmChart, + } + + objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels) + require.Error(t, err) + require.ErrorContains(t, err, "removing interrupted release") + require.Nil(t, objs) + require.Empty(t, state) + }) + + t.Run("fails removing an interrupted upgrade release", func(t *testing.T) { + testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingUpgrade}} + testStorage := storage.Init(driver.NewMemory()) + + mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}} + helmApplier := applier.Helm{ + ActionClientGetter: mockAcg, + BundleToHelmChartFn: convert.RegistryV1ToHelmChart, + } + + objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels) + require.Error(t, err) + require.ErrorContains(t, err, "removing interrupted release") + require.Nil(t, objs) + require.Empty(t, state) + }) + + t.Run("successfully removes an interrupted install release", func(t *testing.T) { + testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingInstall}} + testStorage := storage.Init(driver.NewMemory()) + err := testStorage.Create(testRel) + require.NoError(t, err) + + mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}} + helmApplier := applier.Helm{ + ActionClientGetter: mockAcg, + BundleToHelmChartFn: convert.RegistryV1ToHelmChart, + } + + objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels) + require.Error(t, err) + require.ErrorContains(t, err, "removed interrupted release") + require.Nil(t, objs) + require.Empty(t, state) + }) + + t.Run("successfully removes an interrupted upgrade release", func(t *testing.T) { + testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingUpgrade}} + testStorage := storage.Init(driver.NewMemory()) + err := testStorage.Create(testRel) + require.NoError(t, err) + + mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}} + helmApplier := applier.Helm{ + ActionClientGetter: mockAcg, + BundleToHelmChartFn: convert.RegistryV1ToHelmChart, + } + + objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels) + require.Error(t, err) + require.ErrorContains(t, err, "removed interrupted release") + require.Nil(t, objs) + require.Empty(t, state) + }) +} + func TestApply_Installation(t *testing.T) { t.Run("fails during dry-run installation", func(t *testing.T) { mockAcg := &mockActionGetter{ @@ -340,6 +420,7 @@ func TestApply_Upgrade(t *testing.T) { t.Run("fails during dry-run upgrade", func(t *testing.T) { mockAcg := &mockActionGetter{ + currentRel: testCurrentRelease, dryRunUpgradeErr: errors.New("failed attempting to dry-run upgrade chart"), } helmApplier := applier.Helm{ diff --git a/internal/operator-controller/controllers/clusterextension_controller_test.go b/internal/operator-controller/controllers/clusterextension_controller_test.go index be61891a0..1531c9318 100644 --- a/internal/operator-controller/controllers/clusterextension_controller_test.go +++ b/internal/operator-controller/controllers/clusterextension_controller_test.go @@ -12,6 +12,7 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "helm.sh/helm/v3/pkg/action" "helm.sh/helm/v3/pkg/chart" "helm.sh/helm/v3/pkg/release" "helm.sh/helm/v3/pkg/storage/driver" @@ -1411,6 +1412,10 @@ func (mag *MockActionGetter) Reconcile(rel *release.Release) error { return nil } +func (mag *MockActionGetter) Config() *action.Configuration { + return nil +} + func TestGetInstalledBundleHistory(t *testing.T) { getter := controllers.DefaultInstalledBundleGetter{} From 4e0d40aca824cbc1f7d3dfa394e2f56c90cfec3a Mon Sep 17 00:00:00 2001 From: Artur Zych <5843875+azych@users.noreply.github.com> Date: Mon, 10 Mar 2025 15:52:02 +0100 Subject: [PATCH 2/2] move pending release handling out of getReleaseState --- internal/operator-controller/applier/helm.go | 45 +++++++++++-------- .../operator-controller/applier/helm_test.go | 18 ++++---- 2 files changed, 35 insertions(+), 28 deletions(-) diff --git a/internal/operator-controller/applier/helm.go b/internal/operator-controller/applier/helm.go index 7ea4d4769..0e5971a71 100644 --- a/internal/operator-controller/applier/helm.go +++ b/internal/operator-controller/applier/helm.go @@ -36,6 +36,11 @@ const ( maxHelmReleaseHistory = 10 ) +var ( + errPendingRelease = errors.New("release is in a pending state") + errBuildHelmChartFuncNil = errors.New("BundleToHelmChartFn is nil") +) + // Preflight is a check that should be run before making any changes to the cluster type Preflight interface { // Install runs checks that should be successful prior @@ -96,6 +101,23 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte } rel, desiredRel, state, err := h.getReleaseState(ctx, ac, ext, chrt, values, post) + // if a release is pending, that means that a helm action + // (installation/upgrade) we were attempting was likely interrupted in-flight. + // Pending release would leave us in reconciliation error loop because helm + // wouldn't be able to progress automatically from it. + // + // one of the workarounds is to try and remove helm metadata relating to + // that pending release which should 'reset' its state communicated to helm + // and the next reconciliation should be able to successfully pick up from here + // for context see: https://github.com/helm/helm/issues/5595 and https://github.com/helm/helm/issues/7476 + // and the discussion in https://github.com/operator-framework/operator-controller/pull/1776 + if errors.Is(err, errPendingRelease) { + if _, err := ac.Config().Releases.Delete(rel.Name, rel.Version); err != nil { + return nil, "", fmt.Errorf("failed removing pending release %q version %d metadata: %w", rel.Name, rel.Version, err) + } + // return an error to try to detect proper state (installation/upgrade) at next reconciliation + return nil, "", fmt.Errorf("removed pending release %q version %d metadata", rel.Name, rel.Version) + } if err != nil { return nil, "", err } @@ -155,7 +177,7 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte func (h *Helm) buildHelmChart(bundleFS fs.FS, ext *ocv1.ClusterExtension) (*chart.Chart, error) { if h.BundleToHelmChartFn == nil { - return nil, errors.New("BundleToHelmChartFn is nil") + return nil, errBuildHelmChartFuncNil } watchNamespace, err := GetWatchNamespace(ext) if err != nil { @@ -168,24 +190,6 @@ func (h *Helm) getReleaseState(ctx context.Context, cl helmclient.ActionInterfac logger := log.FromContext(ctx) currentRelease, err := cl.Get(ext.GetName()) - // if a release is pending at this point, that means that a helm action - // (installation/upgrade) we were attempting was likely interrupted in-flight. - // Pending release would leave us in reconciliation error loop because helm - // wouldn't be able to progress automatically from it. - // - // one of the workarounds is to try and remove helm metadata relating to - // that pending release which should 'reset' its state communicated to helm - // and the next reconciliation should be able to successfully pick up from here - // for context see: https://github.com/helm/helm/issues/5595 and https://github.com/helm/helm/issues/7476 - // and the discussion in https://github.com/operator-framework/operator-controller/pull/1776 - if err == nil && currentRelease.Info.Status.IsPending() { - if _, err = cl.Config().Releases.Delete(currentRelease.Name, currentRelease.Version); err != nil { - return nil, nil, StateError, fmt.Errorf("failed removing interrupted release %q version %d metadata: %w", currentRelease.Name, currentRelease.Version, err) - } - // return error to try to detect proper state (installation/upgrade) at next reconciliation - return nil, nil, StateError, fmt.Errorf("removed interrupted release %q version %d metadata", currentRelease.Name, currentRelease.Version) - } - if errors.Is(err, driver.ErrReleaseNotFound) { logger.V(4).Info("ClusterExtension dry-run install", "extension", ext.GetName()) desiredRelease, err := cl.Install(ext.GetName(), ext.Spec.Namespace, chrt, values, func(i *action.Install) error { @@ -205,6 +209,9 @@ func (h *Helm) getReleaseState(ctx context.Context, cl helmclient.ActionInterfac if err != nil { return nil, nil, StateError, err } + if currentRelease.Info.Status.IsPending() { + return currentRelease, nil, StateError, errPendingRelease + } desiredRelease, err := cl.Upgrade(ext.GetName(), ext.Spec.Namespace, chrt, values, func(upgrade *action.Upgrade) error { logger.V(4).Info("ClusterExtension dry-run upgrade", "extension", ext.GetName()) diff --git a/internal/operator-controller/applier/helm_test.go b/internal/operator-controller/applier/helm_test.go index 812b10ea4..1d9216bb8 100644 --- a/internal/operator-controller/applier/helm_test.go +++ b/internal/operator-controller/applier/helm_test.go @@ -185,8 +185,8 @@ func TestApply_Base(t *testing.T) { }) } -func TestApply_InterruptedRelease(t *testing.T) { - t.Run("fails removing an interrupted install release", func(t *testing.T) { +func TestApply_PendingRelease(t *testing.T) { + t.Run("fails removing a pending install release", func(t *testing.T) { testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingInstall}} testStorage := storage.Init(driver.NewMemory()) @@ -198,12 +198,12 @@ func TestApply_InterruptedRelease(t *testing.T) { objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels) require.Error(t, err) - require.ErrorContains(t, err, "removing interrupted release") + require.ErrorContains(t, err, "removing pending release") require.Nil(t, objs) require.Empty(t, state) }) - t.Run("fails removing an interrupted upgrade release", func(t *testing.T) { + t.Run("fails removing a pending upgrade release", func(t *testing.T) { testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingUpgrade}} testStorage := storage.Init(driver.NewMemory()) @@ -215,12 +215,12 @@ func TestApply_InterruptedRelease(t *testing.T) { objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels) require.Error(t, err) - require.ErrorContains(t, err, "removing interrupted release") + require.ErrorContains(t, err, "removing pending release") require.Nil(t, objs) require.Empty(t, state) }) - t.Run("successfully removes an interrupted install release", func(t *testing.T) { + t.Run("successfully removes a pending install release", func(t *testing.T) { testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingInstall}} testStorage := storage.Init(driver.NewMemory()) err := testStorage.Create(testRel) @@ -234,12 +234,12 @@ func TestApply_InterruptedRelease(t *testing.T) { objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels) require.Error(t, err) - require.ErrorContains(t, err, "removed interrupted release") + require.ErrorContains(t, err, "removed pending release") require.Nil(t, objs) require.Empty(t, state) }) - t.Run("successfully removes an interrupted upgrade release", func(t *testing.T) { + t.Run("successfully removes an pending upgrade release", func(t *testing.T) { testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingUpgrade}} testStorage := storage.Init(driver.NewMemory()) err := testStorage.Create(testRel) @@ -253,7 +253,7 @@ func TestApply_InterruptedRelease(t *testing.T) { objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels) require.Error(t, err) - require.ErrorContains(t, err, "removed interrupted release") + require.ErrorContains(t, err, "removed pending release") require.Nil(t, objs) require.Empty(t, state) })