Skip to content

🐛 Handle interrupted helm releases in applier #1776

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ require (
github.com/opencontainers/go-digest v1.0.0
github.com/opencontainers/image-spec v1.1.1
github.com/operator-framework/api v0.30.0
github.com/operator-framework/helm-operator-plugins v0.8.0
github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b
github.com/operator-framework/operator-registry v1.50.0
github.com/prometheus/client_golang v1.21.1
github.com/sirupsen/logrus v1.9.3
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -536,8 +536,8 @@ github.com/openshift/crd-schema-checker v0.0.0-20240404194209-35a9033b1d11 h1:eT
github.com/openshift/crd-schema-checker v0.0.0-20240404194209-35a9033b1d11/go.mod h1:EmVJt97N+pfWFsli/ipXTBZqSG5F5KGQhm3c3IsGq1o=
github.com/operator-framework/api v0.30.0 h1:44hCmGnEnZk/Miol5o44dhSldNH0EToQUG7vZTl29kk=
github.com/operator-framework/api v0.30.0/go.mod h1:FYxAPhjtlXSAty/fbn5YJnFagt6SpJZJgFNNbvDe5W0=
github.com/operator-framework/helm-operator-plugins v0.8.0 h1:0f6HOQC5likkf0b/OvGvw7nhDb6h8Cj5twdCNjwNzMc=
github.com/operator-framework/helm-operator-plugins v0.8.0/go.mod h1:Sc+8bE38xTCgCChBUvtq/PxatEg9fAypr7S5iAw8nlA=
github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b h1:andoOL7lpEafTXdjPGDuNLJlJQh0UmRzgj0+D31K8HU=
github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b/go.mod h1:qioyxwOSI89RAtTWhccc+RyaPQdKTTJRc6sFkT6kqys=
github.com/operator-framework/operator-lib v0.17.0 h1:cbz51wZ9+GpWR1ZYP4CSKSSBxDlWxmmnseaHVZZjZt4=
github.com/operator-framework/operator-lib v0.17.0/go.mod h1:TGopBxIE8L6E/Cojzo26R3NFp1eNlqhQNmzqhOblaLw=
github.com/operator-framework/operator-registry v1.50.0 h1:kMAwsKAEDjuSx5dGchMX+CD3SMHWwOAC/xyK3LQweB4=
Expand Down
6 changes: 6 additions & 0 deletions internal/operator-controller/action/helm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
"helm.sh/helm/v3/pkg/action"
"helm.sh/helm/v3/pkg/chart"
"helm.sh/helm/v3/pkg/release"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand Down Expand Up @@ -70,6 +71,11 @@ func (m *mockActionClient) Reconcile(rel *release.Release) error {
return args.Error(0)
}

func (m *mockActionClient) Config() *action.Configuration {
args := m.Called()
return args.Get(0).(*action.Configuration)
}

var _ actionclient.ActionClientGetter = &mockActionClientGetter{}

type mockActionClientGetter struct {
Expand Down
35 changes: 32 additions & 3 deletions internal/operator-controller/applier/helm.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ const (
maxHelmReleaseHistory = 10
)

var (
errPendingRelease = errors.New("release is in a pending state")
errBuildHelmChartFuncNil = errors.New("BundleToHelmChartFn is nil")
)

// Preflight is a check that should be run before making any changes to the cluster
type Preflight interface {
// Install runs checks that should be successful prior
Expand Down Expand Up @@ -95,7 +100,24 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte
labels: objectLabels,
}

rel, desiredRel, state, err := h.getReleaseState(ac, ext, chrt, values, post)
rel, desiredRel, state, err := h.getReleaseState(ctx, ac, ext, chrt, values, post)
// if a release is pending, that means that a helm action
// (installation/upgrade) we were attempting was likely interrupted in-flight.
// Pending release would leave us in reconciliation error loop because helm
// wouldn't be able to progress automatically from it.
//
// one of the workarounds is to try and remove helm metadata relating to
// that pending release which should 'reset' its state communicated to helm
// and the next reconciliation should be able to successfully pick up from here
// for context see: https://github.com/helm/helm/issues/5595 and https://github.com/helm/helm/issues/7476
// and the discussion in https://github.com/operator-framework/operator-controller/pull/1776
if errors.Is(err, errPendingRelease) {
if _, err := ac.Config().Releases.Delete(rel.Name, rel.Version); err != nil {
return nil, "", fmt.Errorf("failed removing pending release %q version %d metadata: %w", rel.Name, rel.Version, err)
}
// return an error to try to detect proper state (installation/upgrade) at next reconciliation
return nil, "", fmt.Errorf("removed pending release %q version %d metadata", rel.Name, rel.Version)
}
if err != nil {
return nil, "", err
}
Expand Down Expand Up @@ -155,7 +177,7 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte

func (h *Helm) buildHelmChart(bundleFS fs.FS, ext *ocv1.ClusterExtension) (*chart.Chart, error) {
if h.BundleToHelmChartFn == nil {
return nil, errors.New("BundleToHelmChartFn is nil")
return nil, errBuildHelmChartFuncNil
}
watchNamespace, err := GetWatchNamespace(ext)
if err != nil {
Expand All @@ -164,9 +186,12 @@ func (h *Helm) buildHelmChart(bundleFS fs.FS, ext *ocv1.ClusterExtension) (*char
return h.BundleToHelmChartFn(bundleFS, ext.Spec.Namespace, watchNamespace)
}

func (h *Helm) getReleaseState(cl helmclient.ActionInterface, ext *ocv1.ClusterExtension, chrt *chart.Chart, values chartutil.Values, post postrender.PostRenderer) (*release.Release, *release.Release, string, error) {
func (h *Helm) getReleaseState(ctx context.Context, cl helmclient.ActionInterface, ext *ocv1.ClusterExtension, chrt *chart.Chart, values chartutil.Values, post postrender.PostRenderer) (*release.Release, *release.Release, string, error) {
logger := log.FromContext(ctx)
currentRelease, err := cl.Get(ext.GetName())

if errors.Is(err, driver.ErrReleaseNotFound) {
logger.V(4).Info("ClusterExtension dry-run install", "extension", ext.GetName())
desiredRelease, err := cl.Install(ext.GetName(), ext.Spec.Namespace, chrt, values, func(i *action.Install) error {
i.DryRun = true
i.DryRunOption = "server"
Expand All @@ -184,8 +209,12 @@ func (h *Helm) getReleaseState(cl helmclient.ActionInterface, ext *ocv1.ClusterE
if err != nil {
return nil, nil, StateError, err
}
if currentRelease.Info.Status.IsPending() {
return currentRelease, nil, StateError, errPendingRelease
}

desiredRelease, err := cl.Upgrade(ext.GetName(), ext.Spec.Namespace, chrt, values, func(upgrade *action.Upgrade) error {
logger.V(4).Info("ClusterExtension dry-run upgrade", "extension", ext.GetName())
upgrade.MaxHistory = maxHelmReleaseHistory
upgrade.DryRun = true
upgrade.DryRunOption = "server"
Expand Down
81 changes: 81 additions & 0 deletions internal/operator-controller/applier/helm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"helm.sh/helm/v3/pkg/action"
"helm.sh/helm/v3/pkg/chart"
"helm.sh/helm/v3/pkg/release"
"helm.sh/helm/v3/pkg/storage"
"helm.sh/helm/v3/pkg/storage/driver"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -50,6 +51,7 @@ type mockActionGetter struct {
reconcileErr error
desiredRel *release.Release
currentRel *release.Release
config *action.Configuration
}

func (mag *mockActionGetter) ActionClientFor(ctx context.Context, obj client.Object) (helmclient.ActionInterface, error) {
Expand Down Expand Up @@ -98,6 +100,10 @@ func (mag *mockActionGetter) Reconcile(rel *release.Release) error {
return mag.reconcileErr
}

func (mag *mockActionGetter) Config() *action.Configuration {
return mag.config
}

var (
// required for unmockable call to convert.RegistryV1ToHelmChart
validFS = fstest.MapFS{
Expand Down Expand Up @@ -179,6 +185,80 @@ func TestApply_Base(t *testing.T) {
})
}

func TestApply_PendingRelease(t *testing.T) {
t.Run("fails removing a pending install release", func(t *testing.T) {
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingInstall}}
testStorage := storage.Init(driver.NewMemory())

mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
helmApplier := applier.Helm{
ActionClientGetter: mockAcg,
BundleToHelmChartFn: convert.RegistryV1ToHelmChart,
}

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
require.ErrorContains(t, err, "removing pending release")
require.Nil(t, objs)
require.Empty(t, state)
})

t.Run("fails removing a pending upgrade release", func(t *testing.T) {
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingUpgrade}}
testStorage := storage.Init(driver.NewMemory())

mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
helmApplier := applier.Helm{
ActionClientGetter: mockAcg,
BundleToHelmChartFn: convert.RegistryV1ToHelmChart,
}

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
require.ErrorContains(t, err, "removing pending release")
require.Nil(t, objs)
require.Empty(t, state)
})

t.Run("successfully removes a pending install release", func(t *testing.T) {
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingInstall}}
testStorage := storage.Init(driver.NewMemory())
err := testStorage.Create(testRel)
require.NoError(t, err)

mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
helmApplier := applier.Helm{
ActionClientGetter: mockAcg,
BundleToHelmChartFn: convert.RegistryV1ToHelmChart,
}

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
require.ErrorContains(t, err, "removed pending release")
require.Nil(t, objs)
require.Empty(t, state)
})

t.Run("successfully removes an pending upgrade release", func(t *testing.T) {
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingUpgrade}}
testStorage := storage.Init(driver.NewMemory())
err := testStorage.Create(testRel)
require.NoError(t, err)

mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
helmApplier := applier.Helm{
ActionClientGetter: mockAcg,
BundleToHelmChartFn: convert.RegistryV1ToHelmChart,
}

objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
require.Error(t, err)
require.ErrorContains(t, err, "removed pending release")
require.Nil(t, objs)
require.Empty(t, state)
})
}

func TestApply_Installation(t *testing.T) {
t.Run("fails during dry-run installation", func(t *testing.T) {
mockAcg := &mockActionGetter{
Expand Down Expand Up @@ -340,6 +420,7 @@ func TestApply_Upgrade(t *testing.T) {

t.Run("fails during dry-run upgrade", func(t *testing.T) {
mockAcg := &mockActionGetter{
currentRel: testCurrentRelease,
dryRunUpgradeErr: errors.New("failed attempting to dry-run upgrade chart"),
}
helmApplier := applier.Helm{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"helm.sh/helm/v3/pkg/action"
"helm.sh/helm/v3/pkg/chart"
"helm.sh/helm/v3/pkg/release"
"helm.sh/helm/v3/pkg/storage/driver"
Expand Down Expand Up @@ -1411,6 +1412,10 @@ func (mag *MockActionGetter) Reconcile(rel *release.Release) error {
return nil
}

func (mag *MockActionGetter) Config() *action.Configuration {
return nil
}

func TestGetInstalledBundleHistory(t *testing.T) {
getter := controllers.DefaultInstalledBundleGetter{}

Expand Down
Loading