Skip to content

Commit fff2ea8

Browse files
committed
Handle interrupted helm releases in applier
Introduces a workaround for 'interrupted' helm releases which enter into a 'pending' (-install/uninstall/rollback) state. If that happens, for example because of immediate application exit with one of those operations being in flight, helm is not able to resolve it automatically which means we end up in a permanent reconcile error state. One of the workarounds for this that has been repeated in the community is to remove metadata of the pending release, which is the solution chosen here. For full context see: #1776 helm/helm#5595
1 parent e4e76d2 commit fff2ea8

File tree

6 files changed

+119
-5
lines changed

6 files changed

+119
-5
lines changed

Diff for: go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ require (
1818
github.com/opencontainers/go-digest v1.0.0
1919
github.com/opencontainers/image-spec v1.1.1
2020
github.com/operator-framework/api v0.30.0
21-
github.com/operator-framework/helm-operator-plugins v0.8.0
21+
github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b
2222
github.com/operator-framework/operator-registry v1.50.0
2323
github.com/prometheus/client_golang v1.21.1
2424
github.com/sirupsen/logrus v1.9.3

Diff for: go.sum

+2-2
Original file line numberDiff line numberDiff line change
@@ -536,8 +536,8 @@ github.com/openshift/crd-schema-checker v0.0.0-20240404194209-35a9033b1d11 h1:eT
536536
github.com/openshift/crd-schema-checker v0.0.0-20240404194209-35a9033b1d11/go.mod h1:EmVJt97N+pfWFsli/ipXTBZqSG5F5KGQhm3c3IsGq1o=
537537
github.com/operator-framework/api v0.30.0 h1:44hCmGnEnZk/Miol5o44dhSldNH0EToQUG7vZTl29kk=
538538
github.com/operator-framework/api v0.30.0/go.mod h1:FYxAPhjtlXSAty/fbn5YJnFagt6SpJZJgFNNbvDe5W0=
539-
github.com/operator-framework/helm-operator-plugins v0.8.0 h1:0f6HOQC5likkf0b/OvGvw7nhDb6h8Cj5twdCNjwNzMc=
540-
github.com/operator-framework/helm-operator-plugins v0.8.0/go.mod h1:Sc+8bE38xTCgCChBUvtq/PxatEg9fAypr7S5iAw8nlA=
539+
github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b h1:andoOL7lpEafTXdjPGDuNLJlJQh0UmRzgj0+D31K8HU=
540+
github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b/go.mod h1:qioyxwOSI89RAtTWhccc+RyaPQdKTTJRc6sFkT6kqys=
541541
github.com/operator-framework/operator-lib v0.17.0 h1:cbz51wZ9+GpWR1ZYP4CSKSSBxDlWxmmnseaHVZZjZt4=
542542
github.com/operator-framework/operator-lib v0.17.0/go.mod h1:TGopBxIE8L6E/Cojzo26R3NFp1eNlqhQNmzqhOblaLw=
543543
github.com/operator-framework/operator-registry v1.50.0 h1:kMAwsKAEDjuSx5dGchMX+CD3SMHWwOAC/xyK3LQweB4=

Diff for: internal/operator-controller/action/helm_test.go

+6
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"github.com/stretchr/testify/assert"
1010
"github.com/stretchr/testify/mock"
1111
"github.com/stretchr/testify/require"
12+
"helm.sh/helm/v3/pkg/action"
1213
"helm.sh/helm/v3/pkg/chart"
1314
"helm.sh/helm/v3/pkg/release"
1415
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -70,6 +71,11 @@ func (m *mockActionClient) Reconcile(rel *release.Release) error {
7071
return args.Error(0)
7172
}
7273

74+
func (m *mockActionClient) Config() *action.Configuration {
75+
args := m.Called()
76+
return args.Get(0).(*action.Configuration)
77+
}
78+
7379
var _ actionclient.ActionClientGetter = &mockActionClientGetter{}
7480

7581
type mockActionClientGetter struct {

Diff for: internal/operator-controller/applier/helm.go

+24-2
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte
9595
labels: objectLabels,
9696
}
9797

98-
rel, desiredRel, state, err := h.getReleaseState(ac, ext, chrt, values, post)
98+
rel, desiredRel, state, err := h.getReleaseState(ctx, ac, ext, chrt, values, post)
9999
if err != nil {
100100
return nil, "", err
101101
}
@@ -164,9 +164,30 @@ func (h *Helm) buildHelmChart(bundleFS fs.FS, ext *ocv1.ClusterExtension) (*char
164164
return h.BundleToHelmChartFn(bundleFS, ext.Spec.Namespace, watchNamespace)
165165
}
166166

167-
func (h *Helm) getReleaseState(cl helmclient.ActionInterface, ext *ocv1.ClusterExtension, chrt *chart.Chart, values chartutil.Values, post postrender.PostRenderer) (*release.Release, *release.Release, string, error) {
167+
func (h *Helm) getReleaseState(ctx context.Context, cl helmclient.ActionInterface, ext *ocv1.ClusterExtension, chrt *chart.Chart, values chartutil.Values, post postrender.PostRenderer) (*release.Release, *release.Release, string, error) {
168+
logger := log.FromContext(ctx)
168169
currentRelease, err := cl.Get(ext.GetName())
170+
171+
// if a release is pending at this point, that means that a helm action
172+
// (installation/upgrade) we were attempting was likely interrupted in-flight.
173+
// Pending release would leave us in reconciliation error loop because helm
174+
// wouldn't be able to progress automatically from it.
175+
//
176+
// one of the workarounds is to try and remove helm metadata relating to
177+
// that pending release which should 'reset' its state communicated to helm
178+
// and the next reconciliation should be able to successfully pick up from here
179+
// for context see: https://github.com/helm/helm/issues/5595 and https://github.com/helm/helm/issues/7476
180+
// and the discussion in https://github.com/operator-framework/operator-controller/pull/1776
181+
if err == nil && currentRelease.Info.Status.IsPending() {
182+
if _, err = cl.Config().Releases.Delete(currentRelease.Name, currentRelease.Version); err != nil {
183+
return nil, nil, StateError, fmt.Errorf("failed removing interrupted release %q version %d metadata: %w", currentRelease.Name, currentRelease.Version, err)
184+
}
185+
// return error to try to detect proper state (installation/upgrade) at next reconciliation
186+
return nil, nil, StateError, fmt.Errorf("removed interrupted release %q version %d metadata", currentRelease.Name, currentRelease.Version)
187+
}
188+
169189
if errors.Is(err, driver.ErrReleaseNotFound) {
190+
logger.V(4).Info("ClusterExtension dry-run install", "extension", ext.GetName())
170191
desiredRelease, err := cl.Install(ext.GetName(), ext.Spec.Namespace, chrt, values, func(i *action.Install) error {
171192
i.DryRun = true
172193
i.DryRunOption = "server"
@@ -186,6 +207,7 @@ func (h *Helm) getReleaseState(cl helmclient.ActionInterface, ext *ocv1.ClusterE
186207
}
187208

188209
desiredRelease, err := cl.Upgrade(ext.GetName(), ext.Spec.Namespace, chrt, values, func(upgrade *action.Upgrade) error {
210+
logger.V(4).Info("ClusterExtension dry-run upgrade", "extension", ext.GetName())
189211
upgrade.MaxHistory = maxHelmReleaseHistory
190212
upgrade.DryRun = true
191213
upgrade.DryRunOption = "server"

Diff for: internal/operator-controller/applier/helm_test.go

+81
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"helm.sh/helm/v3/pkg/action"
1414
"helm.sh/helm/v3/pkg/chart"
1515
"helm.sh/helm/v3/pkg/release"
16+
"helm.sh/helm/v3/pkg/storage"
1617
"helm.sh/helm/v3/pkg/storage/driver"
1718
corev1 "k8s.io/api/core/v1"
1819
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -50,6 +51,7 @@ type mockActionGetter struct {
5051
reconcileErr error
5152
desiredRel *release.Release
5253
currentRel *release.Release
54+
config *action.Configuration
5355
}
5456

5557
func (mag *mockActionGetter) ActionClientFor(ctx context.Context, obj client.Object) (helmclient.ActionInterface, error) {
@@ -98,6 +100,10 @@ func (mag *mockActionGetter) Reconcile(rel *release.Release) error {
98100
return mag.reconcileErr
99101
}
100102

103+
func (mag *mockActionGetter) Config() *action.Configuration {
104+
return mag.config
105+
}
106+
101107
var (
102108
// required for unmockable call to convert.RegistryV1ToHelmChart
103109
validFS = fstest.MapFS{
@@ -179,6 +185,80 @@ func TestApply_Base(t *testing.T) {
179185
})
180186
}
181187

188+
func TestApply_InterruptedRelease(t *testing.T) {
189+
t.Run("fails removing an interrupted install release", func(t *testing.T) {
190+
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingInstall}}
191+
testStorage := storage.Init(driver.NewMemory())
192+
193+
mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
194+
helmApplier := applier.Helm{
195+
ActionClientGetter: mockAcg,
196+
BundleToHelmChartFn: convert.RegistryV1ToHelmChart,
197+
}
198+
199+
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
200+
require.Error(t, err)
201+
require.ErrorContains(t, err, "removing interrupted release")
202+
require.Nil(t, objs)
203+
require.Empty(t, state)
204+
})
205+
206+
t.Run("fails removing an interrupted upgrade release", func(t *testing.T) {
207+
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingUpgrade}}
208+
testStorage := storage.Init(driver.NewMemory())
209+
210+
mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
211+
helmApplier := applier.Helm{
212+
ActionClientGetter: mockAcg,
213+
BundleToHelmChartFn: convert.RegistryV1ToHelmChart,
214+
}
215+
216+
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
217+
require.Error(t, err)
218+
require.ErrorContains(t, err, "removing interrupted release")
219+
require.Nil(t, objs)
220+
require.Empty(t, state)
221+
})
222+
223+
t.Run("successfully removes an interrupted install release", func(t *testing.T) {
224+
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingInstall}}
225+
testStorage := storage.Init(driver.NewMemory())
226+
err := testStorage.Create(testRel)
227+
require.NoError(t, err)
228+
229+
mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
230+
helmApplier := applier.Helm{
231+
ActionClientGetter: mockAcg,
232+
BundleToHelmChartFn: convert.RegistryV1ToHelmChart,
233+
}
234+
235+
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
236+
require.Error(t, err)
237+
require.ErrorContains(t, err, "removed interrupted release")
238+
require.Nil(t, objs)
239+
require.Empty(t, state)
240+
})
241+
242+
t.Run("successfully removes an interrupted upgrade release", func(t *testing.T) {
243+
testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingUpgrade}}
244+
testStorage := storage.Init(driver.NewMemory())
245+
err := testStorage.Create(testRel)
246+
require.NoError(t, err)
247+
248+
mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}}
249+
helmApplier := applier.Helm{
250+
ActionClientGetter: mockAcg,
251+
BundleToHelmChartFn: convert.RegistryV1ToHelmChart,
252+
}
253+
254+
objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels)
255+
require.Error(t, err)
256+
require.ErrorContains(t, err, "removed interrupted release")
257+
require.Nil(t, objs)
258+
require.Empty(t, state)
259+
})
260+
}
261+
182262
func TestApply_Installation(t *testing.T) {
183263
t.Run("fails during dry-run installation", func(t *testing.T) {
184264
mockAcg := &mockActionGetter{
@@ -340,6 +420,7 @@ func TestApply_Upgrade(t *testing.T) {
340420

341421
t.Run("fails during dry-run upgrade", func(t *testing.T) {
342422
mockAcg := &mockActionGetter{
423+
currentRel: testCurrentRelease,
343424
dryRunUpgradeErr: errors.New("failed attempting to dry-run upgrade chart"),
344425
}
345426
helmApplier := applier.Helm{

Diff for: internal/operator-controller/controllers/clusterextension_controller_test.go

+5
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"github.com/google/go-cmp/cmp/cmpopts"
1313
"github.com/stretchr/testify/assert"
1414
"github.com/stretchr/testify/require"
15+
"helm.sh/helm/v3/pkg/action"
1516
"helm.sh/helm/v3/pkg/chart"
1617
"helm.sh/helm/v3/pkg/release"
1718
"helm.sh/helm/v3/pkg/storage/driver"
@@ -1411,6 +1412,10 @@ func (mag *MockActionGetter) Reconcile(rel *release.Release) error {
14111412
return nil
14121413
}
14131414

1415+
func (mag *MockActionGetter) Config() *action.Configuration {
1416+
return nil
1417+
}
1418+
14141419
func TestGetInstalledBundleHistory(t *testing.T) {
14151420
getter := controllers.DefaultInstalledBundleGetter{}
14161421

0 commit comments

Comments
 (0)