Skip to content

Commit c8472ac

Browse files
committed
(fix) registry pods do not come up again after node failure
[PR 3201](operator-framework#3201) attempted to solve for the issue by deleting the pods stuck in Terminating due to unreachable node. However, the logic to do that was included in `EnsureRegistryServer`, which only gets executed in polling in requested by the user. This PR fixes the issue by modifying the `RegistryReconciler` interface with the introduction of a new component for the interface: `RegistryCleaner`. This promotes the job of cleaning up the pods that are stuck (and any other resources that may need to be cleaned) to a first class status. The `RegistryCleaner` is then called as the first step in the Catalog Operator registry reconclier, so that the pods stuck are cleaned up before the rest of the reconciler logic is executed. The PR provides implementation of `RegistryCleaner` for the `GrpcReconciler`, `ConfigMapReconciler` and the `GrpcAddressRegistryReconciler` implementations of `RegistryReconciler` interface.
1 parent 3a8bc57 commit c8472ac

File tree

6 files changed

+158
-32
lines changed

6 files changed

+158
-32
lines changed

pkg/controller/operators/catalog/operator.go

+6
Original file line numberDiff line numberDiff line change
@@ -992,6 +992,12 @@ func (o *Operator) syncRegistryServer(logger *logrus.Entry, in *v1alpha1.Catalog
992992
out.SetError(v1alpha1.CatalogSourceRegistryServerError, syncError)
993993
return
994994
}
995+
err := srcReconciler.CleanRegistryServer(logger, in)
996+
if err != nil {
997+
syncError = fmt.Errorf("could not clean dead resources: %s", err)
998+
out.SetError(v1alpha1.CatalogSourceRegistryServerError, syncError)
999+
return
1000+
}
9951001

9961002
healthy, err := srcReconciler.CheckRegistryServer(logger, in)
9971003
if err != nil {

pkg/controller/registry/reconciler/configmap.go

+39-8
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,20 @@ package reconciler
33

44
import (
55
"context"
6+
"errors"
67
"fmt"
78

89
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/install"
910
hashutil "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/kubernetes/pkg/util/hash"
10-
"github.com/pkg/errors"
11+
pkgerrors "github.com/pkg/errors"
1112
"github.com/sirupsen/logrus"
1213
corev1 "k8s.io/api/core/v1"
1314
rbacv1 "k8s.io/api/rbac/v1"
1415
apierrors "k8s.io/apimachinery/pkg/api/errors"
1516
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1617
"k8s.io/apimachinery/pkg/labels"
1718
"k8s.io/apimachinery/pkg/util/intstr"
19+
"k8s.io/utils/ptr"
1820

1921
"github.com/operator-framework/api/pkg/operators/v1alpha1"
2022
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/operatorclient"
@@ -192,6 +194,7 @@ type ConfigMapRegistryReconciler struct {
192194

193195
var _ RegistryEnsurer = &ConfigMapRegistryReconciler{}
194196
var _ RegistryChecker = &ConfigMapRegistryReconciler{}
197+
var _ RegistryCleaner = &ConfigMapRegistryReconciler{}
195198
var _ RegistryReconciler = &ConfigMapRegistryReconciler{}
196199

197200
func (c *ConfigMapRegistryReconciler) currentService(source configMapCatalogSourceDecorator) (*corev1.Service, error) {
@@ -327,27 +330,27 @@ func (c *ConfigMapRegistryReconciler) EnsureRegistryServer(logger *logrus.Entry,
327330

328331
//TODO: if any of these error out, we should write a status back (possibly set RegistryServiceStatus to nil so they get recreated)
329332
if err := c.ensureServiceAccount(source, overwrite); err != nil {
330-
return errors.Wrapf(err, "error ensuring service account: %s", source.serviceAccountName())
333+
return pkgerrors.Wrapf(err, "error ensuring service account: %s", source.serviceAccountName())
331334
}
332335
if err := c.ensureRole(source, overwrite); err != nil {
333-
return errors.Wrapf(err, "error ensuring role: %s", source.roleName())
336+
return pkgerrors.Wrapf(err, "error ensuring role: %s", source.roleName())
334337
}
335338
if err := c.ensureRoleBinding(source, overwrite); err != nil {
336-
return errors.Wrapf(err, "error ensuring rolebinding: %s", source.RoleBinding().GetName())
339+
return pkgerrors.Wrapf(err, "error ensuring rolebinding: %s", source.RoleBinding().GetName())
337340
}
338341
pod, err := source.Pod(image, defaultPodSecurityConfig)
339342
if err != nil {
340343
return err
341344
}
342345
if err := c.ensurePod(source, defaultPodSecurityConfig, overwritePod); err != nil {
343-
return errors.Wrapf(err, "error ensuring pod: %s", pod.GetName())
346+
return pkgerrors.Wrapf(err, "error ensuring pod: %s", pod.GetName())
344347
}
345348
service, err := source.Service()
346349
if err != nil {
347350
return err
348351
}
349352
if err := c.ensureService(source, overwrite); err != nil {
350-
return errors.Wrapf(err, "error ensuring service: %s", service.GetName())
353+
return pkgerrors.Wrapf(err, "error ensuring service: %s", service.GetName())
351354
}
352355

353356
if overwritePod {
@@ -420,15 +423,15 @@ func (c *ConfigMapRegistryReconciler) ensurePod(source configMapCatalogSourceDec
420423
}
421424
for _, p := range currentPods {
422425
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(pod.GetNamespace()).Delete(context.TODO(), p.GetName(), *metav1.NewDeleteOptions(1)); err != nil && !apierrors.IsNotFound(err) {
423-
return errors.Wrapf(err, "error deleting old pod: %s", p.GetName())
426+
return pkgerrors.Wrapf(err, "error deleting old pod: %s", p.GetName())
424427
}
425428
}
426429
}
427430
_, err = c.OpClient.KubernetesInterface().CoreV1().Pods(pod.GetNamespace()).Create(context.TODO(), pod, metav1.CreateOptions{})
428431
if err == nil {
429432
return nil
430433
}
431-
return errors.Wrapf(err, "error creating new pod: %s", pod.GetGenerateName())
434+
return pkgerrors.Wrapf(err, "error creating new pod: %s", pod.GetGenerateName())
432435
}
433436

434437
func (c *ConfigMapRegistryReconciler) ensureService(source configMapCatalogSourceDecorator, overwrite bool) error {
@@ -515,3 +518,31 @@ func (c *ConfigMapRegistryReconciler) CheckRegistryServer(logger *logrus.Entry,
515518
healthy = true
516519
return
517520
}
521+
522+
// CleanRegistryServer attempts to force delete registry client pods that are stale.
523+
func (c *ConfigMapRegistryReconciler) CleanRegistryServer(logger *logrus.Entry, catalogSource *v1alpha1.CatalogSource) error {
524+
source := configMapCatalogSourceDecorator{catalogSource, c.createPodAsUser}
525+
defaultPodSecurityConfig, err := getDefaultPodContextConfig(c.OpClient, catalogSource.GetNamespace())
526+
if err != nil {
527+
return err
528+
}
529+
currentPods, err := c.currentPods(source, c.Image, defaultPodSecurityConfig)
530+
if err != nil {
531+
return err
532+
}
533+
var forceDeleteErrs []error
534+
for _, pod := range currentPods {
535+
if isPodDead(pod) {
536+
logger.WithFields(logrus.Fields{"pod.namespace": source.GetNamespace(), "pod.name": pod.GetName()}).Info("force deleting dead pod")
537+
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Delete(context.TODO(), pod.GetName(), metav1.DeleteOptions{
538+
GracePeriodSeconds: ptr.To[int64](0),
539+
}); err != nil && !apierrors.IsNotFound(err) {
540+
forceDeleteErrs = append(forceDeleteErrs, pkgerrors.Wrapf(err, "error deleting old pod: %s", pod.GetName()))
541+
}
542+
}
543+
}
544+
if len(forceDeleteErrs) > 0 {
545+
return errors.Join(forceDeleteErrs...)
546+
}
547+
return nil
548+
}

pkg/controller/registry/reconciler/grpc.go

+25-24
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ import (
44
"context"
55
"errors"
66
"fmt"
7-
"slices"
87
"strings"
98
"time"
109

@@ -346,32 +345,13 @@ func isRegistryServiceStatusValid(source *grpcCatalogSourceDecorator) (bool, err
346345
}
347346

348347
func (c *GrpcRegistryReconciler) ensurePod(logger *logrus.Entry, source grpcCatalogSourceDecorator, serviceAccount *corev1.ServiceAccount, defaultPodSecurityConfig v1alpha1.SecurityConfig, overwrite bool) error {
349-
// currentPods refers to the current pod instances of the catalog source
350-
currentPods := c.currentPods(logger, source)
351-
352-
var forceDeleteErrs []error
353-
currentPods = slices.DeleteFunc(currentPods, func(pod *corev1.Pod) bool {
354-
if !isPodDead(pod) {
355-
logger.WithFields(logrus.Fields{"pod.namespace": source.GetNamespace(), "pod.name": pod.GetName()}).Debug("pod is alive")
356-
return false
357-
}
358-
logger.WithFields(logrus.Fields{"pod.namespace": source.GetNamespace(), "pod.name": pod.GetName()}).Info("force deleting dead pod")
359-
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Delete(context.TODO(), pod.GetName(), metav1.DeleteOptions{
360-
GracePeriodSeconds: ptr.To[int64](0),
361-
}); err != nil && !apierrors.IsNotFound(err) {
362-
forceDeleteErrs = append(forceDeleteErrs, pkgerrors.Wrapf(err, "error deleting old pod: %s", pod.GetName()))
363-
}
364-
return true
365-
})
366-
if len(forceDeleteErrs) > 0 {
367-
return errors.Join(forceDeleteErrs...)
368-
}
369-
370-
if len(currentPods) > 0 {
348+
// currentLivePods refers to the currently live instances of the catalog source
349+
currentLivePods := c.currentPods(logger, source)
350+
if len(currentLivePods) > 0 {
371351
if !overwrite {
372352
return nil
373353
}
374-
for _, p := range currentPods {
354+
for _, p := range currentLivePods {
375355
logger.WithFields(logrus.Fields{"pod.namespace": source.GetNamespace(), "pod.name": p.GetName()}).Info("deleting current pod")
376356
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Delete(context.TODO(), p.GetName(), *metav1.NewDeleteOptions(1)); err != nil && !apierrors.IsNotFound(err) {
377357
return pkgerrors.Wrapf(err, "error deleting old pod: %s", p.GetName())
@@ -711,3 +691,24 @@ func podHashMatch(existing, new *corev1.Pod) bool {
711691

712692
return true
713693
}
694+
695+
// CleanRegistryServer attempts to force delete registry client pods that are stale.
696+
func (c *GrpcRegistryReconciler) CleanRegistryServer(logger *logrus.Entry, catalogSource *v1alpha1.CatalogSource) error {
697+
source := grpcCatalogSourceDecorator{CatalogSource: catalogSource, createPodAsUser: c.createPodAsUser, opmImage: c.opmImage, utilImage: c.utilImage}
698+
currentPods := c.currentPods(logger, source)
699+
var forceDeleteErrs []error
700+
for _, pod := range currentPods {
701+
if isPodDead(pod) {
702+
logger.WithFields(logrus.Fields{"pod.namespace": source.GetNamespace(), "pod.name": pod.GetName()}).Info("force deleting dead pod")
703+
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Delete(context.TODO(), pod.GetName(), metav1.DeleteOptions{
704+
GracePeriodSeconds: ptr.To[int64](0),
705+
}); err != nil && !apierrors.IsNotFound(err) {
706+
forceDeleteErrs = append(forceDeleteErrs, pkgerrors.Wrapf(err, "error deleting old pod: %s", pod.GetName()))
707+
}
708+
}
709+
}
710+
if len(forceDeleteErrs) > 0 {
711+
return errors.Join(forceDeleteErrs...)
712+
}
713+
return nil
714+
}

pkg/controller/registry/reconciler/grpc_address.go

+5
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ type GrpcAddressRegistryReconciler struct {
1111

1212
var _ RegistryEnsurer = &GrpcAddressRegistryReconciler{}
1313
var _ RegistryChecker = &GrpcAddressRegistryReconciler{}
14+
var _ RegistryCleaner = &GrpcAddressRegistryReconciler{}
1415
var _ RegistryReconciler = &GrpcAddressRegistryReconciler{}
1516

1617
// EnsureRegistryServer ensures a registry server exists for the given CatalogSource.
@@ -30,3 +31,7 @@ func (g *GrpcAddressRegistryReconciler) CheckRegistryServer(logger *logrus.Entry
3031
healthy = true
3132
return
3233
}
34+
35+
func (g *GrpcAddressRegistryReconciler) CleanRegistryServer(logger *logrus.Entry, catalogSource *v1alpha1.CatalogSource) error {
36+
return nil
37+
}

pkg/controller/registry/reconciler/reconciler.go

+7
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,17 @@ type RegistryChecker interface {
4646
CheckRegistryServer(logger *logrus.Entry, catalogSource *operatorsv1alpha1.CatalogSource) (healthy bool, err error)
4747
}
4848

49+
// RegistryCleaner force deletes stale resources.
50+
type RegistryCleaner interface {
51+
// CleanRegistryServer attempts to force delete registry clients that are stale.
52+
CleanRegistryServer(logger *logrus.Entry, catalogSource *operatorsv1alpha1.CatalogSource) error
53+
}
54+
4955
// RegistryReconciler knows how to reconcile a registry.
5056
type RegistryReconciler interface {
5157
RegistryChecker
5258
RegistryEnsurer
59+
RegistryCleaner
5360
}
5461

5562
// RegistryReconcilerFactory describes factory methods for RegistryReconcilers.

pkg/fakes/fake_reconciler.go

+76
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)