Skip to content

Commit f243189

Browse files
authored
(fix) registry pods do not come up again after node failure (#3366)
[PR 3201](#3201) attempted to solve for the issue by deleting the pods stuck in `Terminating` due to unreachable node. However, the logic to do that was included in `EnsureRegistryServer`, which only gets executed if polling in requested by the user. This PR moves the logic of checking for dead pods out of `EnsureRegistryServer`, and puts it in `CheckRegistryServer` instead. This way, if there are any dead pods detected during `CheckRegistryServer`, the value of `healthy` is returned `false`, which inturn triggers `EnsureRegistryServer`.
1 parent cd1364f commit f243189

File tree

4 files changed

+128
-36
lines changed

4 files changed

+128
-36
lines changed

pkg/controller/registry/reconciler/configmap.go

+40-10
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,20 @@ package reconciler
33

44
import (
55
"context"
6+
"errors"
67
"fmt"
78

89
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/install"
910
hashutil "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/kubernetes/pkg/util/hash"
10-
"github.com/pkg/errors"
11+
pkgerrors "github.com/pkg/errors"
1112
"github.com/sirupsen/logrus"
1213
corev1 "k8s.io/api/core/v1"
1314
rbacv1 "k8s.io/api/rbac/v1"
1415
apierrors "k8s.io/apimachinery/pkg/api/errors"
1516
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1617
"k8s.io/apimachinery/pkg/labels"
1718
"k8s.io/apimachinery/pkg/util/intstr"
19+
"k8s.io/utils/ptr"
1820

1921
"github.com/operator-framework/api/pkg/operators/v1alpha1"
2022
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/operatorclient"
@@ -327,27 +329,27 @@ func (c *ConfigMapRegistryReconciler) EnsureRegistryServer(logger *logrus.Entry,
327329

328330
//TODO: if any of these error out, we should write a status back (possibly set RegistryServiceStatus to nil so they get recreated)
329331
if err := c.ensureServiceAccount(source, overwrite); err != nil {
330-
return errors.Wrapf(err, "error ensuring service account: %s", source.serviceAccountName())
332+
return pkgerrors.Wrapf(err, "error ensuring service account: %s", source.serviceAccountName())
331333
}
332334
if err := c.ensureRole(source, overwrite); err != nil {
333-
return errors.Wrapf(err, "error ensuring role: %s", source.roleName())
335+
return pkgerrors.Wrapf(err, "error ensuring role: %s", source.roleName())
334336
}
335337
if err := c.ensureRoleBinding(source, overwrite); err != nil {
336-
return errors.Wrapf(err, "error ensuring rolebinding: %s", source.RoleBinding().GetName())
338+
return pkgerrors.Wrapf(err, "error ensuring rolebinding: %s", source.RoleBinding().GetName())
337339
}
338340
pod, err := source.Pod(image, defaultPodSecurityConfig)
339341
if err != nil {
340342
return err
341343
}
342344
if err := c.ensurePod(source, defaultPodSecurityConfig, overwritePod); err != nil {
343-
return errors.Wrapf(err, "error ensuring pod: %s", pod.GetName())
345+
return pkgerrors.Wrapf(err, "error ensuring pod: %s", pod.GetName())
344346
}
345347
service, err := source.Service()
346348
if err != nil {
347349
return err
348350
}
349351
if err := c.ensureService(source, overwrite); err != nil {
350-
return errors.Wrapf(err, "error ensuring service: %s", service.GetName())
352+
return pkgerrors.Wrapf(err, "error ensuring service: %s", service.GetName())
351353
}
352354

353355
if overwritePod {
@@ -420,15 +422,15 @@ func (c *ConfigMapRegistryReconciler) ensurePod(source configMapCatalogSourceDec
420422
}
421423
for _, p := range currentPods {
422424
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(pod.GetNamespace()).Delete(context.TODO(), p.GetName(), *metav1.NewDeleteOptions(1)); err != nil && !apierrors.IsNotFound(err) {
423-
return errors.Wrapf(err, "error deleting old pod: %s", p.GetName())
425+
return pkgerrors.Wrapf(err, "error deleting old pod: %s", p.GetName())
424426
}
425427
}
426428
}
427429
_, err = c.OpClient.KubernetesInterface().CoreV1().Pods(pod.GetNamespace()).Create(context.TODO(), pod, metav1.CreateOptions{})
428430
if err == nil {
429431
return nil
430432
}
431-
return errors.Wrapf(err, "error creating new pod: %s", pod.GetGenerateName())
433+
return pkgerrors.Wrapf(err, "error creating new pod: %s", pod.GetGenerateName())
432434
}
433435

434436
func (c *ConfigMapRegistryReconciler) ensureService(source configMapCatalogSourceDecorator, overwrite bool) error {
@@ -512,6 +514,34 @@ func (c *ConfigMapRegistryReconciler) CheckRegistryServer(logger *logrus.Entry,
512514
return
513515
}
514516

515-
healthy = true
516-
return
517+
podsAreLive, e := detectAndDeleteDeadPods(logger, c.OpClient, pods, source.GetNamespace())
518+
if e != nil {
519+
return false, fmt.Errorf("error deleting dead pods: %v", e)
520+
}
521+
return podsAreLive, nil
522+
}
523+
524+
// detectAndDeleteDeadPods determines if there are registry client pods that are in the deleted state
525+
// but have not been removed by GC (eg the node goes down before GC can remove them), and attempts to
526+
// force delete the pods. If there are live registry pods remaining, it returns true, otherwise returns false.
527+
func detectAndDeleteDeadPods(logger *logrus.Entry, client operatorclient.ClientInterface, pods []*corev1.Pod, sourceNamespace string) (bool, error) {
528+
var forceDeletionErrs []error
529+
livePodFound := false
530+
for _, pod := range pods {
531+
if !isPodDead(pod) {
532+
livePodFound = true
533+
logger.WithFields(logrus.Fields{"pod.namespace": sourceNamespace, "pod.name": pod.GetName()}).Debug("pod is alive")
534+
continue
535+
}
536+
logger.WithFields(logrus.Fields{"pod.namespace": sourceNamespace, "pod.name": pod.GetName()}).Info("force deleting dead pod")
537+
if err := client.KubernetesInterface().CoreV1().Pods(sourceNamespace).Delete(context.TODO(), pod.GetName(), metav1.DeleteOptions{
538+
GracePeriodSeconds: ptr.To[int64](0),
539+
}); err != nil && !apierrors.IsNotFound(err) {
540+
forceDeletionErrs = append(forceDeletionErrs, err)
541+
}
542+
}
543+
if len(forceDeletionErrs) > 0 {
544+
return false, errors.Join(forceDeletionErrs...)
545+
}
546+
return livePodFound, nil
517547
}

pkg/controller/registry/reconciler/configmap_test.go

+52
Original file line numberDiff line numberDiff line change
@@ -527,3 +527,55 @@ func TestConfigMapRegistryReconciler(t *testing.T) {
527527
})
528528
}
529529
}
530+
531+
func TestConfigMapRegistryChecker(t *testing.T) {
532+
validConfigMap := validConfigMap()
533+
validCatalogSource := validConfigMapCatalogSource(validConfigMap)
534+
type cluster struct {
535+
k8sObjs []runtime.Object
536+
}
537+
type in struct {
538+
cluster cluster
539+
catsrc *v1alpha1.CatalogSource
540+
}
541+
type out struct {
542+
healthy bool
543+
err error
544+
}
545+
tests := []struct {
546+
testName string
547+
in in
548+
out out
549+
}{
550+
{
551+
testName: "ConfigMap/ExistingRegistry/DeadPod",
552+
in: in{
553+
cluster: cluster{
554+
k8sObjs: append(withPodDeletedButNotRemoved(objectsForCatalogSource(t, validCatalogSource)), validConfigMap),
555+
},
556+
catsrc: validCatalogSource,
557+
},
558+
out: out{
559+
healthy: false,
560+
},
561+
},
562+
}
563+
for _, tt := range tests {
564+
t.Run(tt.testName, func(t *testing.T) {
565+
stopc := make(chan struct{})
566+
defer close(stopc)
567+
568+
factory, _ := fakeReconcilerFactory(t, stopc, withK8sObjs(tt.in.cluster.k8sObjs...))
569+
rec := factory.ReconcilerForSource(tt.in.catsrc)
570+
571+
healthy, err := rec.CheckRegistryServer(logrus.NewEntry(logrus.New()), tt.in.catsrc)
572+
573+
require.Equal(t, tt.out.err, err)
574+
if tt.out.err != nil {
575+
return
576+
}
577+
578+
require.Equal(t, tt.out.healthy, healthy)
579+
})
580+
}
581+
}

pkg/controller/registry/reconciler/grpc.go

+7-26
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,7 @@ package reconciler
22

33
import (
44
"context"
5-
"errors"
65
"fmt"
7-
"slices"
86
"strings"
97
"time"
108

@@ -24,7 +22,6 @@ import (
2422
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2523
"k8s.io/apimachinery/pkg/labels"
2624
"k8s.io/apimachinery/pkg/util/intstr"
27-
"k8s.io/utils/ptr"
2825
)
2926

3027
const (
@@ -348,25 +345,6 @@ func isRegistryServiceStatusValid(source *grpcCatalogSourceDecorator) (bool, err
348345
func (c *GrpcRegistryReconciler) ensurePod(logger *logrus.Entry, source grpcCatalogSourceDecorator, serviceAccount *corev1.ServiceAccount, defaultPodSecurityConfig v1alpha1.SecurityConfig, overwrite bool) error {
349346
// currentPods refers to the current pod instances of the catalog source
350347
currentPods := c.currentPods(logger, source)
351-
352-
var forceDeleteErrs []error
353-
currentPods = slices.DeleteFunc(currentPods, func(pod *corev1.Pod) bool {
354-
if !isPodDead(pod) {
355-
logger.WithFields(logrus.Fields{"pod.namespace": source.GetNamespace(), "pod.name": pod.GetName()}).Debug("pod is alive")
356-
return false
357-
}
358-
logger.WithFields(logrus.Fields{"pod.namespace": source.GetNamespace(), "pod.name": pod.GetName()}).Info("force deleting dead pod")
359-
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Delete(context.TODO(), pod.GetName(), metav1.DeleteOptions{
360-
GracePeriodSeconds: ptr.To[int64](0),
361-
}); err != nil && !apierrors.IsNotFound(err) {
362-
forceDeleteErrs = append(forceDeleteErrs, pkgerrors.Wrapf(err, "error deleting old pod: %s", pod.GetName()))
363-
}
364-
return true
365-
})
366-
if len(forceDeleteErrs) > 0 {
367-
return errors.Join(forceDeleteErrs...)
368-
}
369-
370348
if len(currentPods) > 0 {
371349
if !overwrite {
372350
return nil
@@ -628,16 +606,19 @@ func (c *GrpcRegistryReconciler) CheckRegistryServer(logger *logrus.Entry, catal
628606
if err != nil {
629607
return false, err
630608
}
631-
current, err := c.currentPodsWithCorrectImageAndSpec(logger, source, serviceAccount, registryPodSecurityConfig)
609+
currentPods, err := c.currentPodsWithCorrectImageAndSpec(logger, source, serviceAccount, registryPodSecurityConfig)
632610
if err != nil {
633611
return false, err
634612
}
635-
if len(current) < 1 ||
613+
if len(currentPods) < 1 ||
636614
service == nil || c.currentServiceAccount(source) == nil {
637615
return false, nil
638616
}
639-
640-
return true, nil
617+
podsAreLive, e := detectAndDeleteDeadPods(logger, c.OpClient, currentPods, source.GetNamespace())
618+
if e != nil {
619+
return false, fmt.Errorf("error deleting dead pods: %v", e)
620+
}
621+
return podsAreLive, nil
641622
}
642623

643624
// promoteCatalog swaps the labels on the update pod so that the update pod is now reachable by the catalog service.

pkg/controller/registry/reconciler/grpc_test.go

+29
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,23 @@ func grpcCatalogSourceWithName(name string) *v1alpha1.CatalogSource {
7070
return catsrc
7171
}
7272

73+
func withPodDeletedButNotRemoved(objs []runtime.Object) []runtime.Object {
74+
var out []runtime.Object
75+
for _, obj := range objs {
76+
o := obj.DeepCopyObject()
77+
if pod, ok := obj.(*corev1.Pod); ok {
78+
pod.DeletionTimestamp = &metav1.Time{Time: time.Now()}
79+
pod.Status.Conditions = append(pod.Status.Conditions, corev1.PodCondition{
80+
Type: corev1.DisruptionTarget,
81+
Reason: "DeletionByTaintManager",
82+
Status: corev1.ConditionTrue,
83+
})
84+
o = pod
85+
}
86+
out = append(out, o)
87+
}
88+
return out
89+
}
7390
func TestGrpcRegistryReconciler(t *testing.T) {
7491
now := func() metav1.Time { return metav1.Date(2018, time.January, 26, 20, 40, 0, 0, time.UTC) }
7592
blockOwnerDeletion := true
@@ -558,6 +575,18 @@ func TestGrpcRegistryChecker(t *testing.T) {
558575
healthy: false,
559576
},
560577
},
578+
{
579+
testName: "Grpc/ExistingRegistry/Image/DeadPod",
580+
in: in{
581+
cluster: cluster{
582+
k8sObjs: withPodDeletedButNotRemoved(objectsForCatalogSource(t, validGrpcCatalogSource("test-img", ""))),
583+
},
584+
catsrc: validGrpcCatalogSource("test-img", ""),
585+
},
586+
out: out{
587+
healthy: false,
588+
},
589+
},
561590
{
562591
testName: "Grpc/ExistingRegistry/Image/OldPod/NotHealthy",
563592
in: in{

0 commit comments

Comments
 (0)