Skip to content

Commit 435bfa5

Browse files
committed
OCPBUGS-42150: (fix) registry pods do not come up again after node failure (#3366)
[PR 3201](operator-framework/operator-lifecycle-manager#3201) attempted to solve for the issue by deleting the pods stuck in `Terminating` due to unreachable node. However, the logic to do that was included in `EnsureRegistryServer`, which only gets executed if polling in requested by the user. This PR moves the logic of checking for dead pods out of `EnsureRegistryServer`, and puts it in `CheckRegistryServer` instead. This way, if there are any dead pods detected during `CheckRegistryServer`, the value of `healthy` is returned `false`, which inturn triggers `EnsureRegistryServer`. Upstream-repository: operator-lifecycle-manager Upstream-commit: f2431893193e7112f78298ad7682ff3e1b179d8c
1 parent cf12580 commit 435bfa5

File tree

6 files changed

+241
-54
lines changed

6 files changed

+241
-54
lines changed

Diff for: staging/operator-lifecycle-manager/pkg/controller/registry/reconciler/configmap.go

+66-16
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,18 @@ package reconciler
33

44
import (
55
"context"
6+
"errors"
67
"fmt"
78

8-
"github.com/pkg/errors"
9+
pkgerrors "github.com/pkg/errors"
910
"github.com/sirupsen/logrus"
1011
corev1 "k8s.io/api/core/v1"
1112
rbacv1 "k8s.io/api/rbac/v1"
1213
apierrors "k8s.io/apimachinery/pkg/api/errors"
1314
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1415
"k8s.io/apimachinery/pkg/labels"
1516
"k8s.io/apimachinery/pkg/util/intstr"
17+
"k8s.io/utils/ptr"
1618

1719
"github.com/operator-framework/api/pkg/operators/v1alpha1"
1820
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/operatorclient"
@@ -284,19 +286,19 @@ func (c *ConfigMapRegistryReconciler) EnsureRegistryServer(catalogSource *v1alph
284286

285287
//TODO: if any of these error out, we should write a status back (possibly set RegistryServiceStatus to nil so they get recreated)
286288
if err := c.ensureServiceAccount(source, overwrite); err != nil {
287-
return errors.Wrapf(err, "error ensuring service account: %s", source.serviceAccountName())
289+
return pkgerrors.Wrapf(err, "error ensuring service account: %s", source.serviceAccountName())
288290
}
289291
if err := c.ensureRole(source, overwrite); err != nil {
290-
return errors.Wrapf(err, "error ensuring role: %s", source.roleName())
292+
return pkgerrors.Wrapf(err, "error ensuring role: %s", source.roleName())
291293
}
292294
if err := c.ensureRoleBinding(source, overwrite); err != nil {
293-
return errors.Wrapf(err, "error ensuring rolebinding: %s", source.RoleBinding().GetName())
295+
return pkgerrors.Wrapf(err, "error ensuring rolebinding: %s", source.RoleBinding().GetName())
294296
}
295297
if err := c.ensurePod(source, overwritePod); err != nil {
296-
return errors.Wrapf(err, "error ensuring pod: %s", source.Pod(image).GetName())
298+
return pkgerrors.Wrapf(err, "error ensuring pod: %s", source.Pod(image).GetName())
297299
}
298300
if err := c.ensureService(source, overwrite); err != nil {
299-
return errors.Wrapf(err, "error ensuring service: %s", source.Service().GetName())
301+
return pkgerrors.Wrapf(err, "error ensuring service: %s", source.Service().GetName())
300302
}
301303

302304
if overwritePod {
@@ -363,15 +365,15 @@ func (c *ConfigMapRegistryReconciler) ensurePod(source configMapCatalogSourceDec
363365
}
364366
for _, p := range currentPods {
365367
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(pod.GetNamespace()).Delete(context.TODO(), p.GetName(), *metav1.NewDeleteOptions(1)); err != nil && !apierrors.IsNotFound(err) {
366-
return errors.Wrapf(err, "error deleting old pod: %s", p.GetName())
368+
return pkgerrors.Wrapf(err, "error deleting old pod: %s", p.GetName())
367369
}
368370
}
369371
}
370372
_, err := c.OpClient.KubernetesInterface().CoreV1().Pods(pod.GetNamespace()).Create(context.TODO(), pod, metav1.CreateOptions{})
371373
if err == nil {
372374
return nil
373375
}
374-
return errors.Wrapf(err, "error creating new pod: %s", pod.GetGenerateName())
376+
return pkgerrors.Wrapf(err, "error creating new pod: %s", pod.GetGenerateName())
375377
}
376378

377379
func (c *ConfigMapRegistryReconciler) ensureService(source configMapCatalogSourceDecorator, overwrite bool) error {
@@ -390,16 +392,15 @@ func (c *ConfigMapRegistryReconciler) ensureService(source configMapCatalogSourc
390392
}
391393

392394
// CheckRegistryServer returns true if the given CatalogSource is considered healthy; false otherwise.
393-
func (c *ConfigMapRegistryReconciler) CheckRegistryServer(catalogSource *v1alpha1.CatalogSource) (healthy bool, err error) {
395+
func (c *ConfigMapRegistryReconciler) CheckRegistryServer(catalogSource *v1alpha1.CatalogSource) (bool, error) {
394396
source := configMapCatalogSourceDecorator{catalogSource, c.createPodAsUser}
395-
396397
image := c.Image
397398
if source.Spec.SourceType == "grpc" {
398399
image = source.Spec.Image
399400
}
400401
if image == "" {
401-
err = fmt.Errorf("no image for registry")
402-
return
402+
err := fmt.Errorf("no image for registry")
403+
return false, err
403404
}
404405

405406
if source.Spec.SourceType == v1alpha1.SourceTypeConfigmap || source.Spec.SourceType == v1alpha1.SourceTypeInternal {
@@ -426,10 +427,59 @@ func (c *ConfigMapRegistryReconciler) CheckRegistryServer(catalogSource *v1alpha
426427
c.currentRoleBinding(source) == nil ||
427428
c.currentService(source) == nil ||
428429
len(c.currentPods(source, c.Image)) < 1 {
429-
healthy = false
430-
return
430+
431+
return false, nil
431432
}
432433

433-
healthy = true
434-
return
434+
podsAreLive, e := detectAndDeleteDeadPods(c.OpClient, c.currentPods(source, c.Image), source.GetNamespace())
435+
if e != nil {
436+
return false, fmt.Errorf("error deleting dead pods: %v", e)
437+
}
438+
return podsAreLive, nil
439+
}
440+
441+
// detectAndDeleteDeadPods determines if there are registry client pods that are in the deleted state
442+
// but have not been removed by GC (eg the node goes down before GC can remove them), and attempts to
443+
// force delete the pods. If there are live registry pods remaining, it returns true, otherwise returns false.
444+
func detectAndDeleteDeadPods(client operatorclient.ClientInterface, pods []*corev1.Pod, sourceNamespace string) (bool, error) {
445+
var forceDeletionErrs []error
446+
livePodFound := false
447+
for _, pod := range pods {
448+
if !isPodDead(pod) {
449+
livePodFound = true
450+
continue
451+
}
452+
if err := client.KubernetesInterface().CoreV1().Pods(sourceNamespace).Delete(context.TODO(), pod.GetName(), metav1.DeleteOptions{
453+
GracePeriodSeconds: ptr.To[int64](0),
454+
}); err != nil && !apierrors.IsNotFound(err) {
455+
forceDeletionErrs = append(forceDeletionErrs, err)
456+
}
457+
}
458+
if len(forceDeletionErrs) > 0 {
459+
return false, errors.Join(forceDeletionErrs...)
460+
}
461+
return livePodFound, nil
462+
}
463+
464+
func isPodDead(pod *corev1.Pod) bool {
465+
for _, check := range []func(*corev1.Pod) bool{
466+
isPodDeletedByTaintManager,
467+
} {
468+
if check(pod) {
469+
return true
470+
}
471+
}
472+
return false
473+
}
474+
475+
func isPodDeletedByTaintManager(pod *corev1.Pod) bool {
476+
if pod.DeletionTimestamp == nil {
477+
return false
478+
}
479+
for _, condition := range pod.Status.Conditions {
480+
if condition.Type == corev1.DisruptionTarget && condition.Reason == "DeletionByTaintManager" && condition.Status == corev1.ConditionTrue {
481+
return true
482+
}
483+
}
484+
return false
435485
}

Diff for: staging/operator-lifecycle-manager/pkg/controller/registry/reconciler/configmap_test.go

+52
Original file line numberDiff line numberDiff line change
@@ -487,3 +487,55 @@ func TestConfigMapRegistryReconciler(t *testing.T) {
487487
})
488488
}
489489
}
490+
491+
func TestConfigMapRegistryChecker(t *testing.T) {
492+
validConfigMap := validConfigMap()
493+
validCatalogSource := validConfigMapCatalogSource(validConfigMap)
494+
type cluster struct {
495+
k8sObjs []runtime.Object
496+
}
497+
type in struct {
498+
cluster cluster
499+
catsrc *v1alpha1.CatalogSource
500+
}
501+
type out struct {
502+
healthy bool
503+
err error
504+
}
505+
tests := []struct {
506+
testName string
507+
in in
508+
out out
509+
}{
510+
{
511+
testName: "ConfigMap/ExistingRegistry/DeadPod",
512+
in: in{
513+
cluster: cluster{
514+
k8sObjs: append(withPodDeletedButNotRemoved(objectsForCatalogSource(validCatalogSource)), validConfigMap),
515+
},
516+
catsrc: validCatalogSource,
517+
},
518+
out: out{
519+
healthy: false,
520+
},
521+
},
522+
}
523+
for _, tt := range tests {
524+
t.Run(tt.testName, func(t *testing.T) {
525+
stopc := make(chan struct{})
526+
defer close(stopc)
527+
528+
factory, _ := fakeReconcilerFactory(t, stopc, withK8sObjs(tt.in.cluster.k8sObjs...))
529+
rec := factory.ReconcilerForSource(tt.in.catsrc)
530+
531+
healthy, err := rec.CheckRegistryServer(tt.in.catsrc)
532+
533+
require.Equal(t, tt.out.err, err)
534+
if tt.out.err != nil {
535+
return
536+
}
537+
538+
require.Equal(t, tt.out.healthy, healthy)
539+
})
540+
}
541+
}

Diff for: staging/operator-lifecycle-manager/pkg/controller/registry/reconciler/grpc.go

+14-11
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"time"
88

99
"github.com/pkg/errors"
10+
1011
"github.com/sirupsen/logrus"
1112
corev1 "k8s.io/api/core/v1"
1213
apierrors "k8s.io/apimachinery/pkg/api/errors"
@@ -256,13 +257,13 @@ func isRegistryServiceStatusValid(source *grpcCatalogSourceDecorator) bool {
256257
}
257258

258259
func (c *GrpcRegistryReconciler) ensurePod(source grpcCatalogSourceDecorator, saName string, overwrite bool) error {
259-
// currentLivePods refers to the currently live instances of the catalog source
260-
currentLivePods := c.currentPods(source)
261-
if len(currentLivePods) > 0 {
260+
// currentPods refers to the current pod instances of the catalog source
261+
currentPods := c.currentPods(source)
262+
if len(currentPods) > 0 {
262263
if !overwrite {
263264
return nil
264265
}
265-
for _, p := range currentLivePods {
266+
for _, p := range currentPods {
266267
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Delete(context.TODO(), p.GetName(), *metav1.NewDeleteOptions(1)); err != nil && !apierrors.IsNotFound(err) {
267268
return errors.Wrapf(err, "error deleting old pod: %s", p.GetName())
268269
}
@@ -448,18 +449,20 @@ func (c *GrpcRegistryReconciler) removePods(pods []*corev1.Pod, namespace string
448449
}
449450

450451
// CheckRegistryServer returns true if the given CatalogSource is considered healthy; false otherwise.
451-
func (c *GrpcRegistryReconciler) CheckRegistryServer(catalogSource *v1alpha1.CatalogSource) (healthy bool, err error) {
452+
func (c *GrpcRegistryReconciler) CheckRegistryServer(catalogSource *v1alpha1.CatalogSource) (bool, error) {
452453
source := grpcCatalogSourceDecorator{catalogSource, c.createPodAsUser}
453454
// Check on registry resources
454455
// TODO: add gRPC health check
455-
if len(c.currentPodsWithCorrectImageAndSpec(source, source.ServiceAccount().GetName())) < 1 ||
456+
currentPods := c.currentPodsWithCorrectImageAndSpec(source, c.currentServiceAccount(source).Name)
457+
if len(currentPods) < 1 ||
456458
c.currentService(source) == nil || c.currentServiceAccount(source) == nil {
457-
healthy = false
458-
return
459+
return false, nil
459460
}
460-
461-
healthy = true
462-
return
461+
podsAreLive, e := detectAndDeleteDeadPods(c.OpClient, currentPods, source.GetNamespace())
462+
if e != nil {
463+
return false, fmt.Errorf("error deleting dead pods: %v", e)
464+
}
465+
return podsAreLive, nil
463466
}
464467

465468
// promoteCatalog swaps the labels on the update pod so that the update pod is now reachable by the catalog service.

Diff for: staging/operator-lifecycle-manager/pkg/controller/registry/reconciler/grpc_test.go

+29
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,23 @@ func grpcCatalogSourceWithAnnotations(annotations map[string]string) *v1alpha1.C
6161
return catsrc
6262
}
6363

64+
func withPodDeletedButNotRemoved(objs []runtime.Object) []runtime.Object {
65+
var out []runtime.Object
66+
for _, obj := range objs {
67+
o := obj.DeepCopyObject()
68+
if pod, ok := obj.(*corev1.Pod); ok {
69+
pod.DeletionTimestamp = &metav1.Time{Time: time.Now()}
70+
pod.Status.Conditions = append(pod.Status.Conditions, corev1.PodCondition{
71+
Type: corev1.DisruptionTarget,
72+
Reason: "DeletionByTaintManager",
73+
Status: corev1.ConditionTrue,
74+
})
75+
o = pod
76+
}
77+
out = append(out, o)
78+
}
79+
return out
80+
}
6481
func TestGrpcRegistryReconciler(t *testing.T) {
6582
now := func() metav1.Time { return metav1.Date(2018, time.January, 26, 20, 40, 0, 0, time.UTC) }
6683
blockOwnerDeletion := true
@@ -508,6 +525,18 @@ func TestGrpcRegistryChecker(t *testing.T) {
508525
healthy: false,
509526
},
510527
},
528+
{
529+
testName: "Grpc/ExistingRegistry/Image/DeadPod",
530+
in: in{
531+
cluster: cluster{
532+
k8sObjs: withPodDeletedButNotRemoved(objectsForCatalogSource(validGrpcCatalogSource("test-img", ""))),
533+
},
534+
catsrc: validGrpcCatalogSource("test-img", ""),
535+
},
536+
out: out{
537+
healthy: false,
538+
},
539+
},
511540
{
512541
testName: "Grpc/ExistingRegistry/Image/OldPod/NotHealthy",
513542
in: in{

0 commit comments

Comments
 (0)