Skip to content

Commit a1c914c

Browse files
committedSep 18, 2024
OCPBUGS-42150: (fix) registry pods do not come up again after node failure (#3366)
[PR 3201](operator-framework/operator-lifecycle-manager#3201) attempted to solve for the issue by deleting the pods stuck in `Terminating` due to unreachable node. However, the logic to do that was included in `EnsureRegistryServer`, which only gets executed if polling in requested by the user. This PR moves the logic of checking for dead pods out of `EnsureRegistryServer`, and puts it in `CheckRegistryServer` instead. This way, if there are any dead pods detected during `CheckRegistryServer`, the value of `healthy` is returned `false`, which inturn triggers `EnsureRegistryServer`. Upstream-repository: operator-lifecycle-manager Upstream-commit: f2431893193e7112f78298ad7682ff3e1b179d8c

File tree

18 files changed

+615
-288
lines changed

18 files changed

+615
-288
lines changed
 

‎go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ require (
2828
k8s.io/client-go v0.27.8
2929
k8s.io/code-generator v0.27.8
3030
k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f
31-
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5
31+
k8s.io/utils v0.0.0-20240902221715-702e33fdd3c3
3232
sigs.k8s.io/controller-runtime v0.15.0
3333
sigs.k8s.io/controller-tools v0.8.0
3434
)

‎go.sum

+2-2
Original file line numberDiff line numberDiff line change
@@ -1484,8 +1484,8 @@ k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f/go.mod h1:byini6yhqGC14c3
14841484
k8s.io/kubectl v0.27.8 h1:VipG0f9E1kGRGJYm2/kNv188RgDduvx1g2q1b20niHg=
14851485
k8s.io/kubectl v0.27.8/go.mod h1:ZufZqfI5V7oBuGFALJHoTxypO0fewOwbadr4saUkRKo=
14861486
k8s.io/utils v0.0.0-20200324210504-a9aa75ae1b89/go.mod h1:sZAwmy6armz5eXlNoLmJcl4F1QuKu7sr+mFQ0byX7Ew=
1487-
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5 h1:kmDqav+P+/5e1i9tFfHq1qcF3sOrDp+YEkVDAHu7Jwk=
1488-
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
1487+
k8s.io/utils v0.0.0-20240902221715-702e33fdd3c3 h1:b2FmK8YH+QEwq/Sy2uAEhmqL5nPfGYbJOcaqjeYYZoA=
1488+
k8s.io/utils v0.0.0-20240902221715-702e33fdd3c3/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
14891489
oras.land/oras-go v1.2.4 h1:djpBY2/2Cs1PV87GSJlxv4voajVOMZxqqtq9AB8YNvY=
14901490
oras.land/oras-go v1.2.4/go.mod h1:DYcGfb3YF1nKjcezfX2SNlDAeQFKSXmf+qrFmrh4324=
14911491
rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=

‎staging/operator-lifecycle-manager/go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ require (
5252
k8s.io/klog/v2 v2.90.1
5353
k8s.io/kube-aggregator v0.25.3
5454
k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f
55-
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5
55+
k8s.io/utils v0.0.0-20240902221715-702e33fdd3c3
5656
sigs.k8s.io/controller-runtime v0.15.0
5757
sigs.k8s.io/controller-tools v0.8.0
5858
sigs.k8s.io/kind v0.20.0

‎staging/operator-lifecycle-manager/go.sum

+2-2
Original file line numberDiff line numberDiff line change
@@ -1347,8 +1347,8 @@ k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f h1:2kWPakN3i/k81b0gvD5C5F
13471347
k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f/go.mod h1:byini6yhqGC14c3ebc/QwanvYwhuMWF6yz2F8uwW8eg=
13481348
k8s.io/kubectl v0.27.7 h1:HTEDa4s/oWjB3t5ysdW1yKlcNl9bzigcqWBq0LIIe3k=
13491349
k8s.io/kubectl v0.27.7/go.mod h1:Xb1Ubc8uN1i2RvSN1HCgSHTtzgX0woihMk/gW7XbjJU=
1350-
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5 h1:kmDqav+P+/5e1i9tFfHq1qcF3sOrDp+YEkVDAHu7Jwk=
1351-
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
1350+
k8s.io/utils v0.0.0-20240902221715-702e33fdd3c3 h1:b2FmK8YH+QEwq/Sy2uAEhmqL5nPfGYbJOcaqjeYYZoA=
1351+
k8s.io/utils v0.0.0-20240902221715-702e33fdd3c3/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
13521352
oras.land/oras-go v1.2.3 h1:v8PJl+gEAntI1pJ/LCrDgsuk+1PKVavVEPsYIHFE5uY=
13531353
oras.land/oras-go v1.2.3/go.mod h1:M/uaPdYklze0Vf3AakfarnpoEckvw0ESbRdN8Z1vdJg=
13541354
rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=

‎staging/operator-lifecycle-manager/pkg/controller/registry/reconciler/configmap.go

+66-16
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,18 @@ package reconciler
33

44
import (
55
"context"
6+
"errors"
67
"fmt"
78

8-
"github.com/pkg/errors"
9+
pkgerrors "github.com/pkg/errors"
910
"github.com/sirupsen/logrus"
1011
corev1 "k8s.io/api/core/v1"
1112
rbacv1 "k8s.io/api/rbac/v1"
1213
apierrors "k8s.io/apimachinery/pkg/api/errors"
1314
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1415
"k8s.io/apimachinery/pkg/labels"
1516
"k8s.io/apimachinery/pkg/util/intstr"
17+
"k8s.io/utils/ptr"
1618

1719
"github.com/operator-framework/api/pkg/operators/v1alpha1"
1820
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/operatorclient"
@@ -284,19 +286,19 @@ func (c *ConfigMapRegistryReconciler) EnsureRegistryServer(catalogSource *v1alph
284286

285287
//TODO: if any of these error out, we should write a status back (possibly set RegistryServiceStatus to nil so they get recreated)
286288
if err := c.ensureServiceAccount(source, overwrite); err != nil {
287-
return errors.Wrapf(err, "error ensuring service account: %s", source.serviceAccountName())
289+
return pkgerrors.Wrapf(err, "error ensuring service account: %s", source.serviceAccountName())
288290
}
289291
if err := c.ensureRole(source, overwrite); err != nil {
290-
return errors.Wrapf(err, "error ensuring role: %s", source.roleName())
292+
return pkgerrors.Wrapf(err, "error ensuring role: %s", source.roleName())
291293
}
292294
if err := c.ensureRoleBinding(source, overwrite); err != nil {
293-
return errors.Wrapf(err, "error ensuring rolebinding: %s", source.RoleBinding().GetName())
295+
return pkgerrors.Wrapf(err, "error ensuring rolebinding: %s", source.RoleBinding().GetName())
294296
}
295297
if err := c.ensurePod(source, overwritePod); err != nil {
296-
return errors.Wrapf(err, "error ensuring pod: %s", source.Pod(image).GetName())
298+
return pkgerrors.Wrapf(err, "error ensuring pod: %s", source.Pod(image).GetName())
297299
}
298300
if err := c.ensureService(source, overwrite); err != nil {
299-
return errors.Wrapf(err, "error ensuring service: %s", source.Service().GetName())
301+
return pkgerrors.Wrapf(err, "error ensuring service: %s", source.Service().GetName())
300302
}
301303

302304
if overwritePod {
@@ -363,15 +365,15 @@ func (c *ConfigMapRegistryReconciler) ensurePod(source configMapCatalogSourceDec
363365
}
364366
for _, p := range currentPods {
365367
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(pod.GetNamespace()).Delete(context.TODO(), p.GetName(), *metav1.NewDeleteOptions(1)); err != nil && !apierrors.IsNotFound(err) {
366-
return errors.Wrapf(err, "error deleting old pod: %s", p.GetName())
368+
return pkgerrors.Wrapf(err, "error deleting old pod: %s", p.GetName())
367369
}
368370
}
369371
}
370372
_, err := c.OpClient.KubernetesInterface().CoreV1().Pods(pod.GetNamespace()).Create(context.TODO(), pod, metav1.CreateOptions{})
371373
if err == nil {
372374
return nil
373375
}
374-
return errors.Wrapf(err, "error creating new pod: %s", pod.GetGenerateName())
376+
return pkgerrors.Wrapf(err, "error creating new pod: %s", pod.GetGenerateName())
375377
}
376378

377379
func (c *ConfigMapRegistryReconciler) ensureService(source configMapCatalogSourceDecorator, overwrite bool) error {
@@ -390,16 +392,15 @@ func (c *ConfigMapRegistryReconciler) ensureService(source configMapCatalogSourc
390392
}
391393

392394
// CheckRegistryServer returns true if the given CatalogSource is considered healthy; false otherwise.
393-
func (c *ConfigMapRegistryReconciler) CheckRegistryServer(catalogSource *v1alpha1.CatalogSource) (healthy bool, err error) {
395+
func (c *ConfigMapRegistryReconciler) CheckRegistryServer(catalogSource *v1alpha1.CatalogSource) (bool, error) {
394396
source := configMapCatalogSourceDecorator{catalogSource, c.createPodAsUser}
395-
396397
image := c.Image
397398
if source.Spec.SourceType == "grpc" {
398399
image = source.Spec.Image
399400
}
400401
if image == "" {
401-
err = fmt.Errorf("no image for registry")
402-
return
402+
err := fmt.Errorf("no image for registry")
403+
return false, err
403404
}
404405

405406
if source.Spec.SourceType == v1alpha1.SourceTypeConfigmap || source.Spec.SourceType == v1alpha1.SourceTypeInternal {
@@ -426,10 +427,59 @@ func (c *ConfigMapRegistryReconciler) CheckRegistryServer(catalogSource *v1alpha
426427
c.currentRoleBinding(source) == nil ||
427428
c.currentService(source) == nil ||
428429
len(c.currentPods(source, c.Image)) < 1 {
429-
healthy = false
430-
return
430+
431+
return false, nil
431432
}
432433

433-
healthy = true
434-
return
434+
podsAreLive, e := detectAndDeleteDeadPods(c.OpClient, c.currentPods(source, c.Image), source.GetNamespace())
435+
if e != nil {
436+
return false, fmt.Errorf("error deleting dead pods: %v", e)
437+
}
438+
return podsAreLive, nil
439+
}
440+
441+
// detectAndDeleteDeadPods determines if there are registry client pods that are in the deleted state
442+
// but have not been removed by GC (eg the node goes down before GC can remove them), and attempts to
443+
// force delete the pods. If there are live registry pods remaining, it returns true, otherwise returns false.
444+
func detectAndDeleteDeadPods(client operatorclient.ClientInterface, pods []*corev1.Pod, sourceNamespace string) (bool, error) {
445+
var forceDeletionErrs []error
446+
livePodFound := false
447+
for _, pod := range pods {
448+
if !isPodDead(pod) {
449+
livePodFound = true
450+
continue
451+
}
452+
if err := client.KubernetesInterface().CoreV1().Pods(sourceNamespace).Delete(context.TODO(), pod.GetName(), metav1.DeleteOptions{
453+
GracePeriodSeconds: ptr.To[int64](0),
454+
}); err != nil && !apierrors.IsNotFound(err) {
455+
forceDeletionErrs = append(forceDeletionErrs, err)
456+
}
457+
}
458+
if len(forceDeletionErrs) > 0 {
459+
return false, errors.Join(forceDeletionErrs...)
460+
}
461+
return livePodFound, nil
462+
}
463+
464+
func isPodDead(pod *corev1.Pod) bool {
465+
for _, check := range []func(*corev1.Pod) bool{
466+
isPodDeletedByTaintManager,
467+
} {
468+
if check(pod) {
469+
return true
470+
}
471+
}
472+
return false
473+
}
474+
475+
func isPodDeletedByTaintManager(pod *corev1.Pod) bool {
476+
if pod.DeletionTimestamp == nil {
477+
return false
478+
}
479+
for _, condition := range pod.Status.Conditions {
480+
if condition.Type == corev1.DisruptionTarget && condition.Reason == "DeletionByTaintManager" && condition.Status == corev1.ConditionTrue {
481+
return true
482+
}
483+
}
484+
return false
435485
}

‎staging/operator-lifecycle-manager/pkg/controller/registry/reconciler/configmap_test.go

+52
Original file line numberDiff line numberDiff line change
@@ -487,3 +487,55 @@ func TestConfigMapRegistryReconciler(t *testing.T) {
487487
})
488488
}
489489
}
490+
491+
func TestConfigMapRegistryChecker(t *testing.T) {
492+
validConfigMap := validConfigMap()
493+
validCatalogSource := validConfigMapCatalogSource(validConfigMap)
494+
type cluster struct {
495+
k8sObjs []runtime.Object
496+
}
497+
type in struct {
498+
cluster cluster
499+
catsrc *v1alpha1.CatalogSource
500+
}
501+
type out struct {
502+
healthy bool
503+
err error
504+
}
505+
tests := []struct {
506+
testName string
507+
in in
508+
out out
509+
}{
510+
{
511+
testName: "ConfigMap/ExistingRegistry/DeadPod",
512+
in: in{
513+
cluster: cluster{
514+
k8sObjs: append(withPodDeletedButNotRemoved(objectsForCatalogSource(validCatalogSource)), validConfigMap),
515+
},
516+
catsrc: validCatalogSource,
517+
},
518+
out: out{
519+
healthy: false,
520+
},
521+
},
522+
}
523+
for _, tt := range tests {
524+
t.Run(tt.testName, func(t *testing.T) {
525+
stopc := make(chan struct{})
526+
defer close(stopc)
527+
528+
factory, _ := fakeReconcilerFactory(t, stopc, withK8sObjs(tt.in.cluster.k8sObjs...))
529+
rec := factory.ReconcilerForSource(tt.in.catsrc)
530+
531+
healthy, err := rec.CheckRegistryServer(tt.in.catsrc)
532+
533+
require.Equal(t, tt.out.err, err)
534+
if tt.out.err != nil {
535+
return
536+
}
537+
538+
require.Equal(t, tt.out.healthy, healthy)
539+
})
540+
}
541+
}

‎staging/operator-lifecycle-manager/pkg/controller/registry/reconciler/grpc.go

+14-11
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"time"
88

99
"github.com/pkg/errors"
10+
1011
"github.com/sirupsen/logrus"
1112
corev1 "k8s.io/api/core/v1"
1213
apierrors "k8s.io/apimachinery/pkg/api/errors"
@@ -256,13 +257,13 @@ func isRegistryServiceStatusValid(source *grpcCatalogSourceDecorator) bool {
256257
}
257258

258259
func (c *GrpcRegistryReconciler) ensurePod(source grpcCatalogSourceDecorator, saName string, overwrite bool) error {
259-
// currentLivePods refers to the currently live instances of the catalog source
260-
currentLivePods := c.currentPods(source)
261-
if len(currentLivePods) > 0 {
260+
// currentPods refers to the current pod instances of the catalog source
261+
currentPods := c.currentPods(source)
262+
if len(currentPods) > 0 {
262263
if !overwrite {
263264
return nil
264265
}
265-
for _, p := range currentLivePods {
266+
for _, p := range currentPods {
266267
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Delete(context.TODO(), p.GetName(), *metav1.NewDeleteOptions(1)); err != nil && !apierrors.IsNotFound(err) {
267268
return errors.Wrapf(err, "error deleting old pod: %s", p.GetName())
268269
}
@@ -448,18 +449,20 @@ func (c *GrpcRegistryReconciler) removePods(pods []*corev1.Pod, namespace string
448449
}
449450

450451
// CheckRegistryServer returns true if the given CatalogSource is considered healthy; false otherwise.
451-
func (c *GrpcRegistryReconciler) CheckRegistryServer(catalogSource *v1alpha1.CatalogSource) (healthy bool, err error) {
452+
func (c *GrpcRegistryReconciler) CheckRegistryServer(catalogSource *v1alpha1.CatalogSource) (bool, error) {
452453
source := grpcCatalogSourceDecorator{catalogSource, c.createPodAsUser}
453454
// Check on registry resources
454455
// TODO: add gRPC health check
455-
if len(c.currentPodsWithCorrectImageAndSpec(source, source.ServiceAccount().GetName())) < 1 ||
456+
currentPods := c.currentPodsWithCorrectImageAndSpec(source, source.ServiceAccount().Name)
457+
if len(currentPods) < 1 ||
456458
c.currentService(source) == nil || c.currentServiceAccount(source) == nil {
457-
healthy = false
458-
return
459+
return false, nil
459460
}
460-
461-
healthy = true
462-
return
461+
podsAreLive, e := detectAndDeleteDeadPods(c.OpClient, currentPods, source.GetNamespace())
462+
if e != nil {
463+
return false, fmt.Errorf("error deleting dead pods: %v", e)
464+
}
465+
return podsAreLive, nil
463466
}
464467

465468
// promoteCatalog swaps the labels on the update pod so that the update pod is now reachable by the catalog service.

‎staging/operator-lifecycle-manager/pkg/controller/registry/reconciler/grpc_test.go

+29
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,23 @@ func grpcCatalogSourceWithAnnotations(annotations map[string]string) *v1alpha1.C
6161
return catsrc
6262
}
6363

64+
func withPodDeletedButNotRemoved(objs []runtime.Object) []runtime.Object {
65+
var out []runtime.Object
66+
for _, obj := range objs {
67+
o := obj.DeepCopyObject()
68+
if pod, ok := obj.(*corev1.Pod); ok {
69+
pod.DeletionTimestamp = &metav1.Time{Time: time.Now()}
70+
pod.Status.Conditions = append(pod.Status.Conditions, corev1.PodCondition{
71+
Type: corev1.DisruptionTarget,
72+
Reason: "DeletionByTaintManager",
73+
Status: corev1.ConditionTrue,
74+
})
75+
o = pod
76+
}
77+
out = append(out, o)
78+
}
79+
return out
80+
}
6481
func TestGrpcRegistryReconciler(t *testing.T) {
6582
now := func() metav1.Time { return metav1.Date(2018, time.January, 26, 20, 40, 0, 0, time.UTC) }
6683
blockOwnerDeletion := true
@@ -508,6 +525,18 @@ func TestGrpcRegistryChecker(t *testing.T) {
508525
healthy: false,
509526
},
510527
},
528+
{
529+
testName: "Grpc/ExistingRegistry/Image/DeadPod",
530+
in: in{
531+
cluster: cluster{
532+
k8sObjs: withPodDeletedButNotRemoved(objectsForCatalogSource(validGrpcCatalogSource("test-img", ""))),
533+
},
534+
catsrc: validGrpcCatalogSource("test-img", ""),
535+
},
536+
out: out{
537+
healthy: false,
538+
},
539+
},
511540
{
512541
testName: "Grpc/ExistingRegistry/Image/OldPod/NotHealthy",
513542
in: in{

‎vendor/github.com/operator-framework/operator-lifecycle-manager/pkg/controller/registry/reconciler/configmap.go

+66-16
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎vendor/github.com/operator-framework/operator-lifecycle-manager/pkg/controller/registry/reconciler/grpc.go

+14-11
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎vendor/k8s.io/utils/integer/integer.go

+4-4
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎vendor/k8s.io/utils/net/multi_listen.go

+195
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎vendor/k8s.io/utils/pointer/pointer.go

+61-222
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎vendor/k8s.io/utils/ptr/OWNERS

+10
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎vendor/k8s.io/utils/ptr/README.md

+3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎vendor/k8s.io/utils/ptr/ptr.go

+73
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎vendor/k8s.io/utils/trace/trace.go

+20-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎vendor/modules.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -2171,7 +2171,7 @@ k8s.io/kubectl/pkg/util/slice
21712171
k8s.io/kubectl/pkg/util/templates
21722172
k8s.io/kubectl/pkg/util/term
21732173
k8s.io/kubectl/pkg/validation
2174-
# k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5
2174+
# k8s.io/utils v0.0.0-20240902221715-702e33fdd3c3
21752175
## explicit; go 1.18
21762176
k8s.io/utils/buffer
21772177
k8s.io/utils/clock
@@ -2184,6 +2184,7 @@ k8s.io/utils/lru
21842184
k8s.io/utils/net
21852185
k8s.io/utils/path
21862186
k8s.io/utils/pointer
2187+
k8s.io/utils/ptr
21872188
k8s.io/utils/strings/slices
21882189
k8s.io/utils/trace
21892190
# oras.land/oras-go v1.2.4

0 commit comments

Comments
 (0)
Please sign in to comment.