Skip to content

Commit 68c24cf

Browse files
authoredApr 15, 2024
catalog-operator: delete catalog pods stuck in Terminating state due to unreachable node (#3201)
Signed-off-by: Joe Lanford <[email protected]>
1 parent 8434d3c commit 68c24cf

File tree

2 files changed

+63
-18
lines changed

2 files changed

+63
-18
lines changed
 

Diff for: ‎Dockerfile

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@ FROM quay.io/fedora/fedora:37-x86_64 as builder
22
LABEL stage=builder
33
WORKDIR /build
44

5-
# install dependencies and go 1.16
5+
# install dependencies and go 1.21
66

77
# copy just enough of the git repo to parse HEAD, used to record version in OLM binaries
88
RUN dnf update -y && dnf install -y bash make git mercurial jq wget && dnf upgrade -y
9-
RUN curl -sSL https://go.dev/dl/go1.20.linux-amd64.tar.gz | tar -xzf - -C /usr/local
9+
RUN curl -sSL https://go.dev/dl/go1.21.9.linux-amd64.tar.gz | tar -xzf - -C /usr/local
1010
ENV PATH=/usr/local/go/bin:$PATH
1111
COPY .git/HEAD .git/HEAD
1212
COPY .git/refs/heads/. .git/refs/heads

Diff for: ‎pkg/controller/registry/reconciler/grpc.go

+61-16
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,22 @@ package reconciler
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
7+
"slices"
68
"strings"
79
"time"
810

911
"github.com/google/go-cmp/cmp"
1012
"github.com/operator-framework/api/pkg/operators/v1alpha1"
11-
"github.com/pkg/errors"
13+
pkgerrors "github.com/pkg/errors"
1214
"github.com/sirupsen/logrus"
1315
corev1 "k8s.io/api/core/v1"
1416
apierrors "k8s.io/apimachinery/pkg/api/errors"
1517
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1618
"k8s.io/apimachinery/pkg/labels"
1719
"k8s.io/apimachinery/pkg/util/intstr"
20+
"k8s.io/utils/ptr"
1821

1922
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/install"
2023
controllerclient "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/controller-runtime/client"
@@ -262,7 +265,7 @@ func (c *GrpcRegistryReconciler) EnsureRegistryServer(logger *logrus.Entry, cata
262265
//TODO: if any of these error out, we should write a status back (possibly set RegistryServiceStatus to nil so they get recreated)
263266
sa, err := c.ensureSA(source)
264267
if err != nil && !apierrors.IsAlreadyExists(err) {
265-
return errors.Wrapf(err, "error ensuring service account: %s", source.GetName())
268+
return pkgerrors.Wrapf(err, "error ensuring service account: %s", source.GetName())
266269
}
267270

268271
sa, err = c.OpClient.GetServiceAccount(sa.GetNamespace(), sa.GetName())
@@ -285,20 +288,20 @@ func (c *GrpcRegistryReconciler) EnsureRegistryServer(logger *logrus.Entry, cata
285288
return err
286289
}
287290
if err := c.ensurePod(logger, source, sa, overwritePod); err != nil {
288-
return errors.Wrapf(err, "error ensuring pod: %s", pod.GetName())
291+
return pkgerrors.Wrapf(err, "error ensuring pod: %s", pod.GetName())
289292
}
290293
if err := c.ensureUpdatePod(logger, sa, source); err != nil {
291294
if _, ok := err.(UpdateNotReadyErr); ok {
292295
return err
293296
}
294-
return errors.Wrapf(err, "error ensuring updated catalog source pod: %s", pod.GetName())
297+
return pkgerrors.Wrapf(err, "error ensuring updated catalog source pod: %s", pod.GetName())
295298
}
296299
service, err := source.Service()
297300
if err != nil {
298301
return err
299302
}
300303
if err := c.ensureService(source, overwrite); err != nil {
301-
return errors.Wrapf(err, "error ensuring service: %s", service.GetName())
304+
return pkgerrors.Wrapf(err, "error ensuring service: %s", service.GetName())
302305
}
303306

304307
if overwritePod {
@@ -338,16 +341,35 @@ func isRegistryServiceStatusValid(source *grpcCatalogSourceDecorator) (bool, err
338341
}
339342

340343
func (c *GrpcRegistryReconciler) ensurePod(logger *logrus.Entry, source grpcCatalogSourceDecorator, serviceAccount *corev1.ServiceAccount, overwrite bool) error {
341-
// currentLivePods refers to the currently live instances of the catalog source
342-
currentLivePods := c.currentPods(logger, source)
343-
if len(currentLivePods) > 0 {
344+
// currentPods refers to the current pod instances of the catalog source
345+
currentPods := c.currentPods(logger, source)
346+
347+
var forceDeleteErrs []error
348+
currentPods = slices.DeleteFunc(currentPods, func(pod *corev1.Pod) bool {
349+
if !isPodDead(pod) {
350+
logger.WithFields(logrus.Fields{"pod.namespace": source.GetNamespace(), "pod.name": pod.GetName()}).Debug("pod is alive")
351+
return false
352+
}
353+
logger.WithFields(logrus.Fields{"pod.namespace": source.GetNamespace(), "pod.name": pod.GetName()}).Info("force deleting dead pod")
354+
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Delete(context.TODO(), pod.GetName(), metav1.DeleteOptions{
355+
GracePeriodSeconds: ptr.To[int64](0),
356+
}); err != nil && !apierrors.IsNotFound(err) {
357+
forceDeleteErrs = append(forceDeleteErrs, pkgerrors.Wrapf(err, "error deleting old pod: %s", pod.GetName()))
358+
}
359+
return true
360+
})
361+
if len(forceDeleteErrs) > 0 {
362+
return errors.Join(forceDeleteErrs...)
363+
}
364+
365+
if len(currentPods) > 0 {
344366
if !overwrite {
345367
return nil
346368
}
347-
for _, p := range currentLivePods {
369+
for _, p := range currentPods {
348370
logger.WithFields(logrus.Fields{"pod.namespace": source.GetNamespace(), "pod.name": p.GetName()}).Info("deleting current pod")
349371
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Delete(context.TODO(), p.GetName(), *metav1.NewDeleteOptions(1)); err != nil && !apierrors.IsNotFound(err) {
350-
return errors.Wrapf(err, "error deleting old pod: %s", p.GetName())
372+
return pkgerrors.Wrapf(err, "error deleting old pod: %s", p.GetName())
351373
}
352374
}
353375
}
@@ -358,7 +380,7 @@ func (c *GrpcRegistryReconciler) ensurePod(logger *logrus.Entry, source grpcCata
358380
logger.WithFields(logrus.Fields{"pod.namespace": desiredPod.GetNamespace(), "pod.name": desiredPod.GetName()}).Info("creating desired pod")
359381
_, err = c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Create(context.TODO(), desiredPod, metav1.CreateOptions{})
360382
if err != nil {
361-
return errors.Wrapf(err, "error creating new pod: %s", desiredPod.GetGenerateName())
383+
return pkgerrors.Wrapf(err, "error creating new pod: %s", desiredPod.GetGenerateName())
362384
}
363385

364386
return nil
@@ -378,7 +400,7 @@ func (c *GrpcRegistryReconciler) ensureUpdatePod(logger *logrus.Entry, serviceAc
378400
logger.Infof("catalog update required at %s", time.Now().String())
379401
pod, err := c.createUpdatePod(source, serviceAccount)
380402
if err != nil {
381-
return errors.Wrapf(err, "creating update catalog source pod")
403+
return pkgerrors.Wrapf(err, "creating update catalog source pod")
382404
}
383405
source.SetLastUpdateTime()
384406
return UpdateNotReadyErr{catalogName: source.GetName(), podName: pod.GetName()}
@@ -410,7 +432,7 @@ func (c *GrpcRegistryReconciler) ensureUpdatePod(logger *logrus.Entry, serviceAc
410432
for _, p := range currentLivePods {
411433
logger.WithFields(logrus.Fields{"live-pod.namespace": source.GetNamespace(), "live-pod.name": p.Name}).Info("deleting current live pods")
412434
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Delete(context.TODO(), p.GetName(), *metav1.NewDeleteOptions(1)); err != nil && !apierrors.IsNotFound(err) {
413-
return errors.Wrapf(errors.Wrapf(err, "error deleting pod: %s", p.GetName()), "detected imageID change: error deleting old catalog source pod")
435+
return pkgerrors.Wrapf(pkgerrors.Wrapf(err, "error deleting pod: %s", p.GetName()), "detected imageID change: error deleting old catalog source pod")
414436
}
415437
}
416438
// done syncing
@@ -420,7 +442,7 @@ func (c *GrpcRegistryReconciler) ensureUpdatePod(logger *logrus.Entry, serviceAc
420442
// delete update pod right away, since the digest match, to prevent long-lived duplicate catalog pods
421443
logger.WithFields(logrus.Fields{"update-pod.namespace": updatePod.Namespace, "update-pod.name": updatePod.Name}).Debug("catalog polling result: no update; removing duplicate update pod")
422444
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Delete(context.TODO(), updatePod.GetName(), *metav1.NewDeleteOptions(1)); err != nil && !apierrors.IsNotFound(err) {
423-
return errors.Wrapf(errors.Wrapf(err, "error deleting pod: %s", updatePod.GetName()), "duplicate catalog polling pod")
445+
return pkgerrors.Wrapf(pkgerrors.Wrapf(err, "error deleting pod: %s", updatePod.GetName()), "duplicate catalog polling pod")
424446
}
425447
}
426448

@@ -523,6 +545,29 @@ func imageChanged(logger *logrus.Entry, updatePod *corev1.Pod, servingPods []*co
523545
return false
524546
}
525547

548+
func isPodDead(pod *corev1.Pod) bool {
549+
for _, check := range []func(*corev1.Pod) bool{
550+
isPodDeletedByTaintManager,
551+
} {
552+
if check(pod) {
553+
return true
554+
}
555+
}
556+
return false
557+
}
558+
559+
func isPodDeletedByTaintManager(pod *corev1.Pod) bool {
560+
if pod.DeletionTimestamp == nil {
561+
return false
562+
}
563+
for _, condition := range pod.Status.Conditions {
564+
if condition.Type == corev1.DisruptionTarget && condition.Reason == "DeletionByTaintManager" && condition.Status == corev1.ConditionTrue {
565+
return true
566+
}
567+
}
568+
return false
569+
}
570+
526571
// imageID returns the ImageID of the primary catalog source container or an empty string if the image ID isn't available yet.
527572
// Note: the pod must be running and the container in a ready status to return a valid ImageID.
528573
func imageID(pod *corev1.Pod) string {
@@ -545,7 +590,7 @@ func imageID(pod *corev1.Pod) string {
545590
func (c *GrpcRegistryReconciler) removePods(pods []*corev1.Pod, namespace string) error {
546591
for _, p := range pods {
547592
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(namespace).Delete(context.TODO(), p.GetName(), *metav1.NewDeleteOptions(1)); err != nil && !apierrors.IsNotFound(err) {
548-
return errors.Wrapf(err, "error deleting pod: %s", p.GetName())
593+
return pkgerrors.Wrapf(err, "error deleting pod: %s", p.GetName())
549594
}
550595
}
551596
return nil
@@ -623,7 +668,7 @@ func (c *GrpcRegistryReconciler) podFailed(pod *corev1.Pod) (bool, error) {
623668
logrus.WithField("UpdatePod", pod.GetName()).Infof("catalog polling result: update pod %s failed to start", pod.GetName())
624669
err := c.removePods([]*corev1.Pod{pod}, pod.GetNamespace())
625670
if err != nil {
626-
return true, errors.Wrapf(err, "error deleting failed catalog polling pod: %s", pod.GetName())
671+
return true, pkgerrors.Wrapf(err, "error deleting failed catalog polling pod: %s", pod.GetName())
627672
}
628673
return true, nil
629674
}

0 commit comments

Comments
 (0)
Please sign in to comment.