From f15cede96f377906f561484381a1c8f93aaa84fa Mon Sep 17 00:00:00 2001 From: Simon Krenger Date: Fri, 6 Dec 2024 08:10:45 +0100 Subject: [PATCH] catalog-operator: Delete Pods that were evicted This change adds another reason why a Pod could be detected as "dead", namely when it was evicted by the kubelet. This can happen when there is resource pressure on the Node. Then the reason will be "TerminationByKubelet". This addresses the issue described in https://issues.redhat.com/browse/OCPBUGS-45490 Signed-off-by: Simon Krenger --- pkg/controller/registry/reconciler/grpc.go | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pkg/controller/registry/reconciler/grpc.go b/pkg/controller/registry/reconciler/grpc.go index ec0bdad10c..a960a00729 100644 --- a/pkg/controller/registry/reconciler/grpc.go +++ b/pkg/controller/registry/reconciler/grpc.go @@ -531,6 +531,7 @@ func imageChanged(logger *logrus.Entry, updatePod *corev1.Pod, servingPods []*co func isPodDead(pod *corev1.Pod) bool { for _, check := range []func(*corev1.Pod) bool{ isPodDeletedByTaintManager, + isPodTerminatedByKubelet, } { if check(pod) { return true @@ -551,6 +552,19 @@ func isPodDeletedByTaintManager(pod *corev1.Pod) bool { return false } +// This reason is set when the Pod was evicted due to resource pressure on the Node +func isPodTerminatedByKubelet(pod *corev1.Pod) bool { + if pod.DeletionTimestamp == nil { + return false + } + for _, condition := range pod.Status.Conditions { + if condition.Type == corev1.DisruptionTarget && condition.Reason == "TerminationByKubelet" && condition.Status == corev1.ConditionTrue { + return true + } + } + return false +} + // imageID returns the ImageID of the primary catalog source container or an empty string if the image ID isn't available yet. // Note: the pod must be running and the container in a ready status to return a valid ImageID. func imageID(pod *corev1.Pod) string {