@@ -3,18 +3,20 @@ package reconciler
3
3
4
4
import (
5
5
"context"
6
+ "errors"
6
7
"fmt"
7
8
8
9
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/install"
9
10
hashutil "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/kubernetes/pkg/util/hash"
10
- "github.com/pkg/errors"
11
+ pkgerrors "github.com/pkg/errors"
11
12
"github.com/sirupsen/logrus"
12
13
corev1 "k8s.io/api/core/v1"
13
14
rbacv1 "k8s.io/api/rbac/v1"
14
15
apierrors "k8s.io/apimachinery/pkg/api/errors"
15
16
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
16
17
"k8s.io/apimachinery/pkg/labels"
17
18
"k8s.io/apimachinery/pkg/util/intstr"
19
+ "k8s.io/utils/ptr"
18
20
19
21
"github.com/operator-framework/api/pkg/operators/v1alpha1"
20
22
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/operatorclient"
@@ -327,27 +329,27 @@ func (c *ConfigMapRegistryReconciler) EnsureRegistryServer(logger *logrus.Entry,
327
329
328
330
//TODO: if any of these error out, we should write a status back (possibly set RegistryServiceStatus to nil so they get recreated)
329
331
if err := c .ensureServiceAccount (source , overwrite ); err != nil {
330
- return errors .Wrapf (err , "error ensuring service account: %s" , source .serviceAccountName ())
332
+ return pkgerrors .Wrapf (err , "error ensuring service account: %s" , source .serviceAccountName ())
331
333
}
332
334
if err := c .ensureRole (source , overwrite ); err != nil {
333
- return errors .Wrapf (err , "error ensuring role: %s" , source .roleName ())
335
+ return pkgerrors .Wrapf (err , "error ensuring role: %s" , source .roleName ())
334
336
}
335
337
if err := c .ensureRoleBinding (source , overwrite ); err != nil {
336
- return errors .Wrapf (err , "error ensuring rolebinding: %s" , source .RoleBinding ().GetName ())
338
+ return pkgerrors .Wrapf (err , "error ensuring rolebinding: %s" , source .RoleBinding ().GetName ())
337
339
}
338
340
pod , err := source .Pod (image , defaultPodSecurityConfig )
339
341
if err != nil {
340
342
return err
341
343
}
342
344
if err := c .ensurePod (source , defaultPodSecurityConfig , overwritePod ); err != nil {
343
- return errors .Wrapf (err , "error ensuring pod: %s" , pod .GetName ())
345
+ return pkgerrors .Wrapf (err , "error ensuring pod: %s" , pod .GetName ())
344
346
}
345
347
service , err := source .Service ()
346
348
if err != nil {
347
349
return err
348
350
}
349
351
if err := c .ensureService (source , overwrite ); err != nil {
350
- return errors .Wrapf (err , "error ensuring service: %s" , service .GetName ())
352
+ return pkgerrors .Wrapf (err , "error ensuring service: %s" , service .GetName ())
351
353
}
352
354
353
355
if overwritePod {
@@ -420,15 +422,15 @@ func (c *ConfigMapRegistryReconciler) ensurePod(source configMapCatalogSourceDec
420
422
}
421
423
for _ , p := range currentPods {
422
424
if err := c .OpClient .KubernetesInterface ().CoreV1 ().Pods (pod .GetNamespace ()).Delete (context .TODO (), p .GetName (), * metav1 .NewDeleteOptions (1 )); err != nil && ! apierrors .IsNotFound (err ) {
423
- return errors .Wrapf (err , "error deleting old pod: %s" , p .GetName ())
425
+ return pkgerrors .Wrapf (err , "error deleting old pod: %s" , p .GetName ())
424
426
}
425
427
}
426
428
}
427
429
_ , err = c .OpClient .KubernetesInterface ().CoreV1 ().Pods (pod .GetNamespace ()).Create (context .TODO (), pod , metav1.CreateOptions {})
428
430
if err == nil {
429
431
return nil
430
432
}
431
- return errors .Wrapf (err , "error creating new pod: %s" , pod .GetGenerateName ())
433
+ return pkgerrors .Wrapf (err , "error creating new pod: %s" , pod .GetGenerateName ())
432
434
}
433
435
434
436
func (c * ConfigMapRegistryReconciler ) ensureService (source configMapCatalogSourceDecorator , overwrite bool ) error {
@@ -512,6 +514,34 @@ func (c *ConfigMapRegistryReconciler) CheckRegistryServer(logger *logrus.Entry,
512
514
return
513
515
}
514
516
515
- healthy = true
516
- return
517
+ podsAreLive , e := detectAndDeleteDeadPods (logger , c .OpClient , pods , source .GetNamespace ())
518
+ if e != nil {
519
+ return false , fmt .Errorf ("error deleting dead pods: %v" , e )
520
+ }
521
+ return podsAreLive , nil
522
+ }
523
+
524
+ // detectAndDeleteDeadPods determines if there are registry client pods that are in the deleted state
525
+ // but have not been removed by GC (eg the node goes down before GC can remove them), and attempts to
526
+ // force delete the pods. If there are live registry pods remaining, it returns true, otherwise returns false.
527
+ func detectAndDeleteDeadPods (logger * logrus.Entry , client operatorclient.ClientInterface , pods []* corev1.Pod , sourceNamespace string ) (bool , error ) {
528
+ var forceDeletionErrs []error
529
+ livePodFound := false
530
+ for _ , pod := range pods {
531
+ if ! isPodDead (pod ) {
532
+ livePodFound = true
533
+ logger .WithFields (logrus.Fields {"pod.namespace" : sourceNamespace , "pod.name" : pod .GetName ()}).Debug ("pod is alive" )
534
+ continue
535
+ }
536
+ logger .WithFields (logrus.Fields {"pod.namespace" : sourceNamespace , "pod.name" : pod .GetName ()}).Info ("force deleting dead pod" )
537
+ if err := client .KubernetesInterface ().CoreV1 ().Pods (sourceNamespace ).Delete (context .TODO (), pod .GetName (), metav1.DeleteOptions {
538
+ GracePeriodSeconds : ptr.To [int64 ](0 ),
539
+ }); err != nil && ! apierrors .IsNotFound (err ) {
540
+ forceDeletionErrs = append (forceDeletionErrs , err )
541
+ }
542
+ }
543
+ if len (forceDeletionErrs ) > 0 {
544
+ return false , errors .Join (forceDeletionErrs ... )
545
+ }
546
+ return livePodFound , nil
517
547
}
0 commit comments