Skip to content

Commit ac6e47c

Browse files
committed
DRA taint eviction: improve error handling
There was one error path that led to a "controller has shut down" log message. Other errors caused different log entries or are so unlikely (event handler registration failure!) that they weren't checked at all. It's clearer to let Run return an error in all cases and then log the "controller has shut down" error at the call site. This also enables tests to mark themselves as failed, should that ever happen.
1 parent 473533a commit ac6e47c

File tree

3 files changed

+28
-13
lines changed

3 files changed

+28
-13
lines changed

cmd/kube-controller-manager/app/core.go

+5-1
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,11 @@ func startDeviceTaintEvictionController(ctx context.Context, controllerContext C
254254
controllerContext.InformerFactory.Resource().V1beta1().DeviceClasses(),
255255
controllerName,
256256
)
257-
go deviceTaintEvictionController.Run(ctx)
257+
go func() {
258+
if err := deviceTaintEvictionController.Run(ctx); err != nil {
259+
klog.FromContext(ctx).Error(err, "Device taint processing leading to Pod eviction failed and is now paused")
260+
}
261+
}()
258262
return nil, true, nil
259263
}
260264

pkg/controller/devicetainteviction/device_taint_eviction.go

+18-7
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package devicetainteviction
1818

1919
import (
2020
"context"
21+
"errors"
2122
"fmt"
2223
"math"
2324
"slices"
@@ -319,7 +320,8 @@ func New(c clientset.Interface, podInformer coreinformers.PodInformer, claimInfo
319320
}
320321

321322
// Run starts the controller which will run until the context is done.
322-
func (tc *Controller) Run(ctx context.Context) {
323+
// An error is returned for startup problems.
324+
func (tc *Controller) Run(ctx context.Context) error {
323325
defer utilruntime.HandleCrash()
324326
logger := klog.FromContext(ctx)
325327
logger.Info("Starting", "controller", tc.name)
@@ -370,7 +372,7 @@ func (tc *Controller) Run(ctx context.Context) {
370372
// mutex serializes event processing.
371373
var mutex sync.Mutex
372374

373-
claimHandler, _ := tc.claimInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
375+
claimHandler, err := tc.claimInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
374376
AddFunc: func(obj any) {
375377
claim, ok := obj.(*resourceapi.ResourceClaim)
376378
if !ok {
@@ -409,12 +411,15 @@ func (tc *Controller) Run(ctx context.Context) {
409411
tc.handleClaimChange(claim, nil)
410412
},
411413
})
414+
if err != nil {
415+
return fmt.Errorf("adding claim event handler:%w", err)
416+
}
412417
defer func() {
413418
_ = tc.claimInformer.Informer().RemoveEventHandler(claimHandler)
414419
}()
415420
tc.haveSynced = append(tc.haveSynced, claimHandler.HasSynced)
416421

417-
podHandler, _ := tc.podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
422+
podHandler, err := tc.podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
418423
AddFunc: func(obj any) {
419424
pod, ok := obj.(*v1.Pod)
420425
if !ok {
@@ -453,6 +458,9 @@ func (tc *Controller) Run(ctx context.Context) {
453458
tc.handlePodChange(pod, nil)
454459
},
455460
})
461+
if err != nil {
462+
return fmt.Errorf("adding pod event handler: %w", err)
463+
}
456464
defer func() {
457465
_ = tc.podInformer.Informer().RemoveEventHandler(podHandler)
458466
}()
@@ -467,8 +475,7 @@ func (tc *Controller) Run(ctx context.Context) {
467475
}
468476
sliceTracker, err := resourceslicetracker.StartTracker(ctx, opts)
469477
if err != nil {
470-
logger.Info("Failed to initialize ResourceSlice tracker; device taint processing leading to Pod eviction is now paused", "err", err)
471-
return
478+
return fmt.Errorf("initialize ResourceSlice tracker: %w", err)
472479
}
473480
tc.haveSynced = append(tc.haveSynced, sliceTracker.HasSynced)
474481
defer sliceTracker.Stop()
@@ -478,11 +485,11 @@ func (tc *Controller) Run(ctx context.Context) {
478485
// work which might be done as events get emitted for intermediate
479486
// state.
480487
if !cache.WaitForNamedCacheSyncWithContext(ctx, tc.haveSynced...) {
481-
return
488+
return errors.New("wait for cache sync timed out")
482489
}
483490
logger.V(1).Info("Underlying informers have synced")
484491

485-
_, _ = sliceTracker.AddEventHandler(cache.ResourceEventHandlerFuncs{
492+
_, err = sliceTracker.AddEventHandler(cache.ResourceEventHandlerFuncs{
486493
AddFunc: func(obj any) {
487494
slice, ok := obj.(*resourceapi.ResourceSlice)
488495
if !ok {
@@ -519,12 +526,16 @@ func (tc *Controller) Run(ctx context.Context) {
519526
tc.handleSliceChange(slice, nil)
520527
},
521528
})
529+
if err != nil {
530+
return fmt.Errorf("add slice event handler: %w", err)
531+
}
522532

523533
// sliceTracker.AddEventHandler blocked while delivering events for all known
524534
// ResourceSlices. Therefore our own state is up-to-date once we get here.
525535
tc.hasSynced.Store(1)
526536

527537
<-ctx.Done()
538+
return nil
528539
}
529540

530541
func (tc *Controller) handleClaimChange(oldClaim, newClaim *resourceapi.ResourceClaim) {

pkg/controller/devicetainteviction/device_taint_eviction_test.go

+5-5
Original file line numberDiff line numberDiff line change
@@ -1339,7 +1339,7 @@ func TestEviction(t *testing.T) {
13391339
wg.Add(1)
13401340
go func() {
13411341
defer wg.Done()
1342-
controller.Run(tCtx)
1342+
assert.NoError(tCtx, controller.Run(tCtx), "eviction controller failed")
13431343
}()
13441344

13451345
// Eventually the controller should have synced it's informers.
@@ -1450,7 +1450,7 @@ func testCancelEviction(tCtx ktesting.TContext, deletePod bool) {
14501450
wg.Add(1)
14511451
go func() {
14521452
defer wg.Done()
1453-
controller.Run(tCtx)
1453+
assert.NoError(tCtx, controller.Run(tCtx), "eviction controller failed")
14541454
}()
14551455

14561456
// Eventually the pod gets scheduled for eviction.
@@ -1543,7 +1543,7 @@ func TestParallelPodDeletion(t *testing.T) {
15431543
wg.Add(1)
15441544
go func() {
15451545
defer wg.Done()
1546-
controller.Run(tCtx)
1546+
assert.NoError(tCtx, controller.Run(tCtx), "eviction controller failed")
15471547
}()
15481548

15491549
// Eventually the pod gets deleted, in this test by us.
@@ -1622,7 +1622,7 @@ func TestRetry(t *testing.T) {
16221622
wg.Add(1)
16231623
go func() {
16241624
defer wg.Done()
1625-
controller.Run(tCtx)
1625+
assert.NoError(tCtx, controller.Run(tCtx), "eviction controller failed")
16261626
}()
16271627

16281628
// Eventually the pod gets deleted.
@@ -1694,7 +1694,7 @@ func TestEvictionFailure(t *testing.T) {
16941694
wg.Add(1)
16951695
go func() {
16961696
defer wg.Done()
1697-
controller.Run(tCtx)
1697+
assert.NoError(tCtx, controller.Run(tCtx), "eviction controller failed")
16981698
}()
16991699

17001700
// Eventually deletion is attempted a few times.

0 commit comments

Comments
 (0)