Skip to content

Commit dae0037

Browse files
author
OpenShift Bot
authored
Merge pull request #13550 from kargakis/retry-longer-on-pending-deployments
Merged by openshift-bot
2 parents 959be87 + df79f89 commit dae0037

File tree

3 files changed

+86
-29
lines changed

3 files changed

+86
-29
lines changed

pkg/deploy/controller/deployment/controller.go

+27-4
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,21 @@ import (
2020
"github.com/openshift/origin/pkg/util"
2121
)
2222

23+
// maxRetryCount is the maximum number of times the controller will retry errors.
24+
// The first requeue is after 5ms and subsequent requeues grow exponentially.
25+
// This effectively can extend up to 5^10ms which caps to 1000s:
26+
//
27+
// 5ms, 25ms, 125ms, 625ms, 3s, 16s, 78s, 390s, 1000s, 1000s
28+
//
29+
// The most common errors are:
30+
//
31+
// * failure to delete the deployer pods
32+
// * failure to update the replication controller
33+
// * pod may be missing from the cache once the deployment transitions to Pending.
34+
//
35+
// In most cases, we shouldn't need to retry up to maxRetryCount...
36+
const maxRetryCount = 10
37+
2338
// fatalError is an error which can't be retried.
2439
type fatalError string
2540

@@ -71,10 +86,10 @@ type DeploymentController struct {
7186
recorder record.EventRecorder
7287
}
7388

74-
// Handle processes deployment and either creates a deployer pod or responds
89+
// handle processes a deployment and either creates a deployer pod or responds
7590
// to a terminal deployment status. Since this controller started using caches,
7691
// the provided rc MUST be deep-copied beforehand (see work() in factory.go).
77-
func (c *DeploymentController) Handle(deployment *kapi.ReplicationController) error {
92+
func (c *DeploymentController) handle(deployment *kapi.ReplicationController, willBeDropped bool) error {
7893
// Copy all the annotations from the deployment.
7994
updatedAnnotations := make(map[string]string)
8095
for key, value := range deployment.Annotations {
@@ -127,6 +142,7 @@ func (c *DeploymentController) Handle(deployment *kapi.ReplicationController) er
127142
nextStatus = deployapi.DeploymentStatusPending
128143
glog.V(4).Infof("Created deployer pod %s for deployment %s", deploymentPod.Name, deployutil.LabelForDeployment(deployment))
129144

145+
// Most likely dead code since we never get an error different from 404 back from the cache.
130146
case deployerErr != nil:
131147
// If the pod already exists, it's possible that a previous CreatePod
132148
// succeeded but the deployment state update failed and now we're re-
@@ -163,6 +179,11 @@ func (c *DeploymentController) Handle(deployment *kapi.ReplicationController) er
163179
// If the deployment is cancelled here then we deleted the deployer in a previous
164180
// resync of the deployment.
165181
if !deployutil.IsDeploymentCancelled(deployment) {
182+
// Retry more before setting the deployment to Failed if it's Pending - the pod might not have
183+
// appeared in the cache yet.
184+
if !willBeDropped && currentStatus == deployapi.DeploymentStatusPending {
185+
return deployerErr
186+
}
166187
updatedAnnotations[deployapi.DeploymentStatusReasonAnnotation] = deployapi.DeploymentFailedDeployerPodNoLongerExists
167188
c.emitDeploymentEvent(deployment, kapi.EventTypeWarning, "Failed", fmt.Sprintf("Deployer pod %q has gone missing", deployerPodName))
168189
deployerErr = fmt.Errorf("Failing deployment %q because its deployer pod %q disappeared", deployutil.LabelForDeployment(deployment), deployerPodName)
@@ -423,13 +444,15 @@ func (c *DeploymentController) handleErr(err error, key interface{}, deployment
423444
return
424445
}
425446

426-
if c.queue.NumRequeues(key) < 2 {
447+
if c.queue.NumRequeues(key) < maxRetryCount {
427448
c.queue.AddRateLimited(key)
428449
return
429450
}
430451

452+
msg := fmt.Sprintf("About to stop retrying %q: %v", deployment.Name, err)
431453
if _, isActionableErr := err.(actionableError); isActionableErr {
432-
c.emitDeploymentEvent(deployment, kapi.EventTypeWarning, "FailedRetry", fmt.Sprintf("About to stop retrying %s: %v", deployment.Name, err))
454+
c.emitDeploymentEvent(deployment, kapi.EventTypeWarning, "FailedRetry", msg)
433455
}
456+
glog.V(2).Infof(msg)
434457
c.queue.Forget(key)
435458
}

pkg/deploy/controller/deployment/controller_test.go

+54-24
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ func TestHandle_createPodOk(t *testing.T) {
122122

123123
controller := okDeploymentController(client, nil, nil, true, kapi.PodUnknown)
124124

125-
if err := controller.Handle(deployment); err != nil {
125+
if err := controller.handle(deployment, false); err != nil {
126126
t.Fatalf("unexpected error: %v", err)
127127
}
128128

@@ -211,7 +211,7 @@ func TestHandle_createPodFail(t *testing.T) {
211211

212212
controller := okDeploymentController(client, nil, nil, true, kapi.PodUnknown)
213213

214-
err := controller.Handle(deployment)
214+
err := controller.handle(deployment, false)
215215
if err == nil {
216216
t.Fatalf("expected an error")
217217
}
@@ -278,7 +278,7 @@ func TestHandle_deployerPodAlreadyExists(t *testing.T) {
278278

279279
controller := okDeploymentController(client, deployment, nil, true, test.podPhase)
280280

281-
if err := controller.Handle(deployment); err != nil {
281+
if err := controller.handle(deployment, false); err != nil {
282282
t.Errorf("%s: unexpected error: %v", test.name, err)
283283
continue
284284
}
@@ -317,7 +317,7 @@ func TestHandle_unrelatedPodAlreadyExists(t *testing.T) {
317317

318318
controller := okDeploymentController(client, deployment, nil, false, kapi.PodRunning)
319319

320-
if err := controller.Handle(deployment); err != nil {
320+
if err := controller.handle(deployment, false); err != nil {
321321
t.Fatalf("unexpected error: %v", err)
322322
}
323323

@@ -358,7 +358,7 @@ func TestHandle_unrelatedPodAlreadyExistsTestScaled(t *testing.T) {
358358

359359
controller := okDeploymentController(client, deployment, nil, false, kapi.PodRunning)
360360

361-
if err := controller.Handle(deployment); err != nil {
361+
if err := controller.handle(deployment, false); err != nil {
362362
t.Fatalf("unexpected error: %v", err)
363363
}
364364

@@ -416,7 +416,7 @@ func TestHandle_noop(t *testing.T) {
416416

417417
controller := okDeploymentController(client, deployment, nil, true, test.podPhase)
418418

419-
if err := controller.Handle(deployment); err != nil {
419+
if err := controller.handle(deployment, false); err != nil {
420420
t.Errorf("%s: unexpected error: %v", test.name, err)
421421
continue
422422
}
@@ -451,7 +451,7 @@ func TestHandle_failedTest(t *testing.T) {
451451

452452
controller := okDeploymentController(client, deployment, nil, true, kapi.PodFailed)
453453

454-
if err := controller.Handle(deployment); err != nil {
454+
if err := controller.handle(deployment, false); err != nil {
455455
t.Fatalf("unexpected error: %v", err)
456456
}
457457

@@ -492,7 +492,7 @@ func TestHandle_cleanupPodOk(t *testing.T) {
492492
controller := okDeploymentController(client, deployment, hookPods, true, kapi.PodSucceeded)
493493
hookPods = append(hookPods, deployment.Name)
494494

495-
if err := controller.Handle(deployment); err != nil {
495+
if err := controller.handle(deployment, false); err != nil {
496496
t.Fatalf("unexpected error: %v", err)
497497
}
498498

@@ -537,7 +537,7 @@ func TestHandle_cleanupPodOkTest(t *testing.T) {
537537
controller := okDeploymentController(client, deployment, hookPods, true, kapi.PodSucceeded)
538538
hookPods = append(hookPods, deployment.Name)
539539

540-
if err := controller.Handle(deployment); err != nil {
540+
if err := controller.handle(deployment, false); err != nil {
541541
t.Fatalf("unexpected error: %v", err)
542542
}
543543

@@ -581,7 +581,7 @@ func TestHandle_cleanupPodNoop(t *testing.T) {
581581
pod.Labels[deployapi.DeployerPodForDeploymentLabel] = "unrelated"
582582
controller.podStore.Indexer.Update(pod)
583583

584-
if err := controller.Handle(deployment); err != nil {
584+
if err := controller.handle(deployment, false); err != nil {
585585
t.Fatalf("unexpected error: %v", err)
586586
}
587587
}
@@ -609,7 +609,7 @@ func TestHandle_cleanupPodFail(t *testing.T) {
609609

610610
controller := okDeploymentController(client, deployment, nil, true, kapi.PodSucceeded)
611611

612-
err := controller.Handle(deployment)
612+
err := controller.handle(deployment, false)
613613
if err == nil {
614614
t.Fatal("expected an actionable error")
615615
}
@@ -640,7 +640,7 @@ func TestHandle_cancelNew(t *testing.T) {
640640

641641
controller := okDeploymentController(client, deployment, nil, true, kapi.PodRunning)
642642

643-
if err := controller.Handle(deployment); err != nil {
643+
if err := controller.handle(deployment, false); err != nil {
644644
t.Fatalf("unexpected error: %v", err)
645645
}
646646

@@ -676,7 +676,7 @@ func TestHandle_cleanupNewWithDeployers(t *testing.T) {
676676

677677
controller := okDeploymentController(client, deployment, nil, true, kapi.PodRunning)
678678

679-
if err := controller.Handle(deployment); err != nil {
679+
if err := controller.handle(deployment, false); err != nil {
680680
t.Fatalf("unexpected error: %v", err)
681681
}
682682

@@ -755,7 +755,7 @@ func TestHandle_cleanupPostNew(t *testing.T) {
755755

756756
controller := okDeploymentController(client, deployment, hookPods, true, test.podPhase)
757757

758-
if err := controller.Handle(deployment); err != nil {
758+
if err := controller.handle(deployment, false); err != nil {
759759
t.Errorf("%s: unexpected error: %v", test.name, err)
760760
continue
761761
}
@@ -767,15 +767,27 @@ func TestHandle_cleanupPostNew(t *testing.T) {
767767
}
768768

769769
// TestHandle_deployerPodDisappeared ensures that a pending/running deployment
770-
// is failed when its deployer pod vanishes.
770+
// is failed when its deployer pod vanishes. Ensure that pending deployments
771+
// wont fail instantly on a missing deployer pod because it may take some time
772+
// for it to appear in the pod cache.
771773
func TestHandle_deployerPodDisappeared(t *testing.T) {
772774
tests := []struct {
773-
name string
774-
phase deployapi.DeploymentStatus
775+
name string
776+
phase deployapi.DeploymentStatus
777+
willBeDropped bool
778+
shouldRetry bool
775779
}{
776780
{
777-
name: "pending",
778-
phase: deployapi.DeploymentStatusPending,
781+
name: "pending - retry",
782+
phase: deployapi.DeploymentStatusPending,
783+
willBeDropped: false,
784+
shouldRetry: true,
785+
},
786+
{
787+
name: "pending - fail",
788+
phase: deployapi.DeploymentStatusPending,
789+
willBeDropped: true,
790+
shouldRetry: false,
779791
},
780792
{
781793
name: "running",
@@ -801,21 +813,39 @@ func TestHandle_deployerPodDisappeared(t *testing.T) {
801813
continue
802814
}
803815
deployment.Annotations[deployapi.DeploymentStatusAnnotation] = string(test.phase)
816+
updatedDeployment = deployment
804817

805818
controller := okDeploymentController(client, nil, nil, true, kapi.PodUnknown)
806819

807-
if err := controller.Handle(deployment); err != nil {
820+
err = controller.handle(deployment, test.willBeDropped)
821+
if !test.shouldRetry && err != nil {
808822
t.Errorf("%s: unexpected error: %v", test.name, err)
809823
continue
810824
}
825+
if test.shouldRetry && err == nil {
826+
t.Errorf("%s: expected an error so that the deployment can be retried, got none", test.name)
827+
continue
828+
}
811829

812-
if !updateCalled {
830+
if !test.shouldRetry && !updateCalled {
813831
t.Errorf("%s: expected update", test.name)
814832
continue
815833
}
816834

817-
if e, a := deployapi.DeploymentStatusFailed, deployutil.DeploymentStatusFor(updatedDeployment); e != a {
818-
t.Errorf("%s: expected deployment status %q, got %q", test.name, e, a)
835+
if test.shouldRetry && updateCalled {
836+
t.Errorf("%s: unexpected update", test.name)
837+
continue
838+
}
839+
840+
gotStatus := deployutil.DeploymentStatusFor(updatedDeployment)
841+
if !test.shouldRetry && deployapi.DeploymentStatusFailed != gotStatus {
842+
t.Errorf("%s: expected deployment status %q, got %q", test.name, deployapi.DeploymentStatusFailed, gotStatus)
843+
continue
844+
}
845+
846+
if test.shouldRetry && deployapi.DeploymentStatusPending != gotStatus {
847+
t.Errorf("%s: expected deployment status %q, got %q", test.name, deployapi.DeploymentStatusPending, gotStatus)
848+
continue
819849
}
820850
}
821851
}
@@ -921,7 +951,7 @@ func TestHandle_transitionFromDeployer(t *testing.T) {
921951

922952
controller := okDeploymentController(client, deployment, nil, true, test.podPhase)
923953

924-
if err := controller.Handle(deployment); err != nil {
954+
if err := controller.handle(deployment, false); err != nil {
925955
t.Errorf("%s: unexpected error: %v", test.name, err)
926956
continue
927957
}

pkg/deploy/controller/deployment/factory.go

+5-1
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,11 @@ func (c *DeploymentController) work() bool {
194194
return false
195195
}
196196

197-
err = c.Handle(rc)
197+
// Resist missing deployer pods from the cache in case of a pending deployment.
198+
// Give some room for a possible rc update failure in case we decided to mark it
199+
// failed.
200+
willBeDropped := c.queue.NumRequeues(key) >= maxRetryCount-2
201+
err = c.handle(rc, willBeDropped)
198202
c.handleErr(err, key, rc)
199203

200204
return false

0 commit comments

Comments
 (0)