Skip to content

Commit a40423b

Browse files
pritidesaitekton-robot
authored andcommitted
wait for a given duration in case of imagePullBackOff
We have implemented imagePullBackOff as fail fast. The issue with this approach is, the node where the pod is scheduled often experiences registry rate limit. The image pull failure because of the rate limit returns the same warning (reason: Failed and message: ImagePullBackOff). The pod can potentially recover after waiting for enough time until the cap is expired. Kubernetes can then successfully pull the image and bring the pod up. Introducing a default configuration to specify cluster level timeout to allow the imagePullBackOff to retry for a given duration. Once that duration has passed, return a permanent failure. #5987 #7184 Signed-off-by: Priti Desai <[email protected]> wait for a given duration in case of imagePullBackOff Signed-off-by: Priti Desai <[email protected]>
1 parent 9be03e2 commit a40423b

9 files changed

+247
-22
lines changed

config/config-defaults.yaml

+5
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,11 @@ data:
8787
# no default-resolver-type is specified by default
8888
default-resolver-type:
8989
90+
# default-imagepullbackoff-timeout contains the default duration to wait
91+
# before requeuing the TaskRun to retry, specifying 0 here is equivalent to fail fast
92+
# possible values could be 1m, 5m, 10s, 1h, etc
93+
# default-imagepullbackoff-timeout: "5m"
94+
9095
# default-container-resource-requirements allow users to update default resource requirements
9196
# to a init-containers and containers of a pods create by the controller
9297
# Onet: All the resource requirements are applied to init-containers and containers

docs/additional-configs.md

+21
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ installation.
3131
- [Verify the transparency logs using `rekor-cli`](#verify-the-transparency-logs-using-rekor-cli)
3232
- [Verify Tekton Resources](#verify-tekton-resources)
3333
- [Pipelinerun with Affinity Assistant](#pipelineruns-with-affinity-assistant)
34+
- [TaskRuns with `imagePullBackOff` Timeout](#taskruns-with-imagepullbackoff-timeout)
3435
- [Next steps](#next-steps)
3536

3637

@@ -672,6 +673,26 @@ please take a look at [Trusted Resources](./trusted-resources.md).
672673
The cluster operators can review the [guidelines](developers/affinity-assistant.md) to `cordon` a node in the cluster
673674
with the tekton controller and the affinity assistant is enabled.
674675
676+
## TaskRuns with `imagePullBackOff` Timeout
677+
678+
Tekton pipelines has adopted a fail fast strategy with a taskRun failing with `TaskRunImagePullFailed` in case of an
679+
`imagePullBackOff`. This can be limited in some cases, and it generally depends on the infrastructure. To allow the
680+
cluster operators to decide whether to wait in case of an `imagePullBackOff`, a setting is available to configure
681+
the wait time in minutes such that the controller will wait for the specified duration before declaring a failure.
682+
For example, with the following `config-defaults`, the controller does not mark the taskRun as failure for 5 minutes since
683+
the pod is scheduled in case the image pull fails with `imagePullBackOff`.
684+
See issue https://github.com/tektoncd/pipeline/issues/5987 for more details.
685+
686+
```yaml
687+
apiVersion: v1
688+
kind: ConfigMap
689+
metadata:
690+
name: config-defaults
691+
namespace: tekton-pipelines
692+
data:
693+
default-imagepullbackoff-timeout: "5"
694+
```
695+
675696
## Next steps
676697
677698
To get started with Tekton check the [Introductory tutorials][quickstarts],

pkg/apis/config/default.go

+14
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ const (
4949
// default resource requirements, will be applied to all the containers, which has empty resource requirements
5050
ResourceRequirementDefaultContainerKey = "default"
5151

52+
DefaultImagePullBackOffTimeout = 0 * time.Minute
53+
5254
defaultTimeoutMinutesKey = "default-timeout-minutes"
5355
defaultServiceAccountKey = "default-service-account"
5456
defaultManagedByLabelValueKey = "default-managed-by-label-value"
@@ -60,6 +62,7 @@ const (
6062
defaultForbiddenEnv = "default-forbidden-env"
6163
defaultResolverTypeKey = "default-resolver-type"
6264
defaultContainerResourceRequirementsKey = "default-container-resource-requirements"
65+
defaultImagePullBackOffTimeout = "default-imagepullbackoff-timeout"
6366
)
6467

6568
// DefaultConfig holds all the default configurations for the config.
@@ -79,6 +82,7 @@ type Defaults struct {
7982
DefaultForbiddenEnv []string
8083
DefaultResolverType string
8184
DefaultContainerResourceRequirements map[string]corev1.ResourceRequirements
85+
DefaultImagePullBackOffTimeout time.Duration
8286
}
8387

8488
// GetDefaultsConfigName returns the name of the configmap containing all
@@ -109,6 +113,7 @@ func (cfg *Defaults) Equals(other *Defaults) bool {
109113
other.DefaultTaskRunWorkspaceBinding == cfg.DefaultTaskRunWorkspaceBinding &&
110114
other.DefaultMaxMatrixCombinationsCount == cfg.DefaultMaxMatrixCombinationsCount &&
111115
other.DefaultResolverType == cfg.DefaultResolverType &&
116+
other.DefaultImagePullBackOffTimeout == cfg.DefaultImagePullBackOffTimeout &&
112117
reflect.DeepEqual(other.DefaultForbiddenEnv, cfg.DefaultForbiddenEnv)
113118
}
114119

@@ -121,6 +126,7 @@ func NewDefaultsFromMap(cfgMap map[string]string) (*Defaults, error) {
121126
DefaultCloudEventsSink: DefaultCloudEventSinkValue,
122127
DefaultMaxMatrixCombinationsCount: DefaultMaxMatrixCombinationsCount,
123128
DefaultResolverType: DefaultResolverTypeValue,
129+
DefaultImagePullBackOffTimeout: DefaultImagePullBackOffTimeout,
124130
}
125131

126132
if defaultTimeoutMin, ok := cfgMap[defaultTimeoutMinutesKey]; ok {
@@ -191,6 +197,14 @@ func NewDefaultsFromMap(cfgMap map[string]string) (*Defaults, error) {
191197
tc.DefaultContainerResourceRequirements = resourceRequirementsValue
192198
}
193199

200+
if defaultImagePullBackOff, ok := cfgMap[defaultImagePullBackOffTimeout]; ok {
201+
timeout, err := time.ParseDuration(defaultImagePullBackOff)
202+
if err != nil {
203+
return nil, fmt.Errorf("failed parsing tracing config %q", defaultImagePullBackOffTimeout)
204+
}
205+
tc.DefaultImagePullBackOffTimeout = timeout
206+
}
207+
194208
return &tc, nil
195209
}
196210

pkg/apis/config/default_test.go

+31
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package config_test
1818

1919
import (
2020
"testing"
21+
"time"
2122

2223
"github.com/google/go-cmp/cmp"
2324
"github.com/tektoncd/pipeline/pkg/apis/config"
@@ -43,6 +44,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) {
4344
DefaultManagedByLabelValue: "something-else",
4445
DefaultMaxMatrixCombinationsCount: 256,
4546
DefaultResolverType: "git",
47+
DefaultImagePullBackOffTimeout: time.Duration(5) * time.Second,
4648
},
4749
fileName: config.GetDefaultsConfigName(),
4850
},
@@ -62,12 +64,16 @@ func TestNewDefaultsFromConfigMap(t *testing.T) {
6264
},
6365
},
6466
DefaultMaxMatrixCombinationsCount: 256,
67+
DefaultImagePullBackOffTimeout: 0,
6568
},
6669
fileName: "config-defaults-with-pod-template",
6770
},
6871
{
6972
expectedError: true,
7073
fileName: "config-defaults-timeout-err",
74+
}, {
75+
expectedError: true,
76+
fileName: "config-defaults-imagepullbackoff-timeout-err",
7177
},
7278
// Previously the yaml package did not support UnmarshalStrict, though
7379
// it's supported now however it may introduce incompatibility, so we decide
@@ -81,6 +87,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) {
8187
DefaultManagedByLabelValue: config.DefaultManagedByLabelValue,
8288
DefaultPodTemplate: &pod.Template{},
8389
DefaultMaxMatrixCombinationsCount: 256,
90+
DefaultImagePullBackOffTimeout: 0,
8491
},
8592
},
8693
{
@@ -92,6 +99,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) {
9299
DefaultManagedByLabelValue: config.DefaultManagedByLabelValue,
93100
DefaultAAPodTemplate: &pod.AffinityAssistantTemplate{},
94101
DefaultMaxMatrixCombinationsCount: 256,
102+
DefaultImagePullBackOffTimeout: 0,
95103
},
96104
},
97105
{
@@ -106,6 +114,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) {
106114
DefaultTimeoutMinutes: 60,
107115
DefaultServiceAccount: "default",
108116
DefaultManagedByLabelValue: config.DefaultManagedByLabelValue,
117+
DefaultImagePullBackOffTimeout: 0,
109118
},
110119
},
111120
{
@@ -117,6 +126,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) {
117126
DefaultMaxMatrixCombinationsCount: 256,
118127
DefaultManagedByLabelValue: "tekton-pipelines",
119128
DefaultForbiddenEnv: []string{"TEKTON_POWER_MODE", "TEST_ENV", "TEST_TEKTON"},
129+
DefaultImagePullBackOffTimeout: time.Duration(15) * time.Second,
120130
},
121131
},
122132
{
@@ -128,6 +138,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) {
128138
DefaultManagedByLabelValue: "tekton-pipelines",
129139
DefaultMaxMatrixCombinationsCount: 256,
130140
DefaultContainerResourceRequirements: map[string]corev1.ResourceRequirements{},
141+
DefaultImagePullBackOffTimeout: 0,
131142
},
132143
},
133144
{
@@ -142,6 +153,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) {
142153
DefaultServiceAccount: "default",
143154
DefaultManagedByLabelValue: "tekton-pipelines",
144155
DefaultMaxMatrixCombinationsCount: 256,
156+
DefaultImagePullBackOffTimeout: 0,
145157
DefaultContainerResourceRequirements: map[string]corev1.ResourceRequirements{
146158
config.ResourceRequirementDefaultContainerKey: {
147159
Requests: corev1.ResourceList{
@@ -197,6 +209,7 @@ func TestNewDefaultsFromEmptyConfigMap(t *testing.T) {
197209
DefaultManagedByLabelValue: "tekton-pipelines",
198210
DefaultServiceAccount: "default",
199211
DefaultMaxMatrixCombinationsCount: 256,
212+
DefaultImagePullBackOffTimeout: 0,
200213
}
201214
verifyConfigFileWithExpectedConfig(t, DefaultsConfigEmptyName, expectedConfig)
202215
}
@@ -345,6 +358,24 @@ func TestEquals(t *testing.T) {
345358
DefaultForbiddenEnv: []string{"TEST_ENV", "TEKTON_POWER_MODE"},
346359
},
347360
expected: true,
361+
}, {
362+
name: "different default ImagePullBackOff timeout",
363+
left: &config.Defaults{
364+
DefaultImagePullBackOffTimeout: 10,
365+
},
366+
right: &config.Defaults{
367+
DefaultImagePullBackOffTimeout: 20,
368+
},
369+
expected: false,
370+
}, {
371+
name: "same default ImagePullBackOff timeout",
372+
left: &config.Defaults{
373+
DefaultImagePullBackOffTimeout: 20,
374+
},
375+
right: &config.Defaults{
376+
DefaultImagePullBackOffTimeout: 20,
377+
},
378+
expected: true,
348379
},
349380
}
350381

pkg/apis/config/testdata/config-defaults-forbidden-env.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,4 @@ data:
2121
default-timeout-minutes: "50"
2222
default-service-account: "tekton"
2323
default-forbidden-env: "TEST_TEKTON, TEKTON_POWER_MODE,TEST_ENV,TEST_TEKTON"
24+
default-imagepullbackoff-timeout: "15s"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Copyright 2019 The Tekton Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: v1
16+
kind: ConfigMap
17+
metadata:
18+
name: config-defaults
19+
namespace: tekton-pipelines
20+
data:
21+
default-imagepullbackoff-timeout: "not-a-timeout"

pkg/apis/config/testdata/config-defaults.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ data:
2222
default-service-account: "tekton"
2323
default-managed-by-label-value: "something-else"
2424
default-resolver-type: "git"
25+
default-imagepullbackoff-timeout: "5s"

pkg/reconciler/taskrun/taskrun.go

+45-3
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,15 @@ type Reconciler struct {
9393
tracerProvider trace.TracerProvider
9494
}
9595

96+
const ImagePullBackOff = "ImagePullBackOff"
97+
9698
var (
9799
// Check that our Reconciler implements taskrunreconciler.Interface
98100
_ taskrunreconciler.Interface = (*Reconciler)(nil)
99101

100102
// Pod failure reasons that trigger failure of the TaskRun
101103
podFailureReasons = map[string]struct{}{
102-
"ImagePullBackOff": {},
104+
ImagePullBackOff: {},
103105
"InvalidImageName": {},
104106
}
105107
)
@@ -170,7 +172,7 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, tr *v1.TaskRun) pkgrecon
170172
}
171173

172174
// Check for Pod Failures
173-
if failed, reason, message := c.checkPodFailed(tr); failed {
175+
if failed, reason, message := c.checkPodFailed(ctx, tr); failed {
174176
err := c.failTaskRun(ctx, tr, reason, message)
175177
return c.finishReconcileUpdateEmitEvents(ctx, tr, before, err)
176178
}
@@ -221,10 +223,30 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, tr *v1.TaskRun) pkgrecon
221223
return nil
222224
}
223225

224-
func (c *Reconciler) checkPodFailed(tr *v1.TaskRun) (bool, v1.TaskRunReason, string) {
226+
func (c *Reconciler) checkPodFailed(ctx context.Context, tr *v1.TaskRun) (bool, v1.TaskRunReason, string) {
225227
for _, step := range tr.Status.Steps {
226228
if step.Waiting != nil {
227229
if _, found := podFailureReasons[step.Waiting.Reason]; found {
230+
if step.Waiting.Reason == ImagePullBackOff {
231+
imagePullBackOffTimeOut := config.FromContextOrDefaults(ctx).Defaults.DefaultImagePullBackOffTimeout
232+
// only attempt to recover from the imagePullBackOff if specified
233+
if imagePullBackOffTimeOut.Seconds() != 0 {
234+
p, err := c.KubeClientSet.CoreV1().Pods(tr.Namespace).Get(ctx, tr.Status.PodName, metav1.GetOptions{})
235+
if err != nil {
236+
message := fmt.Sprintf(`The step %q in TaskRun %q failed to pull the image %q and the pod with error: "%s."`, step.Name, tr.Name, step.ImageID, err)
237+
return true, v1.TaskRunReasonImagePullFailed, message
238+
}
239+
for _, condition := range p.Status.Conditions {
240+
// check the pod condition to get the time when the pod was scheduled
241+
// keep trying until the pod schedule time has exceeded the specified imagePullBackOff timeout duration
242+
if condition.Type == corev1.PodScheduled {
243+
if c.Clock.Since(condition.LastTransitionTime.Time) < imagePullBackOffTimeOut {
244+
return false, "", ""
245+
}
246+
}
247+
}
248+
}
249+
}
228250
image := step.ImageID
229251
message := fmt.Sprintf(`The step %q in TaskRun %q failed to pull the image %q. The pod errored with the message: "%s."`, step.Name, tr.Name, image, step.Waiting.Message)
230252
return true, v1.TaskRunReasonImagePullFailed, message
@@ -234,6 +256,26 @@ func (c *Reconciler) checkPodFailed(tr *v1.TaskRun) (bool, v1.TaskRunReason, str
234256
for _, sidecar := range tr.Status.Sidecars {
235257
if sidecar.Waiting != nil {
236258
if _, found := podFailureReasons[sidecar.Waiting.Reason]; found {
259+
if sidecar.Waiting.Reason == ImagePullBackOff {
260+
imagePullBackOffTimeOut := config.FromContextOrDefaults(ctx).Defaults.DefaultImagePullBackOffTimeout
261+
// only attempt to recover from the imagePullBackOff if specified
262+
if imagePullBackOffTimeOut.Seconds() != 0 {
263+
p, err := c.KubeClientSet.CoreV1().Pods(tr.Namespace).Get(ctx, tr.Status.PodName, metav1.GetOptions{})
264+
if err != nil {
265+
message := fmt.Sprintf(`The sidecar %q in TaskRun %q failed to pull the image %q and the pod with error: "%s."`, sidecar.Name, tr.Name, sidecar.ImageID, err)
266+
return true, v1.TaskRunReasonImagePullFailed, message
267+
}
268+
for _, condition := range p.Status.Conditions {
269+
// check the pod condition to get the time when the pod was scheduled
270+
// keep trying until the pod schedule time has exceeded the specified imagePullBackOff timeout duration
271+
if condition.Type == corev1.PodScheduled {
272+
if c.Clock.Since(condition.LastTransitionTime.Time) < imagePullBackOffTimeOut {
273+
return false, "", ""
274+
}
275+
}
276+
}
277+
}
278+
}
237279
image := sidecar.ImageID
238280
message := fmt.Sprintf(`The sidecar %q in TaskRun %q failed to pull the image %q. The pod errored with the message: "%s."`, sidecar.Name, tr.Name, image, sidecar.Waiting.Message)
239281
return true, v1.TaskRunReasonImagePullFailed, message

0 commit comments

Comments
 (0)