Skip to content

Commit 5ee686b

Browse files
authored
Merge pull request kubernetes#128559 from lauralorenz/crashloopbackoff-refactorimagepullbackoff-e2enodecriproxytest
E2E Node tests for image pull backoff and crashloopbackoff behavior
2 parents f59dd4b + 9ab0d81 commit 5ee686b

File tree

3 files changed

+223
-0
lines changed

3 files changed

+223
-0
lines changed
+155
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
//go:build linux
2+
// +build linux
3+
4+
/*
5+
Copyright 2024 The Kubernetes Authors.
6+
7+
Licensed under the Apache License, Version 2.0 (the "License");
8+
you may not use this file except in compliance with the License.
9+
You may obtain a copy of the License at
10+
11+
http://www.apache.org/licenses/LICENSE-2.0
12+
13+
Unless required by applicable law or agreed to in writing, software
14+
distributed under the License is distributed on an "AS IS" BASIS,
15+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
See the License for the specific language governing permissions and
17+
limitations under the License.
18+
*/
19+
20+
package e2enode
21+
22+
import (
23+
"context"
24+
"fmt"
25+
"time"
26+
27+
podv1util "k8s.io/kubernetes/pkg/api/v1/pod"
28+
imageutils "k8s.io/kubernetes/test/utils/image"
29+
30+
"github.com/onsi/ginkgo/v2"
31+
"github.com/onsi/gomega"
32+
"github.com/pkg/errors"
33+
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
34+
35+
v1 "k8s.io/api/core/v1"
36+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
37+
"k8s.io/apimachinery/pkg/util/uuid"
38+
"k8s.io/kubernetes/test/e2e/feature"
39+
"k8s.io/kubernetes/test/e2e/framework"
40+
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
41+
admissionapi "k8s.io/pod-security-admission/api"
42+
)
43+
44+
const containerName = "restarts"
45+
46+
var _ = SIGDescribe("Container Restart", feature.CriProxy, framework.WithSerial(), func() {
47+
f := framework.NewDefaultFramework("container-restart")
48+
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
49+
50+
ginkgo.Context("Container restart backs off", func() {
51+
52+
ginkgo.BeforeEach(func() {
53+
if err := resetCRIProxyInjector(e2eCriProxy); err != nil {
54+
ginkgo.Skip("Skip the test since the CRI Proxy is undefined.")
55+
}
56+
})
57+
58+
ginkgo.AfterEach(func() {
59+
err := resetCRIProxyInjector(e2eCriProxy)
60+
framework.ExpectNoError(err)
61+
})
62+
63+
ginkgo.It("Container restart backs off.", func(ctx context.Context) {
64+
// 0s, 0s, 10s, 30s, 70s, 150s, 310s
65+
doTest(ctx, f, 3, containerName, 7)
66+
})
67+
})
68+
69+
ginkgo.Context("Alternate container restart backs off as expected", func() {
70+
71+
tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
72+
initialConfig.CrashLoopBackOff.MaxContainerRestartPeriod = &metav1.Duration{Duration: time.Duration(30 * time.Second)}
73+
initialConfig.FeatureGates = map[string]bool{"KubeletCrashLoopBackOffMax": true}
74+
})
75+
76+
ginkgo.BeforeEach(func() {
77+
if err := resetCRIProxyInjector(e2eCriProxy); err != nil {
78+
ginkgo.Skip("Skip the test since the CRI Proxy is undefined.")
79+
}
80+
})
81+
82+
ginkgo.AfterEach(func() {
83+
err := resetCRIProxyInjector(e2eCriProxy)
84+
framework.ExpectNoError(err)
85+
})
86+
87+
ginkgo.It("Alternate restart backs off.", func(ctx context.Context) {
88+
// 0s, 0s, 10s, 30s, 60s, 90s, 120s, 150s, 180s, 210s, 240s, 270s, 300s
89+
doTest(ctx, f, 3, containerName, 13)
90+
})
91+
})
92+
})
93+
94+
func doTest(ctx context.Context, f *framework.Framework, targetRestarts int, containerName string, maxRestarts int) {
95+
96+
pod := e2epod.NewPodClient(f).Create(ctx, newFailAlwaysPod())
97+
podErr := e2epod.WaitForPodContainerToFail(ctx, f.ClientSet, f.Namespace.Name, pod.Name, 0, "CrashLoopBackOff", 1*time.Minute)
98+
gomega.Expect(podErr).To(gomega.HaveOccurred())
99+
100+
// Hard wait 30 seconds for targetRestarts in the best case; longer timeout later will handle if infra was slow.
101+
time.Sleep(30 * time.Second)
102+
podErr = waitForContainerRestartedNTimes(ctx, f, f.Namespace.Name, pod.Name, containerName, 5*time.Minute, targetRestarts)
103+
gomega.Expect(podErr).ShouldNot(gomega.HaveOccurred(), "Expected container to repeatedly back off container failures")
104+
105+
r, err := extractObservedBackoff(ctx, f, pod.Name, containerName)
106+
framework.ExpectNoError(err)
107+
108+
gomega.Expect(r).Should(gomega.BeNumerically("<=", maxRestarts))
109+
}
110+
111+
func extractObservedBackoff(ctx context.Context, f *framework.Framework, podName string, containerName string) (int32, error) {
112+
var r int32
113+
pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(ctx, podName, metav1.GetOptions{})
114+
if err != nil {
115+
return r, err
116+
}
117+
for _, statuses := range [][]v1.ContainerStatus{pod.Status.ContainerStatuses, pod.Status.InitContainerStatuses, pod.Status.EphemeralContainerStatuses} {
118+
for _, cs := range statuses {
119+
if cs.Name == containerName {
120+
return cs.RestartCount, nil
121+
}
122+
}
123+
}
124+
return r, errors.Errorf("Could not find container status for container %s in pod %s", containerName, podName)
125+
}
126+
127+
func newFailAlwaysPod() *v1.Pod {
128+
podName := "container-restart" + string(uuid.NewUUID())
129+
pod := &v1.Pod{
130+
ObjectMeta: metav1.ObjectMeta{
131+
Name: podName,
132+
},
133+
Spec: v1.PodSpec{
134+
Containers: []v1.Container{
135+
{
136+
Name: containerName,
137+
Image: imageutils.GetE2EImage(imageutils.BusyBox),
138+
ImagePullPolicy: v1.PullIfNotPresent,
139+
},
140+
},
141+
},
142+
}
143+
return pod
144+
}
145+
146+
func waitForContainerRestartedNTimes(ctx context.Context, f *framework.Framework, namespace string, podName string, containerName string, timeout time.Duration, target int) error {
147+
conditionDesc := fmt.Sprintf("A container in pod %s restarted at least %d times", podName, target)
148+
return e2epod.WaitForPodCondition(ctx, f.ClientSet, namespace, podName, conditionDesc, timeout, func(pod *v1.Pod) (bool, error) {
149+
cs, found := podv1util.GetContainerStatus(pod.Status.ContainerStatuses, containerName)
150+
if !found {
151+
return false, fmt.Errorf("could not find container %s in pod %s", containerName, podName)
152+
}
153+
return cs.RestartCount >= int32(target), nil
154+
})
155+
}

test/e2e_node/criproxy_test.go

+14
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,20 @@ var _ = SIGDescribe(feature.CriProxy, framework.WithSerial(), func() {
8484
})
8585
})
8686

87+
ginkgo.Context("Image pull backoff", func() {
88+
ginkgo.BeforeEach(func() {
89+
if err := resetCRIProxyInjector(e2eCriProxy); err != nil {
90+
ginkgo.Skip("Skip the test since the CRI Proxy is undefined.")
91+
}
92+
})
93+
94+
ginkgo.AfterEach(func() {
95+
err := resetCRIProxyInjector(e2eCriProxy)
96+
framework.ExpectNoError(err)
97+
})
98+
99+
})
100+
87101
ginkgo.Context("Inject a pull image timeout exception into the CriProxy", func() {
88102
ginkgo.BeforeEach(func() {
89103
if err := resetCRIProxyInjector(e2eCriProxy); err != nil {

test/e2e_node/image_pull_test.go

+54
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import (
3434
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3535
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
3636
kubeletevents "k8s.io/kubernetes/pkg/kubelet/events"
37+
"k8s.io/kubernetes/pkg/kubelet/images"
3738
"k8s.io/kubernetes/test/e2e/feature"
3839
"k8s.io/kubernetes/test/e2e/framework"
3940
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
@@ -230,6 +231,44 @@ var _ = SIGDescribe("Pull Image", feature.CriProxy, framework.WithSerial(), func
230231
})
231232

232233
})
234+
235+
ginkgo.It("Image pull retry backs off on error.", func(ctx context.Context) {
236+
// inject PullImage failed to trigger backoff
237+
expectedErr := fmt.Errorf("PullImage failed")
238+
err := addCRIProxyInjector(e2eCriProxy, func(apiName string) error {
239+
if apiName == criproxy.PullImage {
240+
return expectedErr
241+
}
242+
return nil
243+
})
244+
framework.ExpectNoError(err)
245+
246+
pod := e2epod.NewPodClient(f).Create(ctx, newPullImageAlwaysPod())
247+
podErr := e2epod.WaitForPodCondition(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "ImagePullBackOff", 1*time.Minute, func(pod *v1.Pod) (bool, error) {
248+
if len(pod.Status.ContainerStatuses) > 0 && pod.Status.Reason == images.ErrImagePullBackOff.Error() {
249+
return true, nil
250+
}
251+
return false, nil
252+
})
253+
gomega.Expect(podErr).To(gomega.HaveOccurred())
254+
255+
eventMsg, err := getFailedToPullImageMsg(ctx, f, pod.Name)
256+
framework.ExpectNoError(err)
257+
isExpectedErrMsg := strings.Contains(eventMsg, expectedErr.Error())
258+
gomega.Expect(isExpectedErrMsg).To(gomega.BeTrueBecause("we injected an exception into the PullImage interface of the cri proxy"))
259+
260+
// Hard wait 30 seconds for image pulls to repeatedly back off.
261+
time.Sleep(30 * time.Second)
262+
263+
e, err := getImagePullAttempts(ctx, f, pod.Name)
264+
framework.ExpectNoError(err)
265+
// 3 would take 10s best case.
266+
gomega.Expect(e.Count).Should(gomega.BeNumerically(">=", 3))
267+
// 7 would take 310s best case, if the infra went slow.
268+
gomega.Expect(e.Count).Should(gomega.BeNumerically("<=", 7))
269+
270+
})
271+
233272
})
234273

235274
func getPodImagePullDurations(ctx context.Context, f *framework.Framework, testpods []*v1.Pod) (map[string]*pulledStruct, map[string]metav1.Time, map[string]metav1.Time, error) {
@@ -343,3 +382,18 @@ func getDurationsFromPulledEventMsg(msg string) (*pulledStruct, error) {
343382
pulledIncludeWaitingDuration: pulledIncludeWaitingDuration,
344383
}, nil
345384
}
385+
386+
func getImagePullAttempts(ctx context.Context, f *framework.Framework, podName string) (v1.Event, error) {
387+
event := v1.Event{}
388+
e, err := f.ClientSet.CoreV1().Events(f.Namespace.Name).List(ctx, metav1.ListOptions{})
389+
if err != nil {
390+
return event, err
391+
}
392+
393+
for _, event := range e.Items {
394+
if event.InvolvedObject.Name == podName && event.Reason == kubeletevents.PullingImage {
395+
return event, nil
396+
}
397+
}
398+
return event, nil
399+
}

0 commit comments

Comments
 (0)