Skip to content

Commit 7fe41da

Browse files
KEP-4603: Node specific kubelet config for maximum backoff down to 1 second (kubernetes#128374)
* Add feature gate, API, and conflict validation tests for enablecrashloopbackoffmax Signed-off-by: Laura Lorenz <[email protected]> * Handle when current base is longer than node max Signed-off-by: Laura Lorenz <[email protected]> * Update pkg/features/kube_features.go Co-authored-by: Tsubasa Nagasawa <[email protected]> * Fix indentation Signed-off-by: Laura Lorenz <[email protected]> * Follow convention for success test Signed-off-by: Laura Lorenz <[email protected]> * Normalize casing, and change field to Duration Signed-off-by: Laura Lorenz <[email protected]> * Fix json name and some other casing errors Signed-off-by: Laura Lorenz <[email protected]> * Another one I missed before Signed-off-by: Laura Lorenz <[email protected]> * Don't clobber global max function Signed-off-by: Laura Lorenz <[email protected]> * Change to flat value in defaults.go Signed-off-by: Laura Lorenz <[email protected]> * Streamline validation and defaults Signed-off-by: Laura Lorenz <[email protected]> * Fix typecheck Signed-off-by: Laura Lorenz <[email protected]> * Lint Signed-off-by: Laura Lorenz <[email protected]> * Tighten up validation for subsecond values Signed-off-by: Laura Lorenz <[email protected]> * Rename field from MaxBackOffPeriod to MaxContainerRestartPeriod Signed-off-by: Laura Lorenz <[email protected]> * A few missed references to renames Signed-off-by: Laura Lorenz <[email protected]> * Only compare flags in flags test Signed-off-by: Laura Lorenz <[email protected]> * Don't mess with SetDefault signature Nobody messes with SetDefault signature Signed-off-by: Laura Lorenz <[email protected]> * Fix stale signature change, and update test data Signed-off-by: Laura Lorenz <[email protected]> * Inspect current feature gates at defaulting time Signed-off-by: Laura Lorenz <[email protected]> * Don't use the global feature gate for temp usage Signed-off-by: Laura Lorenz <[email protected]> * Expose default error, and some comments Signed-off-by: Laura Lorenz <[email protected]> * Hint fuzzer for less arbitrary values to FeatureGates Signed-off-by: Laura Lorenz <[email protected]> --------- Signed-off-by: Laura Lorenz <[email protected]> Co-authored-by: Tsubasa Nagasawa <[email protected]>
1 parent 591c75e commit 7fe41da

File tree

20 files changed

+664
-276
lines changed

20 files changed

+664
-276
lines changed

cmd/kubelet/app/options/options_test.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,8 @@ func TestRoundTrip(t *testing.T) {
100100
}
101101
continue
102102
}
103-
if !reflect.DeepEqual(modifiedFlags, outputFlags) {
104-
t.Errorf("%s: flags did not round trip: %s", testCase.name, cmp.Diff(modifiedFlags, outputFlags))
103+
if !reflect.DeepEqual(modifiedFlags.KubeletFlags, outputFlags.KubeletFlags) {
104+
t.Errorf("%s: flags did not round trip: %s", testCase.name, cmp.Diff(modifiedFlags.KubeletFlags, outputFlags.KubeletFlags))
105105
continue
106106
}
107107
}

cmd/kubelet/app/server.go

+1
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,7 @@ func mergeKubeletConfigurations(kubeletConfig *kubeletconfiginternal.KubeletConf
367367
}
368368
// apply defaulting after decoding
369369
kubeletconfigv1beta1conversion.SetDefaults_KubeletConfiguration(versionedConfig)
370+
370371
// convert back to internal config
371372
if err := kubeletconfigv1beta1conversion.Convert_v1beta1_KubeletConfiguration_To_config_KubeletConfiguration(versionedConfig, kubeletConfig, nil); err != nil {
372373
return fmt.Errorf("failed to convert merged config to internal kubelet configuration: %w", err)

pkg/features/kube_features.go

+9
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,15 @@ const (
232232
// status from DRA drivers.
233233
DRAResourceClaimDeviceStatus featuregate.Feature = "DRAResourceClaimDeviceStatus"
234234

235+
// owner: @lauralorenz
236+
// kep: https://kep.k8s.io/4603
237+
// owner: @lauralorenz
238+
// kep: https://kep.k8s.io/4603
239+
//
240+
// Enables support for configurable per-node backoff maximums for restarting
241+
// containers (aka containers in CrashLoopBackOff)
242+
KubeletCrashLoopBackOffMax featuregate.Feature = "KubeletCrashLoopBackOffMax"
243+
235244
// owner: @harche
236245
// kep: http://kep.k8s.io/3386
237246
//

pkg/features/versioned_kube_features.go

+4
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,10 @@ var defaultVersionedKubernetesFeatureGates = map[featuregate.Feature]featuregate
187187
{Version: version.MustParse("1.32"), Default: false, PreRelease: featuregate.Alpha},
188188
},
189189

190+
KubeletCrashLoopBackOffMax: {
191+
{Version: version.MustParse("1.32"), Default: false, PreRelease: featuregate.Alpha},
192+
},
193+
190194
ElasticIndexedJob: {
191195
{Version: version.MustParse("1.27"), Default: true, PreRelease: featuregate.Beta},
192196
{Version: version.MustParse("1.31"), Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // GA in 1.31, remove in 1.32

pkg/generated/openapi/zz_generated.openapi.go

+29-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/kubelet/apis/config/fuzzer/fuzzer.go

+4
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,10 @@ func Funcs(codecs runtimeserializer.CodecFactory) []interface{} {
121121
obj.EnableSystemLogHandler = true
122122
obj.MemoryThrottlingFactor = ptr.To(rand.Float64())
123123
obj.LocalStorageCapacityIsolation = true
124+
obj.FeatureGates = map[string]bool{
125+
"AllAlpha": false,
126+
"AllBeta": true,
127+
}
124128
},
125129
}
126130
}

pkg/kubelet/apis/config/helpers_test.go

+1
Original file line numberDiff line numberDiff line change
@@ -302,5 +302,6 @@ var (
302302
"Tracing.SamplingRatePerMillion",
303303
"LocalStorageCapacityIsolation",
304304
"FailCgroupV1",
305+
"CrashLoopBackOff.MaxContainerRestartPeriod",
305306
)
306307
)

pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ cpuCFSQuota: true
2525
cpuCFSQuotaPeriod: 100ms
2626
cpuManagerPolicy: none
2727
cpuManagerReconcilePeriod: 10s
28+
crashLoopBackOff: {}
2829
enableControllerAttachDetach: true
2930
enableDebugFlagsHandler: true
3031
enableDebuggingHandlers: true

pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ cpuCFSQuota: true
2525
cpuCFSQuotaPeriod: 100ms
2626
cpuManagerPolicy: none
2727
cpuManagerReconcilePeriod: 10s
28+
crashLoopBackOff: {}
2829
enableControllerAttachDetach: true
2930
enableDebugFlagsHandler: true
3031
enableDebuggingHandlers: true

pkg/kubelet/apis/config/types.go

+16
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,12 @@ type KubeletConfiguration struct {
506506
// option is explicitly enabled.
507507
// +optional
508508
FailCgroupV1 bool
509+
510+
// CrashLoopBackOff contains config to modify node-level parameters for
511+
// container restart behavior
512+
// +featureGate=KubeletCrashLoopBackoffMax
513+
// +optional
514+
CrashLoopBackOff CrashLoopBackOffConfig
509515
}
510516

511517
// KubeletAuthorizationMode denotes the authorization mode for the kubelet
@@ -684,3 +690,13 @@ type MemorySwapConfiguration struct {
684690
// +optional
685691
SwapBehavior string
686692
}
693+
694+
// CrashLoopBackOffConfig is used for setting configuration for this kubelet's
695+
// container restart behavior
696+
type CrashLoopBackOffConfig struct {
697+
// MaxContainerRestartPeriod is the maximum duration the backoff delay can accrue
698+
// to for container restarts, minimum 1 second, maximum 300 seconds.
699+
// +featureGate=KubeletCrashLoopBackOffMax
700+
// +optional
701+
MaxContainerRestartPeriod *metav1.Duration
702+
}

pkg/kubelet/apis/config/v1beta1/defaults.go

+24
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,18 @@ limitations under the License.
1717
package v1beta1
1818

1919
import (
20+
"fmt"
2021
"time"
2122

2223
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2324
kruntime "k8s.io/apimachinery/pkg/runtime"
2425
kubeletconfigv1beta1 "k8s.io/kubelet/config/v1beta1"
2526

2627
// TODO: Cut references to k8s.io/kubernetes, eventually there should be none from this package
28+
utilfeature "k8s.io/apiserver/pkg/util/feature"
2729
logsapi "k8s.io/component-base/logs/api/v1"
2830
"k8s.io/kubernetes/pkg/cluster/ports"
31+
"k8s.io/kubernetes/pkg/features"
2932
"k8s.io/kubernetes/pkg/kubelet/qos"
3033
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
3134
"k8s.io/utils/ptr"
@@ -39,6 +42,8 @@ const (
3942
DefaultPodLogsDir = "/var/log/pods"
4043
// See https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2570-memory-qos
4144
DefaultMemoryThrottlingFactor = 0.9
45+
// MaxContainerBackOff is the max backoff period for container restarts, exported for the e2e test
46+
MaxContainerBackOff = 300 * time.Second
4247
)
4348

4449
var (
@@ -53,6 +58,19 @@ func addDefaultingFuncs(scheme *kruntime.Scheme) error {
5358
}
5459

5560
func SetDefaults_KubeletConfiguration(obj *kubeletconfigv1beta1.KubeletConfiguration) {
61+
62+
// TODO(lauralorenz): Reasses conditional feature gating on defaults. Here
63+
// we 1) copy the gates to a local var, unilaterally merge it with the gate
64+
// config while being defaulted. Alternatively we could unilaterally set the
65+
// default value, later check the gate and wipe it if needed, like API
66+
// strategy does for gate-disabled fields. Meanwhile, KubeletConfiguration
67+
// is increasingly dynamic and the configured gates may change depending on
68+
// when this is called. See also validation.go.
69+
localFeatureGate := utilfeature.DefaultMutableFeatureGate.DeepCopy()
70+
if err := localFeatureGate.SetFromMap(obj.FeatureGates); err != nil {
71+
panic(fmt.Sprintf("failed to merge global and in-flight KubeletConfiguration while setting defaults, error: %v", err))
72+
}
73+
5674
if obj.EnableServer == nil {
5775
obj.EnableServer = ptr.To(true)
5876
}
@@ -286,4 +304,10 @@ func SetDefaults_KubeletConfiguration(obj *kubeletconfigv1beta1.KubeletConfigura
286304
if obj.PodLogsDir == "" {
287305
obj.PodLogsDir = DefaultPodLogsDir
288306
}
307+
308+
if localFeatureGate.Enabled(features.KubeletCrashLoopBackOffMax) {
309+
if obj.CrashLoopBackOff.MaxContainerRestartPeriod == nil {
310+
obj.CrashLoopBackOff.MaxContainerRestartPeriod = &metav1.Duration{Duration: MaxContainerBackOff}
311+
}
312+
}
289313
}

0 commit comments

Comments
 (0)