Skip to content

Commit da2734d

Browse files
haircommanderbertinatto
authored andcommitted
UPSTREAM: <carry>: disable load balancing on created cgroups when managed is enabled
Previously, cpu load balancing was enabled in cri-o by manually changing the sched_domain of cpus in sysfs. However, RHEL 9 dropped support for this knob, instead requiring it be changed in cgroups directly. To enable cpu load balancing on cgroupv1, the specified cgroup must have cpuset.sched_load_balance set to 0, as well as all of that cgroup's parents, plus all of the cgroups that contain a subset of the cpus that load balancing is disabled for. By default, all cpusets inherit the set from their parent and sched_load_balance as 1. Since we need to keep the cpus that need load balancing disabled in the root cgroup, all slices will inherit the full cpuset. Rather than rebalancing every cgroup whenever a new guaranteed cpuset cgroup is created, the approach this PR takes is to set load balancing to disabled for all slices. Since slices definitionally don't have any processes in them, setting load balancing won't affect the actual scheduling decisions of the kernel. All it will do is open the opportunity for CRI-O to set the actually set load balancing to disabled for containers that request it. Signed-off-by: Peter Hunt <[email protected]> UPSTREAM: <carry>: kubelet/cm: disable cpu load balancing on slices when using static cpu manager policy There are situations where cpu load balance disabling is desired when the kubelet is not in managed state. Instead of using that condition, set the cpu load balancing parameter for new slices when the cpu policy is static Signed-off-by: Peter Hunt <[email protected]> UPSTREAM: <carry>: cm: reorder setting of sched_load_balance for sandbox slice If we call mgr.Apply() first, libcontainer's cpusetCopyIfNeeded() will copy the parent cpuset and set load balancing to 1 by default. This causes the kernel to set the cpus to not load balanced for a brief moment which causes churn. instead, create the cgroup and set load balance, then have Apply() copy the values into it. Signed-off-by: Peter Hunt <[email protected]> UPSTREAM: <carry>: kubelet/cm: use MkdirAll when creating cpuset to ignore file exists error Signed-off-by: Peter Hunt <[email protected]>
1 parent fcb43d6 commit da2734d

6 files changed

+40
-1
lines changed

pkg/kubelet/cm/cgroup_manager_linux.go

+24-1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"github.com/opencontainers/cgroups/fscommon"
3030
libcontainercgroupmanager "github.com/opencontainers/cgroups/manager"
3131
cgroupsystemd "github.com/opencontainers/cgroups/systemd"
32+
"github.com/opencontainers/runc/libcontainer/cgroups"
3233
"k8s.io/klog/v2"
3334
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
3435

@@ -145,6 +146,10 @@ type cgroupCommon struct {
145146

146147
// useSystemd tells if systemd cgroup manager should be used.
147148
useSystemd bool
149+
150+
// cpuLoadBalanceDisable tells whether kubelet should disable
151+
// cpu load balancing on new cgroups it creates.
152+
cpuLoadBalanceDisable bool
148153
}
149154

150155
// Make sure that cgroupV1impl and cgroupV2impl implement the CgroupManager interface
@@ -389,6 +394,25 @@ func (m *cgroupCommon) Create(cgroupConfig *CgroupConfig) error {
389394
return err
390395
}
391396

397+
// Disable cpuset.sched_load_balance for all cgroups Kubelet creates.
398+
// This way, CRI can disable sched_load_balance for pods that must have load balance
399+
// disabled, but the slices can contain all cpus (as the guaranteed cpus are known dynamically).
400+
// Note: this should be done before Apply(-1) below, as Apply contains cpusetCopyIfNeeded(), which will
401+
// populate the cpuset with the parent's cpuset. However, it will be initialized to sched_load_balance=1
402+
// which will cause the kernel to move all cpusets out of their isolated sched_domain, causing unnecessary churn.
403+
if m.cpuLoadBalanceDisable && !libcontainercgroups.IsCgroup2UnifiedMode() {
404+
path := manager.Path("cpuset")
405+
if path == "" {
406+
return fmt.Errorf("Failed to find cpuset for newly created cgroup")
407+
}
408+
if err := os.MkdirAll(path, 0o755); err != nil {
409+
return fmt.Errorf("failed to create cpuset for newly created cgroup: %w", err)
410+
}
411+
if err := cgroups.WriteFile(path, "cpuset.sched_load_balance", "0"); err != nil {
412+
return err
413+
}
414+
}
415+
392416
// Apply(-1) is a hack to create the cgroup directories for each resource
393417
// subsystem. The function [cgroups.Manager.apply()] applies cgroup
394418
// configuration to the process with the specified pid.
@@ -404,7 +428,6 @@ func (m *cgroupCommon) Create(cgroupConfig *CgroupConfig) error {
404428
if err := manager.Set(libcontainerCgroupConfig.Resources); err != nil {
405429
utilruntime.HandleError(fmt.Errorf("cgroup manager.Set failed: %w", err))
406430
}
407-
408431
return nil
409432
}
410433

pkg/kubelet/cm/cgroup_manager_unsupported.go

+3
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,9 @@ func (m *unsupportedCgroupManager) SetCgroupConfig(name CgroupName, resourceConf
9393
return errNotSupported
9494
}
9595

96+
func (m *unsupportedCgroupManager) SetCPULoadBalanceDisable() {
97+
}
98+
9699
var RootCgroupName = CgroupName([]string{})
97100

98101
func NewCgroupName(base CgroupName, components ...string) CgroupName {

pkg/kubelet/cm/cgroup_v1_manager_linux.go

+4
Original file line numberDiff line numberDiff line change
@@ -143,3 +143,7 @@ func (c *cgroupV1impl) getCgroupCPUConfig(cgroupPath string) (*ResourceConfig, e
143143
func (c *cgroupV1impl) getCgroupMemoryConfig(cgroupPath string) (*ResourceConfig, error) {
144144
return readCgroupMemoryConfig(cgroupPath, cgroupv1MemLimitFile)
145145
}
146+
147+
func (m *cgroupV1impl) SetCPULoadBalanceDisable() {
148+
m.cpuLoadBalanceDisable = true
149+
}

pkg/kubelet/cm/cgroup_v2_manager_linux.go

+4
Original file line numberDiff line numberDiff line change
@@ -179,3 +179,7 @@ func cpuSharesToCPUWeight(cpuShares uint64) uint64 {
179179
func cpuWeightToCPUShares(cpuWeight uint64) uint64 {
180180
return uint64((((cpuWeight - 1) * 262142) / 9999) + 2)
181181
}
182+
183+
func (m *cgroupV2impl) SetCPULoadBalanceDisable() {
184+
m.cpuLoadBalanceDisable = true
185+
}

pkg/kubelet/cm/container_manager_linux.go

+3
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,9 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
250250
cgroupRoot := ParseCgroupfsToCgroupName(nodeConfig.CgroupRoot)
251251
cgroupManager := NewCgroupManager(subsystems, nodeConfig.CgroupDriver)
252252
nodeConfig.CgroupVersion = cgroupManager.Version()
253+
if nodeConfig.CPUManagerPolicy == string(cpumanager.PolicyStatic) {
254+
cgroupManager.SetCPULoadBalanceDisable()
255+
}
253256
// Check if Cgroup-root actually exists on the node
254257
if nodeConfig.CgroupsPerQOS {
255258
// this does default to / when enabled, but this tests against regressions.

pkg/kubelet/cm/types.go

+2
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ type CgroupManager interface {
9393
SetCgroupConfig(name CgroupName, resourceConfig *ResourceConfig) error
9494
// Version of the cgroup implementation on the host
9595
Version() int
96+
// Toggle whether CPU load balancing should be disabled for new cgroups the kubelet creates
97+
SetCPULoadBalanceDisable()
9698
}
9799

98100
// QOSContainersInfo stores the names of containers per qos

0 commit comments

Comments
 (0)