Skip to content

Commit 26a60a0

Browse files
haircommanderbertinatto
authored andcommitted
UPSTREAM: <carry>: disable load balancing on created cgroups when managed is enabled
Previously, cpu load balancing was enabled in cri-o by manually changing the sched_domain of cpus in sysfs. However, RHEL 9 dropped support for this knob, instead requiring it be changed in cgroups directly. To enable cpu load balancing on cgroupv1, the specified cgroup must have cpuset.sched_load_balance set to 0, as well as all of that cgroup's parents, plus all of the cgroups that contain a subset of the cpus that load balancing is disabled for. By default, all cpusets inherit the set from their parent and sched_load_balance as 1. Since we need to keep the cpus that need load balancing disabled in the root cgroup, all slices will inherit the full cpuset. Rather than rebalancing every cgroup whenever a new guaranteed cpuset cgroup is created, the approach this PR takes is to set load balancing to disabled for all slices. Since slices definitionally don't have any processes in them, setting load balancing won't affect the actual scheduling decisions of the kernel. All it will do is open the opportunity for CRI-O to set the actually set load balancing to disabled for containers that request it. Signed-off-by: Peter Hunt <[email protected]> UPSTREAM: <carry>: kubelet/cm: disable cpu load balancing on slices when using static cpu manager policy There are situations where cpu load balance disabling is desired when the kubelet is not in managed state. Instead of using that condition, set the cpu load balancing parameter for new slices when the cpu policy is static Signed-off-by: Peter Hunt <[email protected]> UPSTREAM: <carry>: cm: reorder setting of sched_load_balance for sandbox slice If we call mgr.Apply() first, libcontainer's cpusetCopyIfNeeded() will copy the parent cpuset and set load balancing to 1 by default. This causes the kernel to set the cpus to not load balanced for a brief moment which causes churn. instead, create the cgroup and set load balance, then have Apply() copy the values into it. Signed-off-by: Peter Hunt <[email protected]> UPSTREAM: <carry>: kubelet/cm: use MkdirAll when creating cpuset to ignore file exists error Signed-off-by: Peter Hunt <[email protected]>
1 parent 3bdbf8e commit 26a60a0

6 files changed

+40
-1
lines changed

pkg/kubelet/cm/cgroup_manager_linux.go

+24-1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"sync"
2626
"time"
2727

28+
"github.com/opencontainers/runc/libcontainer/cgroups"
2829
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
2930
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
3031
libcontainercgroupmanager "github.com/opencontainers/runc/libcontainer/cgroups/manager"
@@ -146,6 +147,10 @@ type cgroupCommon struct {
146147

147148
// useSystemd tells if systemd cgroup manager should be used.
148149
useSystemd bool
150+
151+
// cpuLoadBalanceDisable tells whether kubelet should disable
152+
// cpu load balancing on new cgroups it creates.
153+
cpuLoadBalanceDisable bool
149154
}
150155

151156
// Make sure that cgroupV1impl and cgroupV2impl implement the CgroupManager interface
@@ -390,6 +395,25 @@ func (m *cgroupCommon) Create(cgroupConfig *CgroupConfig) error {
390395
return err
391396
}
392397

398+
// Disable cpuset.sched_load_balance for all cgroups Kubelet creates.
399+
// This way, CRI can disable sched_load_balance for pods that must have load balance
400+
// disabled, but the slices can contain all cpus (as the guaranteed cpus are known dynamically).
401+
// Note: this should be done before Apply(-1) below, as Apply contains cpusetCopyIfNeeded(), which will
402+
// populate the cpuset with the parent's cpuset. However, it will be initialized to sched_load_balance=1
403+
// which will cause the kernel to move all cpusets out of their isolated sched_domain, causing unnecessary churn.
404+
if m.cpuLoadBalanceDisable && !libcontainercgroups.IsCgroup2UnifiedMode() {
405+
path := manager.Path("cpuset")
406+
if path == "" {
407+
return fmt.Errorf("Failed to find cpuset for newly created cgroup")
408+
}
409+
if err := os.MkdirAll(path, 0o755); err != nil {
410+
return fmt.Errorf("failed to create cpuset for newly created cgroup: %w", err)
411+
}
412+
if err := cgroups.WriteFile(path, "cpuset.sched_load_balance", "0"); err != nil {
413+
return err
414+
}
415+
}
416+
393417
// Apply(-1) is a hack to create the cgroup directories for each resource
394418
// subsystem. The function [cgroups.Manager.apply()] applies cgroup
395419
// configuration to the process with the specified pid.
@@ -405,7 +429,6 @@ func (m *cgroupCommon) Create(cgroupConfig *CgroupConfig) error {
405429
if err := manager.Set(libcontainerCgroupConfig.Resources); err != nil {
406430
utilruntime.HandleError(fmt.Errorf("cgroup manager.Set failed: %w", err))
407431
}
408-
409432
return nil
410433
}
411434

pkg/kubelet/cm/cgroup_manager_unsupported.go

+3
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,9 @@ func (m *unsupportedCgroupManager) SetCgroupConfig(name CgroupName, resourceConf
9393
return errNotSupported
9494
}
9595

96+
func (m *unsupportedCgroupManager) SetCPULoadBalanceDisable() {
97+
}
98+
9699
var RootCgroupName = CgroupName([]string{})
97100

98101
func NewCgroupName(base CgroupName, components ...string) CgroupName {

pkg/kubelet/cm/cgroup_v1_manager_linux.go

+4
Original file line numberDiff line numberDiff line change
@@ -143,3 +143,7 @@ func (c *cgroupV1impl) getCgroupCPUConfig(cgroupPath string) (*ResourceConfig, e
143143
func (c *cgroupV1impl) getCgroupMemoryConfig(cgroupPath string) (*ResourceConfig, error) {
144144
return readCgroupMemoryConfig(cgroupPath, cgroupv1MemLimitFile)
145145
}
146+
147+
func (m *cgroupV1impl) SetCPULoadBalanceDisable() {
148+
m.cpuLoadBalanceDisable = true
149+
}

pkg/kubelet/cm/cgroup_v2_manager_linux.go

+4
Original file line numberDiff line numberDiff line change
@@ -175,3 +175,7 @@ func cpuSharesToCPUWeight(cpuShares uint64) uint64 {
175175
func cpuWeightToCPUShares(cpuWeight uint64) uint64 {
176176
return uint64((((cpuWeight - 1) * 262142) / 9999) + 2)
177177
}
178+
179+
func (m *cgroupV2impl) SetCPULoadBalanceDisable() {
180+
m.cpuLoadBalanceDisable = true
181+
}

pkg/kubelet/cm/container_manager_linux.go

+3
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,9 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
251251
cgroupRoot := ParseCgroupfsToCgroupName(nodeConfig.CgroupRoot)
252252
cgroupManager := NewCgroupManager(subsystems, nodeConfig.CgroupDriver)
253253
nodeConfig.CgroupVersion = cgroupManager.Version()
254+
if nodeConfig.CPUManagerPolicy == string(cpumanager.PolicyStatic) {
255+
cgroupManager.SetCPULoadBalanceDisable()
256+
}
254257
// Check if Cgroup-root actually exists on the node
255258
if nodeConfig.CgroupsPerQOS {
256259
// this does default to / when enabled, but this tests against regressions.

pkg/kubelet/cm/types.go

+2
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ type CgroupManager interface {
9393
SetCgroupConfig(name CgroupName, resourceConfig *ResourceConfig) error
9494
// Version of the cgroup implementation on the host
9595
Version() int
96+
// Toggle whether CPU load balancing should be disabled for new cgroups the kubelet creates
97+
SetCPULoadBalanceDisable()
9698
}
9799

98100
// QOSContainersInfo stores the names of containers per qos

0 commit comments

Comments
 (0)