Skip to content

Commit 924a2a4

Browse files
sjenningk8s-publishing-bot
authored andcommitted
UPSTREAM: 54410: Cpu manager reconcile loop - restore state
Origin-commit: aecd8ed5b576df6a38c58617c5a754504e233934
1 parent cf6d145 commit 924a2a4

File tree

3 files changed

+30
-3
lines changed

3 files changed

+30
-3
lines changed

pkg/kubelet/cm/cpumanager/cpu_manager.go

+21-2
Original file line numberDiff line numberDiff line change
@@ -152,8 +152,8 @@ func NewManager(cpuPolicyName string, reconcilePeriod time.Duration, machineInfo
152152
}
153153

154154
func (m *manager) Start(activePods ActivePodsFunc, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService) {
155-
glog.Infof("[cpumanger] starting with %s policy", m.policy.Name())
156-
glog.Infof("[cpumanger] reconciling every %v", m.reconcilePeriod)
155+
glog.Infof("[cpumanager] starting with %s policy", m.policy.Name())
156+
glog.Infof("[cpumanager] reconciling every %v", m.reconcilePeriod)
157157

158158
m.activePods = activePods
159159
m.podStatusProvider = podStatusProvider
@@ -234,6 +234,25 @@ func (m *manager) reconcileState() (success []reconciledContainer, failure []rec
234234
continue
235235
}
236236

237+
// Check whether container is present in state, there may be 3 reasons why it's not present:
238+
// - policy does not want to track the container
239+
// - kubelet has just been restarted - and there is no previous state file
240+
// - container has been removed from state by RemoveContainer call (DeletionTimestamp is set)
241+
if _, ok := m.state.GetCPUSet(containerID); !ok {
242+
if status.Phase == v1.PodRunning && pod.DeletionTimestamp == nil {
243+
glog.V(4).Infof("[cpumanager] reconcileState: container is not present in state - trying to add (pod: %s, container: %s, container id: %s)", pod.Name, container.Name, containerID)
244+
err := m.AddContainer(pod, &container, containerID)
245+
if err != nil {
246+
glog.Errorf("[cpumanager] reconcileState: failed to add container (pod: %s, container: %s, container id: %s, error: %v)", pod.Name, container.Name, containerID, err)
247+
failure = append(failure, reconciledContainer{pod.Name, container.Name, containerID})
248+
}
249+
} else {
250+
// if DeletionTimestamp is set, pod has already been removed from state
251+
// skip the pod/container since it's not running and will be deleted soon
252+
continue
253+
}
254+
}
255+
237256
cset := m.state.GetCPUSetOrDefault(containerID)
238257
if cset.IsEmpty() {
239258
// NOTE: This should not happen outside of tests.

pkg/kubelet/cm/cpumanager/policy.go

+2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ import (
2525
type Policy interface {
2626
Name() string
2727
Start(s state.State)
28+
// AddContainer call is idempotent
2829
AddContainer(s state.State, pod *v1.Pod, container *v1.Container, containerID string) error
30+
// RemoveContainer call is idempotent
2931
RemoveContainer(s state.State, containerID string) error
3032
}

pkg/kubelet/cm/cpumanager/policy_static.go

+7-1
Original file line numberDiff line numberDiff line change
@@ -156,9 +156,15 @@ func (p *staticPolicy) assignableCPUs(s state.State) cpuset.CPUSet {
156156
}
157157

158158
func (p *staticPolicy) AddContainer(s state.State, pod *v1.Pod, container *v1.Container, containerID string) error {
159-
glog.Infof("[cpumanager] static policy: AddContainer (pod: %s, container: %s, container id: %s)", pod.Name, container.Name, containerID)
160159
if numCPUs := guaranteedCPUs(pod, container); numCPUs != 0 {
160+
glog.Infof("[cpumanager] static policy: AddContainer (pod: %s, container: %s, container id: %s)", pod.Name, container.Name, containerID)
161161
// container belongs in an exclusively allocated pool
162+
163+
if _, ok := s.GetCPUSet(containerID); ok {
164+
glog.Infof("[cpumanager] static policy: container already present in state, skipping (container: %s, container id: %s)", container.Name, containerID)
165+
return nil
166+
}
167+
162168
cpuset, err := p.allocateCPUs(s, numCPUs)
163169
if err != nil {
164170
glog.Errorf("[cpumanager] unable to allocate %d CPUs (container id: %s, error: %v)", numCPUs, containerID, err)

0 commit comments

Comments
 (0)