Skip to content

Commit bc41996

Browse files
authored
Merge pull request #1974 from dashpole/cherrypick_to_v0.28
Cherrypick #1971, #1969 #1964, #1963 to release v0.28
2 parents 0b8d4a2 + 9ab811d commit bc41996

File tree

6 files changed

+63
-44
lines changed

6 files changed

+63
-44
lines changed

accelerators/nvidia.go

+22-27
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,10 @@ import (
3131
)
3232

3333
type NvidiaManager struct {
34-
sync.RWMutex
34+
sync.Mutex
35+
36+
// true if there are NVIDIA devices present on the node
37+
devicesPresent bool
3538

3639
// true if the NVML library (libnvidia-ml.so.1) was loaded successfully
3740
nvmlInitialized bool
@@ -51,20 +54,9 @@ func (nm *NvidiaManager) Setup() {
5154
return
5255
}
5356

54-
nm.initializeNVML()
55-
if nm.nvmlInitialized {
56-
return
57-
}
58-
go func() {
59-
glog.V(2).Info("Starting goroutine to initialize NVML")
60-
// TODO: use globalHousekeepingInterval
61-
for range time.Tick(time.Minute) {
62-
nm.initializeNVML()
63-
if nm.nvmlInitialized {
64-
return
65-
}
66-
}
67-
}()
57+
nm.devicesPresent = true
58+
59+
initializeNVML(nm)
6860
}
6961

7062
// detectDevices returns true if a device with given pci id is present on the node.
@@ -91,20 +83,18 @@ func detectDevices(vendorId string) bool {
9183
}
9284

9385
// initializeNVML initializes the NVML library and sets up the nvmlDevices map.
94-
func (nm *NvidiaManager) initializeNVML() {
86+
// This is defined as a variable to help in testing.
87+
var initializeNVML = func(nm *NvidiaManager) {
9588
if err := gonvml.Initialize(); err != nil {
9689
// This is under a logging level because otherwise we may cause
9790
// log spam if the drivers/nvml is not installed on the system.
9891
glog.V(4).Infof("Could not initialize NVML: %v", err)
9992
return
10093
}
94+
nm.nvmlInitialized = true
10195
numDevices, err := gonvml.DeviceCount()
10296
if err != nil {
10397
glog.Warningf("GPU metrics would not be available. Failed to get the number of nvidia devices: %v", err)
104-
nm.Lock()
105-
// Even though we won't have GPU metrics, the library was initialized and should be shutdown when exiting.
106-
nm.nvmlInitialized = true
107-
nm.Unlock()
10898
return
10999
}
110100
glog.V(1).Infof("NVML initialized. Number of nvidia devices: %v", numDevices)
@@ -122,10 +112,6 @@ func (nm *NvidiaManager) initializeNVML() {
122112
}
123113
nm.nvidiaDevices[int(minorNumber)] = device
124114
}
125-
nm.Lock()
126-
// Doing this at the end to avoid race in accessing nvidiaDevices in GetCollector.
127-
nm.nvmlInitialized = true
128-
nm.Unlock()
129115
}
130116

131117
// Destroy shuts down NVML.
@@ -139,12 +125,21 @@ func (nm *NvidiaManager) Destroy() {
139125
// present in the devices.list file in the given devicesCgroupPath.
140126
func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorCollector, error) {
141127
nc := &NvidiaCollector{}
142-
nm.RLock()
128+
129+
if !nm.devicesPresent {
130+
return nc, nil
131+
}
132+
// Makes sure that we don't call initializeNVML() concurrently and
133+
// that we only call initializeNVML() when it's not initialized.
134+
nm.Lock()
135+
if !nm.nvmlInitialized {
136+
initializeNVML(nm)
137+
}
143138
if !nm.nvmlInitialized || len(nm.nvidiaDevices) == 0 {
144-
nm.RUnlock()
139+
nm.Unlock()
145140
return nc, nil
146141
}
147-
nm.RUnlock()
142+
nm.Unlock()
148143
nvidiaMinorNumbers, err := parseDevicesCgroup(devicesCgroupPath)
149144
if err != nil {
150145
return nc, err

accelerators/nvidia_test.go

+13-1
Original file line numberDiff line numberDiff line change
@@ -71,20 +71,32 @@ func TestGetCollector(t *testing.T) {
7171
return []int{2, 3}, nil
7272
}
7373
parseDevicesCgroup = mockParser
74+
originalInitializeNVML := initializeNVML
75+
initializeNVML = func(_ *NvidiaManager) {}
7476
defer func() {
7577
parseDevicesCgroup = originalParser
78+
initializeNVML = originalInitializeNVML
7679
}()
7780

7881
nm := &NvidiaManager{}
7982

80-
// When nvmlInitialized is false, empty collector should be returned.
83+
// When devicesPresent is false, empty collector should be returned.
8184
ac, err := nm.GetCollector("does-not-matter")
8285
assert.Nil(t, err)
8386
assert.NotNil(t, ac)
8487
nc, ok := ac.(*NvidiaCollector)
8588
assert.True(t, ok)
8689
assert.Equal(t, 0, len(nc.Devices))
8790

91+
// When nvmlInitialized is false, empty collector should be returned.
92+
nm.devicesPresent = true
93+
ac, err = nm.GetCollector("does-not-matter")
94+
assert.Nil(t, err)
95+
assert.NotNil(t, ac)
96+
nc, ok = ac.(*NvidiaCollector)
97+
assert.True(t, ok)
98+
assert.Equal(t, 0, len(nc.Devices))
99+
88100
// When nvidiaDevices is empty, empty collector should be returned.
89101
nm.nvmlInitialized = true
90102
ac, err = nm.GetCollector("does-not-matter")

container/crio/handler.go

+4-8
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,6 @@ type crioContainerHandler struct {
8181
ipAddress string
8282

8383
ignoreMetrics container.MetricSet
84-
85-
// container restart count
86-
restartCount int
8784
}
8885

8986
var _ container.ContainerHandler = &crioContainerHandler{}
@@ -175,7 +172,10 @@ func newCrioContainerHandler(
175172
// ignore err and get zero as default, this happens with sandboxes, not sure why...
176173
// kube isn't sending restart count in labels for sandboxes.
177174
restartCount, _ := strconv.Atoi(cInfo.Annotations["io.kubernetes.container.restartCount"])
178-
handler.restartCount = restartCount
175+
// Only adds restartcount label if it's greater than 0
176+
if restartCount > 0 {
177+
handler.labels["restartcount"] = strconv.Itoa(restartCount)
178+
}
179179

180180
handler.ipAddress = cInfo.IP
181181

@@ -225,10 +225,6 @@ func (self *crioContainerHandler) GetSpec() (info.ContainerSpec, error) {
225225
spec, err := common.GetSpec(self.cgroupPaths, self.machineInfoFactory, self.needNet(), hasFilesystem)
226226

227227
spec.Labels = self.labels
228-
// Only adds restartcount label if it's greater than 0
229-
if self.restartCount > 0 {
230-
spec.Labels["restartcount"] = strconv.Itoa(self.restartCount)
231-
}
232228
spec.Envs = self.envs
233229
spec.Image = self.image
234230

container/docker/handler.go

+4-8
Original file line numberDiff line numberDiff line change
@@ -114,9 +114,6 @@ type dockerContainerHandler struct {
114114

115115
// zfs watcher
116116
zfsWatcher *zfs.ZfsWatcher
117-
118-
// container restart count
119-
restartCount int
120117
}
121118

122119
var _ container.ContainerHandler = &dockerContainerHandler{}
@@ -249,7 +246,10 @@ func newDockerContainerHandler(
249246
handler.image = ctnr.Config.Image
250247
handler.networkMode = ctnr.HostConfig.NetworkMode
251248
handler.deviceID = ctnr.GraphDriver.Data["DeviceId"]
252-
handler.restartCount = ctnr.RestartCount
249+
// Only adds restartcount label if it's greater than 0
250+
if ctnr.RestartCount > 0 {
251+
handler.labels["restartcount"] = strconv.Itoa(ctnr.RestartCount)
252+
}
253253

254254
// Obtain the IP address for the contianer.
255255
// If the NetworkMode starts with 'container:' then we need to use the IP address of the container specified.
@@ -385,10 +385,6 @@ func (self *dockerContainerHandler) GetSpec() (info.ContainerSpec, error) {
385385
spec, err := common.GetSpec(self.cgroupPaths, self.machineInfoFactory, self.needNet(), hasFilesystem)
386386

387387
spec.Labels = self.labels
388-
// Only adds restartcount label if it's greater than 0
389-
if self.restartCount > 0 {
390-
spec.Labels["restartcount"] = strconv.Itoa(self.restartCount)
391-
}
392388
spec.Envs = self.envs
393389
spec.Image = self.image
394390
spec.CreationTime = self.creationTime

fs/fs.go

+15
Original file line numberDiff line numberDiff line change
@@ -526,6 +526,21 @@ func (self *RealFsInfo) GetDirFsDevice(dir string) (*DeviceInfo, error) {
526526
}
527527

528528
mount, found := self.mounts[dir]
529+
// try the parent dir if not found until we reach the root dir
530+
// this is an issue on btrfs systems where the directory is not
531+
// the subvolume
532+
for !found {
533+
pathdir, _ := filepath.Split(dir)
534+
// break when we reach root
535+
if pathdir == "/" {
536+
break
537+
}
538+
// trim "/" from the new parent path otherwise the next possible
539+
// filepath.Split in the loop will not split the string any further
540+
dir = strings.TrimSuffix(pathdir, "/")
541+
mount, found = self.mounts[dir]
542+
}
543+
529544
if found && mount.Fstype == "btrfs" && mount.Major == 0 && strings.HasPrefix(mount.Source, "/dev/") {
530545
major, minor, err := getBtrfsMajorMinorIds(mount)
531546
if err != nil {

manager/watcher/raw/raw.go

+5
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,11 @@ func (self *rawContainerWatcher) Stop() error {
110110
// Watches the specified directory and all subdirectories. Returns whether the path was
111111
// already being watched and an error (if any).
112112
func (self *rawContainerWatcher) watchDirectory(dir string, containerName string) (bool, error) {
113+
// Don't watch .mount cgroups because they never have containers as sub-cgroups. A single container
114+
// can have many .mount cgroups associated with it which can quickly exhaust the inotify watches on a node.
115+
if strings.HasSuffix(containerName, ".mount") {
116+
return false, nil
117+
}
113118
alreadyWatching, err := self.watcher.AddWatch(containerName, dir)
114119
if err != nil {
115120
return alreadyWatching, err

0 commit comments

Comments
 (0)