Skip to content

Commit 5978dad

Browse files
committed
initial commit
1 parent 7886991 commit 5978dad

File tree

10 files changed

+484
-0
lines changed

10 files changed

+484
-0
lines changed

Diff for: LICENSE

100644100755
File mode changed.

Diff for: Makefile

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Copyright 2018 Google Inc. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
GO := go
16+
pkgs = $(shell $(GO) list ./... | grep -v vendor)
17+
18+
all: presubmit build test
19+
20+
test:
21+
@echo ">> running tests"
22+
@$(GO) test -short -race $(pkgs)
23+
24+
format:
25+
@echo ">> formatting code"
26+
@$(GO) fmt $(pkgs)
27+
28+
vet:
29+
@echo ">> vetting code"
30+
@$(GO) vet $(pkgs)
31+
32+
build:
33+
@echo ">> building binaries"
34+
@./build/build.sh
35+
36+
docker:
37+
@docker build -t cadvisor:$(shell git rev-parse --short HEAD) -f deploy/Dockerfile .
38+
39+
presubmit: vet
40+
@echo ">> checking go formatting"
41+
@./build/check_gofmt.sh
42+
43+
.PHONY: all build docker format release test vet presubmit

Diff for: build/build.sh

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#!/usr/bin/env bash
2+
3+
# Copyright 2018 Google Inc. All rights reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
set -e
18+
19+
GO_FLAGS=${GO_FLAGS:-"-tags netgo"} # Extra go flags to use in the build.
20+
BUILD_USER=${BUILD_USER:-"${USER}@${HOSTNAME}"}
21+
BUILD_DATE=${BUILD_DATE:-$( date +%Y%m%d-%H:%M:%S )}
22+
VERBOSE=${VERBOSE:-}
23+
24+
repo_path="github.com/dashpole/example-gpu-monitor/cmd"
25+
26+
version=$( git describe --tags --dirty --abbrev=14 | sed -E 's/-([0-9]+)-g/.\1+/' )
27+
revision=$( git rev-parse --short HEAD 2> /dev/null || echo 'unknown' )
28+
branch=$( git rev-parse --abbrev-ref HEAD 2> /dev/null || echo 'unknown' )
29+
go_version=$( go version | sed -e 's/^[^0-9.]*\([0-9.]*\).*/\1/' )
30+
31+
ldflags="
32+
-X ${repo_path}/version.Version${ldseparator}=${version}
33+
-X ${repo_path}/version.Revision${ldseparator}=${revision}
34+
-X ${repo_path}/version.Branch${ldseparator}=${branch}
35+
-X ${repo_path}/version.BuildUser${ldseparator}=${BUILD_USER}
36+
-X ${repo_path}/version.BuildDate${ldseparator}=${BUILD_DATE}
37+
-X ${repo_path}/version.GoVersion${ldseparator}=${go_version}"
38+
39+
echo ">> building monitor"
40+
41+
if [ -n "$VERBOSE" ]; then
42+
echo "Building with -ldflags $ldflags"
43+
fi
44+
45+
GOBIN=$PWD go build -o monitor ${GO_FLAGS} -ldflags "${ldflags}" "${repo_path}"
46+
47+
exit 0

Diff for: build/check_gofmt.sh

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/bin/bash
2+
3+
# Copyright 2018 Google Inc. All rights reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
# Check formatting on non Godep'd code.
18+
GOFMT_PATHS=$(find . -not -wholename "*.git*" -not -wholename "*Godeps*" -not -wholename "*vendor*" -not -name "." -type d)
19+
20+
# Find any files with gofmt problems
21+
BAD_FILES=$(gofmt -s -l $GOFMT_PATHS)
22+
23+
if [ -n "$BAD_FILES" ]; then
24+
echo "The following files are not properly formatted:"
25+
echo $BAD_FILES
26+
exit 1
27+
fi

Diff for: cmd/main.go

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
// Copyright 2018 Google Inc. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package main
16+
17+
import (
18+
"flag"
19+
"github.com/golang/glog"
20+
21+
"github.com/dashpole/example-gpu-monitor/pkg/gpustats"
22+
"github.com/dashpole/example-gpu-monitor/pkg/kubeletdevices"
23+
"github.com/dashpole/example-gpu-monitor/pkg/metrics"
24+
)
25+
26+
var (
27+
socket = flag.String("socket", "", "location of the kubelet's podresources service")
28+
port = flag.Int("port", 8080, "port on which to listen")
29+
prometheusEndpoint = flag.String("prometheus_endpoint", "/metrics", "Endpoint to expose Prometheus metrics on")
30+
)
31+
32+
func main() {
33+
defer glog.Flush()
34+
flag.Parse()
35+
36+
glog.V(1).Infof("Starting example-gpu-monitor")
37+
38+
metrics.Register(gpustats.NewGPUStatsProvider(), kubeletdevices.NewDeviceProvider())
39+
40+
}

Diff for: deploy/Dockerfile

Whitespace-only changes.

Diff for: monitor

18.9 MB
Binary file not shown.

Diff for: pkg/gpustats/stats.go

+136
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
// Copyright 2018 Google Inc. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package gpustats
16+
17+
import (
18+
"fmt"
19+
"regexp"
20+
"strconv"
21+
"time"
22+
23+
"github.com/golang/glog"
24+
"github.com/mindprince/gonvml"
25+
)
26+
27+
type GPUStatsProvider interface {
28+
GetStats(deviceID string) (*Stats, error)
29+
Stop()
30+
}
31+
32+
type Stats struct {
33+
Model string
34+
ID string
35+
MemoryTotal uint64
36+
MemoryUsed uint64
37+
DutyCycle uint64
38+
}
39+
40+
func NewGPUStatsProvider() GPUStatsProvider {
41+
m := monitorImpl{
42+
devices: make(map[int]gonvml.Device),
43+
}
44+
if err := gonvml.Initialize(); err != nil {
45+
// This is under a logging level because otherwise we may cause
46+
// log spam if the drivers/nvml is not installed on the system.
47+
glog.V(4).Infof("Could not initialize NVML: %v", err)
48+
return &m
49+
}
50+
m.nvmlInitialized = true
51+
numDevices, err := gonvml.DeviceCount()
52+
if err != nil {
53+
glog.Warningf("GPU metrics would not be available. Failed to get the number of nvidia devices: %v", err)
54+
return &m
55+
}
56+
glog.V(1).Infof("NVML initialized. Number of nvidia devices: %v", numDevices)
57+
m.devices = make(map[int]gonvml.Device, numDevices)
58+
for i := 0; i < int(numDevices); i++ {
59+
device, err := gonvml.DeviceHandleByIndex(uint(i))
60+
if err != nil {
61+
glog.Warningf("Failed to get nvidia device handle %d: %v", i, err)
62+
continue
63+
}
64+
minorNumber, err := device.MinorNumber()
65+
if err != nil {
66+
glog.Warningf("Failed to get nvidia device minor number: %v", err)
67+
continue
68+
}
69+
m.devices[int(minorNumber)] = device
70+
}
71+
return &m
72+
}
73+
74+
type monitorImpl struct {
75+
nvmlInitialized bool
76+
77+
// map from device minor number to Device
78+
devices map[int]gonvml.Device
79+
}
80+
81+
func (m *monitorImpl) Stop() {
82+
if m.nvmlInitialized {
83+
gonvml.Shutdown()
84+
}
85+
}
86+
87+
// GetStats assumes the device id is nvidia[minor number]
88+
func (m *monitorImpl) GetStats(deviceId string) (*Stats, error) {
89+
i, err := getMinorNumber(deviceId)
90+
if err != nil {
91+
return nil, fmt.Errorf("error getting device minor number from path %s: %v", deviceId, err)
92+
}
93+
device, found := m.devices[i]
94+
if !found {
95+
return nil, fmt.Errorf("device with minor number %d was not found", i)
96+
}
97+
model, err := device.Name()
98+
if err != nil {
99+
return nil, fmt.Errorf("error while getting gpu name: %v", err)
100+
}
101+
uuid, err := device.UUID()
102+
if err != nil {
103+
return nil, fmt.Errorf("error while getting gpu uuid: %v", err)
104+
}
105+
memoryTotal, memoryUsed, err := device.MemoryInfo()
106+
if err != nil {
107+
return nil, fmt.Errorf("error while getting gpu memory info: %v", err)
108+
}
109+
utilizationGPU, err := device.AverageGPUUtilization(10 * time.Second)
110+
if err != nil {
111+
return nil, fmt.Errorf("error while getting gpu utilization: %v", err)
112+
}
113+
114+
return &Stats{
115+
Model: model,
116+
ID: uuid,
117+
MemoryTotal: memoryTotal,
118+
MemoryUsed: memoryUsed,
119+
DutyCycle: uint64(utilizationGPU),
120+
}, nil
121+
}
122+
123+
var deviceExpr = regexp.MustCompile(`^nvidia([0-9]+)$`)
124+
125+
func getMinorNumber(deviceId string) (int, error) {
126+
matches := deviceExpr.FindStringSubmatch(deviceId)
127+
if len(matches) != 2 {
128+
return 0, fmt.Errorf("%s does not match nvidia[0-9]+", deviceId)
129+
}
130+
minorString := matches[1]
131+
i, err := strconv.ParseInt(minorString, 10, 64)
132+
if err != nil {
133+
return 0, fmt.Errorf("cannot parse %s, to an int: %v", minorString, err)
134+
}
135+
return int(i), nil
136+
}

Diff for: pkg/kubeletdevices/devices.go

+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
// Copyright 2018 Google Inc. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package kubeletdevices
16+
17+
import (
18+
"context"
19+
"fmt"
20+
"time"
21+
22+
"github.com/golang/glog"
23+
24+
"k8s.io/kubernetes/pkg/kubelet/apis/podresources"
25+
podresourcesapi "k8s.io/kubernetes/pkg/kubelet/apis/podresources/v1alpha1"
26+
)
27+
28+
const (
29+
// defaultPodResourcesSocket is the path to the socket serving the podresources API.
30+
// defaultPodResourcesSocket = "unix:///var/lib/kubelet/pod-resources/kubelet.sock"
31+
defaultPodResourcesSocket = "unix:///var/lib/kubelet/kubelet.sock"
32+
defaultPodResourcesTimeout = 10 * time.Second
33+
defaultPodResourcesMaxSize = 1024 * 1024 * 16 // 16 Mb
34+
)
35+
36+
type DeviceProvider interface {
37+
GetDevices() (*podresourcesapi.ListPodResourcesResponse, error)
38+
}
39+
40+
type deviceProvider struct {
41+
client podresourcesapi.PodResourcesListerClient
42+
}
43+
44+
func NewDeviceProvider() DeviceProvider {
45+
client, _, err := podresources.GetClient(defaultPodResourcesSocket, defaultPodResourcesTimeout, defaultPodResourcesMaxSize)
46+
if err != nil {
47+
glog.Fatalf("Failed to get grpc client: %v", err)
48+
}
49+
return &deviceProvider{
50+
client: client,
51+
}
52+
}
53+
54+
func (d *deviceProvider) GetDevices() (*podresourcesapi.ListPodResourcesResponse, error) {
55+
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
56+
defer cancel()
57+
resp, err := d.client.List(ctx, &podresourcesapi.ListPodResourcesRequest{})
58+
if err != nil {
59+
return nil, fmt.Errorf("%v.Get(_) = _, %v", d.client, err)
60+
}
61+
return resp, nil
62+
}

0 commit comments

Comments
 (0)