Skip to content

Commit 50b5d8e

Browse files
committed
Add support for AMD GPU via --gpu=amd for docker linux amd64.
1 parent a46a49b commit 50b5d8e

File tree

32 files changed

+370
-30
lines changed

32 files changed

+370
-30
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
name: "update-amd-gpu-device-plugin-version"
2+
on:
3+
workflow_dispatch:
4+
schedule:
5+
# every Monday at around 3 am pacific/10 am UTC
6+
- cron: "0 10 * * 1"
7+
env:
8+
GOPROXY: https://proxy.golang.org
9+
GO_VERSION: '1.23.0'
10+
permissions:
11+
contents: read
12+
13+
jobs:
14+
bump-amd-gpu-device-plugin-version:
15+
runs-on: ubuntu-22.04
16+
steps:
17+
- uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
18+
- uses: actions/setup-go@0a12ed9d6a96ab950c8f026ed9f722fe0da7ef32
19+
with:
20+
go-version: ${{env.GO_VERSION}}
21+
- name: Bump amd-gpu-device-plugin version
22+
id: bumpAmdDevicePlugin
23+
run: |
24+
echo "OLD_VERSION=$(DEP=amd-gpu-device-plugin make get-dependency-version)" >> "$GITHUB_OUTPUT"
25+
make update-amd-gpu-device-plugin-version
26+
echo "NEW_VERSION=$(DEP=amd-gpu-device-plugin make get-dependency-version)" >> "$GITHUB_OUTPUT"
27+
# The following is to support multiline with GITHUB_OUTPUT, see https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#multiline-strings
28+
echo "changes<<EOF" >> "$GITHUB_OUTPUT"
29+
echo "$(git status --porcelain)" >> "$GITHUB_OUTPUT"
30+
echo "EOF" >> "$GITHUB_OUTPUT"
31+
- name: Create PR
32+
if: ${{ steps.bumpAmdDevicePlugin.outputs.changes != '' }}
33+
uses: peter-evans/create-pull-request@5e914681df9dc83aa4e4905692ca88beb2f9e91f
34+
with:
35+
token: ${{ secrets.MINIKUBE_BOT_PAT }}
36+
commit-message: 'Addon amd-gpu-device-plugin: Update amd/k8s-device-plugin image from ${{ steps.bumpAmdDevicePlugin.outputs.OLD_VERSION }} to ${{ steps.bumpAmdDevicePlugin.outputs.NEW_VERSION }}'
37+
committer: minikube-bot <[email protected]>
38+
author: minikube-bot <[email protected]>
39+
branch: auto_bump_amd_device_plugin_version
40+
push-to-fork: minikube-bot/minikube
41+
base: master
42+
delete-branch: true
43+
title: 'Addon amd-gpu-device-plugin: Update amd/k8s-device-plugin image from ${{ steps.bumpAmdDevicePlugin.outputs.OLD_VERSION }} to ${{ steps.bumpAmdDevicePlugin.outputs.NEW_VERSION }}'
44+
labels: ok-to-test
45+
body: |
46+
The [k8s-device-plugin](https://github.com/ROCm/k8s-device-plugin) project released a new k8s-device-plugin image
47+
48+
This PR was auto-generated by `make update-amd-gpu-device-plugin-version` using [update-amd-gpu-device-plugin-version.yml](https://github.com/kubernetes/minikube/tree/master/.github/workflows/update-amd-gpu-device-plugin-version.yml) CI Workflow.

Makefile

+5
Original file line numberDiff line numberDiff line change
@@ -1222,6 +1222,11 @@ update-nvidia-device-plugin-version:
12221222
(cd hack/update/nvidia_device_plugin_version && \
12231223
go run update_nvidia_device_plugin_version.go)
12241224

1225+
.PHONY: update-amd-gpu-device-plugin-version
1226+
update-amd-gpu-device-plugin-version:
1227+
(cd hack/update/amd_device_plugin_version && \
1228+
go run update_amd_device_plugin_version.go)
1229+
12251230
.PHONY: update-nerctld-version
12261231
update-nerdctld-version:
12271232
(cd hack/update/nerdctld_version && \

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ As well as developer-friendly features:
3535

3636
* [Addons](https://minikube.sigs.k8s.io/docs/handbook/deploying/#addons) - a marketplace for developers to share configurations for running services on minikube
3737
* [NVIDIA GPU support](https://minikube.sigs.k8s.io/docs/tutorials/nvidia/) - for machine learning
38+
* [AMD GPU support](https://minikube.sigs.k8s.io/docs/tutorials/amd/) - for machine learning
3839
* [Filesystem mounts](https://minikube.sigs.k8s.io/docs/handbook/mount/)
3940

4041
**For more information, see the official [minikube website](https://minikube.sigs.k8s.io)**

cmd/minikube/cmd/start.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -1462,8 +1462,8 @@ func validateGPUs(value, drvName, rtime string) error {
14621462
if err := validateGPUsArch(); err != nil {
14631463
return err
14641464
}
1465-
if value != "nvidia" && value != "all" {
1466-
return errors.Errorf(`The gpus flag must be passed a value of "nvidia" or "all"`)
1465+
if value != "nvidia" && value != "all" && value != "amd" {
1466+
return errors.Errorf(`The gpus flag must be passed a value of "nvidia", "amd" or "all"`)
14671467
}
14681468
if drvName == constants.Docker && (rtime == constants.Docker || rtime == constants.DefaultContainerRuntime) {
14691469
return nil

cmd/minikube/cmd/start_flags.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ func initMinikubeFlags() {
206206
startCmd.Flags().Bool(disableOptimizations, false, "If set, disables optimizations that are set for local Kubernetes. Including decreasing CoreDNS replicas from 2 to 1. Defaults to false.")
207207
startCmd.Flags().Bool(disableMetrics, false, "If set, disables metrics reporting (CPU and memory usage), this can improve CPU usage. Defaults to false.")
208208
startCmd.Flags().String(staticIP, "", "Set a static IP for the minikube cluster, the IP must be: private, IPv4, and the last octet must be between 2 and 254, for example 192.168.200.200 (Docker and Podman drivers only)")
209-
startCmd.Flags().StringP(gpus, "g", "", "Allow pods to use your NVIDIA GPUs. Options include: [all,nvidia] (Docker driver with Docker container-runtime only)")
209+
startCmd.Flags().StringP(gpus, "g", "", "Allow pods to use your GPUs. Options include: [all,nvidia,amd] (Docker driver with Docker container-runtime only)")
210210
startCmd.Flags().Duration(autoPauseInterval, time.Minute*1, "Duration of inactivity before the minikube VM is paused (default 1m0s)")
211211
}
212212

cmd/minikube/cmd/start_test.go

+4-1
Original file line numberDiff line numberDiff line change
@@ -814,7 +814,10 @@ func TestValidateGPUs(t *testing.T) {
814814
{"nvidia", "docker", "", ""},
815815
{"all", "kvm", "docker", "The gpus flag can only be used with the docker driver and docker container-runtime"},
816816
{"nvidia", "docker", "containerd", "The gpus flag can only be used with the docker driver and docker container-runtime"},
817-
{"cat", "docker", "docker", `The gpus flag must be passed a value of "nvidia" or "all"`},
817+
{"cat", "docker", "docker", `The gpus flag must be passed a value of "nvidia", "amd" or "all"`},
818+
{"amd", "docker", "docker", ""},
819+
{"amd", "docker", "", ""},
820+
{"amd", "docker", "containerd", "The gpus flag can only be used with the docker driver and docker container-runtime"},
818821
}
819822

820823
for _, tc := range tests {

deploy/addons/assets.go

+4
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,10 @@ var (
107107
//go:embed gpu/nvidia-gpu-device-plugin.yaml.tmpl
108108
NvidiaGpuDevicePluginAssets embed.FS
109109

110+
// AmdGpuDevicePluginAssets assets for amd-gpu-device-plugin addon
111+
//go:embed gpu/amd-gpu-device-plugin.yaml.tmpl
112+
AmdGpuDevicePluginAssets embed.FS
113+
110114
// LogviewerAssets assets for logviewer addon
111115
//go:embed logviewer/*.tmpl logviewer/*.yaml
112116
LogviewerAssets embed.FS
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Copyright 2024 The Kubernetes Authors All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: apps/v1
16+
kind: DaemonSet
17+
metadata:
18+
name: amd-gpu-device-plugin
19+
namespace: kube-system
20+
labels:
21+
k8s-app: amd-gpu-device-plugin
22+
kubernetes.io/minikube-addons: amd-gpu-device-plugin
23+
addonmanager.kubernetes.io/mode: Reconcile
24+
spec:
25+
selector:
26+
matchLabels:
27+
k8s-app: amd-gpu-device-plugin
28+
template:
29+
metadata:
30+
labels:
31+
name: amd-gpu-device-plugin
32+
k8s-app: amd-gpu-device-plugin
33+
spec:
34+
nodeSelector:
35+
kubernetes.io/arch: amd64
36+
priorityClassName: system-node-critical
37+
tolerations:
38+
- key: CriticalAddonsOnly
39+
operator: Exists
40+
volumes:
41+
- name: dp
42+
hostPath:
43+
path: /var/lib/kubelet/device-plugins
44+
- name: sys
45+
hostPath:
46+
path: /sys
47+
containers:
48+
- image: {{.CustomRegistries.AmdDevicePlugin | default .ImageRepository | default .Registries.AmdDevicePlugin }}{{.Images.AmdDevicePlugin}}
49+
name: amd-gpu-device-plugin
50+
securityContext:
51+
allowPrivilegeEscalation: false
52+
capabilities:
53+
drop: ["ALL"]
54+
volumeMounts:
55+
- name: dp
56+
mountPath: /var/lib/kubelet/device-plugins
57+
- name: sys
58+
mountPath: /sys
59+
updateStrategy:
60+
type: RollingUpdate
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
/*
2+
Copyright 2024 The Kubernetes Authors All rights reserved.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package main
18+
19+
import (
20+
"context"
21+
"fmt"
22+
"time"
23+
24+
"k8s.io/klog/v2"
25+
"k8s.io/minikube/hack/update"
26+
)
27+
28+
var schema = map[string]update.Item{
29+
"pkg/minikube/assets/addons.go": {
30+
Replace: map[string]string{
31+
`rocm/k8s-device-plugin:.*`: `rocm/k8s-device-plugin:{{.Version}}@{{.SHA}}",`,
32+
},
33+
},
34+
}
35+
36+
type Data struct {
37+
Version string
38+
SHA string
39+
}
40+
41+
func main() {
42+
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
43+
defer cancel()
44+
45+
stable, _, _, err := update.GHReleases(ctx, "ROCm", "k8s-device-plugin")
46+
if err != nil {
47+
klog.Fatalf("Unable to get stable version: %v", err)
48+
}
49+
sha, err := update.GetImageSHA(fmt.Sprintf("rocm/k8s-device-plugin:%s", stable.Tag))
50+
if err != nil {
51+
klog.Fatalf("failed to get image SHA: %v", err)
52+
}
53+
54+
data := Data{Version: stable.Tag, SHA: sha}
55+
56+
update.Apply(schema, data)
57+
}

hack/update/get_version/get_version.go

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ type dependency struct {
3333
}
3434

3535
var dependencies = map[string]dependency{
36+
"amd-gpu-device-plugin": {addonsFile, `rocm/k8s-device-plugin:(.*)@`},
3637
"buildkit": {"deploy/iso/minikube-iso/arch/x86_64/package/buildkit-bin/buildkit-bin.mk", `BUILDKIT_BIN_VERSION = (.*)`},
3738
"calico": {"pkg/minikube/bootstrapper/images/images.go", `calicoVersion = "(.*)"`},
3839
"cilium": {"pkg/minikube/cni/cilium.yaml", `quay.io/cilium/cilium:(.*)@`},

pkg/addons/config.go

+5
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,11 @@ var Addons = []*Addon{
131131
validations: []setFn{isKVMDriverForNVIDIA},
132132
callbacks: []setFn{EnableOrDisableAddon},
133133
},
134+
{
135+
name: "amd-gpu-device-plugin",
136+
set: SetBool,
137+
callbacks: []setFn{EnableOrDisableAddon},
138+
},
134139
{
135140
name: "olm",
136141
set: SetBool,

pkg/drivers/kic/oci/oci.go

+7-1
Original file line numberDiff line numberDiff line change
@@ -190,8 +190,14 @@ func CreateContainerNode(p CreateParams) error { //nolint to suppress cyclomatic
190190
runArgs = append(runArgs, "--network", p.Network)
191191
runArgs = append(runArgs, "--ip", p.IP)
192192
}
193-
if p.GPUs != "" {
193+
194+
if p.GPUs == "all" || p.GPUs == "nvidia" {
194195
runArgs = append(runArgs, "--gpus", "all", "--env", "NVIDIA_DRIVER_CAPABILITIES=all")
196+
} else if p.GPUs == "amd" {
197+
/* https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html
198+
* "--security-opt seccomp=unconfined" is also required but included above.
199+
*/
200+
runArgs = append(runArgs, "--device", "/dev/kfd", "--device", "/dev/dri", "--group-add", "video", "--group-add", "render")
195201
}
196202

197203
memcgSwap := hasMemorySwapCgroup()

pkg/drivers/kic/oci/types.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ type CreateParams struct {
6161
OCIBinary string // docker or podman
6262
Network string // network name that the container will attach to
6363
IP string // static IP to assign the container in the cluster network
64-
GPUs string // add NVIDIA GPU devices to the container
64+
GPUs string // add GPU devices to the container
6565
}
6666

6767
// createOpt is an option for Create

pkg/drivers/kic/types.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -69,5 +69,5 @@ type Config struct {
6969
StaticIP string // static IP for the kic cluster
7070
ExtraArgs []string // a list of any extra option to pass to oci binary during creation time, for example --expose 8080...
7171
ListenAddress string // IP Address to listen to
72-
GPUs string // add NVIDIA GPU devices to the container
72+
GPUs string // add GPU devices to the container
7373
}

pkg/minikube/assets/addons.go

+11
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,17 @@ var Addons = map[string]*Addon{
487487
}, map[string]string{
488488
"NvidiaDevicePlugin": "registry.k8s.io",
489489
}),
490+
"amd-gpu-device-plugin": NewAddon([]*BinAsset{
491+
MustBinAsset(addons.AmdGpuDevicePluginAssets,
492+
"gpu/amd-gpu-device-plugin.yaml.tmpl",
493+
vmpath.GuestAddonsDir,
494+
"amd-gpu-device-plugin.yaml",
495+
"0640"),
496+
}, false, "amd-gpu-device-plugin", "3rd party (AMD)", "", "https://minikube.sigs.k8s.io/docs/tutorials/amd/", map[string]string{
497+
"AmdDevicePlugin": "rocm/k8s-device-plugin:1.25.2.8@sha256:f3835498cf2274e0a07c32b38c166c05a876f8eb776d756cc06805e599a3ba5f",
498+
}, map[string]string{
499+
"AmdDevicePlugin": "docker.io",
500+
}),
490501
"logviewer": NewAddon([]*BinAsset{
491502
MustBinAsset(addons.LogviewerAssets,
492503
"logviewer/logviewer-dp-and-svc.yaml.tmpl",

pkg/minikube/cruntime/cruntime.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ type Config struct {
156156
// InsecureRegistry list of insecure registries
157157
InsecureRegistry []string
158158
// GPUs add GPU devices to the container
159-
GPUs bool
159+
GPUs string
160160
}
161161

162162
// ListContainersOptions are the options to use for listing containers

pkg/minikube/cruntime/docker.go

+6-2
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ type Docker struct {
7575
Init sysinit.Manager
7676
UseCRI bool
7777
CRIService string
78-
GPUs bool
78+
GPUs string
7979
}
8080

8181
// Name is a human readable name for Docker
@@ -580,13 +580,17 @@ func (r *Docker) configureDocker(driver string) error {
580580
},
581581
StorageDriver: "overlay2",
582582
}
583-
if r.GPUs {
583+
584+
if r.GPUs == "all" || r.GPUs == "nvidia" {
584585
assets.Addons["nvidia-device-plugin"].EnableByDefault()
585586
daemonConfig.DefaultRuntime = "nvidia"
586587
runtimes := &dockerDaemonRuntimes{}
587588
runtimes.Nvidia.Path = "/usr/bin/nvidia-container-runtime"
588589
daemonConfig.Runtimes = runtimes
590+
} else if r.GPUs == "amd" {
591+
assets.Addons["amd-gpu-device-plugin"].EnableByDefault()
589592
}
593+
590594
daemonConfigBytes, err := json.Marshal(daemonConfig)
591595
if err != nil {
592596
return err

pkg/minikube/node/start.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,7 @@ func configureRuntimes(runner cruntime.CommandRunner, cc config.ClusterConfig, k
419419
InsecureRegistry: cc.InsecureRegistry,
420420
}
421421
if cc.GPUs != "" {
422-
co.GPUs = true
422+
co.GPUs = cc.GPUs
423423
}
424424
cr, err := cruntime.New(co)
425425
if err != nil {

site/content/en/docs/commands/start.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ minikube start [flags]
5757
--feature-gates string A set of key=value pairs that describe feature gates for alpha/experimental features.
5858
--force Force minikube to perform possibly dangerous operations
5959
--force-systemd If set, force the container runtime to use systemd as cgroup manager. Defaults to false.
60-
-g, --gpus string Allow pods to use your NVIDIA GPUs. Options include: [all,nvidia] (Docker driver with Docker container-runtime only)
60+
-g, --gpus string Allow pods to use your GPUs. Options include: [all,nvidia,amd] (Docker driver with Docker container-runtime only)
6161
--ha Create Highly Available Multi-Control Plane Cluster with a minimum of three control-plane nodes that will also be marked for work.
6262
--host-dns-resolver Enable host resolver for NAT DNS requests (virtualbox driver only) (default true)
6363
--host-only-cidr string The CIDR to be used for the minikube VM (virtualbox driver only) (default "192.168.59.1/24")

site/content/en/docs/contrib/tests.en.md

+3
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,9 @@ tests disabling an addon on a non-existing cluster
6565
#### validateNvidiaDevicePlugin
6666
tests the nvidia-device-plugin addon by ensuring the pod comes up and the addon disables
6767

68+
#### validateAmdGpuDevicePlugin
69+
tests the amd-gpu-device-plugin addon by ensuring the pod comes up and the addon disables
70+
6871
#### validateYakdAddon
6972

7073
## TestCertOptions

0 commit comments

Comments
 (0)