Skip to content

Commit f067fdc

Browse files
[no-relnote] Update Github Actions E2E
Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
1 parent 6df26cc commit f067fdc

File tree

6 files changed

+73
-40
lines changed

6 files changed

+73
-40
lines changed

.github/workflows/e2e.yaml

+9-2
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,8 @@ jobs:
7070

7171
- name: Run e2e tests
7272
env:
73-
IMAGE_NAME: ghcr.io/nvidia/container-toolkit
74-
VERSION: ${{ inputs.version }}
73+
E2E_IMAGE_REPO: ghcr.io/nvidia/container-toolkit
74+
E2E_IMAGE_TAG: ${{ inputs.version }}
7575
SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
7676
E2E_SSH_USER: ${{ secrets.E2E_SSH_USER }}
7777
E2E_SSH_HOST: ${{ steps.holodeck_public_dns_name.outputs.result }}
@@ -84,6 +84,13 @@ jobs:
8484
8585
make -f tests/e2e/Makefile test
8686
87+
- name: Archive Ginkgo logs
88+
uses: actions/upload-artifact@v4
89+
with:
90+
name: ginkgo-logs
91+
path: ginkgo.json
92+
retention-days: 15
93+
8794
- name: Send Slack alert notification
8895
if: ${{ failure() }}
8996
uses: slackapi/[email protected]

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@
1111
/nvidia-ctk
1212
/shared-*
1313
/release-*
14+
/bin

tests/e2e/Makefile

+3-3
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16-
.PHONY: test-e2e ginkgo
16+
.PHONY: test ginkgo
1717

1818
GINKGO_ARGS ?=
1919
LOG_ARTIFACTS_DIR ?= $(CURDIR)/e2e_logs
2020

21-
ginkgo:
21+
ginkgo: $(CURDIR)/bin/ginkgo
2222
mkdir -p $(CURDIR)/bin
2323
GOBIN=$(CURDIR)/bin go install github.com/onsi/ginkgo/v2/ginkgo@latest
2424

25-
test-e2e: ginkgo
25+
test: ginkgo
2626
$(CURDIR)/bin/ginkgo $(GINKGO_ARGS) -v --json-report ginkgo.json ./tests/e2e/...

tests/e2e/README.md

+9-7
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ limitations under the License.
2020
---
2121

2222
## 1 Scope & Goals
23-
This repository contains a **Ginkgo v2 / Gomega** test harness that exercises an
23+
This folder contains a **Ginkgo v2 / Gomega** test harness that exercises an
2424
NVIDIA Container Toolkit (CTK) installation on a **remote GPU‑enabled host** via
2525
SSH. The suite validates that:
2626

@@ -59,10 +59,11 @@ compatibility runs, and pre‑release validation of new CTK builds.
5959
| Variable | Required | Example | Description |
6060
|----------|----------|---------|-------------|
6161
| `INSTALL_CTK` || `true` | When `true` the test installs CTK on the remote host before running the image. When `false` it assumes CTK is already present. |
62-
| `TOOLKIT_IMAGE` || `nvcr.io/nvidia/cuda:12.4.0-runtime-ubi9` | Image that will be pulled & executed. |
63-
| `SSH_KEY` || `/home/ci/.ssh/id_rsa` | Private key used for authentication. |
64-
| `SSH_USER` || `ubuntu` | Username on the remote host. |
65-
| `REMOTE_HOST` || `gpurunner01.corp.local` | Hostname or IP address of the target node. |
62+
| `E2E_IMAGE_REPO` || `ghcr.io/nvidia/container-toolkit` | Container Toolkit Image |
63+
| `E2E_IMAGE_TAG` || `latest` | Image tag |
64+
| `E2E_SSH_KEY` || `/home/ci/.ssh/id_rsa` | Private key used for authentication. |
65+
| `E2E_SSH_USER` || `ubuntu` | Username on the remote host. |
66+
| `REMOTE_HOST` || `10.0.0.0` | Hostname or IP address of the target node. |
6667
| `REMOTE_PORT` || `22` | SSH port of the target node. |
6768

6869
> All variables are validated at start‑up; the suite aborts early with a clear
@@ -92,12 +93,13 @@ bin/ginkgo:
9293
### 6.1 Basic invocation
9394
```bash
9495
INSTALL_CTK=true \
95-
TOOLKIT_IMAGE=nvcr.io/nvidia/cuda:12.4.0-runtime-ubi9 \
96+
E2E_IMAGE_REPO=ghcr.io/nvidia/container-toolkit \
97+
E2E_IMAGE_TAG=<image-tag> \
9698
SSH_KEY=$HOME/.ssh/id_rsa \
9799
SSH_USER=ubuntu \
98100
REMOTE_HOST=10.0.0.15 \
99101
REMOTE_PORT=22 \
100-
make test-e2e
102+
make test
101103
```
102104
This downloads the image on the remote host, installs CTK (if requested), and
103105
executes a minimal CUDA‑based workload.

tests/e2e/e2e_test.go

+38-11
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package e2e
1919

2020
import (
2121
"context"
22+
"fmt"
2223
"os"
2324
"path/filepath"
2425
"runtime"
@@ -29,6 +30,12 @@ import (
2930
. "github.com/onsi/gomega"
3031
)
3132

33+
const (
34+
vectorAddImage = "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0"
35+
deviceQueryImage = "nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0"
36+
cudaImage = "nvcr.io/nvidia/cuda:12.8.0-base-ubi8"
37+
)
38+
3239
// Test context
3340
var (
3441
ctx context.Context
@@ -85,10 +92,12 @@ var _ = BeforeSuite(func() {
8592
_, _, err := runner.Run("docker pull ubuntu")
8693
Expect(err).ToNot(HaveOccurred())
8794

88-
_, _, err = runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
95+
_, _, err = runner.Run(fmt.Sprintf("docker pull %s", vectorAddImage))
8996
Expect(err).ToNot(HaveOccurred())
9097

91-
_, _, err = runner.Run("docker pull nvcr.io/nvidia/cuda:12.8.0-base-ubi8")
98+
_, _, err = runner.Run(fmt.Sprintf("docker pull %s", deviceQueryImage))
99+
Expect(err).ToNot(HaveOccurred())
100+
_, _, err = runner.Run(fmt.Sprintf("docker pull %s", cudaImage))
92101
Expect(err).ToNot(HaveOccurred())
93102
})
94103

@@ -100,25 +109,29 @@ func getTestEnv() {
100109
_, thisFile, _, _ := runtime.Caller(0)
101110
packagePath = filepath.Dir(thisFile)
102111

103-
installCTK = getBoolEnvVar("INSTALL_CTK", false)
112+
installCTK = getBoolEnvVar("E2E_INSTALL_CTK", false)
104113

105114
ImageRepo = os.Getenv("E2E_IMAGE_REPO")
106115
Expect(ImageRepo).NotTo(BeEmpty(), "E2E_IMAGE_REPO environment variable must be set")
107116

108117
ImageTag = os.Getenv("E2E_IMAGE_TAG")
109118
Expect(ImageTag).NotTo(BeEmpty(), "E2E_IMAGE_TAG environment variable must be set")
110119

111-
sshKey = os.Getenv("SSH_KEY")
112-
Expect(sshKey).NotTo(BeEmpty(), "SSH_KEY environment variable must be set")
120+
// TODO (@ArangoGutierrez):
121+
// once https://github.com/NVIDIA/nvidia-container-toolkit/pull/602
122+
// is merged, remove this
123+
ImageTag = fmt.Sprintf("%s-ubuntu20.04", ImageTag)
113124

114-
sshUser = os.Getenv("SSH_USER")
115-
Expect(sshUser).NotTo(BeEmpty(), "SSH_USER environment variable must be set")
125+
sshKey = os.Getenv("E2E_SSH_KEY")
126+
Expect(sshKey).NotTo(BeEmpty(), "E2E_SSH_KEY environment variable must be set")
116127

117-
host = os.Getenv("REMOTE_HOST")
118-
Expect(host).NotTo(BeEmpty(), "REMOTE_HOST environment variable must be set")
128+
sshUser = os.Getenv("E2E_SSH_USER")
129+
Expect(sshUser).NotTo(BeEmpty(), "E2E_SSH_USER environment variable must be set")
119130

120-
sshPort = os.Getenv("REMOTE_PORT")
121-
Expect(sshPort).NotTo(BeEmpty(), "REMOTE_PORT environment variable must be set")
131+
host = os.Getenv("E2E_SSH_HOST")
132+
Expect(host).NotTo(BeEmpty(), "E2E_SSH_HOST environment variable must be set")
133+
134+
sshPort = getIntEnvVar("E2E_SSH_PORT", 22)
122135

123136
// Get current working directory
124137
cwd, err = os.Getwd()
@@ -137,3 +150,17 @@ func getBoolEnvVar(key string, defaultValue bool) bool {
137150
}
138151
return boolValue
139152
}
153+
154+
// getIntEnvVar returns the integer value of the environment variable or the default value if not set.
155+
func getIntEnvVar(key string, defaultValue int) string {
156+
value := os.Getenv(key)
157+
if value == "" {
158+
return strconv.Itoa(defaultValue)
159+
}
160+
intValue, err := strconv.Atoi(value)
161+
if err != nil {
162+
return strconv.Itoa(defaultValue)
163+
}
164+
165+
return strconv.Itoa(intValue)
166+
}

tests/e2e/nvidia-container-toolkit_test.go

+13-17
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package e2e
1919

2020
import (
2121
"context"
22+
"fmt"
2223
"path/filepath"
2324
"strings"
2425

@@ -85,29 +86,29 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
8586
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
8687
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
8788
var err error
88-
referenceOutput, _, err = runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
89+
referenceOutput, _, err = runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all %s", vectorAddImage))
8990
Expect(err).ToNot(HaveOccurred())
9091

9192
Expect(referenceOutput).To(ContainSubstring("Test PASSED"))
9293
})
9394

9495
It("should support automatic CDI spec generation", func(ctx context.Context) {
9596
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
96-
out2, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
97+
out2, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all %s", vectorAddImage))
9798
Expect(err).ToNot(HaveOccurred())
9899
Expect(referenceOutput).To(Equal(out2))
99100
})
100101

101102
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
102103
By("Running docker run with --runtime=nvidia --gpus all")
103-
out3, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
104+
out3, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia --gpus all %s", vectorAddImage))
104105
Expect(err).ToNot(HaveOccurred())
105106
Expect(referenceOutput).To(Equal(out3))
106107
})
107108

108109
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
109110
By("Running docker run with --gpus all")
110-
out4, _, err := runner.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
111+
out4, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --gpus all %s", vectorAddImage))
111112
Expect(err).ToNot(HaveOccurred())
112113
Expect(referenceOutput).To(Equal(out4))
113114
})
@@ -116,46 +117,41 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
116117
// A deviceQuery sample runs in a container with access to all GPUs
117118
// The following should all produce the same result.
118119
When("Running the cuda-deviceQuery sample", Ordered, func() {
119-
BeforeAll(func(ctx context.Context) {
120-
_, _, err := runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
121-
Expect(err).ToNot(HaveOccurred())
122-
})
123-
124120
var referenceOutput string
125121

126122
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
127123
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
128124
var err error
129-
referenceOutput, _, err = runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
125+
referenceOutput, _, err = runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all %s", deviceQueryImage))
130126
Expect(err).ToNot(HaveOccurred())
131127
Expect(referenceOutput).To(ContainSubstring("Result = PASS"))
132128
})
133129

134130
It("should support automatic CDI spec generation", func(ctx context.Context) {
135131
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
136-
out2, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
132+
out2, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all %s", deviceQueryImage))
137133
Expect(err).ToNot(HaveOccurred())
138134
Expect(referenceOutput).To(Equal(out2))
139135
})
140136

141137
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
142138
By("Running docker run with --runtime=nvidia --gpus all")
143-
out3, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
139+
out3, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia --gpus all %s", deviceQueryImage))
144140
Expect(err).ToNot(HaveOccurred())
145141
Expect(referenceOutput).To(Equal(out3))
146142
})
147143

148144
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
149145
By("Running docker run with --gpus all")
150-
out4, _, err := runner.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
146+
out4, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --gpus all %s", deviceQueryImage))
151147
Expect(err).ToNot(HaveOccurred())
152148
Expect(referenceOutput).To(Equal(out4))
153149
})
154150
})
155151

156152
When("Testing CUDA Forward compatibility", Ordered, func() {
157153
BeforeAll(func(ctx context.Context) {
158-
compatOutput, _, err := runner.Run("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"")
154+
compatOutput, _, err := runner.Run(fmt.Sprintf("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void %s bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"", cudaImage))
159155
Expect(err).ToNot(HaveOccurred())
160156
Expect(compatOutput).ToNot(BeEmpty())
161157

@@ -179,21 +175,21 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
179175

180176
It("should work with the nvidia runtime in legacy mode", func(ctx context.Context) {
181177
By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
182-
ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
178+
ldconfigOut, _, err := runner.Run(fmt.Sprintf("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all %s bash -c \"ldconfig -p | grep libcuda.so.1\"", cudaImage))
183179
Expect(err).ToNot(HaveOccurred())
184180
Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
185181
})
186182

187183
It("should work with the nvidia runtime in CDI mode", func(ctx context.Context) {
188184
By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
189-
ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
185+
ldconfigOut, _, err := runner.Run(fmt.Sprintf("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all %s bash -c \"ldconfig -p | grep libcuda.so.1\"", cudaImage))
190186
Expect(err).ToNot(HaveOccurred())
191187
Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
192188
})
193189

194190
It("should NOT work with nvidia-container-runtime-hook", func(ctx context.Context) {
195191
By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --gpus all")
196-
ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
192+
ldconfigOut, _, err := runner.Run(fmt.Sprintf("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all %s bash -c \"ldconfig -p | grep libcuda.so.1\"", cudaImage))
197193
Expect(err).ToNot(HaveOccurred())
198194
Expect(ldconfigOut).To(ContainSubstring("/usr/lib64"))
199195
})

0 commit comments

Comments
 (0)