Skip to content

Commit f9792b7

Browse files
[no-relnote] Update Github Actions E2E
Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
1 parent 6df26cc commit f9792b7

File tree

5 files changed

+61
-31
lines changed

5 files changed

+61
-31
lines changed

.github/workflows/ci.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,4 @@ jobs:
5151
uses: ./.github/workflows/e2e.yaml
5252
with:
5353
version: ${{ needs.variables.outputs.version }}
54+
distribution: ubuntu20.04

.github/workflows/e2e.yaml

+13-3
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ on:
2020
version:
2121
required: true
2222
type: string
23+
distribution:
24+
required: true
25+
type: string
2326
secrets:
2427
AWS_ACCESS_KEY_ID:
2528
required: true
@@ -70,8 +73,8 @@ jobs:
7073

7174
- name: Run e2e tests
7275
env:
73-
IMAGE_NAME: ghcr.io/nvidia/container-toolkit
74-
VERSION: ${{ inputs.version }}
76+
E2E_IMAGE_REPO: ghcr.io/nvidia/container-toolkit
77+
E2E_IMAGE_TAG: ${{ inputs.version }}-${{ inputs.distribution }}
7578
SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
7679
E2E_SSH_USER: ${{ secrets.E2E_SSH_USER }}
7780
E2E_SSH_HOST: ${{ steps.holodeck_public_dns_name.outputs.result }}
@@ -82,8 +85,15 @@ jobs:
8285
chmod 600 "$e2e_ssh_key"
8386
export E2E_SSH_KEY="$e2e_ssh_key"
8487
85-
make -f tests/e2e/Makefile test
88+
make -f tests/e2e/Makefile test-e2e
8689
90+
- name: Archive Ginkgo logs
91+
uses: actions/upload-artifact@v4
92+
with:
93+
name: ginkgo-logs
94+
path: ginkgo.json
95+
retention-days: 15
96+
8797
- name: Send Slack alert notification
8898
if: ${{ failure() }}
8999
uses: slackapi/[email protected]

tests/e2e/README.md

+6-4
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,10 @@ compatibility runs, and pre‑release validation of new CTK builds.
5959
| Variable | Required | Example | Description |
6060
|----------|----------|---------|-------------|
6161
| `INSTALL_CTK` || `true` | When `true` the test installs CTK on the remote host before running the image. When `false` it assumes CTK is already present. |
62-
| `TOOLKIT_IMAGE` || `nvcr.io/nvidia/cuda:12.4.0-runtime-ubi9` | Image that will be pulled & executed. |
63-
| `SSH_KEY` || `/home/ci/.ssh/id_rsa` | Private key used for authentication. |
64-
| `SSH_USER` || `ubuntu` | Username on the remote host. |
62+
| `E2E_IMAGE_REPO` || `ghcr.io/nvidia/container-toolkit` | Container Toolkit Image |
63+
| `E2E_IMAGE_TAG` || `latest` | Image tag |
64+
| `E2E_SSH_KEY` || `/home/ci/.ssh/id_rsa` | Private key used for authentication. |
65+
| `E2E_SSH_USER` || `ubuntu` | Username on the remote host. |
6566
| `REMOTE_HOST` || `gpurunner01.corp.local` | Hostname or IP address of the target node. |
6667
| `REMOTE_PORT` || `22` | SSH port of the target node. |
6768

@@ -92,7 +93,8 @@ bin/ginkgo:
9293
### 6.1 Basic invocation
9394
```bash
9495
INSTALL_CTK=true \
95-
TOOLKIT_IMAGE=nvcr.io/nvidia/cuda:12.4.0-runtime-ubi9 \
96+
E2E_IMAGE_REPO=ghcr.io/nvidia/container-toolkit \
97+
E2E_IMAGE_TAG=latest \
9698
SSH_KEY=$HOME/.ssh/id_rsa \
9799
SSH_USER=ubuntu \
98100
REMOTE_HOST=10.0.0.15 \

tests/e2e/e2e_test.go

+28-7
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,12 @@ import (
2929
. "github.com/onsi/gomega"
3030
)
3131

32+
const (
33+
vectorAddImage = "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0"
34+
deviceQueryImage = "nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0"
35+
cudaImage = "nvcr.io/nvidia/cuda:12.8.0-base-ubi8"
36+
)
37+
3238
// Test context
3339
var (
3440
ctx context.Context
@@ -88,6 +94,8 @@ var _ = BeforeSuite(func() {
8894
_, _, err = runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
8995
Expect(err).ToNot(HaveOccurred())
9096

97+
_, _, err = runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
98+
Expect(err).ToNot(HaveOccurred())
9199
_, _, err = runner.Run("docker pull nvcr.io/nvidia/cuda:12.8.0-base-ubi8")
92100
Expect(err).ToNot(HaveOccurred())
93101
})
@@ -100,25 +108,24 @@ func getTestEnv() {
100108
_, thisFile, _, _ := runtime.Caller(0)
101109
packagePath = filepath.Dir(thisFile)
102110

103-
installCTK = getBoolEnvVar("INSTALL_CTK", false)
111+
installCTK = getBoolEnvVar("E2E_INSTALL_CTK", false)
104112

105113
ImageRepo = os.Getenv("E2E_IMAGE_REPO")
106114
Expect(ImageRepo).NotTo(BeEmpty(), "E2E_IMAGE_REPO environment variable must be set")
107115

108116
ImageTag = os.Getenv("E2E_IMAGE_TAG")
109117
Expect(ImageTag).NotTo(BeEmpty(), "E2E_IMAGE_TAG environment variable must be set")
110118

111-
sshKey = os.Getenv("SSH_KEY")
112-
Expect(sshKey).NotTo(BeEmpty(), "SSH_KEY environment variable must be set")
119+
sshKey = os.Getenv("E2E_SSH_KEY")
120+
Expect(sshKey).NotTo(BeEmpty(), "E2E_SSH_KEY environment variable must be set")
113121

114-
sshUser = os.Getenv("SSH_USER")
122+
sshUser = os.Getenv("E2E_SSH_USER")
115123
Expect(sshUser).NotTo(BeEmpty(), "SSH_USER environment variable must be set")
116124

117-
host = os.Getenv("REMOTE_HOST")
125+
host = os.Getenv("E2E_SSH_HOST")
118126
Expect(host).NotTo(BeEmpty(), "REMOTE_HOST environment variable must be set")
119127

120-
sshPort = os.Getenv("REMOTE_PORT")
121-
Expect(sshPort).NotTo(BeEmpty(), "REMOTE_PORT environment variable must be set")
128+
sshPort = getIntEnvVar("E2E_SSH_PORT", 22)
122129

123130
// Get current working directory
124131
cwd, err = os.Getwd()
@@ -137,3 +144,17 @@ func getBoolEnvVar(key string, defaultValue bool) bool {
137144
}
138145
return boolValue
139146
}
147+
148+
// getIntEnvVar returns the integer value of the environment variable or the default value if not set.
149+
func getIntEnvVar(key string, defaultValue int) string {
150+
value := os.Getenv(key)
151+
if value == "" {
152+
return strconv.Itoa(defaultValue)
153+
}
154+
intValue, err := strconv.Atoi(value)
155+
if err != nil {
156+
return strconv.Itoa(defaultValue)
157+
}
158+
159+
return strconv.Itoa(intValue)
160+
}

tests/e2e/nvidia-container-toolkit_test.go

+13-17
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package e2e
1919

2020
import (
2121
"context"
22+
"fmt"
2223
"path/filepath"
2324
"strings"
2425

@@ -85,29 +86,29 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
8586
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
8687
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
8788
var err error
88-
referenceOutput, _, err = runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
89+
referenceOutput, _, err = runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all %s", vectorAddImage))
8990
Expect(err).ToNot(HaveOccurred())
9091

9192
Expect(referenceOutput).To(ContainSubstring("Test PASSED"))
9293
})
9394

9495
It("should support automatic CDI spec generation", func(ctx context.Context) {
9596
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
96-
out2, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
97+
out2, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all %s", vectorAddImage))
9798
Expect(err).ToNot(HaveOccurred())
9899
Expect(referenceOutput).To(Equal(out2))
99100
})
100101

101102
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
102103
By("Running docker run with --runtime=nvidia --gpus all")
103-
out3, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
104+
out3, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia --gpus all %s", vectorAddImage))
104105
Expect(err).ToNot(HaveOccurred())
105106
Expect(referenceOutput).To(Equal(out3))
106107
})
107108

108109
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
109110
By("Running docker run with --gpus all")
110-
out4, _, err := runner.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
111+
out4, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --gpus all %s", vectorAddImage))
111112
Expect(err).ToNot(HaveOccurred())
112113
Expect(referenceOutput).To(Equal(out4))
113114
})
@@ -116,46 +117,41 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
116117
// A deviceQuery sample runs in a container with access to all GPUs
117118
// The following should all produce the same result.
118119
When("Running the cuda-deviceQuery sample", Ordered, func() {
119-
BeforeAll(func(ctx context.Context) {
120-
_, _, err := runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
121-
Expect(err).ToNot(HaveOccurred())
122-
})
123-
124120
var referenceOutput string
125121

126122
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
127123
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
128124
var err error
129-
referenceOutput, _, err = runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
125+
referenceOutput, _, err = runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all %s", deviceQueryImage))
130126
Expect(err).ToNot(HaveOccurred())
131127
Expect(referenceOutput).To(ContainSubstring("Result = PASS"))
132128
})
133129

134130
It("should support automatic CDI spec generation", func(ctx context.Context) {
135131
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
136-
out2, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
132+
out2, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all %s", deviceQueryImage))
137133
Expect(err).ToNot(HaveOccurred())
138134
Expect(referenceOutput).To(Equal(out2))
139135
})
140136

141137
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
142138
By("Running docker run with --runtime=nvidia --gpus all")
143-
out3, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
139+
out3, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia --gpus all %s", deviceQueryImage))
144140
Expect(err).ToNot(HaveOccurred())
145141
Expect(referenceOutput).To(Equal(out3))
146142
})
147143

148144
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
149145
By("Running docker run with --gpus all")
150-
out4, _, err := runner.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
146+
out4, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --gpus all %s", deviceQueryImage))
151147
Expect(err).ToNot(HaveOccurred())
152148
Expect(referenceOutput).To(Equal(out4))
153149
})
154150
})
155151

156152
When("Testing CUDA Forward compatibility", Ordered, func() {
157153
BeforeAll(func(ctx context.Context) {
158-
compatOutput, _, err := runner.Run("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"")
154+
compatOutput, _, err := runner.Run(fmt.Sprintf("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void %s bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"", cudaImage))
159155
Expect(err).ToNot(HaveOccurred())
160156
Expect(compatOutput).ToNot(BeEmpty())
161157

@@ -179,21 +175,21 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
179175

180176
It("should work with the nvidia runtime in legacy mode", func(ctx context.Context) {
181177
By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
182-
ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
178+
ldconfigOut, _, err := runner.Run(fmt.Sprintf("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all %s bash -c \"ldconfig -p | grep libcuda.so.1\"", cudaImage))
183179
Expect(err).ToNot(HaveOccurred())
184180
Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
185181
})
186182

187183
It("should work with the nvidia runtime in CDI mode", func(ctx context.Context) {
188184
By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
189-
ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
185+
ldconfigOut, _, err := runner.Run(fmt.Sprintf("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all %s bash -c \"ldconfig -p | grep libcuda.so.1\"", cudaImage))
190186
Expect(err).ToNot(HaveOccurred())
191187
Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
192188
})
193189

194190
It("should NOT work with nvidia-container-runtime-hook", func(ctx context.Context) {
195191
By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --gpus all")
196-
ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
192+
ldconfigOut, _, err := runner.Run(fmt.Sprintf("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all %s bash -c \"ldconfig -p | grep libcuda.so.1\"", cudaImage))
197193
Expect(err).ToNot(HaveOccurred())
198194
Expect(ldconfigOut).To(ContainSubstring("/usr/lib64"))
199195
})

0 commit comments

Comments
 (0)