Skip to content

Commit 72b2ee9

Browse files
authored
Merge pull request #1055 from elezar/add-cuda-compat-mode
Add nvidia-container-cli.compat-mode config option
2 parents 2ec6703 + f4981f0 commit 72b2ee9

File tree

8 files changed

+133
-21
lines changed

8 files changed

+133
-21
lines changed

cmd/nvidia-container-runtime-hook/hook_config.go

+23
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,26 @@ func (c *hookConfig) getSwarmResourceEnvvars() []string {
104104

105105
return envvars
106106
}
107+
108+
// nvidiaContainerCliCUDACompatModeFlags returns required --cuda-compat-mode
109+
// flag(s) depending on the hook and runtime configurations.
110+
func (c *hookConfig) nvidiaContainerCliCUDACompatModeFlags() []string {
111+
var flag string
112+
switch c.NVIDIAContainerRuntimeConfig.Modes.Legacy.CUDACompatMode {
113+
case config.CUDACompatModeLdconfig:
114+
flag = "--cuda-compat-mode=ldconfig"
115+
case config.CUDACompatModeMount:
116+
flag = "--cuda-compat-mode=mount"
117+
case config.CUDACompatModeDisabled, config.CUDACompatModeHook:
118+
flag = "--cuda-compat-mode=disabled"
119+
default:
120+
if !c.Features.AllowCUDACompatLibsFromContainer.IsEnabled() {
121+
flag = "--cuda-compat-mode=disabled"
122+
}
123+
}
124+
125+
if flag == "" {
126+
return nil
127+
}
128+
return []string{flag}
129+
}

cmd/nvidia-container-runtime-hook/main.go

+2-3
Original file line numberDiff line numberDiff line change
@@ -114,9 +114,8 @@ func doPrestart() {
114114
}
115115
args = append(args, "configure")
116116

117-
if !hook.Features.AllowCUDACompatLibsFromContainer.IsEnabled() {
118-
args = append(args, "--no-cntlibs")
119-
}
117+
args = append(args, hook.nvidiaContainerCliCUDACompatModeFlags()...)
118+
120119
if ldconfigPath := cli.NormalizeLDConfigPath(); ldconfigPath != "" {
121120
args = append(args, fmt.Sprintf("--ldconfig=%s", ldconfigPath))
122121
}

cmd/nvidia-ctk-installer/main_test.go

+15
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@ swarm-resource = ""
7979
[nvidia-container-runtime.modes.csv]
8080
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
8181
82+
[nvidia-container-runtime.modes.legacy]
83+
cuda-compat-mode = "ldconfig"
84+
8285
[nvidia-container-runtime-hook]
8386
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
8487
skip-mode-detection = true
@@ -140,6 +143,9 @@ swarm-resource = ""
140143
[nvidia-container-runtime.modes.csv]
141144
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
142145
146+
[nvidia-container-runtime.modes.legacy]
147+
cuda-compat-mode = "ldconfig"
148+
143149
[nvidia-container-runtime-hook]
144150
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
145151
skip-mode-detection = true
@@ -204,6 +210,9 @@ swarm-resource = ""
204210
[nvidia-container-runtime.modes.csv]
205211
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
206212
213+
[nvidia-container-runtime.modes.legacy]
214+
cuda-compat-mode = "ldconfig"
215+
207216
[nvidia-container-runtime-hook]
208217
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
209218
skip-mode-detection = true
@@ -265,6 +274,9 @@ swarm-resource = ""
265274
[nvidia-container-runtime.modes.csv]
266275
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
267276
277+
[nvidia-container-runtime.modes.legacy]
278+
cuda-compat-mode = "ldconfig"
279+
268280
[nvidia-container-runtime-hook]
269281
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
270282
skip-mode-detection = true
@@ -348,6 +360,9 @@ swarm-resource = ""
348360
[nvidia-container-runtime.modes.csv]
349361
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
350362
363+
[nvidia-container-runtime.modes.legacy]
364+
cuda-compat-mode = "ldconfig"
365+
351366
[nvidia-container-runtime-hook]
352367
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
353368
skip-mode-detection = true

internal/config/config.go

+3
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,9 @@ func GetDefault() (*Config, error) {
121121
AnnotationPrefixes: []string{cdi.AnnotationPrefix},
122122
SpecDirs: cdi.DefaultSpecDirs,
123123
},
124+
Legacy: legacyModeConfig{
125+
CUDACompatMode: defaultCUDACompatMode,
126+
},
124127
},
125128
},
126129
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{

internal/config/config_test.go

+23
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ func TestGetConfig(t *testing.T) {
7474
AnnotationPrefixes: []string{"cdi.k8s.io/"},
7575
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
7676
},
77+
Legacy: legacyModeConfig{
78+
CUDACompatMode: "ldconfig",
79+
},
7780
},
7881
},
7982
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
@@ -93,6 +96,7 @@ func TestGetConfig(t *testing.T) {
9396
"nvidia-container-cli.load-kmods = false",
9497
"nvidia-container-cli.ldconfig = \"@/foo/bar/ldconfig\"",
9598
"nvidia-container-cli.user = \"foo:bar\"",
99+
"nvidia-container-cli.cuda-compat-mode = \"mount\"",
96100
"nvidia-container-runtime.debug = \"/foo/bar\"",
97101
"nvidia-container-runtime.discover-mode = \"not-legacy\"",
98102
"nvidia-container-runtime.log-level = \"debug\"",
@@ -102,6 +106,7 @@ func TestGetConfig(t *testing.T) {
102106
"nvidia-container-runtime.modes.cdi.annotation-prefixes = [\"cdi.k8s.io/\", \"example.vendor.com/\",]",
103107
"nvidia-container-runtime.modes.cdi.spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
104108
"nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
109+
"nvidia-container-runtime.modes.legacy.cuda-compat-mode = \"mount\"",
105110
"nvidia-container-runtime-hook.path = \"/foo/bar/nvidia-container-runtime-hook\"",
106111
"nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"",
107112
},
@@ -134,6 +139,9 @@ func TestGetConfig(t *testing.T) {
134139
"/not/var/run/cdi",
135140
},
136141
},
142+
Legacy: legacyModeConfig{
143+
CUDACompatMode: "mount",
144+
},
137145
},
138146
},
139147
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
@@ -178,6 +186,9 @@ func TestGetConfig(t *testing.T) {
178186
"/var/run/cdi",
179187
},
180188
},
189+
Legacy: legacyModeConfig{
190+
CUDACompatMode: "ldconfig",
191+
},
181192
},
182193
},
183194
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
@@ -200,6 +211,7 @@ func TestGetConfig(t *testing.T) {
200211
"root = \"/bar/baz\"",
201212
"load-kmods = false",
202213
"ldconfig = \"@/foo/bar/ldconfig\"",
214+
"cuda-compat-mode = \"mount\"",
203215
"user = \"foo:bar\"",
204216
"[nvidia-container-runtime]",
205217
"debug = \"/foo/bar\"",
@@ -213,6 +225,8 @@ func TestGetConfig(t *testing.T) {
213225
"spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
214226
"[nvidia-container-runtime.modes.csv]",
215227
"mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
228+
"[nvidia-container-runtime.modes.legacy]",
229+
"cuda-compat-mode = \"mount\"",
216230
"[nvidia-container-runtime-hook]",
217231
"path = \"/foo/bar/nvidia-container-runtime-hook\"",
218232
"[nvidia-ctk]",
@@ -247,6 +261,9 @@ func TestGetConfig(t *testing.T) {
247261
"/not/var/run/cdi",
248262
},
249263
},
264+
Legacy: legacyModeConfig{
265+
CUDACompatMode: "mount",
266+
},
250267
},
251268
},
252269
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
@@ -283,6 +300,9 @@ func TestGetConfig(t *testing.T) {
283300
AnnotationPrefixes: []string{"cdi.k8s.io/"},
284301
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
285302
},
303+
Legacy: legacyModeConfig{
304+
CUDACompatMode: "ldconfig",
305+
},
286306
},
287307
},
288308
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
@@ -322,6 +342,9 @@ func TestGetConfig(t *testing.T) {
322342
AnnotationPrefixes: []string{"cdi.k8s.io/"},
323343
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
324344
},
345+
Legacy: legacyModeConfig{
346+
CUDACompatMode: "ldconfig",
347+
},
325348
},
326349
},
327350
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{

internal/config/runtime.go

+31-2
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,9 @@ type RuntimeConfig struct {
2929

3030
// modesConfig defines (optional) per-mode configs
3131
type modesConfig struct {
32-
CSV csvModeConfig `toml:"csv"`
33-
CDI cdiModeConfig `toml:"cdi"`
32+
CSV csvModeConfig `toml:"csv"`
33+
CDI cdiModeConfig `toml:"cdi"`
34+
Legacy legacyModeConfig `toml:"legacy"`
3435
}
3536

3637
type cdiModeConfig struct {
@@ -45,3 +46,31 @@ type cdiModeConfig struct {
4546
type csvModeConfig struct {
4647
MountSpecPath string `toml:"mount-spec-path"`
4748
}
49+
50+
type legacyModeConfig struct {
51+
// CUDACompatMode sets the mode to be used to make CUDA Forward Compat
52+
// libraries discoverable in the container.
53+
CUDACompatMode cudaCompatMode `toml:"cuda-compat-mode,omitempty"`
54+
}
55+
56+
type cudaCompatMode string
57+
58+
const (
59+
defaultCUDACompatMode = CUDACompatModeLdconfig
60+
// CUDACompatModeDisabled explicitly disables the handling of CUDA Forward
61+
// Compatibility in the NVIDIA Container Runtime and NVIDIA Container
62+
// Runtime Hook.
63+
CUDACompatModeDisabled = cudaCompatMode("disabled")
64+
// CUDACompatModeHook uses a container lifecycle hook to implement CUDA
65+
// Forward Compatibility support. This requires the use of the NVIDIA
66+
// Container Runtime and is not compatible with use cases where only the
67+
// NVIDIA Container Runtime Hook is used (e.g. the Docker --gpus flag).
68+
CUDACompatModeHook = cudaCompatMode("hook")
69+
// CUDACompatModeLdconfig adds the folders containing CUDA Forward Compat
70+
// libraries to the ldconfig command invoked from the NVIDIA Container
71+
// Runtime Hook.
72+
CUDACompatModeLdconfig = cudaCompatMode("ldconfig")
73+
// CUDACompatModeMount mounts CUDA Forward Compat folders from the container
74+
// to the container when using the NVIDIA Container Runtime Hook.
75+
CUDACompatModeMount = cudaCompatMode("mount")
76+
)

internal/config/toml_test.go

+3
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ spec-dirs = ["/etc/cdi", "/var/run/cdi"]
7474
[nvidia-container-runtime.modes.csv]
7575
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
7676
77+
[nvidia-container-runtime.modes.legacy]
78+
cuda-compat-mode = "ldconfig"
79+
7780
[nvidia-container-runtime-hook]
7881
path = "nvidia-container-runtime-hook"
7982
skip-mode-detection = false

internal/modifier/gated.go

+33-16
Original file line numberDiff line numberDiff line change
@@ -79,24 +79,41 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image
7979
discoverers = append(discoverers, d)
8080
}
8181

82-
if !cfg.Features.AllowCUDACompatLibsFromContainer.IsEnabled() && !cfg.Features.DisableCUDACompatLibHook.IsEnabled() {
83-
compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver)
84-
discoverers = append(discoverers, compatLibHookDiscoverer)
85-
// For legacy mode, we also need to inject a hook to update the LDCache
86-
// after we have modifed the configuration.
87-
if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" {
88-
ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook(
89-
logger,
90-
discover.None{},
91-
cfg.NVIDIACTKConfig.Path,
92-
"",
93-
)
94-
if err != nil {
95-
return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err)
96-
}
97-
discoverers = append(discoverers, ldcacheUpdateHookDiscoverer)
82+
// If the feature flag has explicitly been toggled, we don't make any modification.
83+
if !cfg.Features.DisableCUDACompatLibHook.IsEnabled() {
84+
cudaCompatDiscoverer, err := getCudaCompatModeDiscoverer(logger, cfg, driver)
85+
if err != nil {
86+
return nil, fmt.Errorf("failed to construct CUDA Compat discoverer: %w", err)
9887
}
88+
discoverers = append(discoverers, cudaCompatDiscoverer)
9989
}
10090

10191
return NewModifierFromDiscoverer(logger, discover.Merge(discoverers...))
10292
}
93+
94+
func getCudaCompatModeDiscoverer(logger logger.Interface, cfg *config.Config, driver *root.Driver) (discover.Discover, error) {
95+
// For legacy mode, we only include the enable-cuda-compat hook if cuda-compat-mode is set to hook.
96+
if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" && cfg.NVIDIAContainerRuntimeConfig.Modes.Legacy.CUDACompatMode != config.CUDACompatModeHook {
97+
return nil, nil
98+
}
99+
100+
compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver)
101+
// For non-legacy modes we return the hook as is. These modes *should* already include the update-ldcache hook.
102+
if cfg.NVIDIAContainerRuntimeConfig.Mode != "legacy" {
103+
return compatLibHookDiscoverer, nil
104+
}
105+
106+
// For legacy mode, we also need to inject a hook to update the LDCache
107+
// after we have modifed the configuration.
108+
ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook(
109+
logger,
110+
discover.None{},
111+
cfg.NVIDIACTKConfig.Path,
112+
"",
113+
)
114+
if err != nil {
115+
return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err)
116+
}
117+
118+
return discover.Merge(compatLibHookDiscoverer, ldcacheUpdateHookDiscoverer), nil
119+
}

0 commit comments

Comments
 (0)