Skip to content

Commit ba69695

Browse files
Issacwwwndbaker1
andauthored
Opt in device plugin (#482)
* Opt in device plugin * Update e2e2/test/cases/neuron/main_test.go Co-authored-by: Nick Baker <[email protected]> * Update main_test.go --------- Co-authored-by: Nick Baker <[email protected]>
1 parent b35d508 commit ba69695

File tree

7 files changed

+139
-122
lines changed

7 files changed

+139
-122
lines changed

e2e2/test/cases/neuron/main_test.go

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,9 @@ import (
1919
)
2020

2121
var (
22-
testenv env.Environment
23-
neuronTestImage *string
22+
testenv env.Environment
23+
neuronTestImage *string
24+
installDevicePlugin *bool
2425
)
2526

2627
var (
@@ -30,8 +31,21 @@ var (
3031
neuronDevicePluginManifest []byte
3132
)
3233

34+
func deployNeuronDevicePlugin(ctx context.Context, config *envconf.Config) (context.Context, error) {
35+
ds := appsv1.DaemonSet{
36+
ObjectMeta: metav1.ObjectMeta{Name: "neuron-device-plugin-daemonset", Namespace: "kube-system"},
37+
}
38+
err := wait.For(fwext.NewConditionExtension(config.Client().Resources()).DaemonSetReady(&ds),
39+
wait.WithContext(ctx))
40+
if err != nil {
41+
return ctx, err
42+
}
43+
return ctx, nil
44+
}
45+
3346
func TestMain(m *testing.M) {
3447
neuronTestImage = flag.String("neuronTestImage", "", "image for neuron single node test")
48+
installDevicePlugin = flag.Bool("installDevicePlugin", true, "install neuron device plugin")
3549
cfg, err := envconf.NewFromFlags()
3650
if err != nil {
3751
log.Fatalf("failed to initialize test environment: %v", err)
@@ -41,31 +55,23 @@ func TestMain(m *testing.M) {
4155
defer cancel()
4256
testenv = testenv.WithContext(ctx)
4357

44-
manifests := [][]byte{
45-
neuronDevicePluginManifest,
46-
neuronDevicePlugiRbacManifest,
47-
}
48-
49-
testenv.Setup(
58+
var manifests [][]byte
59+
setUpFunctions := []env.Func{
5060
func(ctx context.Context, config *envconf.Config) (context.Context, error) {
5161
err := fwext.ApplyManifests(config.Client().RESTConfig(), manifests...)
5262
if err != nil {
5363
return ctx, err
5464
}
5565
return ctx, nil
5666
},
57-
func(ctx context.Context, config *envconf.Config) (context.Context, error) {
58-
ds := appsv1.DaemonSet{
59-
ObjectMeta: metav1.ObjectMeta{Name: "neuron-device-plugin-daemonset", Namespace: "kube-system"},
60-
}
61-
err := wait.For(fwext.NewConditionExtension(config.Client().Resources()).DaemonSetReady(&ds),
62-
wait.WithContext(ctx))
63-
if err != nil {
64-
return ctx, err
65-
}
66-
return ctx, nil
67-
},
68-
)
67+
}
68+
69+
if *installDevicePlugin {
70+
manifests = append(manifests, neuronDevicePluginManifest, neuronDevicePlugiRbacManifest)
71+
setUpFunctions = append(setUpFunctions, deployNeuronDevicePlugin)
72+
}
73+
74+
testenv.Setup(setUpFunctions...)
6975

7076
testenv.Finish(
7177
func(ctx context.Context, config *envconf.Config) (context.Context, error) {

e2e2/test/cases/nvidia/main_test.go

Lines changed: 109 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,14 @@ import (
2424
)
2525

2626
var (
27-
testenv env.Environment
28-
nodeType *string
29-
efaEnabled *bool
30-
nvidiaTestImage *string
31-
nodeCount int
32-
gpuPerNode int
33-
efaPerNode int
27+
testenv env.Environment
28+
nodeType *string
29+
installDevicePlugin *bool
30+
efaEnabled *bool
31+
nvidiaTestImage *string
32+
nodeCount int
33+
gpuPerNode int
34+
efaPerNode int
3435
)
3536

3637
var (
@@ -42,10 +43,97 @@ var (
4243
efaDevicePluginManifest []byte
4344
)
4445

46+
func deployMPIOperator(ctx context.Context, config *envconf.Config) (context.Context, error) {
47+
dep := appsv1.Deployment{
48+
ObjectMeta: metav1.ObjectMeta{Name: "mpi-operator", Namespace: "mpi-operator"},
49+
}
50+
err := wait.For(conditions.New(config.Client().Resources()).DeploymentConditionMatch(&dep, appsv1.DeploymentAvailable, v1.ConditionTrue),
51+
wait.WithContext(ctx))
52+
if err != nil {
53+
return ctx, fmt.Errorf("failed to deploy mpi-operator: %v", err)
54+
}
55+
return ctx, nil
56+
}
57+
58+
func deployNvidiaDevicePlugin(ctx context.Context, config *envconf.Config) (context.Context, error) {
59+
ds := appsv1.DaemonSet{
60+
ObjectMeta: metav1.ObjectMeta{Name: "nvidia-device-plugin-daemonset", Namespace: "kube-system"},
61+
}
62+
err := wait.For(fwext.NewConditionExtension(config.Client().Resources()).DaemonSetReady(&ds),
63+
wait.WithContext(ctx))
64+
if err != nil {
65+
return ctx, fmt.Errorf("failed to deploy nvidia-device-plugin: %v", err)
66+
}
67+
return ctx, nil
68+
}
69+
70+
func deployEFAPlugin(ctx context.Context, config *envconf.Config) (context.Context, error) {
71+
err := fwext.ApplyManifests(config.Client().RESTConfig(), efaDevicePluginManifest)
72+
if err != nil {
73+
return ctx, err
74+
}
75+
76+
ds := appsv1.DaemonSet{
77+
ObjectMeta: metav1.ObjectMeta{Name: "aws-efa-k8s-device-plugin-daemonset", Namespace: "kube-system"},
78+
}
79+
err = wait.For(fwext.NewConditionExtension(config.Client().Resources()).DaemonSetReady(&ds),
80+
wait.WithContext(ctx))
81+
if err != nil {
82+
return ctx, fmt.Errorf("failed to deploy efa-device-plugin: %v", err)
83+
}
84+
85+
return ctx, nil
86+
}
87+
88+
func checkNodeTypes(ctx context.Context, config *envconf.Config) (context.Context, error) {
89+
clientset, err := kubernetes.NewForConfig(config.Client().RESTConfig())
90+
if err != nil {
91+
return ctx, err
92+
}
93+
94+
nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
95+
if err != nil {
96+
return ctx, err
97+
}
98+
99+
singleNodeType := true
100+
for i := 1; i < len(nodes.Items)-1; i++ {
101+
if nodes.Items[i].Labels["node.kubernetes.io/instance-type"] != nodes.Items[i-1].Labels["node.kubernetes.io/instance-type"] {
102+
singleNodeType = false
103+
}
104+
}
105+
if !singleNodeType {
106+
return ctx, fmt.Errorf("Node types are not the same, all node types must be the same in the cluster")
107+
}
108+
109+
if *nodeType != "" {
110+
for _, v := range nodes.Items {
111+
if v.Labels["node.kubernetes.io/instance-type"] == *nodeType {
112+
nodeCount++
113+
gpu := v.Status.Capacity["nvidia.com/gpu"]
114+
gpuPerNode = int(gpu.Value())
115+
efa := v.Status.Capacity["vpc.amazonaws.com/efa"]
116+
efaPerNode = int(efa.Value())
117+
}
118+
}
119+
} else {
120+
log.Printf("No node type specified. Using the node type %s in the node groups.", nodes.Items[0].Labels["node.kubernetes.io/instance-type"])
121+
nodeType = aws.String(nodes.Items[0].Labels["node.kubernetes.io/instance-type"])
122+
nodeCount = len(nodes.Items)
123+
gpu := nodes.Items[0].Status.Capacity["nvidia.com/gpu"]
124+
gpuPerNode = int(gpu.Value())
125+
efa := nodes.Items[0].Status.Capacity["vpc.amazonaws.com/efa"]
126+
efaPerNode = int(efa.Value())
127+
}
128+
129+
return ctx, nil
130+
}
131+
45132
func TestMain(m *testing.M) {
46133
nodeType = flag.String("nodeType", "", "node type for the tests")
47134
nvidiaTestImage = flag.String("nvidiaTestImage", "", "nccl test image for nccl tests")
48135
efaEnabled = flag.Bool("efaEnabled", false, "enable efa tests")
136+
installDevicePlugin = flag.Bool("installDevicePlugin", true, "install nvidia device plugin")
49137
cfg, err := envconf.NewFromFlags()
50138
if err != nil {
51139
log.Fatalf("failed to initialize test environment: %v", err)
@@ -57,95 +145,30 @@ func TestMain(m *testing.M) {
57145

58146
// all NVIDIA tests require the device plugin and MPI operator
59147
manifests := [][]byte{
60-
nvidiaDevicePluginManifest,
61148
mpiOperatorManifest,
62149
}
63-
64-
testenv.Setup(
150+
setUpFunctions := []env.Func{
65151
func(ctx context.Context, config *envconf.Config) (context.Context, error) {
66152
err := fwext.ApplyManifests(config.Client().RESTConfig(), manifests...)
67153
if err != nil {
68154
return ctx, err
69155
}
70156
return ctx, nil
71157
},
72-
func(ctx context.Context, config *envconf.Config) (context.Context, error) {
73-
dep := appsv1.Deployment{
74-
ObjectMeta: metav1.ObjectMeta{Name: "mpi-operator", Namespace: "mpi-operator"},
75-
}
76-
err := wait.For(conditions.New(config.Client().Resources()).DeploymentConditionMatch(&dep, appsv1.DeploymentAvailable, v1.ConditionTrue),
77-
wait.WithContext(ctx))
78-
if err != nil {
79-
return ctx, fmt.Errorf("failed to deploy mpi-operator: %v", err)
80-
}
81-
return ctx, nil
82-
},
83-
func(ctx context.Context, config *envconf.Config) (context.Context, error) {
84-
ds := appsv1.DaemonSet{
85-
ObjectMeta: metav1.ObjectMeta{Name: "nvidia-device-plugin-daemonset", Namespace: "kube-system"},
86-
}
87-
err := wait.For(fwext.NewConditionExtension(config.Client().Resources()).DaemonSetReady(&ds),
88-
wait.WithContext(ctx))
89-
if err != nil {
90-
return ctx, fmt.Errorf("failed to deploy nvidia-device-plugin: %v", err)
91-
}
92-
return ctx, nil
93-
},
94-
func(ctx context.Context, config *envconf.Config) (context.Context, error) {
95-
clientset, err := kubernetes.NewForConfig(cfg.Client().RESTConfig())
96-
if err != nil {
97-
return ctx, err
98-
}
99-
if *efaEnabled {
100-
err := fwext.ApplyManifests(cfg.Client().RESTConfig(), efaDevicePluginManifest)
101-
if err != nil {
102-
return ctx, err
103-
}
104-
ds := appsv1.DaemonSet{
105-
ObjectMeta: metav1.ObjectMeta{Name: "aws-efa-k8s-device-plugin-daemonset", Namespace: "kube-system"},
106-
}
107-
err = wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).DaemonSetReady(&ds),
108-
wait.WithContext(ctx))
109-
if err != nil {
110-
return ctx, fmt.Errorf("failed to deploy efa-device-plugin: %v", err)
111-
}
112-
}
113-
nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
114-
if err != nil {
115-
return ctx, err
116-
}
158+
deployMPIOperator,
159+
checkNodeTypes,
160+
}
117161

118-
singleNodeType := true
119-
for i := 1; i < len(nodes.Items)-1; i++ {
120-
if nodes.Items[i].Labels["node.kubernetes.io/instance-type"] != nodes.Items[i-1].Labels["node.kubernetes.io/instance-type"] {
121-
singleNodeType = false
122-
}
123-
}
124-
if !singleNodeType {
125-
return ctx, fmt.Errorf("Node types are not the same, all node types must be the same in the cluster")
126-
}
127-
if *nodeType != "" {
128-
for _, v := range nodes.Items {
129-
if v.Labels["node.kubernetes.io/instance-type"] == *nodeType {
130-
nodeCount++
131-
gpu := v.Status.Capacity["nvidia.com/gpu"]
132-
gpuPerNode = int(gpu.Value())
133-
efa := v.Status.Capacity["vpc.amazonaws.com/efa"]
134-
efaPerNode = int(efa.Value())
135-
}
136-
}
137-
} else {
138-
log.Printf("No node type specified. Using the node type %s in the node groups.", nodes.Items[0].Labels["node.kubernetes.io/instance-type"])
139-
nodeType = aws.String(nodes.Items[0].Labels["node.kubernetes.io/instance-type"])
140-
nodeCount = len(nodes.Items)
141-
gpu := nodes.Items[0].Status.Capacity["nvidia.com/gpu"]
142-
gpuPerNode = int(gpu.Value())
143-
efa := nodes.Items[0].Status.Capacity["vpc.amazonaws.com/efa"]
144-
efaPerNode = int(efa.Value())
145-
}
146-
return ctx, nil
147-
},
148-
)
162+
if *installDevicePlugin {
163+
manifests = append(manifests, nvidiaDevicePluginManifest)
164+
setUpFunctions = append(setUpFunctions, deployNvidiaDevicePlugin)
165+
}
166+
167+
if *efaEnabled {
168+
setUpFunctions = append(setUpFunctions, deployEFAPlugin)
169+
}
170+
171+
testenv.Setup(setUpFunctions...)
149172

150173
testenv.Finish(
151174
func(ctx context.Context, config *envconf.Config) (context.Context, error) {

kubetest2/internal/deployers/eksapi/nodegroup.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ func (m *NodegroupManager) createManagedNodegroup(infra *Infrastructure, cluster
157157
func (m *NodegroupManager) createUnmanagedNodegroup(infra *Infrastructure, cluster *Cluster, opts *deployerOptions) error {
158158
stackName := m.getUnmanagedNodegroupStackName()
159159
klog.Infof("creating unmanaged nodegroup stack...")
160-
userData, userDataIsMimePart, err := generateUserData(opts.UserDataFormat, opts.EFA, cluster)
160+
userData, userDataIsMimePart, err := generateUserData(opts.UserDataFormat, cluster)
161161
if err != nil {
162162
return err
163163
}
@@ -258,7 +258,7 @@ func (m *NodegroupManager) createUnmanagedNodegroup(infra *Infrastructure, clust
258258
func (m *NodegroupManager) createUnmanagedNodegroupWithEFA(infra *Infrastructure, cluster *Cluster, opts *deployerOptions) error {
259259
stackName := m.getUnmanagedNodegroupStackName()
260260
klog.Infof("creating unmanaged nodegroup with EFA stack...")
261-
userData, userDataIsMimePart, err := generateUserData(opts.UserDataFormat, opts.EFA, cluster)
261+
userData, userDataIsMimePart, err := generateUserData(opts.UserDataFormat, cluster)
262262
if err != nil {
263263
return err
264264
}

kubetest2/internal/deployers/eksapi/templates/templates.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ type UserDataTemplateData struct {
4141
CertificateAuthority string
4242
CIDR string
4343
APIServerEndpoint string
44-
EFAEnabled bool
4544
}
4645

4746
var (

kubetest2/internal/deployers/eksapi/templates/userdata_bottlerocket.toml.template

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,3 @@
55

66
[settings.host-containers.admin]
77
"enabled" = true
8-
9-
[settings.efa]
10-
"enabled" = {{.EFAEnabled}}

kubetest2/internal/deployers/eksapi/userdata.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import (
88
"github.com/aws/aws-k8s-tester/kubetest2/internal/deployers/eksapi/templates"
99
)
1010

11-
func generateUserData(format string, efaEnabled bool, cluster *Cluster) (string, bool, error) {
11+
func generateUserData(format string, cluster *Cluster) (string, bool, error) {
1212
userDataIsMimePart := true
1313
var t *template.Template
1414
switch format {
@@ -29,7 +29,6 @@ func generateUserData(format string, efaEnabled bool, cluster *Cluster) (string,
2929
CertificateAuthority: cluster.certificateAuthorityData,
3030
CIDR: cluster.cidr,
3131
Name: cluster.name,
32-
EFAEnabled: efaEnabled,
3332
}); err != nil {
3433
return "", false, err
3534
}

kubetest2/internal/deployers/eksapi/userdata_test.go

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,40 +42,33 @@ const bottlerocketUserData = `[settings.kubernetes]
4242
4343
[settings.host-containers.admin]
4444
"enabled" = true
45-
46-
[settings.efa]
47-
"enabled" = true
4845
`
4946

5047
func Test_generateUserData(t *testing.T) {
5148
cases := []struct {
5249
format string
53-
efa bool
5450
expected string
5551
expectedIsMimePart bool
5652
}{
5753
{
5854
format: "bootstrap.sh",
59-
efa: false,
6055
expected: bootstrapShUserData,
6156
expectedIsMimePart: true,
6257
},
6358
{
6459
format: "nodeadm",
65-
efa: false,
6660
expected: nodeadmUserData,
6761
expectedIsMimePart: true,
6862
},
6963
{
7064
format: "bottlerocket",
71-
efa: true,
7265
expected: bottlerocketUserData,
7366
expectedIsMimePart: false,
7467
},
7568
}
7669
for _, c := range cases {
7770
t.Run(c.format, func(t *testing.T) {
78-
actual, isMimePart, err := generateUserData(c.format, c.efa, &cluster)
71+
actual, isMimePart, err := generateUserData(c.format, &cluster)
7972
if err != nil {
8073
t.Log(err)
8174
t.Error(err)

0 commit comments

Comments
 (0)