@@ -24,13 +24,14 @@ import (
24
24
)
25
25
26
26
var (
27
- testenv env.Environment
28
- nodeType * string
29
- efaEnabled * bool
30
- nvidiaTestImage * string
31
- nodeCount int
32
- gpuPerNode int
33
- efaPerNode int
27
+ testenv env.Environment
28
+ nodeType * string
29
+ installDevicePlugin * bool
30
+ efaEnabled * bool
31
+ nvidiaTestImage * string
32
+ nodeCount int
33
+ gpuPerNode int
34
+ efaPerNode int
34
35
)
35
36
36
37
var (
@@ -42,10 +43,97 @@ var (
42
43
efaDevicePluginManifest []byte
43
44
)
44
45
46
+ func deployMPIOperator (ctx context.Context , config * envconf.Config ) (context.Context , error ) {
47
+ dep := appsv1.Deployment {
48
+ ObjectMeta : metav1.ObjectMeta {Name : "mpi-operator" , Namespace : "mpi-operator" },
49
+ }
50
+ err := wait .For (conditions .New (config .Client ().Resources ()).DeploymentConditionMatch (& dep , appsv1 .DeploymentAvailable , v1 .ConditionTrue ),
51
+ wait .WithContext (ctx ))
52
+ if err != nil {
53
+ return ctx , fmt .Errorf ("failed to deploy mpi-operator: %v" , err )
54
+ }
55
+ return ctx , nil
56
+ }
57
+
58
+ func deployNvidiaDevicePlugin (ctx context.Context , config * envconf.Config ) (context.Context , error ) {
59
+ ds := appsv1.DaemonSet {
60
+ ObjectMeta : metav1.ObjectMeta {Name : "nvidia-device-plugin-daemonset" , Namespace : "kube-system" },
61
+ }
62
+ err := wait .For (fwext .NewConditionExtension (config .Client ().Resources ()).DaemonSetReady (& ds ),
63
+ wait .WithContext (ctx ))
64
+ if err != nil {
65
+ return ctx , fmt .Errorf ("failed to deploy nvidia-device-plugin: %v" , err )
66
+ }
67
+ return ctx , nil
68
+ }
69
+
70
+ func deployEFAPlugin (ctx context.Context , config * envconf.Config ) (context.Context , error ) {
71
+ err := fwext .ApplyManifests (config .Client ().RESTConfig (), efaDevicePluginManifest )
72
+ if err != nil {
73
+ return ctx , err
74
+ }
75
+
76
+ ds := appsv1.DaemonSet {
77
+ ObjectMeta : metav1.ObjectMeta {Name : "aws-efa-k8s-device-plugin-daemonset" , Namespace : "kube-system" },
78
+ }
79
+ err = wait .For (fwext .NewConditionExtension (config .Client ().Resources ()).DaemonSetReady (& ds ),
80
+ wait .WithContext (ctx ))
81
+ if err != nil {
82
+ return ctx , fmt .Errorf ("failed to deploy efa-device-plugin: %v" , err )
83
+ }
84
+
85
+ return ctx , nil
86
+ }
87
+
88
+ func checkNodeTypes (ctx context.Context , config * envconf.Config ) (context.Context , error ) {
89
+ clientset , err := kubernetes .NewForConfig (config .Client ().RESTConfig ())
90
+ if err != nil {
91
+ return ctx , err
92
+ }
93
+
94
+ nodes , err := clientset .CoreV1 ().Nodes ().List (ctx , metav1.ListOptions {})
95
+ if err != nil {
96
+ return ctx , err
97
+ }
98
+
99
+ singleNodeType := true
100
+ for i := 1 ; i < len (nodes .Items )- 1 ; i ++ {
101
+ if nodes .Items [i ].Labels ["node.kubernetes.io/instance-type" ] != nodes .Items [i - 1 ].Labels ["node.kubernetes.io/instance-type" ] {
102
+ singleNodeType = false
103
+ }
104
+ }
105
+ if ! singleNodeType {
106
+ return ctx , fmt .Errorf ("Node types are not the same, all node types must be the same in the cluster" )
107
+ }
108
+
109
+ if * nodeType != "" {
110
+ for _ , v := range nodes .Items {
111
+ if v .Labels ["node.kubernetes.io/instance-type" ] == * nodeType {
112
+ nodeCount ++
113
+ gpu := v .Status .Capacity ["nvidia.com/gpu" ]
114
+ gpuPerNode = int (gpu .Value ())
115
+ efa := v .Status .Capacity ["vpc.amazonaws.com/efa" ]
116
+ efaPerNode = int (efa .Value ())
117
+ }
118
+ }
119
+ } else {
120
+ log .Printf ("No node type specified. Using the node type %s in the node groups." , nodes .Items [0 ].Labels ["node.kubernetes.io/instance-type" ])
121
+ nodeType = aws .String (nodes .Items [0 ].Labels ["node.kubernetes.io/instance-type" ])
122
+ nodeCount = len (nodes .Items )
123
+ gpu := nodes .Items [0 ].Status .Capacity ["nvidia.com/gpu" ]
124
+ gpuPerNode = int (gpu .Value ())
125
+ efa := nodes .Items [0 ].Status .Capacity ["vpc.amazonaws.com/efa" ]
126
+ efaPerNode = int (efa .Value ())
127
+ }
128
+
129
+ return ctx , nil
130
+ }
131
+
45
132
func TestMain (m * testing.M ) {
46
133
nodeType = flag .String ("nodeType" , "" , "node type for the tests" )
47
134
nvidiaTestImage = flag .String ("nvidiaTestImage" , "" , "nccl test image for nccl tests" )
48
135
efaEnabled = flag .Bool ("efaEnabled" , false , "enable efa tests" )
136
+ installDevicePlugin = flag .Bool ("installDevicePlugin" , true , "install nvidia device plugin" )
49
137
cfg , err := envconf .NewFromFlags ()
50
138
if err != nil {
51
139
log .Fatalf ("failed to initialize test environment: %v" , err )
@@ -57,95 +145,30 @@ func TestMain(m *testing.M) {
57
145
58
146
// all NVIDIA tests require the device plugin and MPI operator
59
147
manifests := [][]byte {
60
- nvidiaDevicePluginManifest ,
61
148
mpiOperatorManifest ,
62
149
}
63
-
64
- testenv .Setup (
150
+ setUpFunctions := []env.Func {
65
151
func (ctx context.Context , config * envconf.Config ) (context.Context , error ) {
66
152
err := fwext .ApplyManifests (config .Client ().RESTConfig (), manifests ... )
67
153
if err != nil {
68
154
return ctx , err
69
155
}
70
156
return ctx , nil
71
157
},
72
- func (ctx context.Context , config * envconf.Config ) (context.Context , error ) {
73
- dep := appsv1.Deployment {
74
- ObjectMeta : metav1.ObjectMeta {Name : "mpi-operator" , Namespace : "mpi-operator" },
75
- }
76
- err := wait .For (conditions .New (config .Client ().Resources ()).DeploymentConditionMatch (& dep , appsv1 .DeploymentAvailable , v1 .ConditionTrue ),
77
- wait .WithContext (ctx ))
78
- if err != nil {
79
- return ctx , fmt .Errorf ("failed to deploy mpi-operator: %v" , err )
80
- }
81
- return ctx , nil
82
- },
83
- func (ctx context.Context , config * envconf.Config ) (context.Context , error ) {
84
- ds := appsv1.DaemonSet {
85
- ObjectMeta : metav1.ObjectMeta {Name : "nvidia-device-plugin-daemonset" , Namespace : "kube-system" },
86
- }
87
- err := wait .For (fwext .NewConditionExtension (config .Client ().Resources ()).DaemonSetReady (& ds ),
88
- wait .WithContext (ctx ))
89
- if err != nil {
90
- return ctx , fmt .Errorf ("failed to deploy nvidia-device-plugin: %v" , err )
91
- }
92
- return ctx , nil
93
- },
94
- func (ctx context.Context , config * envconf.Config ) (context.Context , error ) {
95
- clientset , err := kubernetes .NewForConfig (cfg .Client ().RESTConfig ())
96
- if err != nil {
97
- return ctx , err
98
- }
99
- if * efaEnabled {
100
- err := fwext .ApplyManifests (cfg .Client ().RESTConfig (), efaDevicePluginManifest )
101
- if err != nil {
102
- return ctx , err
103
- }
104
- ds := appsv1.DaemonSet {
105
- ObjectMeta : metav1.ObjectMeta {Name : "aws-efa-k8s-device-plugin-daemonset" , Namespace : "kube-system" },
106
- }
107
- err = wait .For (fwext .NewConditionExtension (cfg .Client ().Resources ()).DaemonSetReady (& ds ),
108
- wait .WithContext (ctx ))
109
- if err != nil {
110
- return ctx , fmt .Errorf ("failed to deploy efa-device-plugin: %v" , err )
111
- }
112
- }
113
- nodes , err := clientset .CoreV1 ().Nodes ().List (ctx , metav1.ListOptions {})
114
- if err != nil {
115
- return ctx , err
116
- }
158
+ deployMPIOperator ,
159
+ checkNodeTypes ,
160
+ }
117
161
118
- singleNodeType := true
119
- for i := 1 ; i < len (nodes .Items )- 1 ; i ++ {
120
- if nodes .Items [i ].Labels ["node.kubernetes.io/instance-type" ] != nodes .Items [i - 1 ].Labels ["node.kubernetes.io/instance-type" ] {
121
- singleNodeType = false
122
- }
123
- }
124
- if ! singleNodeType {
125
- return ctx , fmt .Errorf ("Node types are not the same, all node types must be the same in the cluster" )
126
- }
127
- if * nodeType != "" {
128
- for _ , v := range nodes .Items {
129
- if v .Labels ["node.kubernetes.io/instance-type" ] == * nodeType {
130
- nodeCount ++
131
- gpu := v .Status .Capacity ["nvidia.com/gpu" ]
132
- gpuPerNode = int (gpu .Value ())
133
- efa := v .Status .Capacity ["vpc.amazonaws.com/efa" ]
134
- efaPerNode = int (efa .Value ())
135
- }
136
- }
137
- } else {
138
- log .Printf ("No node type specified. Using the node type %s in the node groups." , nodes .Items [0 ].Labels ["node.kubernetes.io/instance-type" ])
139
- nodeType = aws .String (nodes .Items [0 ].Labels ["node.kubernetes.io/instance-type" ])
140
- nodeCount = len (nodes .Items )
141
- gpu := nodes .Items [0 ].Status .Capacity ["nvidia.com/gpu" ]
142
- gpuPerNode = int (gpu .Value ())
143
- efa := nodes .Items [0 ].Status .Capacity ["vpc.amazonaws.com/efa" ]
144
- efaPerNode = int (efa .Value ())
145
- }
146
- return ctx , nil
147
- },
148
- )
162
+ if * installDevicePlugin {
163
+ manifests = append (manifests , nvidiaDevicePluginManifest )
164
+ setUpFunctions = append (setUpFunctions , deployNvidiaDevicePlugin )
165
+ }
166
+
167
+ if * efaEnabled {
168
+ setUpFunctions = append (setUpFunctions , deployEFAPlugin )
169
+ }
170
+
171
+ testenv .Setup (setUpFunctions ... )
149
172
150
173
testenv .Finish (
151
174
func (ctx context.Context , config * envconf.Config ) (context.Context , error ) {
0 commit comments