30
30
from pytorch_lightning import Trainer
31
31
from pytorch_lightning .accelerators import CPUAccelerator
32
32
from pytorch_lightning .utilities import _HOROVOD_AVAILABLE
33
+ from pytorch_lightning .utilities .exceptions import MisconfigurationException
33
34
from tests .helpers import BoringModel
34
35
from tests .helpers .advanced_models import BasicGAN
35
36
from tests .helpers .runif import RunIf
42
43
TEST_SCRIPT = os .path .join (os .path .dirname (__file__ ), "data" , "horovod" , "train_default_model.py" )
43
44
44
45
45
- def _run_horovod (trainer_options , on_gpu = False ):
46
+ def _run_horovod (trainer_options ):
46
47
"""Execute the training script across multiple workers in parallel."""
47
- num_processes = trainer_options .get ("gpus" , 2 )
48
- # for Horovod, we interpret `gpus` to be set per worker
49
- trainer_options .update (gpus = 1 if on_gpu else None )
48
+ devices = trainer_options .get ("devices" , 1 )
50
49
tutils .reset_seed ()
51
50
# TODO: Find out why coverage breaks CI.
52
51
# append = '-a' if '.coverage' in os.listdir(_PROJECT_ROOT) else ''
53
52
# str(num_processes), sys.executable, '-m', 'coverage', 'run', '--source', 'pytorch_lightning', append,
54
53
cmdline = [
55
54
"horovodrun" ,
56
55
"-np" ,
57
- str (num_processes ),
56
+ str (devices ),
58
57
sys .executable ,
59
58
TEST_SCRIPT ,
60
59
"--trainer-options" ,
61
60
shlex .quote (json .dumps (trainer_options )),
62
61
]
63
- if on_gpu :
62
+ if trainer_options . get ( "accelerator" , "cpu" ) == "gpu" :
64
63
cmdline += ["--on-gpu" ]
65
64
exit_code = subprocess .call (" " .join (cmdline ), shell = True , env = os .environ .copy ())
66
65
assert exit_code == 0
@@ -82,6 +81,20 @@ def test_horovod_cpu(tmpdir):
82
81
_run_horovod (trainer_options )
83
82
84
83
84
+ @RunIf (skip_windows = True , horovod = True , skip_49370 = True )
85
+ def test_horovod_cpu_accumulate_grad_batches (tmpdir ):
86
+ trainer_options = dict (
87
+ default_root_dir = tmpdir ,
88
+ enable_progress_bar = False ,
89
+ max_epochs = 1 ,
90
+ limit_train_batches = 4 ,
91
+ limit_val_batches = 0 ,
92
+ accumulate_grad_batches = 2 ,
93
+ strategy = "horovod" ,
94
+ )
95
+ _run_horovod (trainer_options )
96
+
97
+
85
98
@RunIf (skip_windows = True , horovod = True , skip_49370 = True )
86
99
def test_horovod_cpu_clip_grad_by_value (tmpdir ):
87
100
"""Test Horovod running multi-process on CPU."""
@@ -125,10 +138,44 @@ def test_horovod_multi_gpu(tmpdir):
125
138
max_epochs = 1 ,
126
139
limit_train_batches = 0.4 ,
127
140
limit_val_batches = 0.2 ,
128
- gpus = 2 ,
141
+ accelerator = "gpu" ,
142
+ devices = 2 ,
143
+ strategy = "horovod" ,
144
+ )
145
+ _run_horovod (trainer_options )
146
+
147
+
148
+ @RunIf (min_gpus = 2 , skip_windows = True , horovod_nccl = True )
149
+ def test_horovod_multi_gpu_accumulate_grad_batches (tmpdir ):
150
+ trainer_options = dict (
151
+ default_root_dir = tmpdir ,
152
+ enable_progress_bar = False ,
153
+ max_epochs = 1 ,
154
+ limit_train_batches = 4 ,
155
+ limit_val_batches = 0 ,
156
+ accumulate_grad_batches = 2 ,
157
+ accelerator = "gpu" ,
158
+ devices = 2 ,
129
159
strategy = "horovod" ,
130
160
)
131
- _run_horovod (trainer_options , on_gpu = True )
161
+ _run_horovod (trainer_options )
162
+
163
+
164
+ @RunIf (horovod = True , skip_windows = True )
165
+ def test_horovod_raises_unsupported_accumulate_grad_batches (tmpdir ):
166
+ """Ensure MisConfigurationException for different `accumulate_grad_batches` at different epochs for Horovod
167
+ Strategy on multi-gpus."""
168
+ model = BoringModel ()
169
+ trainer = Trainer (
170
+ default_root_dir = tmpdir ,
171
+ enable_progress_bar = False ,
172
+ accumulate_grad_batches = {0 : 4 , 2 : 2 },
173
+ accelerator = "auto" ,
174
+ devices = 1 ,
175
+ strategy = "horovod" ,
176
+ )
177
+ with pytest .raises (MisconfigurationException , match = "Horovod.*does not support.*accumulate_grad_batches" ):
178
+ trainer .fit (model )
132
179
133
180
134
181
@RunIf (min_gpus = 2 , skip_windows = True , horovod_nccl = True )
@@ -143,10 +190,11 @@ def test_horovod_multi_gpu_grad_by_value(tmpdir):
143
190
max_epochs = 1 ,
144
191
limit_train_batches = 0.4 ,
145
192
limit_val_batches = 0.2 ,
146
- gpus = 2 ,
193
+ accelerator = "gpu" ,
194
+ devices = 2 ,
147
195
strategy = "horovod" ,
148
196
)
149
- _run_horovod (trainer_options , on_gpu = True )
197
+ _run_horovod (trainer_options )
150
198
151
199
152
200
# todo: need to be fixed :]
@@ -164,12 +212,13 @@ def test_horovod_apex(tmpdir):
164
212
max_epochs = 1 ,
165
213
limit_train_batches = 0.4 ,
166
214
limit_val_batches = 0.2 ,
167
- gpus = 2 ,
215
+ accelerator = "gpu" ,
216
+ devices = 2 ,
168
217
strategy = "horovod" ,
169
218
amp_backend = "apex" ,
170
219
precision = 16 ,
171
220
)
172
- _run_horovod (trainer_options , on_gpu = True )
221
+ _run_horovod (trainer_options )
173
222
174
223
175
224
@RunIf (min_gpus = 2 , skip_windows = True , horovod_nccl = True )
@@ -183,12 +232,13 @@ def test_horovod_amp(tmpdir):
183
232
max_epochs = 1 ,
184
233
limit_train_batches = 0.4 ,
185
234
limit_val_batches = 0.2 ,
186
- gpus = 2 ,
235
+ accelerator = "gpu" ,
236
+ devices = 2 ,
187
237
strategy = "horovod" ,
188
238
amp_backend = "native" ,
189
239
precision = 16 ,
190
240
)
191
- _run_horovod (trainer_options , on_gpu = True )
241
+ _run_horovod (trainer_options )
192
242
193
243
194
244
@RunIf (min_gpus = 2 , skip_windows = True , horovod_nccl = True )
@@ -202,10 +252,11 @@ def test_horovod_gather(tmpdir):
202
252
max_epochs = 1 ,
203
253
limit_train_batches = 0.4 ,
204
254
limit_val_batches = 0.2 ,
205
- gpus = 2 ,
255
+ accelerator = "gpu" ,
256
+ devices = 2 ,
206
257
strategy = "horovod" ,
207
258
)
208
- _run_horovod (trainer_options , on_gpu = True )
259
+ _run_horovod (trainer_options )
209
260
210
261
211
262
@RunIf (min_gpus = 1 , skip_windows = True , horovod_nccl = True )
@@ -227,7 +278,8 @@ def validation_step(self, batch, *args, **kwargs):
227
278
max_epochs = 1 ,
228
279
limit_train_batches = 0.4 ,
229
280
limit_val_batches = 0.2 ,
230
- gpus = 1 ,
281
+ accelerator = "gpu" ,
282
+ devices = 1 ,
231
283
strategy = "horovod" ,
232
284
)
233
285
tpipes .run_model_test_without_loggers (trainer_options , model )
0 commit comments