Skip to content

Commit b85cfbe

Browse files
authored
Run ddp_spawn dataloader checks on windows (#6930)
1 parent 3baac71 commit b85cfbe

File tree

2 files changed

+45
-41
lines changed

2 files changed

+45
-41
lines changed

pytorch_lightning/trainer/data_loading.py

+35-36
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
# limitations under the License.
1414
import inspect
1515
import multiprocessing
16-
import platform
1716
from abc import ABC
1817
from copy import deepcopy
1918
from typing import Iterable, List, Tuple, Union
@@ -54,53 +53,53 @@ class TrainerDataLoadingMixin(ABC):
5453
dev_debugger: InternalDebugger
5554

5655
def _worker_check(self, dataloader: DataLoader, name: str) -> None:
57-
on_windows = platform.system() == 'Windows'
56+
if not isinstance(dataloader, DataLoader):
57+
return
5858

59-
# ddp_spawn + num_workers > 0 don't mix! tell the user
60-
is_dataloader = isinstance(dataloader, DataLoader)
6159
using_spawn = self.accelerator_connector.distributed_backend == "ddp_spawn"
62-
if is_dataloader and not on_windows:
63-
if dataloader.num_workers > 0 and using_spawn:
64-
# checks for the attr persistent_workers available in pytorch >= 1.7
65-
if hasattr(dataloader, "persistent_workers"):
66-
if not dataloader.persistent_workers:
67-
rank_zero_warn(
68-
'num_workers>0, persistent_workers=False, and accelerator=ddp_spawn'
69-
' may result in data loading bottlenecks.'
70-
' Consider setting persistent_workers=True'
71-
' (this is a limitation of Python .spawn() and PyTorch)'
72-
)
73-
else:
60+
num_cpus = multiprocessing.cpu_count()
61+
62+
# ddp_spawn + num_workers > 0 don't mix! tell the user
63+
if dataloader.num_workers > 0 and using_spawn:
64+
# checks for the attr persistent_workers available in pytorch >= 1.7
65+
if hasattr(dataloader, "persistent_workers"):
66+
if not dataloader.persistent_workers:
7467
rank_zero_warn(
75-
'num_workers>0 and accelerator=ddp_spawn do not mix well'
76-
' and may result in data loading bottlenecks.'
77-
' Consider setting accelerator=ddp to use num_workers>0'
68+
'num_workers>0, persistent_workers=False, and accelerator=ddp_spawn'
69+
' may result in data loading bottlenecks.'
70+
' Consider setting persistent_workers=True'
7871
' (this is a limitation of Python .spawn() and PyTorch)'
7972
)
73+
else:
74+
rank_zero_warn(
75+
'num_workers>0 and accelerator=ddp_spawn do not mix well'
76+
' and may result in data loading bottlenecks.'
77+
' Consider setting accelerator=ddp to use num_workers>0'
78+
' (this is a limitation of Python .spawn() and PyTorch)'
79+
)
8080

81-
elif dataloader.num_workers == 0 and using_spawn:
82-
# checks for the attr persistent_workers available in pytorch >= 1.7
83-
if hasattr(dataloader, "persistent_workers"):
84-
if not dataloader.persistent_workers:
85-
rank_zero_warn(
86-
'accelerator=ddp_spawn and num_workers=0 may result in data loading bottlenecks.'
87-
' Consider setting num_workers>0 and persistent_workers=True'
88-
)
89-
else:
81+
elif dataloader.num_workers == 0 and using_spawn:
82+
# checks for the attr persistent_workers available in pytorch >= 1.7
83+
if hasattr(dataloader, "persistent_workers"):
84+
if not dataloader.persistent_workers:
9085
rank_zero_warn(
9186
'accelerator=ddp_spawn and num_workers=0 may result in data loading bottlenecks.'
92-
' Consider setting accelerator=ddp and set num_workers>0'
87+
' Consider setting num_workers>0 and persistent_workers=True'
9388
)
94-
95-
elif dataloader.num_workers <= 2 and multiprocessing.cpu_count() > 2 and not using_spawn:
96-
num_cpus = multiprocessing.cpu_count()
89+
else:
9790
rank_zero_warn(
98-
f'The dataloader, {name}, does not have many workers which may be a bottleneck.'
99-
' Consider increasing the value of the `num_workers` argument`'
100-
f' (try {num_cpus} which is the number of cpus on this machine)'
101-
f' in the `DataLoader` init to improve performance.'
91+
'accelerator=ddp_spawn and num_workers=0 may result in data loading bottlenecks.'
92+
' Consider setting accelerator=ddp and set num_workers>0'
10293
)
10394

95+
elif dataloader.num_workers <= 2 < num_cpus and not using_spawn:
96+
rank_zero_warn(
97+
f'The dataloader, {name}, does not have many workers which may be a bottleneck.'
98+
' Consider increasing the value of the `num_workers` argument`'
99+
f' (try {num_cpus} which is the number of cpus on this machine)'
100+
f' in the `DataLoader` init to improve performance.'
101+
)
102+
104103
def auto_add_sampler(self, dataloader: DataLoader, shuffle: bool) -> DataLoader:
105104

106105
# don't do anything if it's not a dataloader

tests/models/test_horovod.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,9 @@ def _run_horovod(trainer_options, on_gpu=False):
4949
# for Horovod, we interpret `gpus` to be set per worker
5050
trainer_options.update(gpus=1 if on_gpu else None)
5151
tutils.reset_seed()
52-
# todo: Find why coverage breaks CI.
52+
# TODO: Find out why coverage breaks CI.
5353
# append = '-a' if '.coverage' in os.listdir(_PROJECT_ROOT) else ''
54-
# str(num_processes), sys.executable, '-m', 'coverage', 'run', '--source', 'pytorch_lightning', append, # noqa E265
54+
# str(num_processes), sys.executable, '-m', 'coverage', 'run', '--source', 'pytorch_lightning', append,
5555
cmdline = [
5656
'horovodrun', '-np',
5757
str(num_processes), sys.executable, TEST_SCRIPT, '--trainer-options',
@@ -151,9 +151,10 @@ def test_horovod_multi_gpu_grad_by_value(tmpdir):
151151
_run_horovod(trainer_options, on_gpu=True)
152152

153153

154+
# todo: need to be fixed :]
154155
# https://discuss.pytorch.org/t/torch-cuda-amp-vs-nvidia-apex/74994
155156
# Check with (tgaddair) on Horovod issues if this feature is needed
156-
@pytest.mark.skip(reason="Horovod currently doesn't work with Apex") # todo
157+
@pytest.mark.skip(reason="TODO: Horovod currently doesn't work with Apex")
157158
@RunIf(min_gpus=2, skip_windows=True, amp_apex=True, horovod_nccl=True)
158159
def test_horovod_apex(tmpdir):
159160
"""Test Horovod with multi-GPU support using apex amp."""
@@ -240,6 +241,8 @@ def validation_step(self, batch, *args, **kwargs):
240241
tpipes.run_model_test_without_loggers(trainer_options, model)
241242

242243

244+
# todo: need to be fixed :]
245+
@pytest.mark.skip('TODO: flaky test - Fatal Python error: Aborted')
243246
@RunIf(skip_windows=True, horovod=True)
244247
def test_horovod_multi_optimizer(tmpdir):
245248
model = BasicGAN()
@@ -272,7 +275,8 @@ def get_optimizer_params(optimizer):
272275
assert get_model_params(model.discriminator) == get_optimizer_params(trainer.optimizers[1])
273276

274277

275-
@pytest.mark.skipif(reason="CI agent.jobstatus=Succeeded: Permission denied")
278+
# todo: need to be fixed :]
279+
@pytest.mark.skip(reason="TODO: CI agent.jobstatus=Succeeded: Permission denied")
276280
@RunIf(skip_windows=True, horovod=True)
277281
def test_result_reduce_horovod(tmpdir):
278282
"""Make sure result logging works with Horovod.
@@ -322,7 +326,8 @@ def training_epoch_end(self, outputs) -> None:
322326
horovod.run(hvd_test_fn, np=2)
323327

324328

325-
@pytest.mark.skipif(reason="CI agent.jobstatus=Succeeded: Permission denied")
329+
# todo: need to be fixed :]
330+
@pytest.mark.skip(reason="TODO: CI agent.jobstatus=Succeeded: Permission denied")
326331
@RunIf(skip_windows=True, horovod=True, num_gpus=2)
327332
def test_accuracy_metric_horovod():
328333
num_batches = 10

0 commit comments

Comments
 (0)