Skip to content

Commit 71d65a6

Browse files
author
SeanNaren
committed
Merge branch 'master' into fix/deepspeed_logging_per_gpu
2 parents 7d9dd6f + 4117028 commit 71d65a6

29 files changed

+115
-116
lines changed

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
136136
- Removed deprecated `CheckpointConnector.hpc_load` property in favor of `CheckpointConnector.restore` ([#10525](https://github.com/PyTorchLightning/pytorch-lightning/pull/10525))
137137

138138

139+
- Removed deprecated `reload_dataloaders_every_epoch` from `Trainer` in favour of `reload_dataloaders_every_n_epochs` ([#10481](https://github.com/PyTorchLightning/pytorch-lightning/pull/10481))
140+
141+
139142

140143
### Fixed
141144

@@ -157,6 +160,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
157160
- Squeeze the early stopping monitor to remove empty tensor dimensions ([#10461](https://github.com/PyTorchLightning/pytorch-lightning/issues/10461))
158161

159162

163+
- Fixed sampler replacement logic with `overfit_batches` to only replace the sample when `SequentialSampler` is not used ([#10486](https://github.com/PyTorchLightning/pytorch-lightning/issues/10486))
164+
165+
166+
-
167+
168+
160169
-
161170

162171
## [1.5.1] - 2021-11-09

docs/source/_templates/layout.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
{% block footer %}
55
{{ super() }}
66
<script script type="text/javascript">
7-
var collapsedSections = ['Best practices', 'Lightning API', 'Optional extensions', 'Tutorials', 'API References', 'Bolts', 'Examples', 'Partner Domain Frameworks', 'Community'];
7+
var collapsedSections = ['Best practices', 'Optional extensions', 'Tutorials', 'API References', 'Bolts', 'Examples', 'Partner Domain Frameworks', 'Community'];
88
</script>
99

1010
{% endblock %}

pl_examples/loop_examples/kfold.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ def on_run_end(self) -> None:
205205
voting_model = EnsembleVotingModel(type(self.trainer.lightning_module), checkpoint_paths)
206206
voting_model.trainer = self.trainer
207207
# This requires to connect the new model and move it the right device.
208-
self.trainer.accelerator.connect(voting_model)
208+
self.trainer.training_type_plugin.connect(voting_model)
209209
self.trainer.training_type_plugin.model_to_device()
210210
self.trainer.test_loop.run()
211211

pl_examples/loop_examples/yielding_training_step.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def _training_step(self, generator):
8686
# Here, instead of calling `lightning_module.training_step()`
8787
# we call next() on the generator!
8888
training_step_output = next(generator)
89-
self.trainer.accelerator.post_training_step()
89+
self.trainer.training_type_plugin.post_training_step()
9090

9191
training_step_output = self.trainer.call_hook("training_step_end", training_step_output)
9292

pytorch_lightning/trainer/connectors/data_connector.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@ def on_trainer_init(
6464
self,
6565
check_val_every_n_epoch: int,
6666
reload_dataloaders_every_n_epochs: int,
67-
reload_dataloaders_every_epoch: bool,
6867
prepare_data_per_node: Optional[bool] = None,
6968
) -> None:
7069
self.trainer.datamodule = None
@@ -83,13 +82,6 @@ def on_trainer_init(
8382

8483
self.trainer.check_val_every_n_epoch = check_val_every_n_epoch
8584

86-
if reload_dataloaders_every_epoch:
87-
reload_dataloaders_every_n_epochs = int(reload_dataloaders_every_epoch)
88-
rank_zero_deprecation(
89-
"`reload_dataloaders_every_epoch` is deprecated in v1.4 and will be removed in v1.6."
90-
" Please use `reload_dataloaders_every_n_epochs` in Trainer."
91-
)
92-
9385
if not isinstance(reload_dataloaders_every_n_epochs, int) or (reload_dataloaders_every_n_epochs < 0):
9486
raise MisconfigurationException(
9587
f"`reload_dataloaders_every_n_epochs` should be an int >= 0, got {reload_dataloaders_every_n_epochs}."

pytorch_lightning/trainer/data_loading.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -438,8 +438,7 @@ def _reset_eval_dataloader(
438438
for loader_i in range(len(dataloaders)):
439439
loader = dataloaders[loader_i]
440440

441-
if hasattr(loader, "sampler") and isinstance(loader.sampler, RandomSampler):
442-
441+
if hasattr(loader, "sampler") and not isinstance(loader.sampler, SequentialSampler):
443442
# when overfitting, the dataloader should not have sampler
444443
if self.overfit_batches > 0 and mode.evaluating:
445444
rank_zero_warn(
@@ -591,16 +590,17 @@ def _add_sampler_metadata_collate(dataloader: DataLoader) -> None:
591590

592591
@staticmethod
593592
def _resolve_overfit_batches(dataloader: Collection[DataLoader]) -> Collection[DataLoader]:
594-
has_random_sampler = False
593+
all_have_sequential_sampler = True
595594

596-
def resolve_had_random_sampler(dataloader: DataLoader):
597-
nonlocal has_random_sampler
598-
if not has_random_sampler:
599-
has_random_sampler = isinstance(dataloader.sampler, RandomSampler)
595+
def resolve_has_no_sequential_sampler(dataloader: DataLoader):
596+
nonlocal all_have_sequential_sampler
597+
all_have_sequential_sampler = all_have_sequential_sampler & isinstance(
598+
dataloader.sampler, SequentialSampler
599+
)
600600

601-
apply_to_collection(dataloader, DataLoader, resolve_had_random_sampler)
601+
apply_to_collection(dataloader, DataLoader, resolve_has_no_sequential_sampler)
602602

603-
if has_random_sampler:
603+
if not all_have_sequential_sampler:
604604
rank_zero_warn(
605605
"You requested to overfit but enabled training dataloader shuffling."
606606
" We are turning off the training dataloader shuffling for you."

pytorch_lightning/trainer/trainer.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,6 @@ def __init__(
162162
benchmark: bool = False,
163163
deterministic: bool = False,
164164
reload_dataloaders_every_n_epochs: int = 0,
165-
reload_dataloaders_every_epoch: bool = False,
166165
auto_lr_find: Union[bool, str] = False,
167166
replace_sampler_ddp: bool = True,
168167
detect_anomaly: bool = False,
@@ -341,12 +340,6 @@ def __init__(
341340
342341
reload_dataloaders_every_n_epochs: Set to a non-negative integer to reload dataloaders every n epochs.
343342
344-
reload_dataloaders_every_epoch: Set to True to reload dataloaders every epoch.
345-
346-
.. deprecated:: v1.4
347-
``reload_dataloaders_every_epoch`` has been deprecated in v1.4 and will be removed in v1.6.
348-
Please use ``reload_dataloaders_every_n_epochs``.
349-
350343
replace_sampler_ddp: Explicitly enables or disables sampler replacement. If not specified this
351344
will toggled automatically when DDP is used. By default it will add ``shuffle=True`` for
352345
train sampler and ``shuffle=False`` for val/test sampler. If you want to customize it,
@@ -515,7 +508,6 @@ def __init__(
515508
self._data_connector.on_trainer_init(
516509
check_val_every_n_epoch,
517510
reload_dataloaders_every_n_epochs,
518-
reload_dataloaders_every_epoch,
519511
prepare_data_per_node,
520512
)
521513

tests/callbacks/test_early_stopping.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -381,16 +381,16 @@ def on_train_end(self) -> None:
381381

382382
_ES_CHECK = dict(check_on_train_epoch_end=True)
383383
_ES_CHECK_P3 = dict(patience=3, check_on_train_epoch_end=True)
384-
_NO_WIN = dict(marks=RunIf(skip_windows=True))
384+
_SPAWN_MARK = dict(marks=RunIf(skip_windows=True, skip_49370=True))
385385

386386

387387
@pytest.mark.parametrize(
388388
"callbacks, expected_stop_epoch, check_on_train_epoch_end, strategy, num_processes",
389389
[
390390
([EarlyStopping("abc"), EarlyStopping("cba", patience=3)], 3, False, None, 1),
391391
([EarlyStopping("cba", patience=3), EarlyStopping("abc")], 3, False, None, 1),
392-
pytest.param([EarlyStopping("abc"), EarlyStopping("cba", patience=3)], 3, False, "ddp_spawn", 2, **_NO_WIN),
393-
pytest.param([EarlyStopping("cba", patience=3), EarlyStopping("abc")], 3, False, "ddp_spawn", 2, **_NO_WIN),
392+
pytest.param([EarlyStopping("abc"), EarlyStopping("cba", patience=3)], 3, False, "ddp_spawn", 2, **_SPAWN_MARK),
393+
pytest.param([EarlyStopping("cba", patience=3), EarlyStopping("abc")], 3, False, "ddp_spawn", 2, **_SPAWN_MARK),
394394
([EarlyStopping("abc", **_ES_CHECK), EarlyStopping("cba", **_ES_CHECK_P3)], 3, True, None, 1),
395395
([EarlyStopping("cba", **_ES_CHECK_P3), EarlyStopping("abc", **_ES_CHECK)], 3, True, None, 1),
396396
pytest.param(
@@ -399,15 +399,15 @@ def on_train_end(self) -> None:
399399
True,
400400
"ddp_spawn",
401401
2,
402-
**_NO_WIN,
402+
**_SPAWN_MARK,
403403
),
404404
pytest.param(
405405
[EarlyStopping("cba", **_ES_CHECK_P3), EarlyStopping("abc", **_ES_CHECK)],
406406
3,
407407
True,
408408
"ddp_spawn",
409409
2,
410-
**_NO_WIN,
410+
**_SPAWN_MARK,
411411
),
412412
],
413413
)

tests/callbacks/test_pruning.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ def test_pruning_callback_ddp_spawn(tmpdir):
187187
train_with_pruning_callback(tmpdir, use_global_unstructured=True, strategy="ddp_spawn", gpus=2)
188188

189189

190-
@RunIf(skip_windows=True)
190+
@RunIf(skip_windows=True, skip_49370=True)
191191
def test_pruning_callback_ddp_cpu(tmpdir):
192192
train_with_pruning_callback(tmpdir, parameters_to_prune=True, strategy="ddp_spawn", num_processes=2)
193193

tests/callbacks/test_stochastic_weight_avg.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ def test_swa_callback_ddp_spawn(tmpdir):
148148
train_with_swa(tmpdir, strategy="ddp_spawn", gpus=2)
149149

150150

151-
@RunIf(skip_windows=True)
151+
@RunIf(skip_windows=True, skip_49370=True)
152152
def test_swa_callback_ddp_cpu(tmpdir):
153153
train_with_swa(tmpdir, strategy="ddp_spawn", num_processes=2)
154154

tests/checkpointing/test_model_checkpoint.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -385,7 +385,7 @@ def on_train_end(self, trainer, pl_module):
385385
assert torch.save.call_count == 0
386386

387387

388-
@RunIf(skip_windows=True)
388+
@RunIf(skip_windows=True, skip_49370=True)
389389
def test_model_checkpoint_no_extraneous_invocations(tmpdir):
390390
"""Test to ensure that the model callback saves the checkpoints only once in distributed mode."""
391391
model = LogInTwoMethods()

tests/checkpointing/test_torch_saving.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def test_model_torch_save(tmpdir):
3434
trainer = torch.load(temp_path)
3535

3636

37-
@RunIf(skip_windows=True)
37+
@RunIf(skip_windows=True, skip_49370=True)
3838
def test_model_torch_save_ddp_cpu(tmpdir):
3939
"""Test to ensure torch save does not fail for model and trainer using cpu ddp."""
4040
model = BoringModel()

tests/deprecated_api/test_remove_1-6.py

Lines changed: 0 additions & 49 deletions
This file was deleted.

tests/deprecated_api/test_remove_1-7.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ def get_from_queue(self, queue: torch.multiprocessing.SimpleQueue) -> None:
245245
return super().get_from_queue(queue)
246246

247247

248-
@RunIf(skip_windows=True)
248+
@RunIf(skip_windows=True, skip_49370=True)
249249
def test_v1_7_0_deprecate_add_get_queue(tmpdir):
250250
model = BoringCallbackDDPSpawnModel()
251251
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, num_processes=2, strategy="ddp_spawn")

tests/helpers/runif.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ def __new__(
7070
fairscale_fully_sharded: bool = False,
7171
deepspeed: bool = False,
7272
rich: bool = False,
73+
skip_49370: bool = False,
7374
**kwargs,
7475
):
7576
"""
@@ -91,6 +92,7 @@ def __new__(
9192
fairscale_fully_sharded: if `fairscale` fully sharded module is required to run the test
9293
deepspeed: if `deepspeed` module is required to run the test
9394
rich: if `rich` module is required to run the test
95+
skip_49370: Skip the test as it's impacted by https://github.com/pytorch/pytorch/issues/49370.
9496
kwargs: native pytest.mark.skipif keyword arguments
9597
"""
9698
conditions = []
@@ -165,6 +167,15 @@ def __new__(
165167
conditions.append(not _RICH_AVAILABLE)
166168
reasons.append("Rich")
167169

170+
if skip_49370:
171+
# strategy=ddp_spawn, accelerator=cpu, python>=3.9, torch<1.8 does not work
172+
py_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
173+
ge_3_9 = Version(py_version) >= Version("3.9")
174+
torch_version = get_distribution("torch").version
175+
old_torch = Version(torch_version) < Version("1.8")
176+
conditions.append(ge_3_9 and old_torch)
177+
reasons.append("Impacted by https://github.com/pytorch/pytorch/issues/49370")
178+
168179
reasons = [rs for cond, rs in zip(conditions, reasons) if cond]
169180
return pytest.mark.skipif(
170181
*args, condition=any(conditions), reason=f"Requires: [{' + '.join(reasons)}]", **kwargs

tests/loggers/test_all.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,8 +321,8 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
321321
assert pl_module.logger.experiment.something(foo="bar") is None
322322

323323

324+
@RunIf(skip_windows=True, skip_49370=True)
324325
@pytest.mark.parametrize("logger_class", [CometLogger, CSVLogger, MLFlowLogger, TensorBoardLogger, TestTubeLogger])
325-
@RunIf(skip_windows=True)
326326
def test_logger_created_on_rank_zero_only(tmpdir, monkeypatch, logger_class):
327327
"""Test that loggers get replaced by dummy loggers on global rank > 0."""
328328
_patch_comet_atexit(monkeypatch)

tests/models/test_cpu.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ def validation_step(self, *args, **kwargs):
122122
model.unfreeze()
123123

124124

125-
@RunIf(skip_windows=True)
125+
@RunIf(skip_windows=True, skip_49370=True)
126126
def test_multi_cpu_model_ddp(tmpdir):
127127
"""Make sure DDP works."""
128128
tutils.set_random_main_port()

tests/models/test_hooks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -866,7 +866,7 @@ def call(hook, fn, *args, **kwargs):
866866
limit_predict_batches=batches,
867867
enable_progress_bar=False,
868868
enable_model_summary=False,
869-
reload_dataloaders_every_epoch=True,
869+
reload_dataloaders_every_n_epochs=True,
870870
)
871871

872872
called = []

tests/models/test_horovod.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def _run_horovod(trainer_options, on_gpu=False):
6666
assert exit_code == 0
6767

6868

69-
@RunIf(skip_windows=True, horovod=True)
69+
@RunIf(skip_windows=True, horovod=True, skip_49370=True)
7070
def test_horovod_cpu(tmpdir):
7171
"""Test Horovod running multi-process on CPU."""
7272
trainer_options = dict(
@@ -82,7 +82,7 @@ def test_horovod_cpu(tmpdir):
8282
_run_horovod(trainer_options)
8383

8484

85-
@RunIf(skip_windows=True, horovod=True)
85+
@RunIf(skip_windows=True, horovod=True, skip_49370=True)
8686
def test_horovod_cpu_clip_grad_by_value(tmpdir):
8787
"""Test Horovod running multi-process on CPU."""
8888
trainer_options = dict(
@@ -99,7 +99,7 @@ def test_horovod_cpu_clip_grad_by_value(tmpdir):
9999
_run_horovod(trainer_options)
100100

101101

102-
@RunIf(skip_windows=True, horovod=True)
102+
@RunIf(skip_windows=True, horovod=True, skip_49370=True)
103103
def test_horovod_cpu_implicit(tmpdir):
104104
"""Test Horovod without specifying a backend, inferring from env set by `horovodrun`."""
105105
trainer_options = dict(

tests/plugins/test_ddp_spawn_plugin.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def get_from_queue(self, queue: torch.multiprocessing.SimpleQueue) -> None:
4646
return super().get_from_queue(queue)
4747

4848

49-
@RunIf(skip_windows=True)
49+
@RunIf(skip_windows=True, skip_49370=True)
5050
def test_ddp_cpu():
5151
"""Tests if device is set correctly when training for DDPSpawnPlugin."""
5252
trainer = Trainer(num_processes=2, fast_dev_run=True)
@@ -91,7 +91,7 @@ def get_from_queue(self, trainer: Trainer, queue: torch.multiprocessing.SimpleQu
9191
return super().get_from_queue(trainer, queue)
9292

9393

94-
@RunIf(skip_windows=True)
94+
@RunIf(skip_windows=True, skip_49370=True)
9595
def test_ddp_spawn_add_get_queue(tmpdir):
9696
"""Tests add_to_queue/get_from_queue with DDPSpawnPlugin."""
9797

@@ -128,7 +128,7 @@ def on_predict_start(self) -> None:
128128
assert isinstance(self.trainer.model, LightningModule)
129129

130130

131-
@RunIf(skip_windows=True)
131+
@RunIf(skip_windows=True, skip_49370=True)
132132
def test_ddp_spawn_configure_ddp(tmpdir):
133133
"""Tests with ddp spawn plugin."""
134134
trainer = Trainer(default_root_dir=tmpdir, num_processes=2, strategy="ddp_spawn", fast_dev_run=True)

tests/profiler/test_profiler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def test_simple_profiler_with_nonexisting_dirpath(tmpdir):
161161
assert nonexisting_tmpdir.join("fit-profiler.txt").exists()
162162

163163

164-
@RunIf(skip_windows=True)
164+
@RunIf(skip_windows=True, skip_49370=True)
165165
def test_simple_profiler_distributed_files(tmpdir):
166166
"""Ensure the proper files are saved in distributed."""
167167
profiler = SimpleProfiler(dirpath=tmpdir, filename="profiler")
@@ -226,6 +226,7 @@ def test_advanced_profiler_iterable_durations(advanced_profiler, action: str, ex
226226
np.testing.assert_allclose(recored_total_duration, expected_total_duration, rtol=0.2)
227227

228228

229+
@pytest.mark.flaky(reruns=3)
229230
def test_advanced_profiler_overhead(advanced_profiler, n_iter=5):
230231
"""ensure that the profiler doesn't introduce too much overhead during training."""
231232
for _ in range(n_iter):

0 commit comments

Comments
 (0)