From 4f483720290d88ba5f93717a230c518fde10775a Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Wed, 1 Dec 2021 21:13:15 +0530 Subject: [PATCH 1/9] disable eval dataloader replacement --- pytorch_lightning/trainer/data_loading.py | 23 +-- tests/trainer/flags/test_overfit_batches.py | 129 ++++++++++++++- tests/trainer/test_data_loading.py | 16 ++ tests/trainer/test_trainer_tricks.py | 174 -------------------- 4 files changed, 148 insertions(+), 194 deletions(-) delete mode 100644 tests/trainer/test_trainer_tricks.py diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 455c2719b124a..759b08243a522 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -14,7 +14,6 @@ import multiprocessing import os from abc import ABC -from copy import deepcopy from typing import Any, Callable, Collection, List, Optional, Tuple, Union from torch.utils.data import DataLoader, RandomSampler, Sampler, SequentialSampler @@ -293,28 +292,14 @@ def _reset_eval_dataloader( if not isinstance(dataloaders, list): dataloaders = [dataloaders] - # when overfitting, use the training loader as val and test - # duplicate it the numb of times needed to match the train loaders - if self.overfit_batches > 0: - train_dataloader = self.request_dataloader(RunningStage.TRAINING, model=model) - dataloaders = [deepcopy(train_dataloader) for _ in range(len(dataloaders))] - for loader_i in range(len(dataloaders)): loader = dataloaders[loader_i] if hasattr(loader, "sampler") and not isinstance(loader.sampler, SequentialSampler): - # when overfitting, the dataloader should not have sampler - if self.overfit_batches > 0 and mode.evaluating: - rank_zero_warn( - "You requested to overfit but enabled val/test dataloader shuffling." - " We are turning it off for you." - ) - dataloaders[loader_i] = _update_dataloader(loader, SequentialSampler(loader.dataset), mode=mode) - else: - rank_zero_warn( - f"Your `{mode.dataloader_prefix}_dataloader` has `shuffle=True`," - "it is strongly recommended that you turn this off for val/test/predict dataloaders." - ) + rank_zero_warn( + f"Your `{mode.dataloader_prefix}_dataloader` has `shuffle=True`," + "it is strongly recommended that you turn this off for val/test/predict dataloaders." + ) if any(dl is None for dl in dataloaders): rank_zero_warn("One of given dataloaders is None and it will be skipped.") diff --git a/tests/trainer/flags/test_overfit_batches.py b/tests/trainer/flags/test_overfit_batches.py index d4c35925abe89..750d4f6c29c28 100644 --- a/tests/trainer/flags/test_overfit_batches.py +++ b/tests/trainer/flags/test_overfit_batches.py @@ -13,9 +13,11 @@ # limitations under the License. import pytest import torch -from torch.utils.data.sampler import Sampler, SequentialSampler +from torch.utils.data import DataLoader, RandomSampler, Sampler, SequentialSampler from pytorch_lightning import Trainer +from pytorch_lightning.trainer.states import RunningStage +from tests.base.model_template import EvalModelTemplate from tests.helpers.boring_model import BoringModel, RandomDataset @@ -62,3 +64,128 @@ def train_dataloader(self): trainer.fit(model) assert isinstance(trainer.train_dataloader.loaders.sampler, SequentialSampler) + + +def test_overfit_batch_limits(tmpdir): + # ------------------------------------------------------ + # Make sure shuffle is correct across loaders initially + # ------------------------------------------------------ + model = EvalModelTemplate() + model.train_dataloader() + + # original train loader which should be replaced in all methods + train_loader = model.train_dataloader() + + # make sure the val and tests are not shuffled + assert isinstance(train_loader.sampler, RandomSampler) + assert isinstance(model.val_dataloader().sampler, SequentialSampler) + assert isinstance(model.test_dataloader().sampler, SequentialSampler) + + # ------------------------------------------------------ + # get the training loader and batch + # ------------------------------------------------------ + # Create a reference train dataloader without shuffling. + train_loader = DataLoader(model.train_dataloader().dataset, shuffle=False) + (xa, ya) = next(iter(train_loader)) + train_loader = DataLoader(model.train_dataloader().dataset, shuffle=True) + full_train_samples = len(train_loader) + num_train_samples = int(0.11 * full_train_samples) + + # ------------------------------------------------------ + # set VAL and Test loaders + # ------------------------------------------------------ + val_loader = DataLoader(model.val_dataloader().dataset, shuffle=False) + test_loader = DataLoader(model.test_dataloader().dataset, shuffle=False) + + # set the model loaders + model.train_dataloader = lambda: train_loader + model.val_dataloader = lambda: val_loader + model.test_dataloader = lambda: test_loader + + # ------------------------------------------------------ + # test train loader applies correct limits + # ------------------------------------------------------ + trainer = Trainer(overfit_batches=4) + model.trainer = trainer + trainer._data_connector.attach_dataloaders(model=model) + trainer.reset_train_dataloader(model) + assert trainer.num_training_batches == 4 + + # make sure the loaders are the same + (xb, yb) = next(iter(trainer.train_dataloader)) + assert torch.eq(xa, xb).all() + assert torch.eq(ya, yb).all() + + trainer = Trainer(overfit_batches=0.11) + model.trainer = trainer + trainer._data_connector.attach_dataloaders(model=model) + trainer.reset_train_dataloader(model) + # The dataloader should have been overwritten with a Sequential sampler. + assert trainer.train_dataloader is not train_loader + assert trainer.num_training_batches == num_train_samples + + # make sure the loaders are the same + (xb, yb) = next(iter(trainer.train_dataloader)) + assert torch.eq(xa, xb).all() + assert torch.eq(ya, yb).all() + + # ------------------------------------------------------ + # run tests for both val and test + # ------------------------------------------------------ + for split in (RunningStage.VALIDATING, RunningStage.TESTING): + + # ------------------------------------------------------ + # test overfit_batches as percent + # ------------------------------------------------------ + trainer = Trainer(overfit_batches=0.11) + trainer._data_connector.attach_dataloaders(model) + loader_num_batches, _ = trainer._reset_eval_dataloader(split, model=model) + if split == RunningStage.VALIDATING: + assert loader_num_batches[0] == 0 + else: + assert loader_num_batches[0] == len(test_loader) + + # ------------------------------------------------------ + # test overfit_batches as int + # ------------------------------------------------------ + trainer = Trainer(overfit_batches=1) + trainer._data_connector.attach_dataloaders(model) + loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model) + if split == RunningStage.VALIDATING: + assert loader_num_batches[0] == 0 + else: + assert loader_num_batches[0] == len(test_loader) + # make sure we turned off shuffle for the user + assert isinstance(dataloaders[0].sampler, SequentialSampler) + + trainer = Trainer(overfit_batches=5) + trainer._data_connector.attach_dataloaders(model) + loader_num_batches, _ = trainer._reset_eval_dataloader(split, model=model) + if split == RunningStage.VALIDATING: + assert loader_num_batches[0] == 0 + else: + assert loader_num_batches[0] == len(test_loader) + + # ------------------------------------------------------ + # test limit_xxx_batches as percent AND int + # ------------------------------------------------------ + if split == RunningStage.VALIDATING: + trainer = Trainer(limit_val_batches=0.1) + trainer._data_connector.attach_dataloaders(model) + loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model) + assert loader_num_batches[0] == int(0.1 * len(val_loader)) + + trainer = Trainer(limit_val_batches=10) + trainer._data_connector.attach_dataloaders(model) + loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model) + assert loader_num_batches[0] == 10 + else: + trainer = Trainer(limit_test_batches=0.1) + trainer._data_connector.attach_dataloaders(model) + loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model) + assert loader_num_batches[0] == int(0.1 * len(test_loader)) + + trainer = Trainer(limit_test_batches=10) + trainer._data_connector.attach_dataloaders(model) + loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model) + assert loader_num_batches[0] == 10 diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index 139334cbe6f06..36261d81df5d6 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -335,3 +335,19 @@ def test_pre_made_batches(): loader = DataLoader(RandomDataset(32, 10), batch_size=None) trainer = Trainer(fast_dev_run=1) trainer.predict(LoaderTestModel(), loader) + + +@RunIf(skip_windows=True) +def test_distributed_sampler_with_overfit_batches(tmpdir): + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + overfit_batches=1, + fast_dev_run=1, + strategy="ddp_find_unused_parameters_false", + num_processes=1, + ) + trainer.fit(model) + train_sampler = trainer.train_dataloader.loaders.sampler + assert isinstance(train_sampler, DistributedSampler) + assert train_sampler.shuffle is False diff --git a/tests/trainer/test_trainer_tricks.py b/tests/trainer/test_trainer_tricks.py deleted file mode 100644 index adf61d95bba52..0000000000000 --- a/tests/trainer/test_trainer_tricks.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import torch -from torch.utils.data import DataLoader, RandomSampler, SequentialSampler - -from pytorch_lightning import Trainer -from pytorch_lightning.trainer.states import RunningStage -from tests.base import EvalModelTemplate - - -def test_num_training_batches(tmpdir): - """Tests that the correct number of batches are allocated.""" - # when we have fewer batches in the dataloader we should use those instead of the limit - model = EvalModelTemplate() - trainer = Trainer(limit_val_batches=100, limit_train_batches=100, max_epochs=1, default_root_dir=tmpdir) - trainer.fit(model) - - assert len(model.train_dataloader()) == 10 - assert len(model.val_dataloader()) == 10 - assert isinstance(trainer.num_val_batches, list) - assert trainer.num_val_batches[0] == 10 - assert trainer.num_training_batches == 10 - - # when we have more batches in the dataloader we should limit them - model = EvalModelTemplate() - trainer = Trainer(limit_val_batches=7, limit_train_batches=7, max_epochs=1, default_root_dir=tmpdir) - trainer.fit(model) - - assert len(model.train_dataloader()) == 10 - assert len(model.val_dataloader()) == 10 - assert isinstance(trainer.num_val_batches, list) - assert trainer.num_val_batches[0] == 7 - assert trainer.num_training_batches == 7 - - -def test_overfit_batch_limits(tmpdir): - # ------------------------------------------------------ - # Make sure shuffle is correct across loaders initially - # ------------------------------------------------------ - model = EvalModelTemplate() - model.train_dataloader() - - # original train loader which should be replaced in all methods - train_loader = model.train_dataloader() - - # make sure the val and tests are not shuffled - assert isinstance(train_loader.sampler, RandomSampler) - assert isinstance(model.val_dataloader().sampler, SequentialSampler) - assert isinstance(model.test_dataloader().sampler, SequentialSampler) - - # ------------------------------------------------------ - # get the training loader and batch - # ------------------------------------------------------ - # Create a reference train dataloader without shuffling. - train_loader = DataLoader(model.train_dataloader().dataset, shuffle=False) - (xa, ya) = next(iter(train_loader)) - train_loader = DataLoader(model.train_dataloader().dataset, shuffle=True) - full_train_samples = len(train_loader) - num_train_samples = int(0.11 * full_train_samples) - - # ------------------------------------------------------ - # set VAL and Test loaders - # ------------------------------------------------------ - val_loader = DataLoader(model.val_dataloader().dataset, shuffle=False) - test_loader = DataLoader(model.test_dataloader().dataset, shuffle=False) - - # set the model loaders - model.train_dataloader = lambda: train_loader - model.val_dataloader = lambda: val_loader - model.test_dataloader = lambda: test_loader - - # ------------------------------------------------------ - # test train loader applies correct limits - # ------------------------------------------------------ - trainer = Trainer(overfit_batches=4) - model.trainer = trainer - trainer._data_connector.attach_dataloaders(model=model) - trainer.reset_train_dataloader(model) - assert trainer.num_training_batches == 4 - - # make sure the loaders are the same - (xb, yb) = next(iter(trainer.train_dataloader)) - assert torch.eq(xa, xb).all() - assert torch.eq(ya, yb).all() - - trainer = Trainer(overfit_batches=0.11) - model.trainer = trainer - trainer._data_connector.attach_dataloaders(model=model) - trainer.reset_train_dataloader(model) - # The dataloader should have been overwritten with a Sequential sampler. - assert trainer.train_dataloader is not train_loader - assert trainer.num_training_batches == num_train_samples - - # make sure the loaders are the same - (xb, yb) = next(iter(trainer.train_dataloader)) - assert torch.eq(xa, xb).all() - assert torch.eq(ya, yb).all() - - # ------------------------------------------------------ - # run tests for both val and test - # ------------------------------------------------------ - for split in (RunningStage.VALIDATING, RunningStage.TESTING): - - # ------------------------------------------------------ - # test overfit_batches as percent - # ------------------------------------------------------ - trainer = Trainer(overfit_batches=0.11) - trainer._data_connector.attach_dataloaders(model) - loader_num_batches, _ = trainer._reset_eval_dataloader(split, model=model) - if split == RunningStage.VALIDATING: - assert loader_num_batches[0] == 0 - else: - assert loader_num_batches[0] == len(test_loader) - - # ------------------------------------------------------ - # test overfit_batches as int - # ------------------------------------------------------ - trainer = Trainer(overfit_batches=1) - trainer._data_connector.attach_dataloaders(model) - loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model) - if split == RunningStage.VALIDATING: - assert loader_num_batches[0] == 0 - else: - assert loader_num_batches[0] == len(test_loader) - # make sure we turned off shuffle for the user - assert isinstance(dataloaders[0].sampler, SequentialSampler) - - # make sure the loaders are the same - (xb, yb) = next(iter(dataloaders[0])) - assert torch.eq(xa, xb).all() - assert torch.eq(ya, yb).all() - - trainer = Trainer(overfit_batches=5) - trainer._data_connector.attach_dataloaders(model) - loader_num_batches, _ = trainer._reset_eval_dataloader(split, model=model) - if split == RunningStage.VALIDATING: - assert loader_num_batches[0] == 0 - else: - assert loader_num_batches[0] == len(test_loader) - - # ------------------------------------------------------ - # test limit_xxx_batches as percent AND int - # ------------------------------------------------------ - if split == RunningStage.VALIDATING: - trainer = Trainer(limit_val_batches=0.1) - trainer._data_connector.attach_dataloaders(model) - loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model) - assert loader_num_batches[0] == int(0.1 * len(val_loader)) - - trainer = Trainer(limit_val_batches=10) - trainer._data_connector.attach_dataloaders(model) - loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model) - assert loader_num_batches[0] == 10 - else: - trainer = Trainer(limit_test_batches=0.1) - trainer._data_connector.attach_dataloaders(model) - loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model) - assert loader_num_batches[0] == int(0.1 * len(test_loader)) - - trainer = Trainer(limit_test_batches=10) - trainer._data_connector.attach_dataloaders(model) - loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model) - assert loader_num_batches[0] == 10 From 2c44c2b2e339273342651cf38f4c678b0783a5a5 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Wed, 1 Dec 2021 22:20:28 +0530 Subject: [PATCH 2/9] update docs --- docs/source/common/debugging.rst | 4 ++-- docs/source/common/trainer.rst | 4 ++-- docs/source/guides/speed.rst | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/common/debugging.rst b/docs/source/common/debugging.rst index a28bc299d0e95..ea0d286fc0abb 100644 --- a/docs/source/common/debugging.rst +++ b/docs/source/common/debugging.rst @@ -79,10 +79,10 @@ argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`) .. testcode:: - # use only 1% of training data (and use the same training dataloader (with shuffle off) in val and test) + # use only 1% of training data (and turn-off validation) trainer = Trainer(overfit_batches=0.01) - # similar, but with a fixed 10 batches no matter the size of the dataset + # similar, but with a fixed 10 batches trainer = Trainer(overfit_batches=10) With this flag, the train, val, and test sets will all be the same train set. We will also replace the sampler diff --git a/docs/source/common/trainer.rst b/docs/source/common/trainer.rst index e79b5e90484d0..a17167cbf14b5 100644 --- a/docs/source/common/trainer.rst +++ b/docs/source/common/trainer.rst @@ -1074,7 +1074,7 @@ overfit_batches | -Uses this much data of the training set. If nonzero, will use the same training set for validation and testing. +Uses this much data of the training set. If nonzero, will turn-off validation. If the training dataloaders have `shuffle=True`, Lightning will automatically disable it. Useful for quickly debugging or trying to overfit on purpose. @@ -1084,7 +1084,7 @@ Useful for quickly debugging or trying to overfit on purpose. # default used by the Trainer trainer = Trainer(overfit_batches=0.0) - # use only 1% of the train set (and use the train set for val and test) + # use only 1% of the train set trainer = Trainer(overfit_batches=0.01) # overfit on 10 of the same batches diff --git a/docs/source/guides/speed.rst b/docs/source/guides/speed.rst index 3b5a9baaeec9d..ed8bbd805df1d 100644 --- a/docs/source/guides/speed.rst +++ b/docs/source/guides/speed.rst @@ -336,7 +336,7 @@ If you don't want to check 100% of the training/validation/test set set these fl If you also pass ``shuffle=True`` to the dataloader, a different random subset of your dataset will be used for each epoch; otherwise the same subset will be used for all epochs. -.. note:: ``limit_train_batches``, ``limit_val_batches`` and ``limit_test_batches`` will be overwritten by ``overfit_batches`` if ``overfit_batches`` > 0. ``limit_val_batches`` will be ignored if ``fast_dev_run=True``. +.. note:: ``limit_train_batches`` will be overwritten by ``overfit_batches`` if ``overfit_batches > 0`` and will turn-off validation. .. note:: If you set ``limit_val_batches=0``, validation will be disabled. From 4df09a10ed1f90e3761c63060d753f4bedb48f30 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Wed, 1 Dec 2021 23:14:02 +0530 Subject: [PATCH 3/9] rm EvalModelTemplate --- tests/trainer/flags/test_limit_batches.py | 75 +++++++++++++++++++++ tests/trainer/flags/test_overfit_batches.py | 44 +++--------- 2 files changed, 85 insertions(+), 34 deletions(-) create mode 100644 tests/trainer/flags/test_limit_batches.py diff --git a/tests/trainer/flags/test_limit_batches.py b/tests/trainer/flags/test_limit_batches.py new file mode 100644 index 0000000000000..e5b07b2cb395c --- /dev/null +++ b/tests/trainer/flags/test_limit_batches.py @@ -0,0 +1,75 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from pytorch_lightning import Trainer +from pytorch_lightning.trainer.states import RunningStage +from tests.helpers.boring_model import BoringModel + + +def test_num_dataloader_batches(tmpdir): + """Tests that the correct number of batches are allocated.""" + # when we have fewer batches in the dataloader we should use those instead of the limit + model = BoringModel() + trainer = Trainer(limit_val_batches=100, limit_train_batches=100, max_epochs=1, default_root_dir=tmpdir) + trainer.fit(model) + + assert len(model.train_dataloader()) == 64 + assert len(model.val_dataloader()) == 64 + assert isinstance(trainer.num_val_batches, list) + assert trainer.num_val_batches[0] == 64 + assert trainer.num_training_batches == 64 + + # when we have more batches in the dataloader we should limit them + model = BoringModel() + trainer = Trainer(limit_val_batches=7, limit_train_batches=7, max_epochs=1, default_root_dir=tmpdir) + trainer.fit(model) + + assert len(model.train_dataloader()) == 64 + assert len(model.val_dataloader()) == 64 + assert isinstance(trainer.num_val_batches, list) + assert trainer.num_val_batches[0] == 7 + assert trainer.num_training_batches == 7 + + +@pytest.mark.parametrize( + ["stage", "mode"], + [ + (RunningStage.VALIDATING, "val"), + (RunningStage.TESTING, "test"), + (RunningStage.PREDICTING, "predict"), + ], +) +def test_eval_limit_batches(stage, mode): + limit_eval_batches = f"limit_{mode}_batches" + dl_hook = f"{mode}_dataloader" + model = BoringModel() + eval_loader = getattr(model, dl_hook)() + + limit_batches = 0.1 + trainer = Trainer(**{limit_eval_batches: limit_batches}) + model.trainer = trainer + trainer._data_connector.attach_dataloaders(model) + loader_num_batches, dataloaders = trainer._reset_eval_dataloader(stage, model=model) + assert loader_num_batches[0] == int(limit_batches * len(eval_loader)) + assert len(dataloaders[0]) == len(eval_loader) + + limit_batches = 10 + trainer = Trainer(**{limit_eval_batches: limit_batches}) + model.trainer = trainer + trainer._data_connector.attach_dataloaders(model) + loader_num_batches, dataloaders = trainer._reset_eval_dataloader(stage, model=model) + assert loader_num_batches[0] == limit_batches + assert len(dataloaders[0]) == len(eval_loader) diff --git a/tests/trainer/flags/test_overfit_batches.py b/tests/trainer/flags/test_overfit_batches.py index 750d4f6c29c28..315978399b864 100644 --- a/tests/trainer/flags/test_overfit_batches.py +++ b/tests/trainer/flags/test_overfit_batches.py @@ -15,9 +15,9 @@ import torch from torch.utils.data import DataLoader, RandomSampler, Sampler, SequentialSampler +from legacy.simple_classif_training import ClassifDataModule, ClassificationModel from pytorch_lightning import Trainer from pytorch_lightning.trainer.states import RunningStage -from tests.base.model_template import EvalModelTemplate from tests.helpers.boring_model import BoringModel, RandomDataset @@ -70,32 +70,32 @@ def test_overfit_batch_limits(tmpdir): # ------------------------------------------------------ # Make sure shuffle is correct across loaders initially # ------------------------------------------------------ - model = EvalModelTemplate() - model.train_dataloader() + model = ClassificationModel() + dm = ClassifDataModule() # original train loader which should be replaced in all methods - train_loader = model.train_dataloader() + train_loader = dm.train_dataloader() # make sure the val and tests are not shuffled assert isinstance(train_loader.sampler, RandomSampler) - assert isinstance(model.val_dataloader().sampler, SequentialSampler) - assert isinstance(model.test_dataloader().sampler, SequentialSampler) + assert isinstance(dm.val_dataloader().sampler, SequentialSampler) + assert isinstance(dm.test_dataloader().sampler, SequentialSampler) # ------------------------------------------------------ # get the training loader and batch # ------------------------------------------------------ # Create a reference train dataloader without shuffling. - train_loader = DataLoader(model.train_dataloader().dataset, shuffle=False) + train_loader = DataLoader(dm.train_dataloader().dataset, shuffle=False) (xa, ya) = next(iter(train_loader)) - train_loader = DataLoader(model.train_dataloader().dataset, shuffle=True) + train_loader = DataLoader(dm.train_dataloader().dataset, shuffle=True) full_train_samples = len(train_loader) num_train_samples = int(0.11 * full_train_samples) # ------------------------------------------------------ # set VAL and Test loaders # ------------------------------------------------------ - val_loader = DataLoader(model.val_dataloader().dataset, shuffle=False) - test_loader = DataLoader(model.test_dataloader().dataset, shuffle=False) + val_loader = DataLoader(dm.val_dataloader().dataset, shuffle=False) + test_loader = DataLoader(dm.test_dataloader().dataset, shuffle=False) # set the model loaders model.train_dataloader = lambda: train_loader @@ -165,27 +165,3 @@ def test_overfit_batch_limits(tmpdir): assert loader_num_batches[0] == 0 else: assert loader_num_batches[0] == len(test_loader) - - # ------------------------------------------------------ - # test limit_xxx_batches as percent AND int - # ------------------------------------------------------ - if split == RunningStage.VALIDATING: - trainer = Trainer(limit_val_batches=0.1) - trainer._data_connector.attach_dataloaders(model) - loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model) - assert loader_num_batches[0] == int(0.1 * len(val_loader)) - - trainer = Trainer(limit_val_batches=10) - trainer._data_connector.attach_dataloaders(model) - loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model) - assert loader_num_batches[0] == 10 - else: - trainer = Trainer(limit_test_batches=0.1) - trainer._data_connector.attach_dataloaders(model) - loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model) - assert loader_num_batches[0] == int(0.1 * len(test_loader)) - - trainer = Trainer(limit_test_batches=10) - trainer._data_connector.attach_dataloaders(model) - loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model) - assert loader_num_batches[0] == 10 From c719cbb3f26ba5c96bca2fe2782b404883e5abd2 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Thu, 2 Dec 2021 20:31:27 +0530 Subject: [PATCH 4/9] address reviews --- pytorch_lightning/trainer/data_loading.py | 3 ++- tests/trainer/test_data_loading.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 759b08243a522..8defafaadcd53 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -298,7 +298,8 @@ def _reset_eval_dataloader( if hasattr(loader, "sampler") and not isinstance(loader.sampler, SequentialSampler): rank_zero_warn( f"Your `{mode.dataloader_prefix}_dataloader` has `shuffle=True`," - "it is strongly recommended that you turn this off for val/test/predict dataloaders." + " it is strongly recommended that you turn this off for val/test/predict dataloaders.", + category=PossibleUserWarning, ) if any(dl is None for dl in dataloaders): diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index 36261d81df5d6..a11c7395a55f1 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -344,8 +344,8 @@ def test_distributed_sampler_with_overfit_batches(tmpdir): default_root_dir=tmpdir, overfit_batches=1, fast_dev_run=1, - strategy="ddp_find_unused_parameters_false", - num_processes=1, + strategy="ddp", + num_processes=2, ) trainer.fit(model) train_sampler = trainer.train_dataloader.loaders.sampler From 21b3d3e1bd5ddcb7c37e630addd70b3bb84d8631 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Thu, 2 Dec 2021 20:32:21 +0530 Subject: [PATCH 5/9] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- docs/source/common/debugging.rst | 2 +- docs/source/common/trainer.rst | 2 +- docs/source/guides/speed.rst | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/common/debugging.rst b/docs/source/common/debugging.rst index ea0d286fc0abb..e8685413c55e3 100644 --- a/docs/source/common/debugging.rst +++ b/docs/source/common/debugging.rst @@ -79,7 +79,7 @@ argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`) .. testcode:: - # use only 1% of training data (and turn-off validation) + # use only 1% of training data (and turn off validation) trainer = Trainer(overfit_batches=0.01) # similar, but with a fixed 10 batches diff --git a/docs/source/common/trainer.rst b/docs/source/common/trainer.rst index a17167cbf14b5..2fe66299880aa 100644 --- a/docs/source/common/trainer.rst +++ b/docs/source/common/trainer.rst @@ -1074,7 +1074,7 @@ overfit_batches | -Uses this much data of the training set. If nonzero, will turn-off validation. +Uses this much data of the training set. If nonzero, will turn off validation. If the training dataloaders have `shuffle=True`, Lightning will automatically disable it. Useful for quickly debugging or trying to overfit on purpose. diff --git a/docs/source/guides/speed.rst b/docs/source/guides/speed.rst index ed8bbd805df1d..572da420d4280 100644 --- a/docs/source/guides/speed.rst +++ b/docs/source/guides/speed.rst @@ -336,7 +336,7 @@ If you don't want to check 100% of the training/validation/test set set these fl If you also pass ``shuffle=True`` to the dataloader, a different random subset of your dataset will be used for each epoch; otherwise the same subset will be used for all epochs. -.. note:: ``limit_train_batches`` will be overwritten by ``overfit_batches`` if ``overfit_batches > 0`` and will turn-off validation. +.. note:: ``limit_train_batches`` will be overwritten by ``overfit_batches`` if ``overfit_batches > 0`` and will turn off validation. .. note:: If you set ``limit_val_batches=0``, validation will be disabled. From 66fc21fe219a68870b5d4e3c2a9fd82781e61703 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Thu, 2 Dec 2021 23:08:45 +0530 Subject: [PATCH 6/9] Apply suggestions from code review --- tests/trainer/test_data_loading.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index a11c7395a55f1..02c434cdb0718 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -344,7 +344,7 @@ def test_distributed_sampler_with_overfit_batches(tmpdir): default_root_dir=tmpdir, overfit_batches=1, fast_dev_run=1, - strategy="ddp", + strategy="ddp_spawn", num_processes=2, ) trainer.fit(model) From 1074612f4ab24ca38fb86be47afd64185b85faab Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Thu, 2 Dec 2021 23:45:54 +0530 Subject: [PATCH 7/9] update test --- tests/trainer/test_data_loading.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index 02c434cdb0718..5021352c6aefb 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -347,7 +347,10 @@ def test_distributed_sampler_with_overfit_batches(tmpdir): strategy="ddp_spawn", num_processes=2, ) - trainer.fit(model) + model.trainer = trainer + trainer.model = model + trainer._data_connector.attach_dataloaders(model) + trainer.reset_train_dataloader() train_sampler = trainer.train_dataloader.loaders.sampler assert isinstance(train_sampler, DistributedSampler) assert train_sampler.shuffle is False From 832fdf1008e4b16304b12787e35314369f026988 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Thu, 2 Dec 2021 23:47:17 +0530 Subject: [PATCH 8/9] cleanup --- tests/trainer/test_data_loading.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index 5021352c6aefb..6fc9d6756f3cb 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -338,12 +338,10 @@ def test_pre_made_batches(): @RunIf(skip_windows=True) -def test_distributed_sampler_with_overfit_batches(tmpdir): +def test_distributed_sampler_with_overfit_batches(): model = BoringModel() trainer = Trainer( - default_root_dir=tmpdir, overfit_batches=1, - fast_dev_run=1, strategy="ddp_spawn", num_processes=2, ) From 33e78025b6741eb1d9a4fb4e41d405b38383a1e3 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Fri, 3 Dec 2021 16:31:24 +0530 Subject: [PATCH 9/9] address reviews --- legacy/simple_classif_training.py | 8 ++ tests/trainer/flags/test_limit_batches.py | 15 +-- tests/trainer/flags/test_overfit_batches.py | 118 ++++++++------------ tests/trainer/test_data_loading.py | 20 +--- 4 files changed, 58 insertions(+), 103 deletions(-) diff --git a/legacy/simple_classif_training.py b/legacy/simple_classif_training.py index 5189d0c1819b6..39362e9ef58ce 100644 --- a/legacy/simple_classif_training.py +++ b/legacy/simple_classif_training.py @@ -55,6 +55,9 @@ def _split_data(self): self.x_train, self.x_test, self.y_train, self.y_test = train_test_split( self._x, self._y, test_size=0.20, random_state=42 ) + self.x_train, self.x_predict, self.y_train, self.y_predict = train_test_split( + self._x, self._y, test_size=0.20, random_state=42 + ) self.x_train, self.x_valid, self.y_train, self.y_valid = train_test_split( self.x_train, self.y_train, test_size=0.40, random_state=42 ) @@ -76,6 +79,11 @@ def test_dataloader(self): SklearnDataset(self.x_test, self.y_test, self._x_type, self._y_type), batch_size=self.batch_size ) + def predict_dataloader(self): + return DataLoader( + SklearnDataset(self.x_predict, self.y_predict, self._x_type, self._y_type), batch_size=self.batch_size + ) + class ClassifDataModule(SklearnDataModule): def __init__(self, num_features=24, length=6000, num_classes=3, batch_size=128): diff --git a/tests/trainer/flags/test_limit_batches.py b/tests/trainer/flags/test_limit_batches.py index e5b07b2cb395c..99c3f6a4daf42 100644 --- a/tests/trainer/flags/test_limit_batches.py +++ b/tests/trainer/flags/test_limit_batches.py @@ -52,24 +52,17 @@ def test_num_dataloader_batches(tmpdir): (RunningStage.PREDICTING, "predict"), ], ) -def test_eval_limit_batches(stage, mode): +@pytest.mark.parametrize("limit_batches", [0.1, 10]) +def test_eval_limit_batches(stage, mode, limit_batches): limit_eval_batches = f"limit_{mode}_batches" dl_hook = f"{mode}_dataloader" model = BoringModel() eval_loader = getattr(model, dl_hook)() - limit_batches = 0.1 trainer = Trainer(**{limit_eval_batches: limit_batches}) model.trainer = trainer trainer._data_connector.attach_dataloaders(model) loader_num_batches, dataloaders = trainer._reset_eval_dataloader(stage, model=model) - assert loader_num_batches[0] == int(limit_batches * len(eval_loader)) - assert len(dataloaders[0]) == len(eval_loader) - - limit_batches = 10 - trainer = Trainer(**{limit_eval_batches: limit_batches}) - model.trainer = trainer - trainer._data_connector.attach_dataloaders(model) - loader_num_batches, dataloaders = trainer._reset_eval_dataloader(stage, model=model) - assert loader_num_batches[0] == limit_batches + expected_batches = int(limit_batches * len(eval_loader)) if isinstance(limit_batches, float) else limit_batches + assert loader_num_batches[0] == expected_batches assert len(dataloaders[0]) == len(eval_loader) diff --git a/tests/trainer/flags/test_overfit_batches.py b/tests/trainer/flags/test_overfit_batches.py index 315978399b864..9e9bbe9cb06e2 100644 --- a/tests/trainer/flags/test_overfit_batches.py +++ b/tests/trainer/flags/test_overfit_batches.py @@ -13,12 +13,13 @@ # limitations under the License. import pytest import torch -from torch.utils.data import DataLoader, RandomSampler, Sampler, SequentialSampler +from torch.utils.data import DataLoader, DistributedSampler, RandomSampler, Sampler, SequentialSampler from legacy.simple_classif_training import ClassifDataModule, ClassificationModel from pytorch_lightning import Trainer from pytorch_lightning.trainer.states import RunningStage from tests.helpers.boring_model import BoringModel, RandomDataset +from tests.helpers.runif import RunIf @pytest.mark.parametrize("overfit_batches", [1, 2, 0.1, 0.25, 1.0]) @@ -66,102 +67,73 @@ def train_dataloader(self): assert isinstance(trainer.train_dataloader.loaders.sampler, SequentialSampler) -def test_overfit_batch_limits(tmpdir): - # ------------------------------------------------------ - # Make sure shuffle is correct across loaders initially - # ------------------------------------------------------ +@pytest.mark.parametrize( + "stage,mode", + [(RunningStage.VALIDATING, "val"), (RunningStage.TESTING, "test"), (RunningStage.PREDICTING, "predict")], +) +@pytest.mark.parametrize("overfit_batches", [0.11, 4]) +def test_overfit_batch_limits_eval(stage, mode, overfit_batches): + model = ClassificationModel() + dm = ClassifDataModule() + eval_loader = getattr(dm, f"{mode}_dataloader")() + trainer = Trainer(overfit_batches=overfit_batches) + model.trainer = trainer + trainer._data_connector.attach_datamodule(model, datamodule=dm) + + loader_num_batches, dataloaders = trainer._reset_eval_dataloader(stage, model=model) + if stage == RunningStage.VALIDATING: + assert loader_num_batches[0] == 0 + else: + assert loader_num_batches[0] == len(eval_loader) + assert isinstance(dataloaders[0].sampler, SequentialSampler) + + +@pytest.mark.parametrize("overfit_batches", [0.11, 4]) +def test_overfit_batch_limits_train(overfit_batches): model = ClassificationModel() dm = ClassifDataModule() # original train loader which should be replaced in all methods train_loader = dm.train_dataloader() - - # make sure the val and tests are not shuffled assert isinstance(train_loader.sampler, RandomSampler) - assert isinstance(dm.val_dataloader().sampler, SequentialSampler) - assert isinstance(dm.test_dataloader().sampler, SequentialSampler) - # ------------------------------------------------------ - # get the training loader and batch - # ------------------------------------------------------ # Create a reference train dataloader without shuffling. train_loader = DataLoader(dm.train_dataloader().dataset, shuffle=False) (xa, ya) = next(iter(train_loader)) train_loader = DataLoader(dm.train_dataloader().dataset, shuffle=True) full_train_samples = len(train_loader) - num_train_samples = int(0.11 * full_train_samples) - - # ------------------------------------------------------ - # set VAL and Test loaders - # ------------------------------------------------------ - val_loader = DataLoader(dm.val_dataloader().dataset, shuffle=False) - test_loader = DataLoader(dm.test_dataloader().dataset, shuffle=False) # set the model loaders model.train_dataloader = lambda: train_loader - model.val_dataloader = lambda: val_loader - model.test_dataloader = lambda: test_loader - # ------------------------------------------------------ # test train loader applies correct limits - # ------------------------------------------------------ - trainer = Trainer(overfit_batches=4) + trainer = Trainer(overfit_batches=overfit_batches) model.trainer = trainer trainer._data_connector.attach_dataloaders(model=model) trainer.reset_train_dataloader(model) - assert trainer.num_training_batches == 4 + expected_batches = ( + int(overfit_batches * full_train_samples) if isinstance(overfit_batches, float) else overfit_batches + ) + assert trainer.num_training_batches == expected_batches # make sure the loaders are the same (xb, yb) = next(iter(trainer.train_dataloader)) assert torch.eq(xa, xb).all() assert torch.eq(ya, yb).all() - trainer = Trainer(overfit_batches=0.11) - model.trainer = trainer - trainer._data_connector.attach_dataloaders(model=model) - trainer.reset_train_dataloader(model) - # The dataloader should have been overwritten with a Sequential sampler. - assert trainer.train_dataloader is not train_loader - assert trainer.num_training_batches == num_train_samples - - # make sure the loaders are the same - (xb, yb) = next(iter(trainer.train_dataloader)) - assert torch.eq(xa, xb).all() - assert torch.eq(ya, yb).all() - # ------------------------------------------------------ - # run tests for both val and test - # ------------------------------------------------------ - for split in (RunningStage.VALIDATING, RunningStage.TESTING): - - # ------------------------------------------------------ - # test overfit_batches as percent - # ------------------------------------------------------ - trainer = Trainer(overfit_batches=0.11) - trainer._data_connector.attach_dataloaders(model) - loader_num_batches, _ = trainer._reset_eval_dataloader(split, model=model) - if split == RunningStage.VALIDATING: - assert loader_num_batches[0] == 0 - else: - assert loader_num_batches[0] == len(test_loader) - - # ------------------------------------------------------ - # test overfit_batches as int - # ------------------------------------------------------ - trainer = Trainer(overfit_batches=1) - trainer._data_connector.attach_dataloaders(model) - loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model) - if split == RunningStage.VALIDATING: - assert loader_num_batches[0] == 0 - else: - assert loader_num_batches[0] == len(test_loader) - # make sure we turned off shuffle for the user - assert isinstance(dataloaders[0].sampler, SequentialSampler) - - trainer = Trainer(overfit_batches=5) - trainer._data_connector.attach_dataloaders(model) - loader_num_batches, _ = trainer._reset_eval_dataloader(split, model=model) - if split == RunningStage.VALIDATING: - assert loader_num_batches[0] == 0 - else: - assert loader_num_batches[0] == len(test_loader) +@RunIf(skip_windows=True) +def test_distributed_sampler_with_overfit_batches(): + model = BoringModel() + trainer = Trainer( + overfit_batches=1, + strategy="ddp_spawn", + num_processes=2, + ) + model.trainer = trainer + trainer.model = model + trainer._data_connector.attach_dataloaders(model) + trainer.reset_train_dataloader() + train_sampler = trainer.train_dataloader.loaders.sampler + assert isinstance(train_sampler, DistributedSampler) + assert train_sampler.shuffle is False diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index 6fc9d6756f3cb..e191f681d209d 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -16,8 +16,7 @@ from re import escape import pytest -from torch.utils.data import DataLoader, DistributedSampler -from torch.utils.data.sampler import BatchSampler, Sampler, SequentialSampler +from torch.utils.data import BatchSampler, DataLoader, DistributedSampler, Sampler, SequentialSampler from pytorch_lightning import Trainer from pytorch_lightning.utilities.data import _update_dataloader @@ -335,20 +334,3 @@ def test_pre_made_batches(): loader = DataLoader(RandomDataset(32, 10), batch_size=None) trainer = Trainer(fast_dev_run=1) trainer.predict(LoaderTestModel(), loader) - - -@RunIf(skip_windows=True) -def test_distributed_sampler_with_overfit_batches(): - model = BoringModel() - trainer = Trainer( - overfit_batches=1, - strategy="ddp_spawn", - num_processes=2, - ) - model.trainer = trainer - trainer.model = model - trainer._data_connector.attach_dataloaders(model) - trainer.reset_train_dataloader() - train_sampler = trainer.train_dataloader.loaders.sampler - assert isinstance(train_sampler, DistributedSampler) - assert train_sampler.shuffle is False