add docs and tests

rohitgr7 · rohitgr7 · commit 88a00d045e77 · 2022-01-04T20:09:09.000+05:30
diff --git a/pytorch_lightning/callbacks/batch_size_finder.py b/pytorch_lightning/callbacks/batch_size_finder.py
@@ -48,10 +48,42 @@ def __init__(
         batch_arg_name="batch_size",
         early_exit=False,
     ):
+        """Callback try to find the largest batch size for a given model that does not give an out of memory (OOM)
+        error. It works with both training and evalation. All you need to do is add it as a callback inside Trainer
+        and call ``trainer.fit/validate/test/predict()``. Internally it calls the respective step function
+        ``steps_per_trial`` times for each batch size until one of the batch size generates and OOM error.
 
+        Args:
+            mode: search strategy to update the batch size:
+
+                - ``'power'`` (default): Keep multiplying the batch size by 2, until we get an OOM error.
+                - ``'binsearch'``: Initially keep multiplying by 2 and after encountering an OOM error
+                    do a binary search between the last successful batch size and the batch size that failed.
+
+            steps_per_trial: number of steps to run with a given batch size.
+                Ideally 1 should be enough to test if a OOM error occurs,
+                however in practice a few are needed.
+
+            init_val: initial batch size to start the search with.
+
+            max_trials: max number of increase in batch size done before
+               algorithm is terminated
+
+            batch_arg_name: name of the attribute that stores the batch size.
+                It is expected that the user has provided a model or datamodule that has a hyperparameter
+                with that name. We will look for this attribute name in the following places
+
+                - ``model``
+                - ``model.hparams``
+                - ``trainer.datamodule`` (the datamodule passed to the tune method)
+
+            early_exit: whether to continue with the training/evaluation or stop after
+                an optimal batch size has been found.
+        """
+        supported_modes = ("power", "binsearch")
         mode = mode.lower()
-        if mode not in ("power", "binsearch"):
-            raise MisconfigurationException("`mode` should be either 'power' or 'binsearch'")
+        if mode not in supported_modes:
+            raise MisconfigurationException(f"`mode` should be one of {supported_modes}")
 
         self.mode = mode
         self.steps_per_trial = steps_per_trial
@@ -121,6 +153,10 @@ def scale_batch_size(self, trainer, pl_module):
             if fs.exists(save_path):
                 fs.rm(save_path)
 
+        # global step and current epoch are incremented before saved in checkpoint
+        trainer.fit_loop.global_step -= 1
+        trainer.fit_loop.current_epoch -= 1
+
         self._restore_params(trainer)
 
         if trainer.progress_bar_callback:
@@ -165,7 +201,7 @@ def _run_binary_scaling(self, trainer, pl_module, new_size):
         while True:
             garbage_collection_cuda()
             try:
-                # Try fit
+                # run loop
                 self._try_loop_run(trainer)
                 count += 1
                 if count > self.max_trials:
@@ -217,7 +253,7 @@ def _try_loop_run(self, trainer):
         elif trainer.state.fn == TrainerFn.PREDICTING:
             loop = trainer.predict_loop
 
-        loop.load_state_dict(deepcopy(self._dumped_params["loop_state_dict"]))
+        loop.load_state_dict(deepcopy(self._dumped_params["loop_state_dict"]), force_load_progress=True)
         loop.run()
 
     @staticmethod
@@ -292,16 +328,16 @@ def _restore_params(self, trainer):
             loop = trainer.predict_loop
             trainer.limit_predict_batches = self._dumped_params["limit_predict_batches"]
 
-        loop.load_state_dict(deepcopy(self._dumped_params["loop_state_dict"]))
+        loop.load_state_dict(deepcopy(self._dumped_params["loop_state_dict"]), force_load_progress=True)
         if "loop_verbose" in self._dumped_params:
             loop.verbose = self._dumped_params["loop_verbose"]
 
     def pre_early_exit(self, trainer):
         if trainer.fast_dev_run:
             return
 
+        # this is required to stop the respective loops
         if trainer.state.fn == TrainerFn.FITTING:
-            trainer.should_stop = True
             self._dumped_params["num_training_batches"] = trainer.num_training_batches
             trainer.num_training_batches = 0
         elif trainer.state.fn == TrainerFn.VALIDATING:
@@ -318,6 +354,7 @@ def post_early_exit(self, trainer):
         if trainer.fast_dev_run:
             return
 
+        # restore the state used to stop the respective loop
         if trainer.state.fn == TrainerFn.FITTING:
             trainer.num_training_batches = self._dumped_params["num_training_batches"]
             loop = trainer.fit_loop
@@ -331,7 +368,7 @@ def post_early_exit(self, trainer):
             trainer.num_predict_batches = self._dumped_params["num_predict_batches"]
             loop = trainer.predict_loop
 
-        loop.load_state_dict(self._dumped_params["loop_state_dict"])
+        loop.load_state_dict(self._dumped_params["loop_state_dict"], force_load_progress=True)
         trainer.callbacks = [cb for cb in trainer.callbacks if not isinstance(cb, BatchSizeFinder)]
 
     def on_fit_start(self, trainer, pl_module):
@@ -346,6 +383,8 @@ def on_validation_start(self, trainer, pl_module):
         if trainer.sanity_checking or trainer.state.fn != TrainerFn.VALIDATING:
             return
 
+        self.scale_batch_size(trainer, pl_module)
+
         if self.early_exit:
             self.pre_early_exit(trainer)
         else:
diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
@@ -276,10 +276,9 @@ def state_dict(self, destination: Optional[Dict] = None, prefix: str = "") -> Di
         destination[prefix + "state_dict"] = self.on_save_checkpoint()
 
         # do not get the mode from `self.trainer` because it might not have been attached yet
-        ft_enabled = _FaultTolerantMode.detect_current_mode().is_enabled
         for k, v in self.__dict__.items():
             key = prefix + k
-            if ft_enabled and isinstance(v, BaseProgress):
+            if isinstance(v, BaseProgress):
                 destination[key] = v.state_dict()
             elif isinstance(v, Loop):
                 v.state_dict(destination, key + ".")
@@ -296,21 +295,30 @@ def load_state_dict(
         state_dict: Dict,
         prefix: str = "",
         metrics: Optional[Dict[str, Metric]] = None,
+        force_load_progress: bool = False,
     ) -> None:
         """Loads the state of this loop and all its children."""
-        self._load_from_state_dict(state_dict.copy(), prefix, metrics)
+        self._load_from_state_dict(state_dict.copy(), prefix, metrics, force_load_progress)
         for k, v in self.__dict__.items():
             if isinstance(v, Loop):
-                v.load_state_dict(state_dict.copy(), prefix + k + ".")
+                v.load_state_dict(state_dict.copy(), prefix + k + ".", force_load_progress=force_load_progress)
+
+    def _load_from_state_dict(
+        self,
+        state_dict: Dict,
+        prefix: str,
+        metrics: Optional[Dict[str, Metric]] = None,
+        force_load_progress: bool = False,
+    ) -> None:
+        load_progress = _FaultTolerantMode.detect_current_mode().is_enabled or force_load_progress
 
-    def _load_from_state_dict(self, state_dict: Dict, prefix: str, metrics: Optional[Dict[str, Metric]] = None) -> None:
         for k, v in self.__dict__.items():
             key = prefix + k
             if key not in state_dict:
                 # no state for this object, maybe we are loading an old checkpoint
                 continue
 
-            if isinstance(v, BaseProgress):
+            if load_progress and isinstance(v, BaseProgress):
                 v.load_state_dict(state_dict[key])
             elif (
                 isinstance(v, _ResultCollection)
diff --git a/tests/tuner/test_scale_batch_size.py b/tests/tuner/test_scale_batch_size.py
@@ -20,6 +20,7 @@
 
 import tests.helpers.utils as tutils
 from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks.batch_size_finder import BatchSizeFinder
 from pytorch_lightning.tuner.tuning import Tuner
 from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -49,6 +50,12 @@ def train_dataloader(self):
     def val_dataloader(self):
         return DataLoader(RandomDataset(32, 64), batch_size=getattr(self, "batch_size", 1))
 
+    def test_dataloader(self):
+        return DataLoader(RandomDataset(32, 64), batch_size=getattr(self, "batch_size", 1))
+
+    def predict_dataloader(self):
+        return DataLoader(RandomDataset(32, 64), batch_size=getattr(self, "batch_size", 1))
+
 
 @pytest.mark.parametrize(["model_bs", "dm_bs"], [(2, -1), (2, 2), (2, None), (None, 2), (16, 16)])
 def test_scale_batch_size_method_with_model_or_datamodule(tmpdir, model_bs, dm_bs):
@@ -133,7 +140,7 @@ def test_auto_scale_batch_size_trainer_arg(tmpdir, scale_arg):
     after_batch_size = model.batch_size
     assert before_batch_size != after_batch_size, "Batch size was not altered after running auto scaling of batch size"
 
-    assert not os.path.exists(tmpdir / "scale_batch_size_temp_model.ckpt")
+    assert not any(f for f in os.listdir(tmpdir) if f.startswith(".scale_batch_size_temp_model"))
 
 
 @RunIf(min_gpus=1)
@@ -275,9 +282,9 @@ def __init__(self):
         auto_scale_batch_size="ThisModeDoesNotExist",
     )
 
-    with pytest.raises(MisconfigurationException, match="should be either 'power' or 'binsearch'"):
+    with pytest.raises(MisconfigurationException, match="should be one of"):
         trainer.tune(model)
-    with pytest.raises(MisconfigurationException, match="should be either 'power' or 'binsearch'"):
+    with pytest.raises(MisconfigurationException, match="should be one of"):
         trainer.tuner.scale_batch_size(model, mode="ThisModeDoesNotExist")
 
 
@@ -292,3 +299,37 @@ def test_dataloader_reset_with_scale_batch_size(tmpdir, scale_method):
 
     assert trainer.train_dataloader.loaders.batch_size == new_batch_size
     assert trainer.val_dataloaders[0].batch_size == new_batch_size
+
+
+@pytest.mark.parametrize("trainer_fn", ["fit", "validate", "test", "predict"])
+@pytest.mark.parametrize("early_exit", [False])
+# @pytest.mark.parametrize('early_exit', [True, False])
+def test_batch_size_finder_callback(tmpdir, trainer_fn, early_exit):
+    """Test batch size finder callback with different trainer methods."""
+    tutils.reset_seed()
+    before_batch_size = 2
+    model = BatchSizeModel(batch_size=before_batch_size)
+    batch_size_finder = BatchSizeFinder(max_trials=4, batch_arg_name="batch_size", early_exit=early_exit)
+    trainer = Trainer(default_root_dir=tmpdir, max_epochs=2, callbacks=[batch_size_finder])
+    fn = getattr(trainer, trainer_fn)
+    fn(model)
+    after_batch_size = model.batch_size
+    loop = getattr(trainer, f"{trainer_fn}_loop")
+
+    if early_exit:
+        trainer.global_step == 0
+        trainer.current_epoch == 0
+        if trainer_fn != "fit":
+            assert loop.dataloader_progress.current.completed == 0
+            assert loop.epoch_loop.batch_progress.current.completed == 0
+    else:
+        if trainer_fn == "fit":
+            assert trainer.global_step == 4
+            assert trainer.current_epoch == 1
+        else:
+            assert trainer.global_step == 0
+            assert loop.dataloader_progress.current.completed == 1
+            assert loop.epoch_loop.batch_progress.current.completed == 2
+
+    assert before_batch_size != after_batch_size, "Batch size was not altered after running auto scaling of batch size"
+    assert not any(f for f in os.listdir(tmpdir) if f.startswith(".scale_batch_size_temp_model"))