keep tune and remove early_exit

rohitgr7 · rohitgr7 · commit f5e9a5adca45 · 2022-01-05T19:40:42.000+05:30
diff --git a/pytorch_lightning/callbacks/batch_size_finder.py b/pytorch_lightning/callbacks/batch_size_finder.py
@@ -29,6 +29,7 @@
 from pytorch_lightning.callbacks.base import Callback
 from pytorch_lightning.loggers.base import DummyLogger
 from pytorch_lightning.trainer.states import TrainerFn
+from pytorch_lightning.tuner.tuning import _TunerExitException
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.data import has_len_all_ranks
 from pytorch_lightning.utilities.distributed import rank_zero_info
@@ -46,7 +47,6 @@ def __init__(
         init_val=2,
         max_trials=25,
         batch_arg_name="batch_size",
-        early_exit=False,
     ):
         """Callback try to find the largest batch size for a given model that does not give an out of memory (OOM)
         error. It works with both training and evalation. All you need to do is add it as a callback inside Trainer
@@ -56,7 +56,7 @@ def __init__(
         Args:
             mode: search strategy to update the batch size:
 
-                - ``'power'`` (default): Keep multiplying the batch size by 2, until we get an OOM error.
+                - ``'power'``: Keep multiplying the batch size by 2, until we get an OOM error.
                 - ``'binsearch'``: Initially keep multiplying by 2 and after encountering an OOM error
                     do a binary search between the last successful batch size and the batch size that failed.
 
@@ -76,9 +76,6 @@ def __init__(
                 - ``model``
                 - ``model.hparams``
                 - ``trainer.datamodule`` (the datamodule passed to the tune method)
-
-            early_exit: whether to continue with the training/evaluation or stop after
-                an optimal batch size has been found.
         """
         supported_modes = ("power", "binsearch")
         mode = mode.lower()
@@ -91,7 +88,8 @@ def __init__(
         self.max_trials = max_trials
         self.batch_arg_name = batch_arg_name
         self.optimal_batch_size = init_val
-        self.early_exit = early_exit
+
+        self._early_exit = False
 
     def scale_batch_size(self, trainer, pl_module):
         if trainer.fast_dev_run:
@@ -165,6 +163,9 @@ def scale_batch_size(self, trainer, pl_module):
         print(f"new batch size: {new_size}")
         self.optimal_batch_size = new_size
 
+        if self._early_exit:
+            raise _TunerExitException()
+
     def _run_power_scaling(self, trainer, pl_module, new_size):
         """Batch scaling mode where the size is doubled at each iteration until an OOM error is encountered."""
         for _ in range(self.max_trials):
@@ -332,99 +333,21 @@ def _restore_params(self, trainer):
         if "loop_verbose" in self._dumped_params:
             loop.verbose = self._dumped_params["loop_verbose"]
 
-    def pre_early_exit(self, trainer):
-        if trainer.fast_dev_run:
-            return
-
-        # this is required to stop the respective loops
-        if trainer.state.fn == TrainerFn.FITTING:
-            self._dumped_params["num_training_batches"] = trainer.num_training_batches
-            trainer.num_training_batches = 0
-        elif trainer.state.fn == TrainerFn.VALIDATING:
-            self._dumped_params["num_val_batches"] = trainer.num_val_batches
-            trainer.num_val_batches = [0]
-        elif trainer.state.fn == TrainerFn.TESTING:
-            self._dumped_params["num_test_batches"] = trainer.num_test_batches
-            trainer.num_test_batches = [0]
-        elif trainer.state.fn == TrainerFn.PREDICTING:
-            self._dumped_params["num_predict_batches"] = trainer.num_predict_batches
-            trainer.num_predict_batches = [0]
-
-    def post_early_exit(self, trainer):
-        if trainer.fast_dev_run:
-            return
-
-        # restore the state used to stop the respective loop
-        if trainer.state.fn == TrainerFn.FITTING:
-            trainer.num_training_batches = self._dumped_params["num_training_batches"]
-            loop = trainer.fit_loop
-        if trainer.state.fn == TrainerFn.VALIDATING:
-            trainer.num_val_batches = self._dumped_params["num_val_batches"]
-            loop = trainer.validate_loop
-        if trainer.state.fn == TrainerFn.TESTING:
-            trainer.num_test_batches = self._dumped_params["num_test_batches"]
-            loop = trainer.test_loop
-        if trainer.state.fn == TrainerFn.PREDICTING:
-            trainer.num_predict_batches = self._dumped_params["num_predict_batches"]
-            loop = trainer.predict_loop
-
-        loop.load_state_dict(self._dumped_params["loop_state_dict"], force_load_progress=True)
-        trainer.callbacks = [cb for cb in trainer.callbacks if not isinstance(cb, BatchSizeFinder)]
-
     def on_fit_start(self, trainer, pl_module):
         self.scale_batch_size(trainer, pl_module)
 
-        if self.early_exit:
-            self.pre_early_exit(trainer)
-        else:
-            trainer.callbacks = [cb for cb in trainer.callbacks if not isinstance(cb, BatchSizeFinder)]
-
     def on_validation_start(self, trainer, pl_module):
         if trainer.sanity_checking or trainer.state.fn != TrainerFn.VALIDATING:
             return
 
         self.scale_batch_size(trainer, pl_module)
 
-        if self.early_exit:
-            self.pre_early_exit(trainer)
-        else:
-            trainer.callbacks = [cb for cb in trainer.callbacks if not isinstance(cb, BatchSizeFinder)]
-
     def on_test_start(self, trainer, pl_module):
         self.scale_batch_size(trainer, pl_module)
 
-        if self.early_exit:
-            self.pre_early_exit(trainer)
-        else:
-            trainer.callbacks = [cb for cb in trainer.callbacks if not isinstance(cb, BatchSizeFinder)]
-
     def on_predict_start(self, trainer, pl_module):
         self.scale_batch_size(trainer, pl_module)
 
-        if self.early_exit:
-            self.pre_early_exit(trainer)
-        else:
-            trainer.callbacks = [cb for cb in trainer.callbacks if not isinstance(cb, BatchSizeFinder)]
-
-    def on_fit_end(self, trainer, pl_module):
-        if self.early_exit:
-            self.post_early_exit(trainer)
-
-    def on_validation_end(self, trainer, pl_module):
-        if trainer.sanity_checking or trainer.state.fn != TrainerFn.VALIDATING:
-            return
-
-        if self.early_exit:
-            self.post_early_exit(trainer)
-
-    def on_test_end(self, trainer, pl_module):
-        if self.early_exit:
-            self.post_early_exit(trainer)
-
-    def on_predict_end(self, trainer, pl_module):
-        if self.early_exit:
-            self.post_early_exit(trainer)
-
     def _adjust_batch_size(
         self,
         trainer: "pl.Trainer",
diff --git a/pytorch_lightning/trainer/connectors/callback_connector.py b/pytorch_lightning/trainer/connectors/callback_connector.py
@@ -306,6 +306,6 @@ def _reorder_callbacks(callbacks: List[Callback]) -> List[Callback]:
         checkpoints = [c for c in callbacks if isinstance(c, ModelCheckpoint)]
         not_checkpoints = [c for c in callbacks if not isinstance(c, ModelCheckpoint)]
         callbacks = not_checkpoints + checkpoints
-        batch_size_finder_callback = [c for c in callbacks if isinstance(c, BatchSizeFinder)]
-        other_callbacks = [c for c in callbacks if not isinstance(c, BatchSizeFinder)]
-        return batch_size_finder_callback + other_callbacks
+        tuner_callbacks = [c for c in callbacks if isinstance(c, BatchSizeFinder)]
+        non_tuner_callbacks = [c for c in callbacks if not isinstance(c, BatchSizeFinder)]
+        return tuner_callbacks + non_tuner_callbacks
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -71,7 +71,7 @@
 from pytorch_lightning.trainer.optimizers import TrainerOptimizersMixin
 from pytorch_lightning.trainer.states import RunningStage, TrainerFn, TrainerState, TrainerStatus
 from pytorch_lightning.tuner.lr_finder import _LRFinder
-from pytorch_lightning.tuner.tuning import Tuner
+from pytorch_lightning.tuner.tuning import _TunerExitException, Tuner
 from pytorch_lightning.utilities import (
     _AcceleratorType,
     _IPU_AVAILABLE,
@@ -678,6 +678,19 @@ def _call_and_handle_interrupt(self, trainer_fn: Callable, *args: Any, **kwargs:
                 return spawn_output.trainer_results
             else:
                 return trainer_fn(*args, **kwargs)
+
+        except _TunerExitException as exception:
+            self.state.status = TrainerStatus.FINISHED
+            if distributed_available() and self.world_size > 1:
+                # try syncing remaing processes, kill otherwise
+                self.strategy.reconciliate_processes(traceback.format_exc())
+            self._on_exception()
+            # reset bookkeeping
+            self.state.stage = None
+            self._call_callback_hooks("on_exception", exception)
+            # shutdown workers
+            self._data_connector.teardown()
+
         # TODO: treat KeyboardInterrupt as BaseException (delete the code below) in v1.7
         except KeyboardInterrupt as exception:
             rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
@@ -1027,9 +1040,11 @@ def tune(
         model: "pl.LightningModule",
         train_dataloaders: Optional[Union[TRAIN_DATALOADERS, LightningDataModule]] = None,
         val_dataloaders: Optional[EVAL_DATALOADERS] = None,
+        dataloaders: Optional[EVAL_DATALOADERS] = None,
         datamodule: Optional[LightningDataModule] = None,
         scale_batch_size_kwargs: Optional[Dict[str, Any]] = None,
         lr_find_kwargs: Optional[Dict[str, Any]] = None,
+        method="fit",
     ) -> Dict[str, Optional[Union[int, _LRFinder]]]:
         r"""
         Runs routines to tune hyperparameters before training.
@@ -1043,44 +1058,28 @@ def tune(
 
             val_dataloaders: A :class:`torch.utils.data.DataLoader` or a sequence of them specifying validation samples.
 
+            dataloaders: A :class:`torch.utils.data.DataLoader` or a sequence of them specifying val/test/predict
+                samples used for running tuner on validation/testing/prediction.
+
             datamodule: An instance of :class:`~pytorch_lightning.core.datamodule.LightningDataModule`.
 
             scale_batch_size_kwargs: Arguments for :func:`~pytorch_lightning.tuner.batch_size_scaling.scale_batch_size`
 
             lr_find_kwargs: Arguments for :func:`~pytorch_lightning.tuner.lr_finder.lr_find`
-        """
-        Trainer._log_api_event("tune")
-        self.state.fn = TrainerFn.TUNING
-        self.state.status = TrainerStatus.RUNNING
-        self.tuning = True
-
-        # if a datamodule comes in as the second arg, then fix it for the user
-        if isinstance(train_dataloaders, LightningDataModule):
-            datamodule = train_dataloaders
-            train_dataloaders = None
-        # If you supply a datamodule you can't supply train_dataloader or val_dataloaders
-        if (train_dataloaders is not None or val_dataloaders is not None) and datamodule is not None:
-            raise MisconfigurationException(
-                "You cannot pass `train_dataloader` or `val_dataloaders` to `trainer.tune(datamodule=...)`"
-            )
-
-        # links data to the trainer
-        self._data_connector.attach_data(
-            model, train_dataloaders=train_dataloaders, val_dataloaders=val_dataloaders, datamodule=datamodule
-        )
 
+            method: Method to run tuner on. It can be ``'fit', 'validate', 'test', 'predict'``
+        """
         result = self.tuner._tune(
             model,
             train_dataloaders,
             val_dataloaders,
+            dataloaders,
             datamodule,
             scale_batch_size_kwargs=scale_batch_size_kwargs,
             lr_find_kwargs=lr_find_kwargs,
+            method=method,
         )
 
-        assert self.state.stopped
-        self.tuning = False
-
         return result
 
     def _restore_modules_and_callbacks(self, checkpoint_path: Optional[_PATH] = None) -> None:
diff --git a/pytorch_lightning/tuner/tuning.py b/pytorch_lightning/tuner/tuning.py