Lightning-AI · lexierule · Sep 27, 2022 · Dec 14, 2021 · Dec 15, 2021 · Dec 15, 2021
@@ -30,6 +30,7 @@ callbacks
     BackboneFinetuning
     BaseFinetuning
     BasePredictionWriter
+    BatchSizeFinder
     Callback
     DeviceStatsMonitor
     EarlyStopping

@@ -87,6 +87,7 @@ Lightning has a few built-in callbacks.
     BackboneFinetuning
     BaseFinetuning
     BasePredictionWriter
+    BatchSizeFinder
     Callback
     DeviceStatsMonitor
     EarlyStopping

@@ -8,6 +8,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Added `BatchSizeFinder` callback ([#11089](https://github.com/PyTorchLightning/pytorch-lightning/pull/11089))
+
+
 - Added prefix to log message in `seed_everything` with rank info ([#13290](https://github.com/Lightning-AI/lightning/issues/13290))
 
 

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from pytorch_lightning.callbacks.batch_size_finder import BatchSizeFinder
 from pytorch_lightning.callbacks.callback import Callback
 from pytorch_lightning.callbacks.checkpoint import Checkpoint
 from pytorch_lightning.callbacks.device_stats_monitor import DeviceStatsMonitor
@@ -32,6 +33,8 @@
 __all__ = [
     "BackboneFinetuning",
     "BaseFinetuning",
+    "BasePredictionWriter",
+    "BatchSizeFinder",
     "Callback",
     "Checkpoint",
     "DeviceStatsMonitor",
@@ -42,7 +45,6 @@
     "ModelCheckpoint",
     "ModelPruning",
     "ModelSummary",
-    "BasePredictionWriter",
     "ProgressBarBase",
     "QuantizationAwareTraining",
     "RichModelSummary",

@@ -0,0 +1,146 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""
+BatchSizeFinder
+===============
+
+Finds optimal batch size
+"""
+
+from typing import Optional
+
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks.callback import Callback
+from pytorch_lightning.tuner.batch_size_scaling import scale_batch_size
+from pytorch_lightning.utilities.exceptions import _TunerExitException, MisconfigurationException
+from pytorch_lightning.utilities.parsing import lightning_hasattr
+from pytorch_lightning.utilities.rank_zero import rank_zero_warn
+
+
+class BatchSizeFinder(Callback):
+    SUPPORTED_MODES = ("power", "binsearch")
+
+    def __init__(
+        self,
+        mode: str = "power",
+        steps_per_trial: int = 3,
+        init_val: int = 2,
+        max_trials: int = 25,
+        batch_arg_name: str = "batch_size",
+    ) -> None:
+        """The `BatchSizeFinder` callback tries to find the largest batch size for a given model that does not give
+        an out of memory (OOM) error. It works with both training and evalation. All you need to do is add it as a
+        callback inside Trainer and call ``trainer.fit/validate/test/predict()``. Internally it calls the
+        respective step function ``steps_per_trial`` times for each batch size until one of the batch size
+        generates and OOM error.
+
+        Args:
+            mode: search strategy to update the batch size:
+
+                - ``'power'``: Keep multiplying the batch size by 2, until we get an OOM error.
+                - ``'binsearch'``: Initially keep multiplying by 2 and after encountering an OOM error
+                    do a binary search between the last successful batch size and the batch size that failed.
+
+            steps_per_trial: number of steps to run with a given batch size.
+                Ideally 1 should be enough to test if a OOM error occurs,
+                however in practice a few are needed.
+
+            init_val: initial batch size to start the search with.
+
+            max_trials: max number of increase in batch size done before
+               algorithm is terminated
+
+            batch_arg_name: name of the attribute that stores the batch size.
+                It is expected that the user has provided a model or datamodule that has a hyperparameter
+                with that name. We will look for this attribute name in the following places
+
+                - ``model``
+                - ``model.hparams``
+                - ``trainer.datamodule`` (the datamodule passed to the tune method)
+        """
+        # TODO: Add input validation.
+        mode = mode.lower()
+        if mode not in self.SUPPORTED_MODES:
+            raise MisconfigurationException(f"`mode` should be either of {self.SUPPORTED_MODES}")
+
+        self.mode = mode
+        self.steps_per_trial = steps_per_trial
+        self.init_val = init_val
+        self.max_trials = max_trials
+        self.batch_arg_name = batch_arg_name
+        self.optimal_batch_size = init_val
+        self._early_exit = False
+
+    def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: Optional[str] = None) -> None:
+        if trainer._accelerator_connector.is_distributed:
+            raise MisconfigurationException("Batch size finder is not supported with distributed strategies.")
+
+        running_stage = trainer.state.stage
+        assert running_stage is not None
+        dl_source = getattr(trainer._data_connector, f"_{running_stage.dataloader_prefix}_dataloader_source")
+
+        # TODO: check if this can be enabled (#4040)
+        if not trainer._data_connector._train_dataloader_source.is_module():
+            raise MisconfigurationException(
+                "Batch size finder cannot be used with dataloaders passed directly to `.fit()`. Please disable"
+                " the feature or incorporate the dataloader into your LightningModule or LightningDataModule."
+            )
+
+        # TODO: Add support for multiple eval dataloader
+        if stage != "fit":
+            dataloaders = dl_source.dataloader()
+            if isinstance(dataloaders, list) and len(dataloaders) > 1:
+                raise MisconfigurationException(
+                    "Batch size finder cannot be used with multiple" f" {running_stage.dataloader_prefix} dataloaders."
+                )
+
+        if not lightning_hasattr(pl_module, self.batch_arg_name):
+            raise MisconfigurationException(
+                f"Field {self.batch_arg_name} not found in both `model` and `model.hparams`"
+            )
+
+        if (
+            hasattr(pl_module, self.batch_arg_name)
+            and hasattr(pl_module, "hparams")
+            and self.batch_arg_name in pl_module.hparams
+        ):
+            rank_zero_warn(
+                f"Field `model.{self.batch_arg_name}` and `model.hparams.{self.batch_arg_name}` are mutually exclusive!"
+                f" `model.{self.batch_arg_name}` will be used as the initial batch size for scaling."
+                " If this is not the intended behavior, please remove either one."
+            )
+
+    def scale_batch_size(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        new_size = scale_batch_size(
+            trainer, pl_module, self.mode, self.steps_per_trial, self.init_val, self.max_trials, self.batch_arg_name
+        )
+
+        self.optimal_batch_size = new_size
+        if self._early_exit:
+            raise _TunerExitException()
+
+    def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        self.scale_batch_size(trainer, pl_module)
+
+    def on_validation_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        if trainer.sanity_checking or trainer.state.fn != "validate":
+            return
+
+        self.scale_batch_size(trainer, pl_module)
+
+    def on_test_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        self.scale_batch_size(trainer, pl_module)
+
+    def on_predict_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        self.scale_batch_size(trainer, pl_module)
diff --git a/src/pytorch_lightning/loggers/base.py b/src/pytorch_lightning/loggers/base.py
@@ -17,7 +17,7 @@
 import numpy as np
 
 import pytorch_lightning.loggers.logger as logger
-from pytorch_lightning.utilities.warnings import rank_zero_deprecation
+from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation
 
 
 def rank_zero_experiment(fn: Callable) -> Callable:

@@ -28,6 +28,7 @@
     RichProgressBar,
     TQDMProgressBar,
 )
+from pytorch_lightning.callbacks.batch_size_finder import BatchSizeFinder
 from pytorch_lightning.callbacks.rich_model_summary import RichModelSummary
 from pytorch_lightning.callbacks.timer import Timer
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -237,19 +238,30 @@ def _attach_model_callbacks(self) -> None:
 
     @staticmethod
     def _reorder_callbacks(callbacks: List[Callback]) -> List[Callback]:
-        """Moves all Checkpoint callbacks to the end of the list. The sequential order within the group of
-        checkpoint callbacks is preserved, as well as the order of all other callbacks.
+        """Moves all the tuner specific callbacks at the beginning of the list and all the `ModelCheckpoint`
+        callbacks to the end of the list. The sequential order within the group of checkpoint callbacks is
+        preserved, as well as the order of all other callbacks.
 
         Args:
             callbacks: A list of callbacks.
 
         Return:
-            A new list in which the last elements are Checkpoint if there were any present in the
-            input.
+            A new list in which the first elements are tuner specific callbacks and last elements are ModelCheckpoints
+            if there were any present in the input.
         """
-        checkpoints: List[Callback] = [c for c in callbacks if isinstance(c, Checkpoint)]
-        not_checkpoints = [c for c in callbacks if not isinstance(c, Checkpoint)]
-        return not_checkpoints + checkpoints
+        tuner_callbacks: List[Callback] = []
+        other_callbacks: List[Callback] = []
+        checkpoint_callbacks: List[Callback] = []
+
+        for cb in callbacks:
+            if isinstance(cb, BatchSizeFinder):
+                tuner_callbacks.append(cb)
+            elif isinstance(cb, Checkpoint):
+                checkpoint_callbacks.append(cb)
+            else:
+                other_callbacks.append(cb)
+
+        return tuner_callbacks + other_callbacks + checkpoint_callbacks
 
 
 def _configure_external_callbacks() -> List[Callback]:

@@ -104,8 +104,13 @@
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.data import _auto_add_worker_init_fn, has_len_all_ranks
 from pytorch_lightning.utilities.distributed import distributed_available
-from pytorch_lightning.utilities.exceptions import ExitGracefullyException, MisconfigurationException
+from pytorch_lightning.utilities.exceptions import (
+    _TunerExitException,
+    ExitGracefullyException,
+    MisconfigurationException,
+)
 from pytorch_lightning.utilities.imports import _fault_tolerant_training, _module_available
+from pytorch_lightning.utilities.meta import is_on_meta_device, materialize_module
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info, rank_zero_warn
 from pytorch_lightning.utilities.seed import isolate_rng
@@ -650,6 +655,16 @@ def _call_and_handle_interrupt(self, trainer_fn: Callable, *args: Any, **kwargs:
                 return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
             else:
                 return trainer_fn(*args, **kwargs)
+
+        except _TunerExitException as exception:
+            self.state.status = TrainerStatus.FINISHED
+            if distributed_available() and self.world_size > 1:
+                # try syncing remaing processes, kill otherwise
+                self.strategy.reconciliate_processes(traceback.format_exc())
+            self._call_callback_hooks("on_exception", exception)
+            self._teardown()
+            self.state.stage = None
+
         # TODO(awaelchli): Unify both exceptions below, where `KeyboardError` doesn't re-raise
         except KeyboardInterrupt as exception:
             rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
@@ -1017,9 +1032,11 @@ def tune(
         model: "pl.LightningModule",
         train_dataloaders: Optional[Union[TRAIN_DATALOADERS, LightningDataModule]] = None,
         val_dataloaders: Optional[EVAL_DATALOADERS] = None,
+        dataloaders: Optional[EVAL_DATALOADERS] = None,
         datamodule: Optional[LightningDataModule] = None,
         scale_batch_size_kwargs: Optional[Dict[str, Any]] = None,
         lr_find_kwargs: Optional[Dict[str, Any]] = None,
+        method: str = "fit",
     ) -> _TunerResult:
         r"""
         Runs routines to tune hyperparameters before training.
@@ -1033,44 +1050,34 @@ def tune(
 
             val_dataloaders: A :class:`torch.utils.data.DataLoader` or a sequence of them specifying validation samples.
 
+            dataloaders: A :class:`torch.utils.data.DataLoader` or a sequence of them specifying val/test/predict
+                samples used for running tuner on validation/testing/prediction.
+
             datamodule: An instance of :class:`~pytorch_lightning.core.datamodule.LightningDataModule`.
 
             scale_batch_size_kwargs: Arguments for :func:`~pytorch_lightning.tuner.batch_size_scaling.scale_batch_size`
 
             lr_find_kwargs: Arguments for :func:`~pytorch_lightning.tuner.lr_finder.lr_find`
+
+            method: Method to run tuner on. It can be any of ``("fit", "validate", "test", "predict")``.
         """
         if not isinstance(model, pl.LightningModule):
             raise TypeError(f"`Trainer.tune()` requires a `LightningModule`, got: {model.__class__.__qualname__}")
 
         Trainer._log_api_event("tune")
 
-        self.state.fn = TrainerFn.TUNING
-        self.state.status = TrainerStatus.RUNNING
-        self.tuning = True
-
-        # if a datamodule comes in as the second arg, then fix it for the user
-        if isinstance(train_dataloaders, LightningDataModule):
-            datamodule = train_dataloaders
-            train_dataloaders = None
-        # If you supply a datamodule you can't supply train_dataloader or val_dataloaders
-        if (train_dataloaders is not None or val_dataloaders is not None) and datamodule is not None:
-            raise MisconfigurationException(
-                "You cannot pass `train_dataloader` or `val_dataloaders` to `trainer.tune(datamodule=...)`"
-            )
-
-        # links data to the trainer
-        self._data_connector.attach_data(
-            model, train_dataloaders=train_dataloaders, val_dataloaders=val_dataloaders, datamodule=datamodule
-        )
-
         with isolate_rng():
             result = self.tuner._tune(
-                model, scale_batch_size_kwargs=scale_batch_size_kwargs, lr_find_kwargs=lr_find_kwargs
+                model,
+                train_dataloaders,
+                val_dataloaders,
+                dataloaders,
+                datamodule,
+                scale_batch_size_kwargs=scale_batch_size_kwargs,
+                lr_find_kwargs=lr_find_kwargs,
+                method=method,
             )
 
-        assert self.state.stopped
-        self.tuning = False
-
         return result
 
     def _restore_modules_and_callbacks(self, checkpoint_path: Optional[_PATH] = None) -> None:
Original file line number	Diff line number	Diff line change
Expand Up		@@ -8,6 +8,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

		### Added

		- Added `BatchSizeFinder` callback ([#11089](https://github.com/PyTorchLightning/pytorch-lightning/pull/11089))


		- Added prefix to log message in `seed_everything` with rank info ([#13290](https://github.com/Lightning-AI/lightning/issues/13290))


Expand Down