Lightning-AI
diff --git a/‎.azure-pipelines/gpu-benchmark.yml
Lines changed: 17 additions & 0 deletions b/‎.azure-pipelines/gpu-benchmark.yml
Lines changed: 17 additions & 0 deletions
diff --git a/‎.github/CODEOWNERS
Lines changed: 1 addition & 6 deletions b/‎.github/CODEOWNERS
Lines changed: 1 addition & 6 deletions
diff --git a/‎CHANGELOG.md
Lines changed: 19 additions & 2 deletions b/‎CHANGELOG.md
Lines changed: 19 additions & 2 deletions
diff --git a/‎CITATION.cff
Lines changed: 2 additions & 2 deletions b/‎CITATION.cff
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source/common/lightning_module.rst
Lines changed: 0 additions & 6 deletions b/‎docs/source/common/lightning_module.rst
Lines changed: 0 additions & 6 deletions
diff --git a/‎docs/source/extensions/logging.rst
Lines changed: 3 additions & 3 deletions b/‎docs/source/extensions/logging.rst
Lines changed: 3 additions & 3 deletions
diff --git a/‎pytorch_lightning/accelerators/accelerator.py
Lines changed: 4 additions & 33 deletions b/‎pytorch_lightning/accelerators/accelerator.py
Lines changed: 4 additions & 33 deletions
diff --git a/‎pytorch_lightning/callbacks/model_checkpoint.py
Lines changed: 8 additions & 26 deletions b/‎pytorch_lightning/callbacks/model_checkpoint.py
Lines changed: 8 additions & 26 deletions
diff --git a/‎pytorch_lightning/callbacks/progress/base.py
Lines changed: 71 additions & 0 deletions b/‎pytorch_lightning/callbacks/progress/base.py
Lines changed: 71 additions & 0 deletions
diff --git a/‎pytorch_lightning/callbacks/progress/rich_progress.py
Lines changed: 10 additions & 3 deletions b/‎pytorch_lightning/callbacks/progress/rich_progress.py
Lines changed: 10 additions & 3 deletions
@@ -1,3 +1,20 @@
+# Python package
+# Create and test a Python package on multiple Python versions.
+# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
+# https://docs.microsoft.com/azure/devops/pipelines/languages/python
+
+trigger:
+  tags:
+    include:
+      - '*'
+  branches:
+    include:
+      - "master"
+      - "release/*"
+      - "refs/tags/*"
+
+pr: none
+
 schedules:
   - cron: "0 0 * * *" # At the end of every day
     displayName: Daily midnight benchmark
 
@@ -21,10 +21,10 @@
 # Packages
 /pytorch_lightning/accelerators         @williamfalcon @tchaton @SeanNaren @awaelchli @justusschock @kaushikb11
 /pytorch_lightning/callbacks            @williamfalcon @tchaton @carmocca @borda @kaushikb11
-/pytorch_lightning/cluster_environments @borda @tchaton @SeanNaren @carmocca @kaushikb11
 /pytorch_lightning/core                 @tchaton @SeanNaren @borda @carmocca @justusschock @kaushikb11
 /pytorch_lightning/distributed          @williamfalcon @tchaton @awaelchli @kaushikb11
 /pytorch_lightning/loggers              @tchaton @awaelchli @borda
+/pytorch_lightning/loggers/wandb.py     @borisdayma
 /pytorch_lightning/loops                @tchaton @awaelchli @justusschock @carmocca
 /pytorch_lightning/overrides            @tchaton @SeanNaren @borda
 /pytorch_lightning/plugins              @tchaton @SeanNaren @awaelchli @justusschock
@@ -38,11 +38,6 @@
 /pytorch_lightning/trainer/connectors/logger_connector @tchaton @carmocca
 /pytorch_lightning/trainer/progress.py  @tchaton @awaelchli @carmocca
 
-# Metrics
-/pytorch_lightning/metrics/             @SkafteNicki @ananyahjha93 @justusschock
-/tests/metrics/                         @SkafteNicki @ananyahjha93 @justusschock
-/docs/source/metrics.rst                @SkafteNicki @ananyahjha93 @justusschock
-
 # API
 /pytorch_lightning/callbacks/base.py    @williamfalcon @awaelchli @ananthsub @carmocca
 /pytorch_lightning/core/datamodule.py   @williamFalcon @awaelchli @ananthsub @carmocca
 
@@ -59,6 +59,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
     * Added Fault Tolerant Training to `DataFetcher` ([#8891](https://github.com/PyTorchLightning/pytorch-lightning/pull/8891))
     * Replaced old prefetch iterator with new `DataFetcher` in training loop ([#8953](https://github.com/PyTorchLightning/pytorch-lightning/pull/8953))
     * Added partial support for global random state fault-tolerance in map-style datasets ([#8950](https://github.com/PyTorchLightning/pytorch-lightning/pull/8950))
+    * Converted state to tuple explicitly when setting Python random state ([#9401](https://github.com/PyTorchLightning/pytorch-lightning/pull/9401))
+
 
 - Checkpoint saving & loading extensibility:
     * Added `CheckpointIO` to expose checkpoint IO from training type plugin ([#8743](https://github.com/PyTorchLightning/pytorch-lightning/pull/8743))
@@ -108,10 +110,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `on_exception` callback hook ([#9183](https://github.com/PyTorchLightning/pytorch-lightning/pull/9183))
 
 
-- Add a warning to deepspeed when inferring batch size ([#9221](https://github.com/PyTorchLightning/pytorch-lightning/pull/9221))
+- Added a warning to deepspeed when inferring batch size ([#9221](https://github.com/PyTorchLightning/pytorch-lightning/pull/9221))
+
+
+- Added `inference_mode` for evaluation and prediction ([#8813](https://github.com/PyTorchLightning/pytorch-lightning/pull/8813))
 
 
-- Added `inference_mode` for evaluation and prediction ([8813](https://github.com/PyTorchLightning/pytorch-lightning/pull/8813))
+- Added `remove_checkpoint` to `CheckpointIO` plugin by moving the responsibility from `ModelCheckpoint` Callback ([#9373](https://github.com/PyTorchLightning/pytorch-lightning/pull/9373))
 
 
 ### Changed
@@ -177,6 +182,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Deprecated `DataModule` properties: `train_transforms`, `val_transforms`, `test_transforms`, `size`, `dims` ([#8851](https://github.com/PyTorchLightning/pytorch-lightning/pull/8851))
 
 
+- Deprecated `LightningModule.get_progress_bar_dict` and `Trainer.progress_bar_dict` in favor of `pytorch_lightning.callbacks.progress.base.get_standard_metrics` and `ProgressBarBase.get_metrics` ([#8985](https://github.com/PyTorchLightning/pytorch-lightning/pull/8985))
+
+
 - Deprecated `prepare_data_per_node` flag on Trainer and set it as a property of `DataHooks`, accessible in the `LightningModule` and `LightningDataModule` ([#8958](https://github.com/PyTorchLightning/pytorch-lightning/pull/8958))
 
 
@@ -323,9 +331,18 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed incorrect main progress bar indicator when resuming training mid-epoch ([#9310](https://github.com/PyTorchLightning/pytorch-lightning/pull/9310))
 
 
+- Fixed logging of nan parameters ([#9364](https://github.com/PyTorchLightning/pytorch-lightning/pull/9364))
+
+
 - Fixed `replace_sampler` missing the batch size under specific conditions ([#9367](https://github.com/PyTorchLightning/pytorch-lightning/pull/9367))
 
 
+- Fixed bug where the training step output needed to be `deepcopy`-ed ([#9349](https://github.com/PyTorchLightning/pytorch-lightning/pull/9349))
+
+
+- Fixed freeing data iterators in loop `on_run_end` ([#9386](https://github.com/PyTorchLightning/pytorch-lightning/pull/9386))
+
+
 ## [1.4.5] - 2021-08-31
 
 - Fixed reduction using `self.log(sync_dict=True, reduce_fx={mean,max})` ([#9142](https://github.com/PyTorchLightning/pytorch-lightning/pull/9142))
 
@@ -4,8 +4,8 @@ title: "PyTorch Lightning"
 abstract: "The lightweight PyTorch wrapper for high-performance AI research. Scale your models, not the boilerplate."
 date-released: 2019-03-30
 authors:
-  - family-names: "William"
-    given-names: "Falcon"
+  - family-names: "Falcon"
+    given-names: "William"
   - name: "The PyTorch Lightning team"
 version: 1.4
 doi: 10.5281/zenodo.3828935
 
@@ -1242,12 +1242,6 @@ backward
 .. automethod:: pytorch_lightning.core.lightning.LightningModule.backward
     :noindex:
 
-get_progress_bar_dict
-~~~~~~~~~~~~~~~~~~~~~
-
-.. automethod:: pytorch_lightning.core.lightning.LightningModule.get_progress_bar_dict
-    :noindex:
-
 on_before_backward
 ~~~~~~~~~~~~~~~~~~
 
 
@@ -245,13 +245,13 @@ Modifying the progress bar
 
 The progress bar by default already includes the training loss and version number of the experiment
 if you are using a logger. These defaults can be customized by overriding the
-:func:`~pytorch_lightning.core.lightning.LightningModule.get_progress_bar_dict` hook in your module.
+:func:`~pytorch_lightning.callbacks.base.ProgressBarBase.get_metrics` hook in your module.
 
 .. code-block:: python
 
-    def get_progress_bar_dict(self):
+    def get_metrics(self):
         # don't show the version number
-        items = super().get_progress_bar_dict()
+        items = super().get_metrics()
         items.pop("v_num", None)
         return items
 
 
@@ -173,15 +173,7 @@ def batch_to_device(self, batch: Any, device: Optional[torch.device] = None, dat
     def training_step(self, step_kwargs: Dict[str, Union[Any, int]]) -> STEP_OUTPUT:
         """The actual training step.
 
-        Args:
-            step_kwargs: the arguments for the models training step. Can consist of the following:
-
-                - batch (:class:`~torch.Tensor` | (:class:`~torch.Tensor`, ...) | [:class:`~torch.Tensor`, ...]):
-                  The output of your :class:`~torch.utils.data.DataLoader`. A tensor, tuple or list.
-                - batch_idx (int): Integer displaying index of this batch
-                - optimizer_idx (int): When using multiple optimizers, this argument will also be present.
-                - hiddens(:class:`~torch.Tensor`): Passed in if
-                  :paramref:`~pytorch_lightning.core.lightning.LightningModule.truncated_bptt_steps` > 0.
+        See :meth:`~pytorch_lightning.core.lightning.LightningModule.training_step` for more details
         """
         with self.precision_plugin.train_step_context():
             return self.training_type_plugin.training_step(*step_kwargs.values())
@@ -192,44 +184,23 @@ def post_training_step(self) -> None:
     def validation_step(self, step_kwargs: Dict[str, Union[Any, int]]) -> Optional[STEP_OUTPUT]:
         """The actual validation step.
 
-        Args:
-            step_kwargs: the arguments for the models validation step. Can consist of the following:
-
-                - batch (:class:`~torch.Tensor` | (:class:`~torch.Tensor`, ...) | [:class:`~torch.Tensor`, ...]):
-                  The output of your :class:`~torch.utils.data.DataLoader`. A tensor, tuple or list.
-                - batch_idx (int): The index of this batch
-                - dataloader_idx (int): The index of the dataloader that produced this batch
-                  (only if multiple val dataloaders used)
+        See :meth:`~pytorch_lightning.core.lightning.LightningModule.validation_step` for more details
         """
         with self.precision_plugin.val_step_context():
             return self.training_type_plugin.validation_step(*step_kwargs.values())
 
     def test_step(self, step_kwargs: Dict[str, Union[Any, int]]) -> Optional[STEP_OUTPUT]:
         """The actual test step.
 
-        Args:
-            step_kwargs: the arguments for the models test step. Can consist of the following:
-
-                - batch (:class:`~torch.Tensor` | (:class:`~torch.Tensor`, ...) | [:class:`~torch.Tensor`, ...]):
-                  The output of your :class:`~torch.utils.data.DataLoader`. A tensor, tuple or list.
-                - batch_idx (int): The index of this batch.
-                - dataloader_idx (int): The index of the dataloader that produced this batch
-                  (only if multiple test dataloaders used).
+        See :meth:`~pytorch_lightning.core.lightning.LightningModule.test_step` for more details
         """
         with self.precision_plugin.test_step_context():
             return self.training_type_plugin.test_step(*step_kwargs.values())
 
     def predict_step(self, step_kwargs: Dict[str, Union[Any, int]]) -> STEP_OUTPUT:
         """The actual predict step.
 
-        Args:
-            step_kwargs: the arguments for the models predict step. Can consist of the following:
-
-                - batch (:class:`~torch.Tensor` | (:class:`~torch.Tensor`, ...) | [:class:`~torch.Tensor`, ...]):
-                  The output of your :class:`~torch.utils.data.DataLoader`. A tensor, tuple or list.
-                - batch_idx (int): The index of this batch.
-                - dataloader_idx (int): The index of the dataloader that produced this batch
-                  (only if multiple predict dataloaders used).
+        See :meth:`~pytorch_lightning.core.lightning.LightningModule.predict_step` for more details
         """
         with self.precision_plugin.predict_step_context():
             return self.training_type_plugin.predict_step(*step_kwargs.values())
 
@@ -486,19 +486,6 @@ def __init_triggers(
     def every_n_epochs(self) -> Optional[int]:
         return self._every_n_epochs
 
-    def _del_model(self, trainer: "pl.Trainer", filepath: str) -> None:
-        if trainer.should_rank_save_checkpoint and self._fs.exists(filepath):
-            self._fs.rm(filepath, recursive=True)
-            log.debug(f"Removed checkpoint: {filepath}")
-
-    def _save_model(self, trainer: "pl.Trainer", filepath: str) -> None:
-        # make paths
-        if trainer.should_rank_save_checkpoint:
-            self._fs.makedirs(os.path.dirname(filepath), exist_ok=True)
-
-        # delegate the saving to the trainer
-        trainer.save_checkpoint(filepath, self.save_weights_only)
-
     def check_monitor_top_k(self, trainer: "pl.Trainer", current: Optional[torch.Tensor] = None) -> bool:
         if current is None:
             return False
@@ -671,10 +658,10 @@ def _save_last_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[
         filepath = self._format_checkpoint_name(self.CHECKPOINT_NAME_LAST, monitor_candidates)
         filepath = os.path.join(self.dirpath, f"{filepath}{self.FILE_EXTENSION}")
 
-        self._save_model(trainer, filepath)
+        trainer.save_checkpoint(filepath, self.save_weights_only)
 
-        if self.last_model_path and self.last_model_path != filepath and trainer.should_rank_save_checkpoint:
-            self._del_model(trainer, self.last_model_path)
+        if self.last_model_path and self.last_model_path != filepath:
+            trainer.training_type_plugin.remove_checkpoint(self.last_model_path)
 
         self.last_model_path = filepath
 
@@ -696,15 +683,10 @@ def _save_none_monitor_checkpoint(self, trainer: "pl.Trainer", monitor_candidate
             return
 
         filepath = self._get_metric_interpolated_filepath_name(monitor_candidates, trainer)
-        self._save_model(trainer, filepath)
+        trainer.save_checkpoint(filepath, self.save_weights_only)
 
-        if (
-            self.save_top_k == 1
-            and self.best_model_path
-            and self.best_model_path != filepath
-            and trainer.should_rank_save_checkpoint
-        ):
-            self._del_model(trainer, self.best_model_path)
+        if self.save_top_k == 1 and self.best_model_path and self.best_model_path != filepath:
+            trainer.training_type_plugin.remove_checkpoint(self.best_model_path)
 
         self.best_model_path = filepath
 
@@ -748,10 +730,10 @@ def _update_best_and_save(
                 f"Epoch {epoch:d}, global step {step:d}: {self.monitor} reached {current:0.5f}"
                 f' (best {self.best_model_score:0.5f}), saving model to "{filepath}" as top {k}'
             )
-        self._save_model(trainer, filepath)
+        trainer.save_checkpoint(filepath, self.save_weights_only)
 
         if del_filepath is not None and filepath != del_filepath:
-            self._del_model(trainer, del_filepath)
+            trainer.training_type_plugin.remove_checkpoint(del_filepath)
 
     def to_yaml(self, filepath: Optional[Union[str, Path]] = None) -> None:
         """Saves the `best_k_models` dict containing the checkpoint paths with the corresponding scores to a YAML
 
@@ -11,7 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Dict, Union
+
+import pytorch_lightning as pl
 from pytorch_lightning.callbacks import Callback
+from pytorch_lightning.utilities import rank_zero_warn
 
 
 class ProgressBarBase(Callback):
@@ -177,3 +181,70 @@ def on_predict_epoch_start(self, trainer, pl_module):
 
     def on_predict_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
         self._predict_batch_idx += 1
+
+    def get_metrics(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> Dict[str, Union[int, str]]:
+        r"""
+        Combines progress bar metrics collected from the trainer with standard metrics from get_standard_metrics.
+        Implement this to override the items displayed in the progress bar.
+
+        Here is an example of how to override the defaults:
+
+        .. code-block:: python
+
+            def get_metrics(self, trainer, model):
+                # don't show the version number
+                items = super().get_metrics(trainer, model)
+                items.pop("v_num", None)
+                return items
+
+        Return:
+            Dictionary with the items to be displayed in the progress bar.
+        """
+        standard_metrics = pl_module.get_progress_bar_dict()
+        pbar_metrics = trainer.progress_bar_metrics
+        duplicates = list(standard_metrics.keys() & pbar_metrics.keys())
+        if duplicates:
+            rank_zero_warn(
+                f"The progress bar already tracks a metric with the name(s) '{', '.join(duplicates)}' and"
+                f" `self.log('{duplicates[0]}', ..., prog_bar=True)` will overwrite this value. "
+                " If this is undesired, change the name or override `get_metrics()` in the progress bar callback.",
+                UserWarning,
+            )
+
+        return {**standard_metrics, **pbar_metrics}
+
+
+def get_standard_metrics(trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> Dict[str, Union[int, str]]:
+    r"""
+    Returns several standard metrics displayed in the progress bar, including the average loss value,
+    split index of BPTT (if used) and the version of the experiment when using a logger.
+
+    .. code-block::
+
+        Epoch 1:   4%|▎         | 40/1095 [00:03<01:37, 10.84it/s, loss=4.501, v_num=10]
+
+    Return:
+        Dictionary with the standard metrics to be displayed in the progress bar.
+    """
+    # call .item() only once but store elements without graphs
+    running_train_loss = trainer.fit_loop.running_loss.mean()
+    avg_training_loss = None
+    if running_train_loss is not None:
+        avg_training_loss = running_train_loss.cpu().item()
+    elif pl_module.automatic_optimization:
+        avg_training_loss = float("NaN")
+
+    items_dict = {}
+    if avg_training_loss is not None:
+        items_dict["loss"] = f"{avg_training_loss:.3g}"
+
+    if pl_module.truncated_bptt_steps > 0:
+        items_dict["split_idx"] = trainer.fit_loop.split_idx
+
+    if trainer.logger is not None and trainer.logger.version is not None:
+        version = trainer.logger.version
+        # show last 4 places of long version strings
+        version = version[-4:] if isinstance(version, str) else version
+        items_dict["v_num"] = version
+
+    return items_dict
@@ -46,8 +46,9 @@ def render(self, task) -> RenderableType:
     class MetricsTextColumn(ProgressColumn):
         """A column containing text."""
 
-        def __init__(self, trainer, stage):
+        def __init__(self, trainer, pl_module, stage):
             self._trainer = trainer
+            self._pl_module = pl_module
             self._stage = stage
             self._tasks = {}
             self._current_task_id = 0
@@ -64,7 +65,13 @@ def render(self, task) -> Text:
             if self._trainer.training and task.id != self._current_task_id:
                 return self._tasks[task.id]
             _text = ""
-            for k, v in self._trainer.progress_bar_dict.items():
+            # TODO(@daniellepintz): make this code cleaner
+            progress_bar_callback = getattr(self._trainer, "progress_bar_callback", None)
+            if progress_bar_callback:
+                metrics = self._trainer.progress_bar_callback.get_metrics(self._trainer, self._pl_module)
+            else:
+                metrics = self._trainer.progress_bar_metrics
+            for k, v in metrics.items():
                 _text += f"{k}: {round(v, 3) if isinstance(v, float) else v} "
             text = Text.from_markup(_text, style=None, justify="left")
             return text
@@ -163,7 +170,7 @@ def setup(self, trainer, pl_module, stage):
             "[",
             CustomTimeColumn(),
             ProcessingSpeedColumn(),
-            MetricsTextColumn(trainer, stage),
+            MetricsTextColumn(trainer, pl_module, stage),
             "]",
             console=self.console,
             refresh_per_second=self.refresh_rate,