Lightning-AI
diff --git a/‎.azure-pipelines/gpu-tests.yml
Lines changed: 1 addition & 1 deletion b/‎.azure-pipelines/gpu-tests.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎CHANGELOG.md
Lines changed: 15 additions & 2 deletions b/‎CHANGELOG.md
Lines changed: 15 additions & 2 deletions
diff --git a/‎dockers/base-cuda/Dockerfile
Lines changed: 2 additions & 2 deletions b/‎dockers/base-cuda/Dockerfile
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source/advanced/plugins_registry.rst
Lines changed: 49 additions & 0 deletions b/‎docs/source/advanced/plugins_registry.rst
Lines changed: 49 additions & 0 deletions
diff --git a/‎docs/source/extensions/accelerators.rst
Lines changed: 1 addition & 1 deletion b/‎docs/source/extensions/accelerators.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/index.rst
Lines changed: 1 addition & 0 deletions b/‎docs/source/index.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/starter/new-project.rst
Lines changed: 38 additions & 16 deletions b/‎docs/source/starter/new-project.rst
Lines changed: 38 additions & 16 deletions
diff --git a/‎pytorch_lightning/callbacks/lr_monitor.py
Lines changed: 13 additions & 2 deletions b/‎pytorch_lightning/callbacks/lr_monitor.py
Lines changed: 13 additions & 2 deletions
diff --git a/‎pytorch_lightning/callbacks/model_checkpoint.py
Lines changed: 1 addition & 1 deletion b/‎pytorch_lightning/callbacks/model_checkpoint.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎pytorch_lightning/plugins/precision/deepspeed_precision.py
Lines changed: 5 additions & 1 deletion b/‎pytorch_lightning/plugins/precision/deepspeed_precision.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎pytorch_lightning/plugins/precision/fully_sharded_native_amp.py
Lines changed: 1 addition & 3 deletions b/‎pytorch_lightning/plugins/precision/fully_sharded_native_amp.py
Lines changed: 1 addition & 3 deletions
@@ -50,7 +50,7 @@ jobs:
 
     - bash: |
         python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
-        pip install fairscale>=0.3.4
+        pip install fairscale==0.4.0
         pip install deepspeed==0.5.4
         pip install . --requirement requirements/devel.txt
         pip list
 
@@ -220,7 +220,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
     * Implemented `DeepSpeedPlugin._setup_model_and_optimizers` ([#10009](https://github.com/PyTorchLightning/pytorch-lightning/pull/10009), [#10064](https://github.com/PyTorchLightning/pytorch-lightning/pull/10064))
     * Implemented `{DDPShardedPlugin,DDPShardedSpawnPlugin}._setup_model_and_optimizers` ([#10028](https://github.com/PyTorchLightning/pytorch-lightning/pull/10028), [#10064](https://github.com/PyTorchLightning/pytorch-lightning/pull/10064))
     * Added optional `model` argument to the `optimizer_step` methods in accelerators and plugins ([#10023](https://github.com/PyTorchLightning/pytorch-lightning/pull/10023))
-
+    * Updated precision attributes in `DeepSpeedPlugin` ([#10164](https://github.com/PyTorchLightning/pytorch-lightning/pull/10164))
+    * Added the ability to return a result from rank 0 in `DDPSpawnPlugin.spawn` ([#10162](https://github.com/PyTorchLightning/pytorch-lightning/pull/10162))
 
 
 - Added `XLACheckpointIO` plugin ([#9972](https://github.com/PyTorchLightning/pytorch-lightning/pull/9972))
@@ -343,6 +344,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Moved the `optimizer_step` and `clip_gradients` hook from the `Accelerator` and `TrainingTypePlugin` into the `PrecisionPlugin` ([#10143](https://github.com/PyTorchLightning/pytorch-lightning/pull/10143), [#10029](https://github.com/PyTorchLightning/pytorch-lightning/pull/10029))
 
 
+- `NativeMixedPrecisionPlugin` and its subclasses now take an optional `GradScaler` instance ([#10055](https://github.com/PyTorchLightning/pytorch-lightning/pull/10055))
+
+
 - Updated several places in the loops and trainer to access `training_type_plugin` directly instead of `accelerator` ([#9901](https://github.com/PyTorchLightning/pytorch-lightning/pull/9901))
 
 
@@ -444,10 +448,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Deprecated `ClusterEnvironment.creates_children()` in favor of `ClusterEnvironment.creates_processes_externally` (property) ([#10106](https://github.com/PyTorchLightning/pytorch-lightning/pull/10106))
 
 
-
 - Deprecated `PrecisionPlugin.master_params()` in favor of `PrecisionPlugin.main_params()` ([#10105](https://github.com/PyTorchLightning/pytorch-lightning/pull/10105))
 
 
+- Deprecated `lr_sch_names` from `LearningRateMonitor` ([#10066](https://github.com/PyTorchLightning/pytorch-lightning/pull/10066))
+
+
 ### Removed
 
 - Removed deprecated `metrics` ([#8586](https://github.com/PyTorchLightning/pytorch-lightning/pull/8586/))
@@ -656,9 +662,16 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed undesired side effects being caused by `Trainer` patching dataloader methods on the `LightningModule` ([#9764](https://github.com/PyTorchLightning/pytorch-lightning/pull/9764))
 
 
+- Fixed monitor value in `ModelCheckpoint` getting moved to the wrong device in a special case where it becomes NaN ([#10118](https://github.com/PyTorchLightning/pytorch-lightning/pull/10118))
+
+
 - Fixed creation of `dirpath` in `BaseProfiler` if it doesn't exist ([#10073](https://github.com/PyTorchLightning/pytorch-lightning/pull/10073))
 
 
+- Fixed an issue with `pl.utilities.seed.reset_seed` converting the `PL_SEED_WORKERS` environment variable to `bool` ([#10099](https://github.com/PyTorchLightning/pytorch-lightning/pull/10099))
+
+
+
 ## [1.4.9] - 2021-09-30
 
 - Fixed `lr_find` to generate same results on multiple calls ([#9704](https://github.com/PyTorchLightning/pytorch-lightning/pull/9704))
 
@@ -108,11 +108,11 @@ RUN \
 
 RUN \
     # install FairScale
-    pip install fairscale>=0.3.4
+    pip install fairscale==0.4.0
 
 RUN \
     # install DeepSpeed
-    pip install deepspeed==0.4.0
+    pip install deepspeed==0.5.4
 
 RUN \
     # Show what we have
 
@@ -0,0 +1,49 @@
+Training Type Plugins Registry
+==============================
+
+.. warning:: The Plugins Registry is experimental and subject to change.
+
+Lightning includes a registry that holds information about Training Type plugins and allows for the registration of new custom plugins.
+
+The Plugins are assigned strings that identify them, such as "ddp", "deepspeed_stage_2_offload", and so on.
+It also returns the optional description and parameters for initialising the Plugin that were defined during registration.
+
+
+.. code-block:: python
+
+    # Training with the DDP Plugin with `find_unused_parameters` as False
+    trainer = Trainer(strategy="ddp_find_unused_parameters_false", accelerator="gpu", devices=4)
+
+    # Training with DeepSpeed ZeRO Stage 3 and CPU Offload
+    trainer = Trainer(strategy="deepspeed_stage_3_offload", accelerator="gpu", devices=3)
+
+    # Training with the TPU Spawn Plugin with `debug` as True
+    trainer = Trainer(strategy="tpu_spawn_debug", accelerator="tpu", devices=8)
+
+
+Additionally, you can pass your custom registered training type plugins to the ``strategy`` argument.
+
+.. code-block:: python
+
+    from pytorch_lightning.plugins import DDPPlugin, TrainingTypePluginsRegistry, CheckpointIO
+
+
+    class CustomCheckpointIO(CheckpointIO):
+        def save_checkpoint(self, checkpoint: Dict[str, Any], path: Union[str, Path]) -> None:
+            ...
+
+        def load_checkpoint(self, path: Union[str, Path]) -> Dict[str, Any]:
+            ...
+
+
+    custom_checkpoint_io = CustomCheckpointIO()
+
+    # Register the DDP Plugin with your custom CheckpointIO plugin
+    TrainingTypePluginsRegistry.register(
+        "ddp_custom_checkpoint_io",
+        DDPPlugin,
+        description="DDP Plugin with custom checkpoint io plugin",
+        checkpoint_io=custom_checkpoint_io,
+    )
+
+    trainer = Trainer(strategy="ddp_custom_checkpoint_io", accelerator="gpu", devices=2)
@@ -26,7 +26,7 @@ One to handle differences from the training routine and one to handle different
     from pytorch_lightning.plugins import NativeMixedPrecisionPlugin, DDPPlugin
 
     accelerator = GPUAccelerator(
-        precision_plugin=NativeMixedPrecisionPlugin(),
+        precision_plugin=NativeMixedPrecisionPlugin(16, "cuda"),
         training_type_plugin=DDPPlugin(),
     )
     trainer = Trainer(accelerator=accelerator)
 
@@ -66,6 +66,7 @@ PyTorch Lightning
    advanced/checkpoint_io
    common/optimizers
    advanced/profiler
+   advanced/plugins_registry
    advanced/sequences
    common/single_gpu
    advanced/training_tricks
 
@@ -134,6 +134,7 @@ Under the hood a LightningModule is still just a :class:`torch.nn.Module` that g
 - The Train loop
 - The Validation loop
 - The Test loop
+- The Prediction loop
 - The Model or system of Models
 - The Optimizer
 
@@ -181,7 +182,7 @@ More details in :doc:`lightning module <../common/lightning_module>` docs.
 Step 2: Fit with Lightning Trainer
 **********************************
 
-First, define the data however you want. Lightning just needs a :class:`~torch.utils.data.DataLoader` for the train/val/test splits.
+First, define the data however you want. Lightning just needs a :class:`~torch.utils.data.DataLoader` for the train/val/test/predict splits.
 
 .. code-block:: python
 
@@ -258,7 +259,8 @@ Turn off automatic optimization and you control the train loop!
 
 
     def training_step(self, batch, batch_idx):
-        # access your optimizers with use_pl_optimizer=False. Default is True
+        # access your optimizers with use_pl_optimizer=False. Default is True,
+        # setting use_pl_optimizer=True will maintain plugin/precision support
         opt_a, opt_b = self.optimizers(use_pl_optimizer=True)
 
         loss_a = self.generator(batch)
@@ -321,7 +323,7 @@ You can also add a forward method to do predictions however you want.
 
 
     autoencoder = LitAutoEncoder()
-    autoencoder = autoencoder(torch.rand(1, 28 * 28))
+    embedding = autoencoder(torch.rand(1, 28 * 28))
 
 
 .. code-block:: python
@@ -371,9 +373,9 @@ a forward method or trace only the sub-models you need.
 
 --------------------
 
-Using CPUs/GPUs/TPUs
-====================
-It's trivial to use CPUs, GPUs or TPUs in Lightning. There's **NO NEED** to change your code, simply change the :class:`~pytorch_lightning.trainer.Trainer` options.
+Using CPUs/GPUs/TPUs/IPUs
+=========================
+It's trivial to use CPUs, GPUs, TPUs or IPUs in Lightning. There's **NO NEED** to change your code, simply change the :class:`~pytorch_lightning.trainer.Trainer` options.
 
 .. testcode::
 
@@ -423,6 +425,11 @@ Without changing a SINGLE line of your code, you can now do the following with t
     # using only half the training data and checking validation every quarter of a training epoch
     trainer = pl.Trainer(tpu_cores=8, precision=16, limit_train_batches=0.5, val_check_interval=0.25)
 
+.. code-block:: python
+
+    # Train on IPUs
+    trainer = pl.Trainer(ipus=8)
+
 -----------
 
 Checkpoints
@@ -449,7 +456,7 @@ If you prefer to do it manually, here's the equivalent
 
 Data flow
 =========
-Each loop (training, validation, test) has three hooks you can implement:
+Each loop (training, validation, test, predict) has three hooks you can implement:
 
 - x_step
 - x_step_end
@@ -474,8 +481,8 @@ The equivalent in Lightning is:
         return prediction
 
 
-    def training_epoch_end(self, training_step_outputs):
-        for prediction in predictions:
+    def training_epoch_end(self, outs):
+        for out in outs:
             ...
 
 In the event that you use DP or DDP2 distributed modes (ie: split a batch across GPUs),
@@ -508,9 +515,9 @@ The lightning equivalent is:
     def training_step_end(self, losses):
         gpu_0_loss = losses[0]
         gpu_1_loss = losses[1]
-        return (gpu_0_loss + gpu_1_loss) * 1 / 2
+        return (gpu_0_loss + gpu_1_loss) / 2
 
-.. tip:: The validation and test loops have the same structure.
+.. tip:: The validation, test and prediction loops have the same structure.
 
 -----------------
 
@@ -648,8 +655,10 @@ Make your data code reusable by organizing it into a :class:`~pytorch_lightning.
           if stage in (None, "fit"):
               mnist_train = MNIST(os.getcwd(), train=True, transform=transform)
               self.mnist_train, self.mnist_val = random_split(mnist_train, [55000, 5000])
-          if stage == (None, "test"):
+          if stage == "test":
               self.mnist_test = MNIST(os.getcwd(), train=False, transform=transform)
+          if stage == "predict":
+              self.mnist_predict = MNIST(os.getcwd(), train=False, transform=transform)
 
       # return the dataloader for each split
       def train_dataloader(self):
@@ -664,6 +673,10 @@ Make your data code reusable by organizing it into a :class:`~pytorch_lightning.
           mnist_test = DataLoader(self.mnist_test, batch_size=self.batch_size)
           return mnist_test
 
+      def predict_dataloader(self):
+          mnist_predict = DataLoader(self.mnist_predict, batch_size=self.batch_size)
+          return mnist_predict
+
 :class:`~pytorch_lightning.core.datamodule.LightningDataModule` is designed to enable sharing and reusing data splits
 and transforms across different projects. It encapsulates all the steps needed to process data: downloading,
 tokenizing, processing etc.
@@ -681,11 +694,17 @@ the :class:`~pytorch_lightning.trainer.Trainer`:
 
     # train
     trainer = pl.Trainer()
-    trainer.fit(model, dm)
+    trainer.fit(model, datamodule=dm)
+
+    # validate
+    trainer.validate(datamodule=dm)
 
     # test
     trainer.test(datamodule=dm)
 
+    # predict
+    predictions = trainer.predict(datamodule=dm)
+
 DataModules are specifically useful for building models based on data. Read more on :doc:`datamodules <../extensions/datamodules>`.
 
 ------
@@ -701,15 +720,18 @@ Lightning has many tools for debugging. Here is an example of just a few of them
 
 .. testcode::
 
-    # Automatically overfit the sane batch of your model for a sanity test
+    # Automatically overfit the same batch of your model for a sanity test
     trainer = Trainer(overfit_batches=1)
 
 .. testcode::
 
-    # unit test all the code- hits every line of your code once to see if you have bugs,
+    # unit test all the code - hits every line of your code once to see if you have bugs,
     # instead of waiting hours to crash on validation
     trainer = Trainer(fast_dev_run=True)
 
+    # unit test all the code - hits every line of your code with 4 batches
+    trainer = Trainer(fast_dev_run=4)
+
 .. testcode::
 
    # train only 20% of an epoch
@@ -739,7 +761,7 @@ Once you define and train your first Lightning model, you might want to try othe
 - :doc:`Automatically find a good learning rate <../advanced/lr_finder>`
 - :ref:`Load checkpoints directly from S3 <common/weights_loading:Checkpoint Loading>`
 - :doc:`Scale to massive compute clusters <../clouds/cluster>`
-- :doc:`Use multiple dataloaders per train/val/test loop <../guides/data>`
+- :doc:`Use multiple dataloaders per train/val/test/predict loop <../guides/data>`
 - :ref:`Use multiple optimizers to do reinforcement learning or even GANs <common/optimizers:Use multiple optimizers (like GANs)>`
 
 Or read our :doc:`Guide <../starter/introduction_guide>` to learn more!
 
@@ -28,6 +28,7 @@
 import pytorch_lightning as pl
 from pytorch_lightning.callbacks.base import Callback
 from pytorch_lightning.utilities import rank_zero_warn
+from pytorch_lightning.utilities.distributed import rank_zero_deprecation
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
@@ -93,7 +94,7 @@ def __init__(self, logging_interval: Optional[str] = None, log_momentum: bool =
         self.logging_interval = logging_interval
         self.log_momentum = log_momentum
         self.lrs: Dict[str, List[float]] = {}
-        self.lr_sch_names: List[str] = []
+        self._lr_sch_names: List[str] = []
 
     def on_train_start(self, trainer: "pl.Trainer", *args: Any, **kwargs: Any) -> None:
         """Called before training, determines unique names for all lr schedulers in the case of multiple of the
@@ -334,6 +335,16 @@ def _check_duplicates_and_update_name(
         name_list = [self._add_suffix(name, param_groups, i) for i in range(len(param_groups))]
 
         if add_lr_sch_names:
-            self.lr_sch_names.append(name)
+            self._lr_sch_names.append(name)
 
         return name_list
+
+    @property
+    def lr_sch_names(self) -> List[str]:
+        # TODO remove `lr_sch_names` and `add_lr_sch_names` argument in v1.7.0
+        rank_zero_deprecation(
+            "`LearningRateMonitor.lr_sch_names` has been deprecated in v1.5 and will be removed in 1.7."
+            " Consider accessing them using `LearningRateMonitor.lrs.keys()` which will return"
+            " the names of all the optimizers, even those without a scheduler."
+        )
+        return self._lr_sch_names
@@ -700,7 +700,7 @@ def _update_best_and_save(
 
         # do not save nan, replace with +/- inf
         if isinstance(current, torch.Tensor) and torch.isnan(current):
-            current = torch.tensor(float("inf" if self.mode == "min" else "-inf"))
+            current = torch.tensor(float("inf" if self.mode == "min" else "-inf"), device=current.device)
 
         filepath = self._get_metric_interpolated_filepath_name(monitor_candidates, trainer, del_filepath)
 
 
@@ -21,9 +21,13 @@
 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
 from pytorch_lightning.utilities import GradClipAlgorithmType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.imports import _DEEPSPEED_AVAILABLE
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.warnings import WarningCache
 
+if _DEEPSPEED_AVAILABLE:
+    from deepspeed import DeepSpeedEngine
+
 warning_cache = WarningCache()
 
 
@@ -40,7 +44,7 @@ def backward(self, model: "pl.LightningModule", closure_loss: Tensor, *args: Any
                 "You have overridden the `LightningModule.backward` hook but it will be ignored since DeepSpeed handles"
                 " the backward logic internally."
             )
-        deepspeed_engine = model.trainer.model
+        deepspeed_engine: DeepSpeedEngine = model.trainer.model
         deepspeed_engine.backward(closure_loss, *args, **kwargs)
 
     def _run_backward(self, tensor: Tensor, model: Module, *args: Any, **kwargs: Any) -> None:
 
@@ -18,9 +18,7 @@
 
 
 class FullyShardedNativeMixedPrecisionPlugin(ShardedNativeMixedPrecisionPlugin):
-    """Mixed Precision for Full Sharded Training."""
-
-    precision = "mixed"
+    """Native AMP for Fully Sharded Training."""
 
     def clip_grad_by_norm(self, *_: Any, **__: Any) -> None:
         # see https://fairscale.readthedocs.io/en/latest/api/nn/fsdp_tips.html
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ One to handle differences from the training routine and one to handle different`
`26`	`26`	`from pytorch_lightning.plugins import NativeMixedPrecisionPlugin, DDPPlugin`
`27`	`27`
`28`	`28`	`accelerator = GPUAccelerator(`
`29`		`- precision_plugin=NativeMixedPrecisionPlugin(),`
	`29`	`+ precision_plugin=NativeMixedPrecisionPlugin(16, "cuda"),`
`30`	`30`	`training_type_plugin=DDPPlugin(),`
`31`	`31`	`)`
`32`	`32`	`trainer = Trainer(accelerator=accelerator)`