diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
index f009ea1b9bb0b..8f49b3346009d 100644
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-tests.yml
@@ -51,7 +51,7 @@ jobs:
     - bash: |
         python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
         pip install fairscale>=0.3.4
-        pip install "deepspeed==0.4.3" # FIXME: bug with >= 0.4.4
+        pip install deepspeed==0.5.4
         pip install . --requirement requirements/devel.txt
         pip list
       displayName: 'Install dependencies'
@@ -106,10 +106,10 @@ jobs:
         set -e
         python -m pytest pl_examples -v --maxfail=2 --durations=0
         bash pl_examples/run_examples.sh --trainer.gpus=1
-        bash pl_examples/run_examples.sh --trainer.gpus=2 --trainer.accelerator=ddp
-        bash pl_examples/run_examples.sh --trainer.gpus=2 --trainer.accelerator=ddp --trainer.precision=16
-        bash pl_examples/run_examples.sh --trainer.gpus=2 --trainer.accelerator=dp
-        bash pl_examples/run_examples.sh --trainer.gpus=2 --trainer.accelerator=dp --trainer.precision=16
+        bash pl_examples/run_examples.sh --trainer.gpus=2 --trainer.strategy=ddp
+        bash pl_examples/run_examples.sh --trainer.gpus=2 --trainer.strategy=ddp --trainer.precision=16
+        bash pl_examples/run_examples.sh --trainer.gpus=2 --trainer.strategy=dp
+        bash pl_examples/run_examples.sh --trainer.gpus=2 --trainer.strategy=dp --trainer.precision=16
       env:
         PL_USE_MOCKED_MNIST: "1"
       displayName: 'Testing: examples'
diff --git a/.gitignore b/.gitignore
index 6ad0671fb3306..7b1247433e7b4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -156,3 +156,4 @@ cifar-10-batches-py
 *.pt
 # ctags
 tags
+.tags
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 70044b87791f6..06d3f824e8470 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,11 +5,14 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
-## [unReleased] - 2021-MM-DD
+## [1.5.0] - 2021-MM-DD
 
 ### Added
 
 
+- Add support for monitoring the learning rate monitor without schedulers in `LearningRateMonitor` ([#9786](https://github.com/PyTorchLightning/pytorch-lightning/issues/9786))
+
+
 - Register `ShardedTensor` state dict hooks in `LightningModule.__init__` if the pytorch version supports `ShardedTensor` ([#8944](https://github.com/PyTorchLightning/pytorch-lightning/pull/8944))
 
 
@@ -163,6 +166,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added a warning when an unknown key is encountered in optimizer configuration, and when `OneCycleLR` is used with `"interval": "epoch"` ([#9666](https://github.com/PyTorchLightning/pytorch-lightning/pull/9666))
 
 
+- Added `DeviceStatsMonitor` callback ([#9712](https://github.com/PyTorchLightning/pytorch-lightning/pull/9712))
+
+
 - Added `enable_progress_bar` to Trainer constructor ([#9664](https://github.com/PyTorchLightning/pytorch-lightning/pull/9664))
 
 
@@ -175,13 +181,36 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Enabled automatic parameters tying for TPUs ([#9525](https://github.com/PyTorchLightning/pytorch-lightning/pull/9525))
 
 
+- Raise a `MisconfigurationException` when trainer functions are called with `ckpt_path="best"` but `checkpoint_callback` isn't configured ([#9841](https://github.com/PyTorchLightning/pytorch-lightning/pull/9841))
+
+
 - Added support for `torch.autograd.set_detect_anomaly` through `Trainer` constructor argument `detect_anomaly` ([#9848](https://github.com/PyTorchLightning/pytorch-lightning/pull/9848))
 
 
+- Added a `len` method to `LightningDataModule` ([#9895](https://github.com/PyTorchLightning/pytorch-lightning/pull/9895))
+
+
+- Added `enable_model_summary` flag to Trainer ([#9699](https://github.com/PyTorchLightning/pytorch-lightning/pull/9699))
+
+
+- Added `strategy` argument to Trainer ([#8597](https://github.com/PyTorchLightning/pytorch-lightning/pull/8597))
+
+
+- Added `kfold` example for loop customization ([#9965](https://github.com/PyTorchLightning/pytorch-lightning/pull/9965))
+
+
+- LightningLite:
+    * Added `PrecisionPlugin.forward_context`, making it the default implementation for all `{train,val,test,predict}_step_context()` methods ([#9988](https://github.com/PyTorchLightning/pytorch-lightning/pull/9988))
+
+
 ### Changed
 
+- Setting `Trainer(accelerator="ddp_cpu")` now does not spawn a subprocess if `num_processes` is kept `1` along with `num_nodes > 1` ([#9603](https://github.com/PyTorchLightning/pytorch-lightning/pull/9603)).
+
+
 - Module imports are now catching `ModuleNotFoundError` instead of `ImportError` ([#9867](https://github.com/PyTorchLightning/pytorch-lightning/pull/9867))
 
+
 - `pytorch_lightning.loggers.neptune.NeptuneLogger` is now consistent with new [neptune-client](https://github.com/neptune-ai/neptune-client) API ([#6867](https://github.com/PyTorchLightning/pytorch-lightning/pull/6867)).
 
   Old [neptune-client](https://github.com/neptune-ai/neptune-client) API is supported by `NeptuneClient` from [neptune-contrib](https://github.com/neptune-ai/neptune-contrib) repo.
@@ -257,6 +286,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Changed `HorovodPlugin.all_gather` to return a `torch.Tensor` instead of a list ([#9696](https://github.com/PyTorchLightning/pytorch-lightning/pull/9696))
 
 
+- Changed Trainer connectors to be protected attributes:
+    * Configuration Validator ([#9779](https://github.com/PyTorchLightning/pytorch-lightning/pull/9779))
+
+
 - Restore `current_epoch` and `global_step` irrespective of trainer task ([#9413](https://github.com/PyTorchLightning/pytorch-lightning/pull/9413))
 
 
@@ -269,8 +302,25 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Update the logic to check for accumulation steps with deepspeed ([#9826](https://github.com/PyTorchLightning/pytorch-lightning/pull/9826))
 
 
+- `pytorch_lightning.utilities.grads.grad_norm` now raises an exception if parameter `norm_type <= 0` ([#9765](https://github.com/PyTorchLightning/pytorch-lightning/pull/9765))
+
+
+
+- Updated error message for interactive incompatible plugins ([#9896](https://github.com/PyTorchLightning/pytorch-lightning/pull/9896))
+
+
+- Updated several places in the loops and trainer to access `training_type_plugin` directly instead of `accelerator` ([#9901](https://github.com/PyTorchLightning/pytorch-lightning/pull/9901))
+
+
+
 ### Deprecated
 
+- Deprecated trainer argument `terminate_on_nan` in favour of `detect_anomaly`([#9175](https://github.com/PyTorchLightning/pytorch-lightning/pull/9175))
+
+
+- Deprecated `Trainer.terminate_on_nan` public attribute access ([#9849](https://github.com/PyTorchLightning/pytorch-lightning/pull/9849))
+
+
 - Deprecated `LightningModule.summarize()` in favor of `pytorch_lightning.utilities.model_summary.summarize()`
 
 
@@ -310,7 +360,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Deprecated passing `progress_bar_refresh_rate` to the `Trainer` constructor in favor of adding the `ProgressBar` callback with `refresh_rate` directly to the list of callbacks, or passing `enable_progress_bar=False` to disable the progress bar ([#9616](https://github.com/PyTorchLightning/pytorch-lightning/pull/9616))
 
 
-- Deprecate `LightningDistributed` and move the broadcast logic to `DDPPlugin` and `DDPSpawnPlugin` directly ([#9691](https://github.com/PyTorchLightning/pytorch-lightning/pull/9691))
+- Deprecated `LightningDistributed` and move the broadcast logic to `DDPPlugin` and `DDPSpawnPlugin` directly ([#9691](https://github.com/PyTorchLightning/pytorch-lightning/pull/9691))
 
 
 - Deprecated passing `stochastic_weight_avg` from the `Trainer` constructor in favor of adding the `StochasticWeightAveraging` callback directly to the list of callbacks ([#8989](https://github.com/PyTorchLightning/pytorch-lightning/pull/8989))
@@ -319,12 +369,23 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Deprecated Accelerator collective API `barrier`, `broadcast`, and `all_gather`, call `TrainingTypePlugin` collective API directly ([#9677](https://github.com/PyTorchLightning/pytorch-lightning/pull/9677))
 
 
+- Deprecated `checkpoint_callback` from the `Trainer` constructor in favour of `enable_checkpointing` ([#9754](https://github.com/PyTorchLightning/pytorch-lightning/pull/9754))
+
+
 - Deprecated the `LightningModule.on_post_move_to_device` method ([#9525](https://github.com/PyTorchLightning/pytorch-lightning/pull/9525))
 
 
 - Deprecated `pytorch_lightning.core.decorators.parameter_validation` in favor of `pytorch_lightning.utilities.parameter_tying.set_shared_parameters` ([#9525](https://github.com/PyTorchLightning/pytorch-lightning/pull/9525))
 
 
+- Deprecated passing `weights_summary` to the `Trainer` constructor in favor of adding the `ModelSummary` callback with `max_depth` directly to the list of callbacks ([#9699](https://github.com/PyTorchLightning/pytorch-lightning/pull/9699))
+
+
+- Deprecated `log_gpu_memory`, `gpu_metrics`, and util funcs in favor of `DeviceStatsMonitor` callback ([#9921](https://github.com/PyTorchLightning/pytorch-lightning/pull/9921))
+
+
+- Deprecated `GPUStatsMonitor` and `XLAStatsMonitor` in favor of `DeviceStatsMonitor` callback ([#9924](https://github.com/PyTorchLightning/pytorch-lightning/pull/9924))
+
 ### Removed
 
 - Removed deprecated `metrics` ([#8586](https://github.com/PyTorchLightning/pytorch-lightning/pull/8586/))
@@ -423,9 +484,24 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Removed `call_configure_sharded_model_hook` property from `Accelerator` and `TrainingTypePlugin` ([#9612](https://github.com/PyTorchLightning/pytorch-lightning/pull/9612))
 
 
+- Removed deprecated trainer flag `Trainer.distributed_backend` in favor of `Trainer.accelerator` ([#9246](https://github.com/PyTorchLightning/pytorch-lightning/pull/9246))
+
+
 - Removed `TrainerProperties` mixin and moved property definitions directly into `Trainer` ([#9495](https://github.com/PyTorchLightning/pytorch-lightning/pull/9495))
 
 
+- Removed a redundant warning with `ModelCheckpoint(monitor=None)` callback ([#9875](https://github.com/PyTorchLightning/pytorch-lightning/pull/9875))
+
+
+- Remove `epoch` from `trainer.logged_metrics` ([#9904](https://github.com/PyTorchLightning/pytorch-lightning/pull/9904))
+
+
+- Removed `should_rank_save_checkpoint` property from Trainer ([#9433](https://github.com/PyTorchLightning/pytorch-lightning/pull/9433))
+
+
+- Removed deprecated trainer flag `Trainer.distributed_backend` in favor of `Trainer.accelerator` ([#9246](https://github.com/PyTorchLightning/pytorch-lightning/pull/9246))
+
+
 ### Fixed
 
 
@@ -450,6 +526,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed `BasePredictionWriter` not returning the batch_indices in a non-distributed setting ([#9432](https://github.com/PyTorchLightning/pytorch-lightning/pull/9432))
 
 
+- Fixed an error when running on in XLA environments with no TPU attached ([#9572](https://github.com/PyTorchLightning/pytorch-lightning/pull/9572))
+
+
 - Fixed check on torchmetrics logged whose `compute()` output is a multielement tensor ([#9582](https://github.com/PyTorchLightning/pytorch-lightning/pull/9582))
 
 
@@ -468,17 +547,35 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed `broadcast` in `DDPPlugin` and ``DDPSpawnPlugin` to respect the `src` input  ([#9691](https://github.com/PyTorchLightning/pytorch-lightning/pull/9691))
 
 
+- Fixed `self.log(on_epoch=True)` for the `on_batch_start` and `on_train_batch_start` hooks ([#9780](https://github.com/PyTorchLightning/pytorch-lightning/pull/9780))
+
+
 - Fixed restoring training state during `trainer.fit` only ([#9413](https://github.com/PyTorchLightning/pytorch-lightning/pull/9413))
 
 
 - Fixed DeepSpeed and Lightning both calling the scheduler ([#9788](https://github.com/PyTorchLightning/pytorch-lightning/pull/9788))
 
+
 - Fixed missing arguments when saving hyperparameters from the parent class but not from the child class ([#9800](https://github.com/PyTorchLightning/pytorch-lightning/pull/9800))
 
 
+- Fixed DeepSpeed GPU device IDs ([#9847](https://github.com/PyTorchLightning/pytorch-lightning/pull/9847))
+
+
 - Reset `val_dataloader` in `tuner/batch_size_scaling` ([#9857](https://github.com/PyTorchLightning/pytorch-lightning/pull/9857))
 
 
+- Fixed use of `LightningCLI` in computer_vision_fine_tuning.py example ([#9934](https://github.com/PyTorchLightning/pytorch-lightning/pull/9934))
+
+
+- Fixed issue with non-init dataclass fields in `apply_to_collection` ([#9963](https://github.com/PyTorchLightning/pytorch-lightning/issues/9963))
+
+- Reset `val_dataloader` in `tuner/batch_size_scaling` for binsearch ([#9975](https://github.com/PyTorchLightning/pytorch-lightning/pull/9975))
+
+
+- Fixed logic to check for spawn in dataloader `TrainerDataLoadingMixin._worker_check` ([#9902](https://github.com/PyTorchLightning/pytorch-lightning/pull/9902))
+
+
 ## [1.4.9] - 2021-09-30
 
 - Fixed `lr_find` to generate same results on multiple calls ([#9704](https://github.com/PyTorchLightning/pytorch-lightning/pull/9704))
diff --git a/benchmarks/test_basic_parity.py b/benchmarks/test_basic_parity.py
index e9442dd26e65b..2144be39394cb 100644
--- a/benchmarks/test_basic_parity.py
+++ b/benchmarks/test_basic_parity.py
@@ -159,7 +159,7 @@ def lightning_loop(cls_model, idx, device_type: str = "cuda", num_epochs=10):
         # as the first run is skipped, no need to run it long
         max_epochs=num_epochs if idx > 0 else 1,
         enable_progress_bar=False,
-        weights_summary=None,
+        enable_model_summary=False,
         gpus=1 if device_type == "cuda" else 0,
         checkpoint_callback=False,
         logger=False,
diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py
index b6bcb658dcde9..ade0a055d27c2 100644
--- a/benchmarks/test_sharded_parity.py
+++ b/benchmarks/test_sharded_parity.py
@@ -137,7 +137,7 @@ def plugin_parity_test(
     ddp_model = model_cls()
     use_cuda = gpus > 0
 
-    trainer = Trainer(fast_dev_run=True, max_epochs=1, gpus=gpus, precision=precision, accelerator="ddp_spawn")
+    trainer = Trainer(fast_dev_run=True, max_epochs=1, gpus=gpus, precision=precision, strategy="ddp_spawn")
 
     max_memory_ddp, ddp_time = record_ddp_fit_model_stats(trainer=trainer, model=ddp_model, use_cuda=use_cuda)
 
@@ -145,7 +145,7 @@ def plugin_parity_test(
     seed_everything(seed)
     custom_plugin_model = model_cls()
 
-    trainer = Trainer(fast_dev_run=True, max_epochs=1, gpus=gpus, precision=precision, accelerator="ddp_sharded_spawn")
+    trainer = Trainer(fast_dev_run=True, max_epochs=1, gpus=gpus, precision=precision, strategy="ddp_sharded_spawn")
     assert isinstance(trainer.training_type_plugin, DDPSpawnShardedPlugin)
 
     max_memory_custom, custom_model_time = record_ddp_fit_model_stats(
diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/tpu-tests/tpu_test_cases.jsonnet
index 4a3b9728221a7..55454e7cac0a2 100644
--- a/dockers/tpu-tests/tpu_test_cases.jsonnet
+++ b/dockers/tpu-tests/tpu_test_cases.jsonnet
@@ -35,7 +35,8 @@ local tputests = base.BaseTest {
       coverage run --source=pytorch_lightning -m pytest -v --capture=no \
           tests/profiler/test_xla_profiler.py \
           pytorch_lightning/utilities/xla_device.py \
-          tests/accelerators/test_tpu_backend.py \
+          tests/accelerators/test_tpu.py \
+          tests/callbacks/test_device_stats_monitor.py \
           tests/models/test_tpu.py
       test_exit_code=$?
       echo "\n||| END PYTEST LOGS |||\n"
diff --git a/docs/source/advanced/multi_gpu.rst b/docs/source/advanced/multi_gpu.rst
index ee689e16112c1..653906d4fb68b 100644
--- a/docs/source/advanced/multi_gpu.rst
+++ b/docs/source/advanced/multi_gpu.rst
@@ -611,28 +611,34 @@ Let's say you have a batch size of 7 in your dataloader.
         def train_dataloader(self):
             return Dataset(..., batch_size=7)
 
-In DDP or Horovod your effective batch size will be 7 * gpus * num_nodes.
+In DDP, DDP_SPAWN, Deepspeed, DDP_SHARDED, or Horovod your effective batch size will be 7 * gpus * num_nodes.
 
 .. code-block:: python
 
     # effective batch size = 7 * 8
     Trainer(gpus=8, accelerator="ddp")
+    Trainer(gpus=8, accelerator="ddp_spawn")
+    Trainer(gpus=8, accelerator="ddp_sharded")
     Trainer(gpus=8, accelerator="horovod")
 
     # effective batch size = 7 * 8 * 10
     Trainer(gpus=8, num_nodes=10, accelerator="ddp")
+    Trainer(gpus=8, num_nodes=10, accelerator="ddp_spawn")
+    Trainer(gpus=8, num_nodes=10, accelerator="ddp_sharded")
     Trainer(gpus=8, num_nodes=10, accelerator="horovod")
 
-In DDP2, your effective batch size will be 7 * num_nodes.
+In DDP2 or DP, your effective batch size will be 7 * num_nodes.
 The reason is that the full batch is visible to all GPUs on the node when using DDP2.
 
 .. code-block:: python
 
     # effective batch size = 7
     Trainer(gpus=8, accelerator="ddp2")
+    Trainer(gpus=8, accelerator="dp")
 
     # effective batch size = 7 * 10
     Trainer(gpus=8, num_nodes=10, accelerator="ddp2")
+    Trainer(gpus=8, accelerator="dp")
 
 
 .. note:: Huge batch sizes are actually really bad for convergence. Check out:
diff --git a/docs/source/advanced/sequences.rst b/docs/source/advanced/sequences.rst
index 8e50de49933eb..2d8d770cbb850 100644
--- a/docs/source/advanced/sequences.rst
+++ b/docs/source/advanced/sequences.rst
@@ -1,6 +1,6 @@
 
 Sequential Data
-================
+===============
 
 Truncated Backpropagation Through Time
 --------------------------------------
diff --git a/docs/source/api_references.rst b/docs/source/api_references.rst
index df70b2b0a3944..7bc4d8b460e8d 100644
--- a/docs/source/api_references.rst
+++ b/docs/source/api_references.rst
@@ -67,6 +67,71 @@ Loggers API
     test_tube
     wandb
 
+Loop API
+--------
+
+Base Classes
+^^^^^^^^^^^^
+
+.. currentmodule:: pytorch_lightning.loops
+
+.. autosummary::
+    :toctree: api
+    :nosignatures:
+    :template: classtemplate.rst
+
+    ~base.Loop
+    ~dataloader.dataloader_loop.DataLoaderLoop
+
+
+Default Loop Implementations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Training
+""""""""
+
+.. currentmodule:: pytorch_lightning.loops
+
+.. autosummary::
+    :toctree: api
+    :nosignatures:
+    :template: classtemplate.rst
+
+    FitLoop
+    ~epoch.TrainingEpochLoop
+    ~batch.TrainingBatchLoop
+    ~optimization.OptimizerLoop
+    ~optimization.ManualOptimization
+
+
+Validation and Testing
+""""""""""""""""""""""
+
+.. currentmodule:: pytorch_lightning.loops
+
+.. autosummary::
+    :toctree: api
+    :nosignatures:
+    :template: classtemplate.rst
+
+    ~dataloader.EvaluationLoop
+    ~epoch.EvaluationEpochLoop
+
+
+Prediction
+""""""""""
+
+.. currentmodule:: pytorch_lightning.loops
+
+.. autosummary::
+    :toctree: api
+    :nosignatures:
+    :template: classtemplate.rst
+
+    ~dataloader.PredictionLoop
+    ~epoch.PredictionEpochLoop
+
+
 Plugins API
 -----------
 
diff --git a/docs/source/clouds/cluster.rst b/docs/source/clouds/cluster.rst
index c7a8b71f26d0b..f75a735bb809f 100644
--- a/docs/source/clouds/cluster.rst
+++ b/docs/source/clouds/cluster.rst
@@ -11,11 +11,13 @@ In this guide, we cover
 
 1.  General purpose cluster (not managed)
 
-2.  SLURM cluster
+2.  Using `Torch Distributed Run <https://pytorch.org/docs/stable/elastic/run.html>`__
 
-3.  Custom cluster environment
+3.  SLURM cluster
 
-4.  General tips for multi-node training
+4.  Custom cluster environment
+
+5.  General tips for multi-node training
 
 --------
 
@@ -39,6 +41,7 @@ PyTorch Lightning follows the design of `PyTorch distributed communication packa
 - *WORLD_SIZE* - required; how many nodes are in the cluster
 - *NODE_RANK* - required; id of the node in the cluster
 
+.. _training_script_setup:
 
 Training script setup
 ---------------------
@@ -66,12 +69,45 @@ This means that you need to:
 3. Run the script on each node.
 
 
---------
+----------
+
+.. _torch_distributed_run:
+
+2. Torch Distributed Run
+========================
+
+`Torch Distributed Run <https://pytorch.org/docs/stable/elastic/run.html>`__ provides helper functions to setup distributed environment variables from the `PyTorch distributed communication package <https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization>`__ that need to be defined on each node.
+
+Once the script is setup like described in :ref:`training_script_setup`, you can run the below command across your nodes to start multi-node training.
+
+Like a custom cluster, you have to ensure that there is network connectivity between the nodes with firewall rules that allow traffic flow on a specified *MASTER_PORT*.
+
+Finally, you'll need to decide which node you'd like to be the master node (*MASTER_ADDR*), and the ranks of each node (*NODE_RANK*).
+
+For example:
+
+* *MASTER_ADDR* 10.10.10.16
+* *MASTER_PORT* 29500
+* *NODE_RANK* 0 for the first node, 1 for the second node
+
+Run the below command with the appropriate variables set on each node.
+
+.. code-block:: bash
+
+    python -m torch.distributed.run
+        --nnodes=2 # number of nodes you'd like to run with
+        --master_addr <MASTER_ADDR>
+        --master_port <MASTER_PORT>
+        --node_rank <NODE_RANK>
+        train.py (--arg1 ... train script args...)
+
+.. note::
 
+    ``torch.distributed.run`` assumes that you'd like to spawn a process per GPU if GPU devices are found on the node. This can be adjusted with ``-nproc_per_node``.
 
 .. _slurm:
 
-2. SLURM managed cluster
+3. SLURM managed cluster
 ========================
 
 Lightning automates the details behind training on a SLURM-powered cluster. In contrast to the general purpose
@@ -239,7 +275,7 @@ The other option is that you generate scripts on your own via a bash command or
 
 .. _custom-cluster:
 
-3. Custom cluster
+4. Custom cluster
 =================
 
 Lightning provides an interface for providing your own definition of a cluster environment. It mainly consists of
@@ -282,7 +318,7 @@ and node rank (node id). Here is an example of a custom
 
 ----------
 
-4. General tips for multi-node training
+5. General tips for multi-node training
 =======================================
 
 Debugging flags
diff --git a/docs/source/common/debugging.rst b/docs/source/common/debugging.rst
index 7a11863c0e1bf..6e5a721dd092a 100644
--- a/docs/source/common/debugging.rst
+++ b/docs/source/common/debugging.rst
@@ -95,11 +95,14 @@ Print a summary of your LightningModule
 ---------------------------------------
 Whenever the ``.fit()`` function gets called, the Trainer will print the weights summary for the LightningModule.
 By default it only prints the top-level modules. If you want to show all submodules in your network, use the
-`'full'` option:
+``max_depth`` option:
 
 .. testcode::
 
-    trainer = Trainer(weights_summary="full")
+    from pytorch_lightning.callbacks import ModelSummary
+
+    trainer = Trainer(callbacks=[ModelSummary(max_depth=-1)])
+
 
 You can also display the intermediate input- and output sizes of all your layers by setting the
 ``example_input_array`` attribute in your LightningModule. It will print a table like this
@@ -115,8 +118,9 @@ You can also display the intermediate input- and output sizes of all your layers
 when you call ``.fit()`` on the Trainer. This can help you find bugs in the composition of your layers.
 
 See Also:
-    - :paramref:`~pytorch_lightning.trainer.trainer.Trainer.weights_summary` Trainer argument
-    - :class:`~pytorch_lightning.core.memory.ModelSummary`
+    - :class:`~pytorch_lightning.callbacks.model_summary.ModelSummary`
+    - :func:`~pytorch_lightning.utilities.model_summary.summarize`
+    - :class:`~pytorch_lightning.utilities.model_summary.ModelSummary`
 
 ----------------
 
diff --git a/docs/source/common/hyperparameters.rst b/docs/source/common/hyperparameters.rst
index 1781a26a9189f..41a99e022ae95 100644
--- a/docs/source/common/hyperparameters.rst
+++ b/docs/source/common/hyperparameters.rst
@@ -201,7 +201,7 @@ To recap, add ALL possible trainer flags to the argparser and init the ``Trainer
     trainer = Trainer.from_argparse_args(hparams)
 
     # or if you need to pass in callbacks
-    trainer = Trainer.from_argparse_args(hparams, checkpoint_callback=..., callbacks=[...])
+    trainer = Trainer.from_argparse_args(hparams, enable_checkpointing=..., callbacks=[...])
 
 ----------
 
diff --git a/docs/source/common/lightning_module.rst b/docs/source/common/lightning_module.rst
index ba2694286739e..6ee0ebe7b1110 100644
--- a/docs/source/common/lightning_module.rst
+++ b/docs/source/common/lightning_module.rst
@@ -1195,6 +1195,7 @@ for more information.
             on_after_backward()
 
             on_before_optimizer_step()
+            configure_gradient_clipping()
             optimizer_step()
 
             on_train_batch_end()
@@ -1452,6 +1453,12 @@ on_before_optimizer_step
 .. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_before_optimizer_step
     :noindex:
 
+configure_gradient_clipping
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.lightning.LightningModule.configure_gradient_clipping
+    :noindex:
+
 optimizer_step
 ~~~~~~~~~~~~~~
 
diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst
index 39a583d9c94d8..0405b9a4365af 100644
--- a/docs/source/common/optimizers.rst
+++ b/docs/source/common/optimizers.rst
@@ -69,7 +69,7 @@ Here is a minimal example of manual optimization.
 Gradient accumulation
 ---------------------
 You can accumulate gradients over batches similarly to
-:attr:`~pytorch_lightning.trainer.Trainer.accumulate_grad_batches` of automatic optimization.
+:attr:`~pytorch_lightning.trainer.trainer.Trainer.accumulate_grad_batches` of automatic optimization.
 To perform gradient accumulation with one optimizer, you can do as such.
 
 .. testcode:: python
@@ -516,3 +516,47 @@ to perform a step, Lightning won't be able to support accelerators and precision
     ):
         optimizer = optimizer.optimizer
         optimizer.step(closure=optimizer_closure)
+
+-----
+
+Configure gradient clipping
+---------------------------
+To configure custom gradient clipping, consider overriding
+the :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_gradient_clipping` method.
+Attributes :attr:`~pytorch_lightning.trainer.trainer.Trainer.gradient_clip_val` and
+:attr:`~pytorch_lightning.trainer.trainer.Trainer.gradient_clip_algorithm` will be passed in the respective
+arguments here and Lightning will handle gradient clipping for you. In case you want to set
+different values for your arguments of your choice and let Lightning handle the gradient clipping, you can
+use the inbuilt :meth:`~pytorch_lightning.core.lightning.LightningModule.clip_gradients` method and pass
+the arguments along with your optimizer.
+
+.. note::
+    Make sure to not override :meth:`~pytorch_lightning.core.lightning.LightningModule.clip_gradients`
+    method. If you want to customize gradient clipping, consider using
+    :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_gradient_clipping` method.
+
+For example, here we will apply gradient clipping only to the gradients associated with optimizer A.
+
+.. testcode:: python
+
+    def configure_gradient_clipping(self, optimizer, optimizer_idx, gradient_clip_val, gradient_clip_algorithm):
+        if optimizer_idx == 0:
+            # Lightning will handle the gradient clipping
+            self.clip_gradients(
+                optimizer, gradient_clip_val=gradient_clip_val, gradient_clip_algorithm=gradient_clip_algorithm
+            )
+
+Here we configure gradient clipping differently for optimizer B.
+
+.. testcode:: python
+
+    def configure_gradient_clipping(self, optimizer, optimizer_idx, gradient_clip_val, gradient_clip_algorithm):
+        if optimizer_idx == 0:
+            # Lightning will handle the gradient clipping
+            self.clip_gradients(
+                optimizer, gradient_clip_val=gradient_clip_val, gradient_clip_algorithm=gradient_clip_algorithm
+            )
+        elif optimizer_idx == 1:
+            self.clip_gradients(
+                optimizer, gradient_clip_val=gradient_clip_val * 2, gradient_clip_algorithm=gradient_clip_algorithm
+            )
diff --git a/docs/source/common/trainer.rst b/docs/source/common/trainer.rst
index e8f78864b1ddf..f8d815432a41c 100644
--- a/docs/source/common/trainer.rst
+++ b/docs/source/common/trainer.rst
@@ -216,7 +216,7 @@ accelerator
 
 |
 
-The accelerator backend to use (previously known as distributed_backend).
+The accelerator backend to use:
 
 - (``'dp'``) is DataParallel (split batch among GPUs of same machine)
 - (``'ddp'``) is DistributedDataParallel (each gpu on each node trains, and syncs grads)
@@ -528,6 +528,34 @@ Example::
 checkpoint_callback
 ^^^^^^^^^^^^^^^^^^^
 
+Deprecated: This has been deprecated in v1.5 and will be removed in v1.7. Please use ``enable_checkpointing`` instead.
+
+default_root_dir
+^^^^^^^^^^^^^^^^
+
+.. raw:: html
+
+    <video width="50%" max-width="400px" controls
+    poster="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/thumb/default%E2%80%A8_root_dir.jpg"
+    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/default_root_dir.mp4"></video>
+
+|
+
+Default path for logs and weights when no logger or
+:class:`pytorch_lightning.callbacks.ModelCheckpoint` callback passed.  On
+certain clusters you might want to separate where logs and checkpoints are
+stored. If you don't then use this argument for convenience. Paths can be local
+paths or remote paths such as `s3://bucket/path` or 'hdfs://path/'. Credentials
+will need to be set up to use remote filepaths.
+
+.. testcode::
+
+    # default used by the Trainer
+    trainer = Trainer(default_root_dir=os.getcwd())
+
+enable_checkpointing
+^^^^^^^^^^^^^^^^^^^^
+
 .. raw:: html
 
     <video width="50%" max-width="400px" controls
@@ -542,11 +570,11 @@ To disable automatic checkpointing, set this to `False`.
 
 .. code-block:: python
 
-    # default used by Trainer
-    trainer = Trainer(checkpoint_callback=True)
+    # default used by Trainer, saves the most recent model to a single checkpoint after each epoch
+    trainer = Trainer(enable_checkpointing=True)
 
     # turn off automatic checkpointing
-    trainer = Trainer(checkpoint_callback=False)
+    trainer = Trainer(enable_checkpointing=False)
 
 
 You can override the default behavior by initializing the :class:`~pytorch_lightning.callbacks.ModelCheckpoint`
@@ -563,38 +591,6 @@ See :doc:`Saving and Loading Weights <../common/weights_loading>` for how to cus
     # Add your callback to the callbacks list
     trainer = Trainer(callbacks=[checkpoint_callback])
 
-
-.. warning:: Passing a ModelCheckpoint instance to this argument is deprecated since
-    v1.1 and will be unsupported from v1.3. Use `callbacks` argument instead.
-
-
-default_root_dir
-^^^^^^^^^^^^^^^^
-
-.. raw:: html
-
-    <video width="50%" max-width="400px" controls
-    poster="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/thumb/default%E2%80%A8_root_dir.jpg"
-    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/default_root_dir.mp4"></video>
-
-|
-
-Default path for logs and weights when no logger or
-:class:`pytorch_lightning.callbacks.ModelCheckpoint` callback passed.  On
-certain clusters you might want to separate where logs and checkpoints are
-stored. If you don't then use this argument for convenience. Paths can be local
-paths or remote paths such as `s3://bucket/path` or 'hdfs://path/'. Credentials
-will need to be set up to use remote filepaths.
-
-.. testcode::
-
-    # default used by the Trainer
-    trainer = Trainer(default_root_dir=os.getcwd())
-
-distributed_backend
-^^^^^^^^^^^^^^^^^^^
-Deprecated: This has been renamed ``accelerator``.
-
 fast_dev_run
 ^^^^^^^^^^^^
 
@@ -838,36 +834,6 @@ How often to add logging rows (does not write to disk)
 See Also:
     - :doc:`logging <../extensions/logging>`
 
-log_gpu_memory
-^^^^^^^^^^^^^^
-
-.. raw:: html
-
-    <video width="50%" max-width="400px" controls
-    poster="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/thumb/log_gpu_memory.jpg"
-    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/log_gpu_memory.mp4"></video>
-
-|
-
-Options:
-
-- None
-- 'min_max'
-- 'all'
-
-.. testcode::
-
-    # default used by the Trainer
-    trainer = Trainer(log_gpu_memory=None)
-
-    # log all the GPUs (on master node only)
-    trainer = Trainer(log_gpu_memory="all")
-
-    # log only the min and max memory on the master node
-    trainer = Trainer(log_gpu_memory="min_max")
-
-.. note:: Might slow performance because it uses the output of ``nvidia-smi``.
-
 logger
 ^^^^^^
 
@@ -1032,7 +998,9 @@ when using ``accelerator="ddp"``. Set to a number greater than 1 when
 using ``accelerator="ddp_cpu"`` to mimic distributed training on a
 machine without GPUs. This is useful for debugging, but **will not** provide
 any speedup, since single-process Torch already makes efficient use of multiple
-CPUs.
+CPUs. While ``ddp_cpu`` typically spawns subprocesses for training, setting
+``num_nodes > 1`` and keeping ``num_processes = 1`` runs training in the main
+process.
 
 .. testcode::
 
@@ -1589,6 +1557,11 @@ Example::
 weights_summary
 ^^^^^^^^^^^^^^^
 
+.. warning:: `weights_summary` is deprecated in v1.5 and will be removed in v1.7. Please pass :class:`~pytorch_lightning.callbacks.model_summary.ModelSummary`
+    directly to the Trainer's ``callbacks`` argument instead. To disable the model summary,
+    pass ``enable_model_summary = False`` to the Trainer.
+
+
 .. raw:: html
 
     <video width="50%" max-width="400px" controls
@@ -1611,6 +1584,25 @@ Options: 'full', 'top', None.
     # don't print a summary
     trainer = Trainer(weights_summary=None)
 
+
+enable_model_summary
+^^^^^^^^^^^^^^^^^^^^
+
+Whether to enable or disable the model summarization. Defaults to True.
+
+.. testcode::
+
+    # default used by the Trainer
+    trainer = Trainer(enable_model_summary=True)
+
+    # disable summarization
+    trainer = Trainer(enable_model_summary=False)
+
+    # enable custom summarization
+    from pytorch_lightning.callbacks import ModelSummary
+
+    trainer = Trainer(enable_model_summary=True, callbacks=[ModelSummary(max_depth=-1)])
+
 -----
 
 Trainer class API
@@ -1706,7 +1698,7 @@ The metrics sent to the logger (visualizer).
 .. code-block:: python
 
     def training_step(self, batch, batch_idx):
-        self.log("a_val", 2, log=True)
+        self.log("a_val", 2, logger=True)
 
 
     logged_metrics = trainer.logged_metrics
diff --git a/docs/source/extensions/accelerators.rst b/docs/source/extensions/accelerators.rst
index 6bd9ebd8267bd..61aad18eea4f8 100644
--- a/docs/source/extensions/accelerators.rst
+++ b/docs/source/extensions/accelerators.rst
@@ -14,6 +14,7 @@ Currently there are accelerators for:
 - CPU
 - GPU
 - TPU
+- IPU
 
 Each Accelerator gets two plugins upon initialization:
 One to handle differences from the training routine and one to handle different precisions.
diff --git a/docs/source/extensions/callbacks.rst b/docs/source/extensions/callbacks.rst
index ad61c10a7bd3b..f8088d1d4153e 100644
--- a/docs/source/extensions/callbacks.rst
+++ b/docs/source/extensions/callbacks.rst
@@ -99,6 +99,7 @@ Lightning has a few built-in callbacks.
     BaseFinetuning
     BasePredictionWriter
     Callback
+    DeviceStatsMonitor
     EarlyStopping
     GPUStatsMonitor
     GradientAccumulationScheduler
diff --git a/docs/source/extensions/loops.rst b/docs/source/extensions/loops.rst
new file mode 100644
index 0000000000000..d779f16c4505b
--- /dev/null
+++ b/docs/source/extensions/loops.rst
@@ -0,0 +1,413 @@
+.. _loop_customization:
+
+Loops
+=====
+
+Loops let advanced users swap out the default gradient descent optimization loop at the core of Lightning with a different optimization paradigm.
+
+The Lightning Trainer is built on top of the standard gradient descent optimization loop which works for 90%+ of machine learning use cases:
+
+.. code-block:: python
+
+    for i, batch in enumerate(dataloader):
+        x, y = batch
+        y_hat = model(x)
+        loss = loss_function(y_hat, y)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+However, some new research use cases such as meta-learning, active learning, recommendation systems, etc., require a different loop structure.
+For example here is a simple loop that guides the weight updates with a loss from a special validation split:
+
+.. code-block:: python
+
+    for i, batch in enumerate(train_dataloader):
+        x, y = batch
+        y_hat = model(x)
+        loss = loss_function(y_hat, y)
+        optimizer.zero_grad()
+        loss.backward()
+
+        val_loss = 0
+        for i, val_batch in enumerate(val_dataloader):
+            x, y = val_batch
+            y_hat = model(x)
+            val_loss += loss_function(y_hat, y)
+
+        scale_gradients(model, 1 / val_loss)
+        optimizer.step()
+
+
+With Lightning Loops, you can customize to non-standard gradient descent optimizations to get the same loop above:
+
+.. code-block:: python
+
+    trainer = Trainer()
+    trainer.fit_loop.epoch_loop = MyGradientDescentLoop()
+
+Think of this as swapping out the engine in a car!
+
+Understanding the default Trainer loop
+--------------------------------------
+
+The Lightning :class:`~pytorch_lightning.trainer.trainer.Trainer` automates the standard optimization loop which every PyTorch user is familiar with:
+
+.. code-block:: python
+
+    for i, batch in enumerate(dataloader):
+        x, y = batch
+        y_hat = model(x)
+        loss = loss_function(y_hat, y)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+The core research logic is simply shifted to the :class:`~pytorch_lightning.core.lightning.LightningModule`:
+
+.. code-block:: python
+
+    for i, batch in enumerate(dataloader):
+        # x, y = batch                      moved to training_step
+        # y_hat = model(x)                  moved to training_step
+        # loss = loss_function(y_hat, y)    moved to training_step
+        loss = lightning_module.training_step(batch, i)
+
+        # Lighting handles automatically:
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+Under the hood, the above loop is implemented using the :class:`~pytorch_lightning.loops.base.Loop` API like so:
+
+.. code-block:: python
+
+    class DefaultLoop(Loop):
+        def advance(self, batch, i):
+            loss = lightning_module.training_step(batch, i)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+        def run(self, dataloader):
+            for i, batch in enumerate(dataloader):
+                self.advance(batch, i)
+
+Defining a loop within a class interface instead of hard-coding a raw Python for/while loop has several benefits:
+
+1. You can have full control over the data flow through loops.
+2. You can add new loops and nest as many of them as you want.
+3. If needed, the state of a loop can be :ref:`saved and resumed <persisting loop state>`.
+4. New hooks can be injected at any point.
+
+.. image:: https://pl-public-data.s3.amazonaws.com/docs/static/images/loops/epoch-loop-steps.gif
+    :alt: Animation showing how to convert a standard training loop to a Lightning loop
+
+
+.. _override default loops:
+
+Overriding the default loops
+----------------------------
+
+The fastest way to get started with loops, is to override functionality of an existing loop.
+Lightning has 4 main loops it uses: :class:`~pytorch_lightning.loops.fit_loop.FitLoop` for training and validating,
+:class:`~pytorch_lightning.loops.dataloader.evaluation_loop.EvaluationLoop` for testing,
+:class:`~pytorch_lightning.loops.dataloader.prediction_loop.PredictionLoop` for predicting.
+
+For simple changes that don't require a custom loop, you can modify each of these loops.
+
+Each loop has a series of methods that can be modified.
+For example with the :class:`~pytorch_lightning.loops.fit_loop.FitLoop`:
+
+.. code-block::
+
+    from pytorch_lightning.loops import FitLoop
+
+    class MyLoop(FitLoop):
+
+        def advance():
+            ...
+
+        def on_advance_end(self)
+            ...
+
+        def on_run_end(self):
+            ...
+
+A full list with all built-in loops and subloops can be found :ref:`here <loop structure>`.
+
+To add your own modifications to a loop, simply subclass an existing loop class and override what you need.
+Here is a simple example how to add a new hook:
+
+.. code-block:: python
+
+    from pytorch_lightning.loops import FitLoop
+
+
+    class CustomFitLoop(FitLoop):
+        def advance(self):
+            # ... whatever code before
+
+            # pass anything you want to the hook
+            self.trainer.call_hook("my_new_hook", *args, **kwargs)
+
+            # ... whatever code after
+
+Now simply attach the correct loop in the trainer directly:
+
+.. code-block:: python
+
+    trainer = Trainer(...)
+    trainer.fit_loop = CustomFitLoop()
+
+    # fit() now uses the new FitLoop!
+    trainer.fit(...)
+
+    # the equivalent for validate(), test(), predict()
+    val_loop = CustomValLoop()
+    trainer = Trainer()
+    trainer.validate_loop = val_loop
+    trainer.validate(model)
+
+Now your code is FULLY flexible and you can still leverage ALL the best parts of Lightning!
+
+.. image:: https://pl-public-data.s3.amazonaws.com/docs/static/images/loops/replace-fit-loop.gif
+    :alt: Animation showing how to replace a loop on the Trainer
+
+Creating a new loop from scratch
+--------------------------------
+
+You can also go wild and implement a full loop from scratch by sub-classing the :class:`~pytorch_lightning.loops.base.Loop` base class.
+You will need to override a minimum of two things:
+
+.. code-block::
+
+    from pytorch_lightning.loop import Loop
+
+    class MyFancyLoop(Loop):
+
+        @property
+        def done(self):
+            # provide condition to stop the loop
+
+        def advance(self):
+            # access your dataloader/s in whatever way you want
+            # do your fancy optimization things
+            # call the lightning module methods at your leisure
+
+Finally, attach it into the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
+
+.. code-block:: python
+
+    trainer = Trainer(...)
+    trainer.fit_loop = MyFancyLoop()
+
+    # fit() now uses your fancy loop!
+    trainer.fit(...)
+
+Now you have full control over the Trainer.
+But beware: The power of loop customization comes with great responsibility.
+We recommend that you familiarize yourself with :ref:`overriding the default loops <override default loops>` first before you start building a new loop from the ground up.
+
+Loop API
+--------
+Here is the full API of methods available in the Loop base class.
+
+The :class:`~pytorch_lightning.loops.base.Loop` class is the base for all loops in Lighting just like the :class:`~pytorch_lightning.core.lightning.LightningModule` is the base for all models.
+It defines a public interface that each loop implementation must follow, the key ones are:
+
+Properties
+^^^^^^^^^^
+
+done
+~~~~
+
+.. autoattribute:: pytorch_lightning.loops.base.Loop.done
+    :noindex:
+
+skip (optional)
+~~~~~~~~~~~~~~~
+
+.. autoattribute:: pytorch_lightning.loops.base.Loop.skip
+    :noindex:
+
+Methods
+^^^^^^^
+
+reset (optional)
+~~~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.loops.base.Loop.reset
+    :noindex:
+
+advance
+~~~~~~~
+
+.. automethod:: pytorch_lightning.loops.base.Loop.advance
+    :noindex:
+
+run (optional)
+~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.loops.base.Loop.run
+    :noindex:
+
+
+Subloops
+--------
+
+When you want to customize nested loops within loops, use the :meth:`~pytorch_lightning.loops.base.Loop.connect` method:
+
+.. code-block:: python
+
+    # Step 1: create your loop
+    my_epoch_loop = MyEpochLoop()
+
+    # Step 2: use connect()
+    trainer.fit_loop.connect(epoch_loop=my_epoch_loop)
+
+    # Trainer runs the fit loop with your new epoch loop!
+    trainer.fit(model)
+
+More about the built-in loops and how they are composed is explained in the next section.
+
+.. image:: https://pl-public-data.s3.amazonaws.com/docs/static/images/loops/connect-epoch-loop.gif
+    :alt: Animation showing how to connect a custom subloop
+
+.. _loop structure:
+
+Built-in Loops
+--------------
+
+The training loop in Lightning is called *fit loop* and is actually a combination of several loops.
+Here is what the structure would look like in plain Python:
+
+.. code-block:: python
+
+    # FitLoop
+    for epoch in range(max_epochs):
+
+        # TrainingEpochLoop
+        for batch_idx, batch in enumerate(train_dataloader):
+
+            # TrainingBatchLoop
+            for split_batch in tbptt_split(batch):
+
+                # OptimizerLoop
+                for optimizer_idx, opt in enumerate(optimizers):
+
+                    loss = lightning_module.training_step(batch, batch_idx, optimizer_idx)
+                    ...
+
+            # ValidationEpochLoop
+            for batch_idx, batch in enumerate(val_dataloader):
+                lightning_module.validation_step(batch, batch_idx, optimizer_idx)
+                ...
+
+
+Each of these :code:`for`-loops represents a class implementing the :class:`~pytorch_lightning.loops.base.Loop` interface.
+
+
+.. list-table:: Trainer entry points and associated loops
+   :widths: 25 75
+   :header-rows: 1
+
+   * - Built-in loop
+     - Description
+   * - :class:`~pytorch_lightning.loops.fit_loop.FitLoop`
+     - The :class:`~pytorch_lightning.loops.fit_loop.FitLoop` is the top-level loop where training starts.
+       It simply counts the epochs and iterates from one to the next by calling :code:`TrainingEpochLoop.run()` in its :code:`advance()` method.
+   * - :class:`~pytorch_lightning.loops.epoch.training_epoch_loop.TrainingEpochLoop`
+     - The :class:`~pytorch_lightning.loops.epoch.training_epoch_loop.TrainingEpochLoop` is the one that iterates over the dataloader that the user returns in their :meth:`~pytorch_lightning.core.lightning.LightningModule.train_dataloader` method.
+       Its main responsibilities are calling the :code:`*_epoch_start` and :code:`*_epoch_end` hooks, accumulating outputs if the user request them in one of these hooks, and running validation at the requested interval.
+       The validation is carried out by yet another loop, :class:`~pytorch_lightning.loops.epoch.validation_epoch_loop.ValidationEpochLoop`.
+
+       In the :code:`run()` method, the training epoch loop could in theory simply call the :code:`LightningModule.training_step` already and perform the optimization.
+       However, Lightning has built-in support for automatic optimization with multiple optimizers and on top of that also supports :doc:`truncated back-propagation through time <../advanced/sequences>`.
+       For this reason there are actually two more loops nested under :class:`~pytorch_lightning.loops.epoch.training_epoch_loop.TrainingEpochLoop`.
+   * - :class:`~pytorch_lightning.loops.batch.training_batch_loop.TrainingBatchLoop`
+     - The responsibility of the :class:`~pytorch_lightning.loops.batch.training_batch_loop.TrainingBatchLoop` is to split a batch given by the :class:`~pytorch_lightning.loops.epoch.training_epoch_loop.TrainingEpochLoop` along the time-dimension and iterate over the list of splits.
+       It also keeps track of the hidden state *hiddens* returned by the training step.
+       By default, when truncated back-propagation through time (TBPTT) is turned off, this loop does not do anything except redirect the call to the :class:`~pytorch_lightning.loops.optimization.optimizer_loop.OptimizerLoop`.
+       Read more about :doc:`TBPTT <../advanced/sequences>`.
+   * - :class:`~pytorch_lightning.loops.optimization.optimizer_loop.OptimizerLoop`
+     - The :class:`~pytorch_lightning.loops.optimization.optimizer_loop.OptimizerLoop` iterates over one or multiple optimizers and for each one it calls the :meth:`~pytorch_lightning.core.lightning.LightningModule.training_step` method with the batch, the current batch index and the optimizer index if multiple optimizers are requested.
+       It is the leaf node in the tree of loops and performs the actual optimization (forward, zero grad, backward, optimizer step).
+   * - :class:`~pytorch_lightning.loops.optimization.manual_loop.ManualOptimization`
+     - Substitutes the :class:`~pytorch_lightning.loops.optimization.optimizer_loop.OptimizerLoop` in case of :ref:`manual_optimization` and implements the manual optimization step.
+
+
+Available Loops in Lightning Flash
+----------------------------------
+
+`Active Learning <https://en.wikipedia.org/wiki/Active_learning_(machine_learning)>`__ is a machine learning practice in which the user interacts with the learner in order to provide new labels when required.
+
+You can find a real use case in `Lightning Flash <https://github.com/PyTorchLightning/lightning-flash>`_.
+
+Flash implements the :code:`ActiveLearningLoop` that you can use together with the :code:`ActiveLearningDataModule` to label new data on the fly.
+To run the following demo, install Flash and `BaaL <https://github.com/ElementAI/baal>`__  first:
+
+.. code-block:: bash
+
+    pip install lightning-flash baal
+
+.. code-block:: python
+
+    import torch
+
+    import flash
+    from flash.core.classification import Probabilities
+    from flash.core.data.utils import download_data
+    from flash.image import ImageClassificationData, ImageClassifier
+    from flash.image.classification.integrations.baal import ActiveLearningDataModule, ActiveLearningLoop
+
+    # 1. Create the DataModule
+    download_data("https://pl-flash-data.s3.amazonaws.com/hymenoptera_data.zip", "./data")
+
+    # Implement the research use-case where we mask labels from labelled dataset.
+    datamodule = ActiveLearningDataModule(
+        ImageClassificationData.from_folders(train_folder="data/hymenoptera_data/train/", batch_size=2),
+        val_split=0.1,
+    )
+
+    # 2. Build the task
+    head = torch.nn.Sequential(
+        torch.nn.Dropout(p=0.1),
+        torch.nn.Linear(512, datamodule.num_classes),
+    )
+    model = ImageClassifier(backbone="resnet18", head=head, num_classes=datamodule.num_classes, serializer=Probabilities())
+
+    # 3.1 Create the trainer
+    trainer = flash.Trainer(max_epochs=3)
+
+    # 3.2 Create the active learning loop and connect it to the trainer
+    active_learning_loop = ActiveLearningLoop(label_epoch_frequency=1)
+    active_learning_loop.connect(trainer.fit_loop)
+    trainer.fit_loop = active_learning_loop
+
+    # 3.3 Finetune
+    trainer.finetune(model, datamodule=datamodule, strategy="freeze")
+
+    # 4. Predict what's on a few images! ants or bees?
+    predictions = model.predict("data/hymenoptera_data/val/bees/65038344_52a45d090d.jpg")
+    print(predictions)
+
+    # 5. Save the model!
+    trainer.save_checkpoint("image_classification_model.pt")
+
+Here is the `Active Learning Loop example <https://github.com/PyTorchLightning/lightning-flash/blob/master/flash_examples/integrations/baal/image_classification_active_learning.py>`_ and the `code for the active learning loop <https://github.com/PyTorchLightning/lightning-flash/blob/master/flash/image/classification/integrations/baal/loop.py#L31>`_.
+
+
+`KFold / Cross Validation <https://en.wikipedia.org/wiki/Cross-validation_(statistics)>`__ is a machine learning practice in which the training dataset is being partitioned into `num_folds` complementary subsets.
+One cross validation round will perform fitting where one fold is left out for validation and the other folds are used for training.
+To reduce variability, once all rounds are performed using the different folds, the trained models are ensembled and their predictions are
+averaged when estimating the model's predictive performance on the test dataset.
+KFold can elegantly be implemented with `Lightning Loop Customization` as follows:
+
+Here is the `KFold Loop example <https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pl_examples/loops/kfold.py>`_.
+
+
+Advanced Topics and Examples
+----------------------------
+
+Next: :doc:`Advanced loop features and examples <../extensions/loops_advanced>`
diff --git a/docs/source/extensions/loops_advanced.rst b/docs/source/extensions/loops_advanced.rst
new file mode 100644
index 0000000000000..6cf8ceb72b98b
--- /dev/null
+++ b/docs/source/extensions/loops_advanced.rst
@@ -0,0 +1,41 @@
+:orphan:
+
+Loops (Advanced)
+================
+
+.. _persisting loop state:
+
+Persisting the state of loops
+-----------------------------
+
+.. note::
+
+    This is an experimental feature and is not activated by default.
+    Set the environment variable `PL_FAULT_TOLERANT_TRAINING = 1` to enable saving the progress of loops.
+    Read more about :doc:`fault-tolerant training <../advanced/fault_tolerant_training>`.
+
+A powerful property of the class-based loop interface is that it can own an internal state.
+Loop instances can save their state to the checkpoint through corresponding hooks and if implemented accordingly, resume the state of exectuion at the appropriate place.
+This design is particularly interesting for fault-tolerant training which is an experimental feature released in Lightning v1.5.
+
+The two hooks :class:`~pytorch_lightning.loops.base.Loop.on_save_checkpoint` and :class:`~pytorch_lightning.loops.base.Loop.on_load_checkpoint` function very similarly to how LightningModules and Callbacks save and load state.
+
+.. code-block:: python
+
+    def on_save_checkpoint(self):
+        state_dict["iteration"] = self.iteration
+        return state_dict
+
+
+    def on_load_checkpoint(self, state_dict):
+        self.iteration = state_dict["iteration"]
+
+When the Trainer is restarting from a checkpoint (e.g., through :code:`Trainer(resume_from_checkpoint=...)`), the loop exposes a boolean attribute :attr:`~pytorch_lightning.loops.base.Loop.restarting`.
+Based around the value of this variable, the user can write the loop in such a way that it can restart from an arbitrary point given the state loaded from the checkpoint.
+For example, the implementation of the :meth:`~pytorch_lightning.loops.base.Loop.reset` method could look like this given our previous example:
+
+.. code-block:: python
+
+    def reset(self):
+        if not self.restarting:
+            self.iteration = 0
diff --git a/docs/source/guides/data.rst b/docs/source/guides/data.rst
index 520e3a9a4fc5d..5cd9f355ee552 100644
--- a/docs/source/guides/data.rst
+++ b/docs/source/guides/data.rst
@@ -227,8 +227,8 @@ needs to wrap the DataLoaders with `CombinedLoader`.
 
 
     def val_dataloader(self):
-        loader_1 = DataLoader()
-        loader_2 = DataLoader()
+        loader_a = DataLoader()
+        loader_b = DataLoader()
         loaders = {"a": loader_a, "b": loader_b}
         combined_loaders = CombinedLoader(loaders, "max_size_cycle")
         return combined_loaders
diff --git a/docs/source/index.rst b/docs/source/index.rst
index f0eb5c05af4d1..ea3e606d72849 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -85,7 +85,7 @@ PyTorch Lightning
    extensions/logging
    extensions/metrics
    extensions/plugins
-
+   extensions/loops
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/source/starter/new-project.rst b/docs/source/starter/new-project.rst
index 88213637d44a9..0626aa09db871 100644
--- a/docs/source/starter/new-project.rst
+++ b/docs/source/starter/new-project.rst
@@ -272,6 +272,11 @@ Turn off automatic optimization and you control the train loop!
         self.manual_backward(loss_b)
         opt_b.step()
 
+Loop customization
+==================
+
+If you need even more flexibility, you can fully customize the training loop to its core.
+Learn more about loops :doc:`here <../extensions/loops>`.
 
 Predict or Deploy
 =================
diff --git a/pl_examples/basic_examples/mnist_datamodule.py b/pl_examples/basic_examples/mnist_datamodule.py
index 68823eeac7bba..1d2371c702ce0 100644
--- a/pl_examples/basic_examples/mnist_datamodule.py
+++ b/pl_examples/basic_examples/mnist_datamodule.py
@@ -26,18 +26,21 @@
 if _TORCHVISION_AVAILABLE:
     from torchvision import transforms as transform_lib
 
-_TORCHVISION_MNIST_AVAILABLE = not bool(os.getenv("PL_USE_MOCKED_MNIST", False))
-if _TORCHVISION_MNIST_AVAILABLE:
-    try:
-        from torchvision.datasets import MNIST
-
-        MNIST(_DATASETS_PATH, download=True)
-    except HTTPError as e:
-        print(f"Error {e} downloading `torchvision.datasets.MNIST`")
-        _TORCHVISION_MNIST_AVAILABLE = False
-if not _TORCHVISION_MNIST_AVAILABLE:
-    print("`torchvision.datasets.MNIST` not available. Using our hosted version")
-    from tests.helpers.datasets import MNIST
+
+def MNIST(*args, **kwargs):
+    torchvision_mnist_available = not bool(os.getenv("PL_USE_MOCKED_MNIST", False))
+    if torchvision_mnist_available:
+        try:
+            from torchvision.datasets import MNIST
+
+            MNIST(_DATASETS_PATH, download=True)
+        except HTTPError as e:
+            print(f"Error {e} downloading `torchvision.datasets.MNIST`")
+            torchvision_mnist_available = False
+    if not torchvision_mnist_available:
+        print("`torchvision.datasets.MNIST` not available. Using our hosted version")
+        from tests.helpers.datasets import MNIST
+    return MNIST(*args, **kwargs)
 
 
 class MNISTDataModule(LightningDataModule):
diff --git a/pl_examples/bug_report_model.py b/pl_examples/bug_report_model.py
index f83cc70d44526..270b0cd2abe8d 100644
--- a/pl_examples/bug_report_model.py
+++ b/pl_examples/bug_report_model.py
@@ -55,7 +55,7 @@ def run():
         limit_val_batches=1,
         num_sanity_val_steps=0,
         max_epochs=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data)
     trainer.test(model, dataloaders=test_data)
diff --git a/pl_examples/domain_templates/computer_vision_fine_tuning.py b/pl_examples/domain_templates/computer_vision_fine_tuning.py
index 2cfeaf655f819..c507a6f0e9588 100644
--- a/pl_examples/domain_templates/computer_vision_fine_tuning.py
+++ b/pl_examples/domain_templates/computer_vision_fine_tuning.py
@@ -34,6 +34,9 @@
 
 Note:
     See: https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html
+
+To run:
+    python computer_vision_fine_tuning.py fit
 """
 
 import logging
@@ -265,23 +268,18 @@ def configure_optimizers(self):
 
 class MyLightningCLI(LightningCLI):
     def add_arguments_to_parser(self, parser):
-        parser.add_class_arguments(MilestonesFinetuning, "finetuning")
+        parser.add_lightning_class_args(MilestonesFinetuning, "finetuning")
         parser.link_arguments("data.batch_size", "model.batch_size")
         parser.link_arguments("finetuning.milestones", "model.milestones")
         parser.link_arguments("finetuning.train_bn", "model.train_bn")
         parser.set_defaults(
             {
                 "trainer.max_epochs": 15,
-                "trainer.weights_summary": None,
+                "trainer.enable_model_summary": False,
                 "trainer.num_sanity_val_steps": 0,
             }
         )
 
-    def instantiate_trainer(self, *args):
-        finetuning_callback = MilestonesFinetuning(**self._get(self.config_init, "finetuning"))
-        self.trainer_defaults["callbacks"] = [finetuning_callback]
-        return super().instantiate_trainer(*args)
-
 
 def cli_main():
     MyLightningCLI(TransferLearningModel, CatDogImageDataModule, seed_everything_default=1234)
diff --git a/pl_examples/loop_examples/__init__.py b/pl_examples/loop_examples/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/pl_examples/loop_examples/kfold.py b/pl_examples/loop_examples/kfold.py
new file mode 100644
index 0000000000000..630b1f26f3b4a
--- /dev/null
+++ b/pl_examples/loop_examples/kfold.py
@@ -0,0 +1,256 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os.path as osp
+from abc import ABC, abstractmethod
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Type
+
+import torch
+import torchvision.transforms as T
+from sklearn.model_selection import KFold
+from torch.nn import functional as F
+from torch.utils.data import random_split
+from torch.utils.data.dataloader import DataLoader
+from torch.utils.data.dataset import Dataset, Subset
+
+from pl_examples import _DATASETS_PATH
+from pl_examples.basic_examples.mnist_datamodule import MNIST
+from pl_examples.basic_examples.simple_image_classifier import LitClassifier
+from pytorch_lightning import LightningDataModule, seed_everything, Trainer
+from pytorch_lightning.core.lightning import LightningModule
+from pytorch_lightning.loops.base import Loop
+from pytorch_lightning.loops.fit_loop import FitLoop
+from pytorch_lightning.trainer.states import TrainerFn
+
+#############################################################################################
+#                           KFold Loop / Cross Validation Example                           #
+# This example demonstrates how to leverage Lightning Loop Customization introduced in v1.5 #
+# Learn more about the loop structure from the documentation:                               #
+# https://pytorch-lightning.readthedocs.io/en/latest/extensions/loops.html                  #
+#############################################################################################
+
+
+seed_everything(42)
+
+
+#############################################################################################
+#                           Step 1 / 5: Define KFold DataModule API                         #
+# Our KFold DataModule requires to implement the `setup_folds` and `setup_fold_index`       #
+# methods.                                                                                  #
+#############################################################################################
+
+
+class BaseKFoldDataModule(LightningDataModule, ABC):
+    @abstractmethod
+    def setup_folds(self, num_folds: int) -> None:
+        pass
+
+    @abstractmethod
+    def setup_fold_index(self, fold_index: int) -> None:
+        pass
+
+
+#############################################################################################
+#                           Step 2 / 5: Implement the KFoldDataModule                       #
+# The `KFoldDataModule` will take a train and test dataset.                                 #
+# On `setup_folds`, folds will be created depending on the provided argument `num_folds`    #
+# Our `setup_fold_index`, the provided train dataset will be splitted accordingly to        #
+# the current fold split.                                                                   #
+#############################################################################################
+
+
+@dataclass
+class MNISTKFoldDataModule(BaseKFoldDataModule):
+
+    train_dataset: Optional[Dataset] = None
+    test_dataset: Optional[Dataset] = None
+    train_fold: Optional[Dataset] = None
+    val_fold: Optional[Dataset] = None
+
+    def prepare_data(self) -> None:
+        # download the data.
+        MNIST(_DATASETS_PATH, transform=T.Compose([T.ToTensor(), T.Normalize(mean=(0.5,), std=(0.5,))]))
+
+    def setup(self, stage: Optional[str] = None) -> None:
+        # load the data
+        dataset = MNIST(_DATASETS_PATH, transform=T.Compose([T.ToTensor(), T.Normalize(mean=(0.5,), std=(0.5,))]))
+        self.train_dataset, self.test_dataset = random_split(dataset, [50000, 10000])
+
+    def setup_folds(self, num_folds: int) -> None:
+        self.num_folds = num_folds
+        self.splits = [split for split in KFold(num_folds).split(range(len(self.train_dataset)))]
+
+    def setup_fold_index(self, fold_index: int) -> None:
+        train_indices, val_indices = self.splits[fold_index]
+        self.train_fold = Subset(self.train_dataset, train_indices)
+        self.val_fold = Subset(self.train_dataset, val_indices)
+
+    def train_dataloader(self) -> DataLoader:
+        return DataLoader(self.train_fold)
+
+    def val_dataloader(self) -> DataLoader:
+        return DataLoader(self.val_fold)
+
+    def test_dataloader(self) -> DataLoader:
+        return DataLoader(self.test_dataset)
+
+
+#############################################################################################
+#                           Step 3 / 5: Implement the EnsembleVotingModel module            #
+# The `EnsembleVotingModel` will take our custom LightningModule and                        #
+# several checkpoint_paths.                                                                 #
+#                                                                                           #
+#############################################################################################
+
+
+class EnsembleVotingModel(LightningModule):
+    def __init__(self, model_cls: Type[LightningModule], checkpoint_paths: List[str]):
+        super().__init__()
+        # Create `num_folds` models with their associated fold weights
+        self.models = torch.nn.ModuleList([model_cls.load_from_checkpoint(p) for p in checkpoint_paths])
+
+    def test_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> None:
+        # Compute the averaged predictions over the `num_folds` models.
+        logits = torch.stack([m(batch[0]) for m in self.models]).mean(0)
+        loss = F.cross_entropy(logits, batch[1])
+        self.log("test_loss", loss)
+
+
+#############################################################################################
+#                           Step 4 / 5: Implement the  KFoldLoop                            #
+# From Lightning v1.5, it is possible to implement your own loop. There is several steps    #
+# to do so which are described in detail within the documentation                           #
+# https://pytorch-lightning.readthedocs.io/en/latest/extensions/loops.html.                 #
+# Here, we will implement an outer fit_loop. It means we will implement subclass the        #
+# base Loop and wrap the current trainer `fit_loop`.                                        #
+#############################################################################################
+
+
+#############################################################################################
+#                     Here is the `Pseudo Code` for the base Loop.                          #
+# class Loop:                                                                               #
+#                                                                                           #
+#   def run(self, ...):                                                                     #
+#       self.reset(...)                                                                     #
+#       self.on_run_start(...)                                                              #
+#                                                                                           #
+#        while not self.done:                                                               #
+#            self.on_advance_start(...)                                                     #
+#            self.advance(...)                                                              #
+#            self.on_advance_end(...)                                                       #
+#                                                                                           #
+#        return self.on_run_end(...)                                                        #
+#############################################################################################
+
+
+class KFoldLoop(Loop):
+    def __init__(self, num_folds: int, fit_loop: FitLoop, export_path: str):
+        super().__init__()
+        self.num_folds = num_folds
+        self.fit_loop = fit_loop
+        self.current_fold: int = 0
+        self.export_path = export_path
+
+    @property
+    def done(self) -> bool:
+        return self.current_fold >= self.num_folds
+
+    def reset(self) -> None:
+        """Nothing to reset in this loop."""
+
+    def on_run_start(self, *args: Any, **kwargs: Any) -> None:
+        """Used to call `setup_folds` from the `BaseKFoldDataModule` instance and store the original weights of the
+        model."""
+        assert isinstance(self.trainer.datamodule, BaseKFoldDataModule)
+        self.trainer.datamodule.setup_folds(self.num_folds)
+        self.lightning_module_state_dict = deepcopy(self.trainer.lightning_module.state_dict())
+
+    def on_advance_start(self, *args: Any, **kwargs: Any) -> None:
+        """Used to call `setup_fold_index` from the `BaseKFoldDataModule` instance."""
+        print(f"STARTING FOLD {self.current_fold}")
+        assert isinstance(self.trainer.datamodule, BaseKFoldDataModule)
+        self.trainer.datamodule.setup_fold_index(self.current_fold)
+
+    def advance(self, *args: Any, **kwargs: Any) -> None:
+        """Used to the run a fitting and testing on the current hold."""
+        self._reset_fitting()  # requires to reset the tracking stage.
+        self.fit_loop.run()
+
+        self._reset_testing()  # requires to reset the tracking stage.
+        self.trainer.test_loop.run()
+        self.current_fold += 1  # increment fold tracking number.
+
+    def on_advance_end(self) -> None:
+        """Used to save the weights of the current fold and reset the LightningModule and its optimizers."""
+        self.trainer.save_checkpoint(osp.join(self.export_path, f"model.{self.current_fold}.pt"))
+        # restore the original weights + optimizers and schedulers.
+        self.trainer.lightning_module.load_state_dict(self.lightning_module_state_dict)
+        self.trainer.accelerator.setup_optimizers(self.trainer)
+
+    def on_run_end(self) -> None:
+        """Used to compute the performance of the ensemble model on the test set."""
+        checkpoint_paths = [osp.join(self.export_path, f"model.{f_idx + 1}.pt") for f_idx in range(self.num_folds)]
+        voting_model = EnsembleVotingModel(type(self.trainer.lightning_module), checkpoint_paths)
+        voting_model.trainer = self.trainer
+        # This requires to connect the new model and move it the right device.
+        self.trainer.accelerator.connect(voting_model)
+        self.trainer.training_type_plugin.model_to_device()
+        self.trainer.test_loop.run()
+
+    def on_save_checkpoint(self) -> Dict[str, int]:
+        return {"current_fold": self.current_fold}
+
+    def on_load_checkpoint(self, state_dict: Dict) -> None:
+        self.current_fold = state_dict["current_fold"]
+
+    def _reset_fitting(self) -> None:
+        self.trainer.reset_train_dataloader()
+        self.trainer.reset_val_dataloader()
+        self.trainer.state.fn = TrainerFn.FITTING
+        self.trainer.training = True
+
+    def _reset_testing(self) -> None:
+        self.trainer.reset_test_dataloader()
+        self.trainer.state.fn = TrainerFn.TESTING
+        self.trainer.testing = True
+
+    def __getattr__(self, key) -> Any:
+        # requires to be overridden as attributes of the wrapped loop are being accessed.
+        if key not in self.__dict__:
+            return getattr(self.fit_loop, key)
+        return self.__dict__[key]
+
+
+#############################################################################################
+#                           Step 5 / 5: Connect the KFoldLoop to the Trainer                #
+# After creating the `KFoldDataModule` and our model, the `KFoldLoop` is being connected to #
+# the Trainer.                                                                              #
+# Finally, use `trainer.fit` to start the cross validation training.                        #
+#############################################################################################
+
+model = LitClassifier()
+datamodule = MNISTKFoldDataModule()
+trainer = Trainer(
+    max_epochs=10,
+    limit_train_batches=2,
+    limit_val_batches=2,
+    limit_test_batches=2,
+    num_sanity_val_steps=0,
+    devices=1,
+    accelerator="auto",
+    strategy="ddp",
+)
+trainer.fit_loop = KFoldLoop(5, trainer.fit_loop, export_path="./")
+trainer.fit(model, datamodule)
diff --git a/pyproject.toml b/pyproject.toml
index be5b5fe4c571a..3106dd3449f87 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -61,9 +61,11 @@ ignore_errors = "True"
 
 [[tool.mypy.overrides]]
 module = [
+    "pytorch_lightning.callbacks.device_stats_monitor",
     "pytorch_lightning.callbacks.model_summary",
     "pytorch_lightning.callbacks.pruning",
     "pytorch_lightning.callbacks.rich_model_summary",
+    "pytorch_lightning.core.optimizer",
     "pytorch_lightning.loops.optimization.*",
     "pytorch_lightning.loops.evaluation_loop",
     "pytorch_lightning.trainer.connectors.checkpoint_connector",
diff --git a/pytorch_lightning/__about__.py b/pytorch_lightning/__about__.py
index 5bfc1ae6d3473..6d6318dbc4846 100644
--- a/pytorch_lightning/__about__.py
+++ b/pytorch_lightning/__about__.py
@@ -1,7 +1,7 @@
 import time
 
 _this_year = time.strftime("%Y")
-__version__ = "1.5.0dev"
+__version__ = "1.5.0rc0"
 __author__ = "William Falcon et al."
 __author_email__ = "waf2107@columbia.edu"
 __license__ = "Apache-2.0"
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index cfed45e1db186..541cf5de3be2b 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -59,7 +59,18 @@ def __init__(self, precision_plugin: PrecisionPlugin, training_type_plugin: Trai
         self.optimizer_frequencies: List = []
 
     def connect(self, model: "pl.LightningModule") -> None:
-        """Transfers ownership of the model to this plugin."""
+        """Transfers ownership of the model to this plugin.
+
+        See deprecation warning below.
+
+        .. deprecated:: v1.5
+            This method is deprecated in v1.5 and will be removed in v1.6.
+            Please call `training_type_plugin.on_train_batch_start` directly.
+        """
+        rank_zero_deprecation(
+            "`Accelerator.connect` is deprecated in v1.5 and will be removed in v1.6. "
+            "`connect` logic is implemented directly in the `TrainingTypePlugin` implementations."
+        )
         self.training_type_plugin.connect(model)
 
     def setup_environment(self) -> None:
@@ -82,12 +93,39 @@ def setup(self, trainer: "pl.Trainer") -> None:
         self.setup_precision_plugin()
 
     def start_training(self, trainer: "pl.Trainer") -> None:
+        """
+        .. deprecated:: v1.5
+            This method is deprecated in v1.5 and will be removed in v1.6.
+            Please call `training_type_plugin.start_training` directly.
+        """
+        rank_zero_deprecation(
+            "`Accelerator.start_training` is deprecated in v1.5 and will be removed in v1.6. "
+            "`start_training` logic is implemented directly in the `TrainingTypePlugin` implementations."
+        )
         self.training_type_plugin.start_training(trainer)
 
     def start_evaluating(self, trainer: "pl.Trainer") -> None:
+        """
+        .. deprecated:: v1.5
+            This method is deprecated in v1.5 and will be removed in v1.6.
+            Please call `training_type_plugin.start_evaluating` directly.
+        """
+        rank_zero_deprecation(
+            "`Accelerator.start_evaluating` is deprecated in v1.5 and will be removed in v1.6. "
+            "`start_evaluating` logic is implemented directly in the `TrainingTypePlugin` implementations."
+        )
         self.training_type_plugin.start_evaluating(trainer)
 
     def start_predicting(self, trainer: "pl.Trainer") -> None:
+        """
+        .. deprecated:: v1.5
+            This method is deprecated in v1.5 and will be removed in v1.6.
+            Please call `training_type_plugin.start_predicting` directly.
+        """
+        rank_zero_deprecation(
+            "`Accelerator.start_predicting` is deprecated in v1.5 and will be removed in v1.6. "
+            "`start_predicting` logic is implemented directly in the `TrainingTypePlugin` implementations."
+        )
         self.training_type_plugin.start_predicting(trainer)
 
     def pre_dispatch(self, trainer: "pl.Trainer") -> None:
@@ -177,6 +215,15 @@ def training_step(self, step_kwargs: Dict[str, Union[Any, int]]) -> STEP_OUTPUT:
             return self.training_type_plugin.training_step(*step_kwargs.values())
 
     def post_training_step(self) -> None:
+        """
+        .. deprecated:: v1.5
+            This method is deprecated in v1.5 and will be removed in v1.6.
+            Please call `training_type_plugin.post_training_step` directly.
+        """
+        rank_zero_deprecation(
+            "`Accelerator.post_training_step` is deprecated in v1.5 and will be removed in v1.6. "
+            "`post_training_step` logic is implemented directly in the `TrainingTypePlugin` implementations."
+        )
         self.training_type_plugin.post_training_step()
 
     def validation_step(self, step_kwargs: Dict[str, Union[Any, int]]) -> Optional[STEP_OUTPUT]:
@@ -206,25 +253,49 @@ def predict_step(self, step_kwargs: Dict[str, Union[Any, int]]) -> STEP_OUTPUT:
     def training_step_end(self, output: STEP_OUTPUT) -> STEP_OUTPUT:
         """A hook to do something at the end of the training step.
 
+        .. deprecated:: v1.5
+            This method is deprecated in v1.5 and will be removed in v1.6.
+            Please call `training_type_plugin.training_step_end` directly.
+
         Args:
             output: the output of the training step
         """
+        rank_zero_deprecation(
+            "`Accelerator.training_step_end` is deprecated in v1.5 and will be removed in v1.6. "
+            "`training_step_end` logic is implemented directly in the `TrainingTypePlugin` implementations."
+        )
         return self.training_type_plugin.training_step_end(output)
 
     def test_step_end(self, output: Optional[STEP_OUTPUT]) -> Optional[STEP_OUTPUT]:
         """A hook to do something at the end of the test step.
 
+        .. deprecated:: v1.5
+            This method is deprecated in v1.5 and will be removed in v1.6.
+            Please call `training_type_plugin.test_step_end` directly.
+
         Args:
             output: the output of the test step
         """
+        rank_zero_deprecation(
+            "`Accelerator.test_step_end` is deprecated in v1.5 and will be removed in v1.6. "
+            "`test_step_end` logic is implemented directly in the `TrainingTypePlugin` implementations."
+        )
         return self.training_type_plugin.test_step_end(output)
 
     def validation_step_end(self, output: Optional[STEP_OUTPUT]) -> Optional[STEP_OUTPUT]:
         """A hook to do something at the end of the validation step.
 
+        .. deprecated:: v1.5
+            This method is deprecated in v1.5 and will be removed in v1.6.
+            Please call `training_type_plugin.validation_step_end` directly.
+
         Args:
             output: the output of the validation step
         """
+        rank_zero_deprecation(
+            "`Accelerator.validation_step_end` is deprecated in v1.5 and will be removed in v1.6. "
+            "`validation_step_end` logic is implemented directly in the `TrainingTypePlugin` implementations."
+        )
         return self.training_type_plugin.validation_step_end(output)
 
     def backward(self, closure_loss: Tensor, *args: Any, **kwargs: Any) -> Tensor:
@@ -330,19 +401,27 @@ def optimizer_state(self, optimizer: Optimizer) -> Dict[str, Tensor]:
     def lightning_module_state_dict(self) -> Dict[str, Union[Any, Tensor]]:
         """Returns state of model.
 
+        .. deprecated:: v1.5
+            This method is deprecated in v1.5 and will be removed in v1.6.
+            Please call `training_type_plugin.lightning_module_state_dict` directly.
+
         Allows for syncing/collating model state from processes in custom plugins.
         """
+        rank_zero_deprecation(
+            "`Accelerator.lightning_module_state_dict` is deprecated in v1.5 and will be removed in v1.6. "
+            "`lightning_module_state_dict` logic is implemented directly in the `TrainingTypePlugin` implementations."
+        )
         return self.training_type_plugin.lightning_module_state_dict()
 
     def barrier(self, name: Optional[str] = None) -> None:
         """
         .. deprecated:: v1.5
             This method is deprecated in v1.5 and will be removed in v1.6.
-            Please call ``training_type_plugin.barrier`` directly.
+            Please call `training_type_plugin.barrier` directly.
         """
         rank_zero_deprecation(
             "`Accelerator.barrier` is deprecated in v1.5 and will be removed in v1.6. "
-            "Barrier logic is implemented directly in the `TrainingTypePlugin` implementations."
+            "`Barrier` logic is implemented directly in the `TrainingTypePlugin` implementations."
         )
         self.training_type_plugin.barrier(name=name)
 
@@ -352,7 +431,7 @@ def broadcast(self, obj: object, src: int = 0) -> object:
 
         .. deprecated:: v1.5
             This method is deprecated in v1.5 and will be removed in v1.6.
-            Please call ``training_type_plugin.broadcast`` directly.
+            Please call `training_type_plugin.broadcast` directly.
 
         Args:
             obj: Object to broadcast to all process, usually a tensor or collection of tensors.
@@ -360,7 +439,7 @@ def broadcast(self, obj: object, src: int = 0) -> object:
         """
         rank_zero_deprecation(
             "`Accelerator.broadcast` is deprecated in v1.5 and will be removed in v1.6. "
-            "Broadcast logic is implemented directly in the `TrainingTypePlugin` implementations."
+            "`Broadcast` logic is implemented directly in the `TrainingTypePlugin` implementations."
         )
         return self.training_type_plugin.broadcast(obj, src)
 
@@ -369,7 +448,7 @@ def all_gather(self, tensor: Tensor, group: Optional[Any] = None, sync_grads: bo
 
         .. deprecated:: v1.5
             This method is deprecated in v1.5 and will be removed in v1.6.
-            Please call ``training_type_plugin.all_gather`` directly.
+            Please call `training_type_plugin.all_gather` directly.
 
         Args:
             tensor: tensor of shape (batch, ...)
@@ -381,24 +460,40 @@ def all_gather(self, tensor: Tensor, group: Optional[Any] = None, sync_grads: bo
         """
         rank_zero_deprecation(
             "`Accelerator.all_gather` is deprecated in v1.5 and will be removed in v1.6. "
-            "All-gather logic is implemented directly in the `TrainingTypePlugin` implementations."
+            "`all_gather` logic is implemented directly in the `TrainingTypePlugin` implementations."
         )
         return self.training_type_plugin.all_gather(tensor, group=group, sync_grads=sync_grads)
 
     def process_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
         """Wraps the dataloader if necessary.
 
+        .. deprecated:: v1.5
+            This method is deprecated in v1.5 and will be removed in v1.6.
+            Please call `training_type_plugin.process_dataloader` directly.
+
         Args:
             dataloader: iterable. Ideally of type: :class:`torch.utils.data.DataLoader`
         """
+        rank_zero_deprecation(
+            "`Accelerator.process_dataloader` is deprecated in v1.5 and will be removed in v1.6. "
+            "`process_dataloader` logic is implemented directly in the `TrainingTypePlugin` implementations."
+        )
         return self.training_type_plugin.process_dataloader(dataloader)
 
     @property
     def results(self) -> Any:
         """The results of the last run will be cached within the training type plugin.
 
+        .. deprecated:: v1.5
+            This property is deprecated in v1.5 and will be removed in v1.6.
+            Please call `training_type_plugin.results` directly.
+
         In distributed training, we make sure to transfer the results to the appropriate master process.
         """
+        rank_zero_deprecation(
+            "`Accelerator.results` is deprecated in v1.5 and will be removed in v1.6. "
+            "Accesse results directly from the `TrainingTypePlugin`."
+        )
         return self.training_type_plugin.results
 
     @contextlib.contextmanager
@@ -417,10 +512,18 @@ def model_sharded_context(self) -> Generator[None, None, None]:
     def save_checkpoint(self, checkpoint: Dict[str, Any], filepath: _PATH) -> None:
         """Save model/training states as a checkpoint file through state-dump and file-write.
 
+        .. deprecated:: v1.5
+            This method is deprecated in v1.5 and will be removed in v1.6.
+            Please call `training_type_plugin.save_checkpoint` directly.
+
         Args:
             checkpoint: dict containing model and trainer state
             filepath: write-target file's path
         """
+        rank_zero_deprecation(
+            "`Accelerator.save_checkpoint` is deprecated in v1.5 and will be removed in v1.6. "
+            "`save_checkpoint` logic is implemented directly in the `TrainingTypePlugin` implementations."
+        )
         self.training_type_plugin.save_checkpoint(checkpoint, filepath)
 
     @property
@@ -429,9 +532,17 @@ def setup_optimizers_in_pre_dispatch(self) -> bool:
         `TrainingTypePlugin` requires operating on the wrapped accelerator model. However this may break certain
         precision plugins such as APEX which require optimizers to be set.
 
+        .. deprecated:: v1.5
+            This property is deprecated in v1.5 and will be removed in v1.6.
+            Please call `training_type_plugin.setup_optimizers_in_pre_dispatch` directly.
+
         Returns:
             If True, delay setup optimizers until `pre_dispatch`, else call within `setup`.
         """
+        rank_zero_deprecation(
+            "`Accelerator.setup_optimizers_in_pre_dispatch` is deprecated in v1.5 and will be removed in v1.6. "
+            "Accesse `setup_optimizers_in_pre_dispatch directly` from the `TrainingTypePlugin`."
+        )
         return self.training_type_plugin.setup_optimizers_in_pre_dispatch
 
     @property
@@ -439,9 +550,17 @@ def restore_checkpoint_after_pre_dispatch(self) -> bool:
         """Override to delay restoring from checkpoint till after pre-dispatch. This is useful when the plugin
         requires all the setup hooks to run before loading checkpoint.
 
+        .. deprecated:: v1.5
+            This property is deprecated in v1.5 and will be removed in v1.6.
+            Please call `training_type_plugin.restore_checkpoint_after_pre_dispatch` directly.
+
         Returns:
             If true, restore checkpoint after pre_dispatch.
         """
+        rank_zero_deprecation(
+            "`Accelerator.restore_checkpoint_after_pre_dispatch` is deprecated in v1.5 and will be removed in v1.6. "
+            "Accesse `restore_checkpoint_after_pre_dispatch` directly from the `TrainingTypePlugin`."
+        )
         return self.training_type_plugin.restore_checkpoint_after_pre_dispatch
 
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
@@ -460,34 +579,122 @@ def on_train_start(self) -> None:
         return self.training_type_plugin.on_train_start()
 
     def on_validation_start(self) -> None:
-        """Called when validation begins."""
+        """Called when validation begins.
+
+        See deprecation warning below.
+
+        .. deprecated:: v1.5
+            This method is deprecated in v1.5 and will be removed in v1.6.
+            Please call `training_type_plugin.on_validation_start` directly.
+        """
+        rank_zero_deprecation(
+            "`Accelerator.on_validation_start` is deprecated in v1.5 and will be removed in v1.6. "
+            "`on_validation_start` logic is implemented directly in the `TrainingTypePlugin` implementations."
+        )
         return self.training_type_plugin.on_validation_start()
 
     def on_test_start(self) -> None:
-        """Called when test begins."""
+        """Called when test begins.
+
+        See deprecation warning below.
+
+        .. deprecated:: v1.5
+            This method is deprecated in v1.5 and will be removed in v1.6.
+            Please call `training_type_plugin.on_test_start` directly.
+        """
+        rank_zero_deprecation(
+            "`Accelerator.on_test_start` is deprecated in v1.5 and will be removed in v1.6. "
+            "`on_test_start` logic is implemented directly in the `TrainingTypePlugin` implementations."
+        )
         return self.training_type_plugin.on_test_start()
 
     def on_predict_start(self) -> None:
-        """Called when predict begins."""
+        """Called when predict begins.
+
+        See deprecation warning below.
+
+        .. deprecated:: v1.5
+            This method is deprecated in v1.5 and will be removed in v1.6.
+            Please call `training_type_plugin.on_predict_start` directly.
+        """
+        rank_zero_deprecation(
+            "`Accelerator.on_predict_start` is deprecated in v1.5 and will be removed in v1.6. "
+            "`on_predict_start` logic is implemented directly in the `TrainingTypePlugin` implementations."
+        )
         return self.training_type_plugin.on_predict_start()
 
     def on_validation_end(self) -> None:
-        """Called when validation ends."""
+        """Called when validation ends.
+
+        See deprecation warning below.
+
+        .. deprecated:: v1.5
+            This method is deprecated in v1.5 and will be removed in v1.6.
+            Please call `training_type_plugin.on_validation_end` directly.
+        """
+        rank_zero_deprecation(
+            "`Accelerator.on_validation_end` is deprecated in v1.5 and will be removed in v1.6. "
+            "`on_validation_end` logic is implemented directly in the `TrainingTypePlugin` implementations."
+        )
         return self.training_type_plugin.on_validation_end()
 
     def on_test_end(self) -> None:
-        """Called when test end."""
+        """Called when test end.
+
+        See deprecation warning below.
+
+        .. deprecated:: v1.5
+            This method is deprecated in v1.5 and will be removed in v1.6.
+            Please call `training_type_plugin.on_test_end` directly.
+        """
+        rank_zero_deprecation(
+            "`Accelerator.on_test_end` is deprecated in v1.5 and will be removed in v1.6. "
+            "`on_test_end` logic is implemented directly in the `TrainingTypePlugin` implementations."
+        )
         return self.training_type_plugin.on_test_end()
 
     def on_predict_end(self) -> None:
-        """Called when predict ends."""
+        """Called when predict ends.
+
+        See deprecation warning below.
+
+        .. deprecated:: v1.5
+            This method is deprecated in v1.5 and will be removed in v1.6.
+            Please call `training_type_plugin.on_predict_end` directly.
+        """
+        rank_zero_deprecation(
+            "`Accelerator.on_predict_end` is deprecated in v1.5 and will be removed in v1.6. "
+            "`on_predict_end` logic is implemented directly in the `TrainingTypePlugin` implementations."
+        )
         return self.training_type_plugin.on_predict_end()
 
     def on_train_end(self) -> None:
-        """Called when train ends."""
+        """Called when train ends.
+
+        See deprecation warning below.
+
+        .. deprecated:: v1.5
+            This method is deprecated in v1.5 and will be removed in v1.6.
+            Please call `training_type_plugin.on_train_end` directly.
+        """
+        rank_zero_deprecation(
+            "`Accelerator.on_train_end` is deprecated in v1.5 and will be removed in v1.6. "
+            "`on_train_end` logic is implemented directly in the `TrainingTypePlugin` implementations."
+        )
         return self.training_type_plugin.on_train_end()
 
     # TODO: Update this in v1.7 (deprecation: #9816)
     def on_train_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> None:
-        """Called in the training loop before anything happens for that batch."""
+        """Called in the training loop before anything happens for that batch.
+
+        See deprecation warning below.
+
+        .. deprecated:: v1.5
+            This method is deprecated in v1.5 and will be removed in v1.6.
+            Please call `training_type_plugin.on_train_batch_start` directly.
+        """
+        rank_zero_deprecation(
+            "`Accelerator.on_train_batch_start` is deprecated in v1.5 and will be removed in v1.6. "
+            "`on_train_batch_start` logic is implemented directly in the `TrainingTypePlugin` implementations."
+        )
         return self.training_type_plugin.on_train_batch_start(batch, batch_idx)
diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py
index baa922b6d796b..d16e8b6a8b1ac 100644
--- a/pytorch_lightning/accelerators/cpu.py
+++ b/pytorch_lightning/accelerators/cpu.py
@@ -35,5 +35,5 @@ def setup(self, trainer: "pl.Trainer") -> None:
         return super().setup(trainer)
 
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
-        """Returns dummy implementation for now."""
+        """CPU device stats aren't supported yet."""
         return {}
diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
index b33903c2d60c9..44b29efe6f2bc 100644
--- a/pytorch_lightning/accelerators/gpu.py
+++ b/pytorch_lightning/accelerators/gpu.py
@@ -46,6 +46,7 @@ def setup(self, trainer: "pl.Trainer") -> None:
         return super().setup(trainer)
 
     def on_train_start(self) -> None:
+        super().on_train_start()
         # clear cache before training
         torch.cuda.empty_cache()
 
diff --git a/pytorch_lightning/accelerators/ipu.py b/pytorch_lightning/accelerators/ipu.py
index 1456847a6ab4a..fbd23b5f2a217 100644
--- a/pytorch_lightning/accelerators/ipu.py
+++ b/pytorch_lightning/accelerators/ipu.py
@@ -11,8 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Callable
+from typing import Any, Callable, Dict, Union
 
+import torch
 from torch.optim import Optimizer
 
 import pytorch_lightning as pl
@@ -37,3 +38,7 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None:
     def optimizer_step(self, optimizer: Optimizer, opt_idx: int, lambda_closure: Callable, **kwargs: Any) -> None:
         # Optimizer step is handled by the IPU accelerator.
         lambda_closure()
+
+    def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
+        """IPU device stats aren't supported yet."""
+        return {}
diff --git a/pytorch_lightning/callbacks/__init__.py b/pytorch_lightning/callbacks/__init__.py
index 98cf5df7cafda..b94fa969f6ac9 100644
--- a/pytorch_lightning/callbacks/__init__.py
+++ b/pytorch_lightning/callbacks/__init__.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from pytorch_lightning.callbacks.base import Callback
+from pytorch_lightning.callbacks.device_stats_monitor import DeviceStatsMonitor
 from pytorch_lightning.callbacks.early_stopping import EarlyStopping
 from pytorch_lightning.callbacks.finetuning import BackboneFinetuning, BaseFinetuning
 from pytorch_lightning.callbacks.gpu_stats_monitor import GPUStatsMonitor
@@ -33,6 +34,7 @@
     "BackboneFinetuning",
     "BaseFinetuning",
     "Callback",
+    "DeviceStatsMonitor",
     "EarlyStopping",
     "GPUStatsMonitor",
     "XLAStatsMonitor",
diff --git a/pytorch_lightning/callbacks/base.py b/pytorch_lightning/callbacks/base.py
index c041cdbc5bc9f..88f922fa67a92 100644
--- a/pytorch_lightning/callbacks/base.py
+++ b/pytorch_lightning/callbacks/base.py
@@ -327,5 +327,5 @@ def on_before_optimizer_step(
         pass
 
     def on_before_zero_grad(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", optimizer: Optimizer) -> None:
-        """Called after ``optimizer.step()`` and before ``optimizer.zero_grad()``."""
+        """Called before ``optimizer.zero_grad()``."""
         pass
diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py
new file mode 100644
index 0000000000000..b743ed3e1bbeb
--- /dev/null
+++ b/pytorch_lightning/callbacks/device_stats_monitor.py
@@ -0,0 +1,82 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Device Stats Monitor
+====================
+
+Monitors and logs device stats during training.
+
+"""
+from typing import Any, Dict, Optional
+
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks.base import Callback
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.types import STEP_OUTPUT
+
+
+class DeviceStatsMonitor(Callback):
+    r"""
+    Automatically monitors and logs device stats during training stage. ``DeviceStatsMonitor``
+    is a special callback as it requires a ``logger`` to passed as argument to the ``Trainer``.
+
+    Raises:
+        MisconfigurationException:
+            If ``Trainer`` has no logger.
+
+    Example:
+        >>> from pytorch_lightning import Trainer
+        >>> from pytorch_lightning.callbacks import DeviceStatsMonitor
+        >>> device_stats = DeviceStatsMonitor() # doctest: +SKIP
+        >>> trainer = Trainer(callbacks=[device_stats]) # doctest: +SKIP
+    """
+
+    def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: Optional[str] = None) -> None:
+        if not trainer.logger:
+            raise MisconfigurationException("Cannot use DeviceStatsMonitor callback with Trainer that has no logger.")
+
+    def on_train_batch_start(
+        self,
+        trainer: "pl.Trainer",
+        pl_module: "pl.LightningModule",
+        batch: Any,
+        batch_idx: int,
+        unused: Optional[int] = 0,
+    ) -> None:
+        if not trainer.logger_connector.should_update_logs:
+            return
+
+        device_stats = trainer.accelerator.get_device_stats(pl_module.device)
+        prefixed_device_stats = prefix_metrics_keys(device_stats, "on_train_batch_start")
+        trainer.logger.log_metrics(prefixed_device_stats, step=trainer.global_step)
+
+    def on_train_batch_end(
+        self,
+        trainer: "pl.Trainer",
+        pl_module: "pl.LightningModule",
+        outputs: STEP_OUTPUT,
+        batch: Any,
+        batch_idx: int,
+        unused: Optional[int] = 0,
+    ) -> None:
+        if not trainer.logger_connector.should_update_logs:
+            return
+
+        device_stats = trainer.accelerator.get_device_stats(pl_module.device)
+        prefixed_device_stats = prefix_metrics_keys(device_stats, "on_train_batch_end")
+        trainer.logger.log_metrics(prefixed_device_stats, step=trainer.global_step)
+
+
+def prefix_metrics_keys(metrics_dict: Dict[str, float], prefix: str) -> Dict[str, float]:
+    return {prefix + "." + k: v for k, v in metrics_dict.items()}
diff --git a/pytorch_lightning/callbacks/gpu_stats_monitor.py b/pytorch_lightning/callbacks/gpu_stats_monitor.py
index 8e9e671949c88..eb0c090a9b8b1 100644
--- a/pytorch_lightning/callbacks/gpu_stats_monitor.py
+++ b/pytorch_lightning/callbacks/gpu_stats_monitor.py
@@ -29,7 +29,7 @@
 
 import pytorch_lightning as pl
 from pytorch_lightning.callbacks.base import Callback
-from pytorch_lightning.utilities import DeviceType, rank_zero_only
+from pytorch_lightning.utilities import DeviceType, rank_zero_deprecation, rank_zero_only
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.parsing import AttributeDict
 from pytorch_lightning.utilities.types import STEP_OUTPUT
@@ -37,6 +37,10 @@
 
 class GPUStatsMonitor(Callback):
     r"""
+    .. deprecated:: v1.5
+        The `GPUStatsMonitor` callback was deprecated in v1.5 and will be removed in v1.7.
+        Please use the `DeviceStatsMonitor` callback instead.
+
     Automatically monitors and logs GPU stats during training stage. ``GPUStatsMonitor``
     is a callback and in order to use it you need to assign a logger in the ``Trainer``.
 
@@ -91,6 +95,11 @@ def __init__(
     ):
         super().__init__()
 
+        rank_zero_deprecation(
+            "The `GPUStatsMonitor` callback was deprecated in v1.5 and will be removed in v1.7."
+            " Please use the `DeviceStatsMonitor` callback instead."
+        )
+
         if shutil.which("nvidia-smi") is None:
             raise MisconfigurationException(
                 "Cannot use GPUStatsMonitor callback because NVIDIA driver is not installed."
diff --git a/pytorch_lightning/callbacks/gradient_accumulation_scheduler.py b/pytorch_lightning/callbacks/gradient_accumulation_scheduler.py
index c69089caedd99..4b2f2c2781ef1 100644
--- a/pytorch_lightning/callbacks/gradient_accumulation_scheduler.py
+++ b/pytorch_lightning/callbacks/gradient_accumulation_scheduler.py
@@ -23,6 +23,7 @@
 from typing import Dict
 
 from pytorch_lightning.callbacks.base import Callback
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
 class GradientAccumulationScheduler(Callback):
@@ -32,6 +33,14 @@ class GradientAccumulationScheduler(Callback):
     Args:
         scheduling: scheduling in format {epoch: accumulation_factor}
 
+    Note:
+        The argument scheduling is a dictionary. Each key represent an epoch and
+        its associated accumulation factor value.
+        Warning: Epoch are zero-indexed c.f it means if you want to change
+        the accumulation factor after 4 epochs, set ``Trainer(accumulate_grad_batches={4: factor})``
+        or ``GradientAccumulationScheduler(scheduling={4: factor})``.
+        For more info check the example below.
+
     Raises:
         TypeError:
             If ``scheduling`` is an empty ``dict``,
@@ -44,12 +53,13 @@ class GradientAccumulationScheduler(Callback):
         >>> from pytorch_lightning import Trainer
         >>> from pytorch_lightning.callbacks import GradientAccumulationScheduler
 
-        # at epoch 5 start accumulating every 2 batches
-        >>> accumulator = GradientAccumulationScheduler(scheduling={5: 2})
+        # from epoch 5, it starts accumulating every 2 batches. Here we have 4 instead of 5
+        because epoch (key) should be zero-indexed.
+        >>> accumulator = GradientAccumulationScheduler(scheduling={4: 2})
         >>> trainer = Trainer(callbacks=[accumulator])
 
         # alternatively, pass the scheduling dict directly to the Trainer
-        >>> trainer = Trainer(accumulate_grad_batches={5: 2})
+        >>> trainer = Trainer(accumulate_grad_batches={4: 2})
     """
 
     def __init__(self, scheduling: Dict[int, int]):
@@ -58,9 +68,15 @@ def __init__(self, scheduling: Dict[int, int]):
         if not scheduling:  # empty dict error
             raise TypeError("Empty dict cannot be interpreted correct")
 
-        for key in scheduling:
-            if not isinstance(key, int) or not isinstance(scheduling[key], int):
-                raise TypeError("All epoches and accumulation factor must be integers")
+        if any(not isinstance(key, int) or key < 0 for key in scheduling):
+            raise MisconfigurationException(
+                f"Epoch should be an int greater than or equal to 0. Got {list(scheduling.keys())}."
+            )
+
+        if any(not isinstance(value, int) or value < 1 for value in scheduling.values()):
+            raise MisconfigurationException(
+                f"Accumulation factor should be an int greater than 0. Got {list(scheduling.values())}."
+            )
 
         minimal_epoch = min(scheduling.keys())
         if minimal_epoch < 0:
diff --git a/pytorch_lightning/callbacks/lr_monitor.py b/pytorch_lightning/callbacks/lr_monitor.py
index 01fded3e8dc9c..50e37d3dbe5ed 100644
--- a/pytorch_lightning/callbacks/lr_monitor.py
+++ b/pytorch_lightning/callbacks/lr_monitor.py
@@ -106,18 +106,13 @@ def on_train_start(self, trainer, *args, **kwargs):
                 "Cannot use `LearningRateMonitor` callback with `Trainer` that has no logger."
             )
 
-        if not trainer.lr_schedulers:
-            rank_zero_warn(
-                "You are using `LearningRateMonitor` callback with models that"
-                " have no learning rate schedulers. Please see documentation"
-                " for `configure_optimizers` method.",
-                RuntimeWarning,
-            )
-
         if self.log_momentum:
 
             def _check_no_key(key):
-                return any(key not in sch["scheduler"].optimizer.defaults for sch in trainer.lr_schedulers)
+                if trainer.lr_schedulers:
+                    return any(key not in sch["scheduler"].optimizer.defaults for sch in trainer.lr_schedulers)
+
+                return any(key not in optimizer.defaults for optimizer in trainer.optimizers)
 
             if _check_no_key("momentum") and _check_no_key("betas"):
                 rank_zero_warn(
@@ -127,7 +122,21 @@ def _check_no_key(key):
                 )
 
         # Find names for schedulers
-        names = self._find_names(trainer.lr_schedulers)
+        names = []
+        (
+            sched_hparam_keys,
+            optimizers_with_scheduler,
+            optimizers_with_scheduler_types,
+        ) = self._find_names_from_schedulers(trainer.lr_schedulers)
+        names.extend(sched_hparam_keys)
+
+        # Find names for leftover optimizers
+        optimizer_hparam_keys, _ = self._find_names_from_optimizers(
+            trainer.optimizers,
+            seen_optimizers=optimizers_with_scheduler,
+            seen_optimizer_types=optimizers_with_scheduler_types,
+        )
+        names.extend(optimizer_hparam_keys)
 
         # Initialize for storing values
         self.lrs = {name: [] for name in names}
@@ -155,26 +164,49 @@ def on_train_epoch_start(self, trainer, *args, **kwargs):
     def _extract_stats(self, trainer, interval: str) -> Dict[str, float]:
         latest_stat = {}
 
-        names = self._find_names(trainer.lr_schedulers, add_lr_sch_names=False)
-        self._remap_keys(names)
+        (
+            scheduler_hparam_keys,
+            optimizers_with_scheduler,
+            optimizers_with_scheduler_types,
+        ) = self._find_names_from_schedulers(trainer.lr_schedulers, add_lr_sch_names=False)
+        self._remap_keys(scheduler_hparam_keys)
 
         for name, scheduler in zip(self.lr_sch_names, trainer.lr_schedulers):
-            if scheduler["interval"] == interval or interval == "any":
+            if interval in [scheduler["interval"], "any"]:
                 opt = scheduler["scheduler"].optimizer
-                param_groups = opt.param_groups
-                use_betas = "betas" in opt.defaults
-
-                for i, pg in enumerate(param_groups):
-                    name_and_suffix = self._add_suffix(name, param_groups, i)
-                    lr = self._extract_lr(pg, name_and_suffix)
-                    latest_stat.update(lr)
-                    momentum = self._extract_momentum(
-                        param_group=pg, name=name_and_suffix.replace(name, f"{name}-momentum"), use_betas=use_betas
-                    )
-                    latest_stat.update(momentum)
+                current_stat = self._get_lr_momentum_stat(opt, name)
+                latest_stat.update(current_stat)
+
+        optimizer_hparam_keys, optimizers_without_scheduler = self._find_names_from_optimizers(
+            trainer.optimizers,
+            seen_optimizers=optimizers_with_scheduler,
+            seen_optimizer_types=optimizers_with_scheduler_types,
+            add_lr_sch_names=False,
+        )
+        self._remap_keys(optimizer_hparam_keys)
+
+        for opt, name in zip(optimizers_without_scheduler, optimizer_hparam_keys):
+            current_stat = self._get_lr_momentum_stat(opt, name)
+            latest_stat.update(current_stat)
 
         return latest_stat
 
+    def _get_lr_momentum_stat(self, optimizer: Optimizer, name: str) -> None:
+        lr_momentum_stat = {}
+        param_groups = optimizer.param_groups
+        use_betas = "betas" in optimizer.defaults
+
+        for i, pg in enumerate(param_groups):
+            name_and_suffix = self._add_suffix(name, param_groups, i)
+            lr = self._extract_lr(pg, name_and_suffix)
+            lr_momentum_stat.update(lr)
+            momentum = self._extract_momentum(
+                param_group=pg, name=name_and_suffix.replace(name, f"{name}-momentum"), use_betas=use_betas
+            )
+            lr_momentum_stat.update(momentum)
+
+        return lr_momentum_stat
+
     def _extract_lr(self, param_group: Dict[str, Any], name: str) -> Dict[str, Any]:
         lr = param_group.get("lr")
         self.lrs[name].append(lr)
@@ -223,7 +255,7 @@ def _duplicate_param_group_names(self, param_groups: List[Dict]) -> Set[str]:
             return set()
         return {n for n in names if names.count(n) > 1}
 
-    def _find_names(self, lr_schedulers: List, add_lr_sch_names: bool = True) -> List[str]:
+    def _find_names_from_schedulers(self, lr_schedulers: List, add_lr_sch_names: bool = True) -> List[str]:
         # Create unique names in the case we have multiple of the same learning
         # rate scheduler + multiple parameter groups
         names = []
@@ -236,28 +268,64 @@ def _find_names(self, lr_schedulers: List, add_lr_sch_names: bool = True) -> Lis
             else:
                 name = "lr-" + sch.optimizer.__class__.__name__
 
-            seen_optimizers.append(sch.optimizer)
-            optimizer_cls = type(sch.optimizer)
-            if scheduler["name"] is None:
-                seen_optimizer_types[optimizer_cls] += 1
-
-            # Multiple param groups for the same scheduler
-            param_groups = sch.optimizer.param_groups
-            duplicates = self._duplicate_param_group_names(param_groups)
-            if duplicates:
-                raise MisconfigurationException(
-                    "A single `Optimizer` cannot have multiple parameter groups with identical "
-                    f"`name` values. {name} has duplicated parameter group names {duplicates}"
-                )
+            updated_name = self._check_duplicates_and_update_name(
+                sch.optimizer, name, seen_optimizers, seen_optimizer_types, scheduler, add_lr_sch_names
+            )
+            names.extend(updated_name)
+        return names, seen_optimizers, seen_optimizer_types
+
+    def _find_names_from_optimizers(
+        self, optimizers, seen_optimizers, seen_optimizer_types, add_lr_sch_names: bool = True
+    ) -> List[str]:
+        names = []
+        optimizers_without_scheduler = []
 
-            name = self._add_prefix(name, optimizer_cls, seen_optimizer_types)
+        for optimizer in optimizers:
+            # Deepspeed optimizer wraps the native optimizer
+            optimizer = optimizer.optimizer if hasattr(optimizer, "optimizer") else optimizer
+            if optimizer in seen_optimizers:
+                continue
+
+            name = "lr-" + optimizer.__class__.__name__
+            updated_name = self._check_duplicates_and_update_name(
+                optimizer, name, seen_optimizers, seen_optimizer_types, None, add_lr_sch_names
+            )
+            names.extend(updated_name)
+            optimizers_without_scheduler.append(optimizer)
+        return names, optimizers_without_scheduler
+
+    def _check_duplicates_and_update_name(
+        self,
+        optimizer: Optimizer,
+        name: str,
+        seen_optimizers: List,
+        seen_optimizer_types: List,
+        scheduler: Dict[str, Any] = None,
+        add_lr_sch_names: bool = True,
+    ) -> List:
+        seen_optimizers.append(optimizer)
+        optimizer_cls = type(optimizer)
+        if scheduler is not None and scheduler["name"] is None:
+            seen_optimizer_types[optimizer_cls] += 1
+        elif scheduler is None:
+            seen_optimizer_types[optimizer_cls] += 1
+
+        # Multiple param groups for the same optimizer
+        param_groups = optimizer.param_groups
+        duplicates = self._duplicate_param_group_names(param_groups)
+        if duplicates:
+            raise MisconfigurationException(
+                "A single `Optimizer` cannot have multiple parameter groups with identical "
+                f"`name` values. {name} has duplicated parameter group names {duplicates}"
+            )
 
-            names.extend(self._add_suffix(name, param_groups, i) for i in range(len(param_groups)))
+        name = self._add_prefix(name, optimizer_cls, seen_optimizer_types)
+        name_list = [self._add_suffix(name, param_groups, i) for i in range(len(param_groups))]
 
-            if add_lr_sch_names:
-                self.lr_sch_names.append(name)
+        if add_lr_sch_names:
+            self.lr_sch_names.append(name)
 
-        return names
+        return name_list
 
     @staticmethod
     def _should_log(trainer) -> bool:
diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 3725c870997c3..ec57147764914 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -130,7 +130,6 @@ class ModelCheckpoint(Callback):
 
             Use ``every_n_epochs`` instead.
 
-
     Note:
         For extra customization, ModelCheckpoint includes the following attributes:
 
@@ -147,7 +146,7 @@ class ModelCheckpoint(Callback):
 
     Raises:
         MisconfigurationException:
-            If ``save_top_k`` is neither ``None`` nor more than or equal to ``-1``,
+            If ``save_top_k`` is smaller than ``-1``,
             if ``monitor`` is ``None`` and ``save_top_k`` is none of ``None``, ``-1``, and ``0``, or
             if ``mode`` is none of ``"min"`` or ``"max"``.
         ValueError:
@@ -427,11 +426,7 @@ def __validate_init_configuration(self) -> None:
                     f"ModelCheckpoint(save_top_k={self.save_top_k}, monitor=None) is not a valid"
                     " configuration. No quantity for top_k to track."
                 )
-            if self.save_last:
-                rank_zero_warn(
-                    "ModelCheckpoint(save_last=True, save_top_k=None, monitor=None) is a redundant configuration."
-                    " You can save the last checkpoint with ModelCheckpoint(save_top_k=None, monitor=None)."
-                )
+
             if self.save_top_k == -1 and self.save_last:
                 rank_zero_info(
                     "ModelCheckpoint(save_last=True, save_top_k=-1, monitor=None)"
@@ -614,7 +609,7 @@ def __resolve_ckpt_dir(self, trainer: "pl.Trainer") -> None:
 
         self.dirpath = ckpt_path
 
-        if not trainer.fast_dev_run and trainer.should_rank_save_checkpoint:
+        if not trainer.fast_dev_run and trainer.training_type_plugin.should_rank_save_checkpoint:
             self._fs.makedirs(self.dirpath, exist_ok=True)
 
     def __warn_if_dir_not_empty(self, dirpath: _PATH) -> None:
diff --git a/pytorch_lightning/callbacks/quantization.py b/pytorch_lightning/callbacks/quantization.py
index da85b100cafe4..2b99c25ed6478 100644
--- a/pytorch_lightning/callbacks/quantization.py
+++ b/pytorch_lightning/callbacks/quantization.py
@@ -20,13 +20,17 @@
 from typing import Any, Callable, Optional, Sequence, Union
 
 import torch
-from torch.quantization import QConfig
 
 import pytorch_lightning as pl
 from pytorch_lightning.callbacks.base import Callback
 from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_10
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
+if _TORCH_GREATER_EQUAL_1_10:
+    from torch.ao.quantization.qconfig import QConfig
+else:
+    from torch.quantization import QConfig
+
 
 def wrap_qat_forward_context(
     quant_cb, model: "pl.LightningModule", func: Callable, trigger_condition: Optional[Union[Callable, int]] = None
diff --git a/pytorch_lightning/callbacks/xla_stats_monitor.py b/pytorch_lightning/callbacks/xla_stats_monitor.py
index 07e3008aa6cd1..20d3f1b8ba925 100644
--- a/pytorch_lightning/callbacks/xla_stats_monitor.py
+++ b/pytorch_lightning/callbacks/xla_stats_monitor.py
@@ -21,7 +21,7 @@
 import time
 
 from pytorch_lightning.callbacks.base import Callback
-from pytorch_lightning.utilities import _TPU_AVAILABLE, DeviceType, rank_zero_info
+from pytorch_lightning.utilities import _TPU_AVAILABLE, DeviceType, rank_zero_deprecation, rank_zero_info
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 if _TPU_AVAILABLE:
@@ -29,7 +29,12 @@
 
 
 class XLAStatsMonitor(Callback):
-    """Automatically monitors and logs XLA stats during training stage. ``XLAStatsMonitor`` is a callback and in
+    r"""
+    .. deprecated:: v1.5
+        The `XLAStatsMonitor` callback was deprecated in v1.5 and will be removed in v1.7.
+        Please use the `DeviceStatsMonitor` callback instead.
+
+    Automatically monitors and logs XLA stats during training stage. ``XLAStatsMonitor`` is a callback and in
     order to use it you need to assign a logger in the ``Trainer``.
 
     Args:
@@ -51,6 +56,11 @@ class XLAStatsMonitor(Callback):
     def __init__(self, verbose: bool = True) -> None:
         super().__init__()
 
+        rank_zero_deprecation(
+            "The `XLAStatsMonitor` callback was deprecated in v1.5 and will be removed in v1.7."
+            " Please use the `DeviceStatsMonitor` callback instead."
+        )
+
         if not _TPU_AVAILABLE:
             raise MisconfigurationException("Cannot use XLAStatsMonitor with TPUs are not available")
 
diff --git a/pytorch_lightning/core/datamodule.py b/pytorch_lightning/core/datamodule.py
index f3a5c855fe07a..8131bbb896bbe 100644
--- a/pytorch_lightning/core/datamodule.py
+++ b/pytorch_lightning/core/datamodule.py
@@ -22,7 +22,10 @@
 from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks
 from pytorch_lightning.core.mixins import HyperparametersMixin
 from pytorch_lightning.utilities import rank_zero_deprecation
+from pytorch_lightning.utilities.apply_func import apply_to_collection
 from pytorch_lightning.utilities.argparse import add_argparse_args, from_argparse_args, get_init_arguments_and_types
+from pytorch_lightning.utilities.data import has_len
+from pytorch_lightning.utilities.warnings import rank_zero_warn
 
 
 class LightningDataModule(CheckpointHooks, DataHooks, HyperparametersMixin):
@@ -481,3 +484,37 @@ def __getstate__(self) -> dict:
         for fn in ("prepare_data", "setup", "teardown"):
             del d[fn]
         return d
+
+    def __len__(self) -> int:
+        """Returns the total number of batches in all dataloaders defined in the datamodule."""
+
+        from pytorch_lightning.trainer.supporters import CombinedLoader
+
+        num_batches = 0
+        not_implemented_count = 0
+
+        def get_num_batches(dataloader: DataLoader, name: str) -> None:
+            nonlocal num_batches
+            if not has_len(dataloader):
+                rank_zero_warn(
+                    f"The number of batches for a dataloader in `{name}` is counted as 0 "
+                    "because it does not have `__len__` defined."
+                )
+            else:
+                num_batches += len(dataloader)
+
+        for method_name in ("train_dataloader", "val_dataloader", "test_dataloader", "predict_dataloader"):
+            dataloader_method = getattr(self, method_name)
+            try:
+                dataloader = dataloader_method()
+            except NotImplementedError:
+                not_implemented_count += 1
+                continue
+            if isinstance(dataloader, CombinedLoader):
+                dataloader = dataloader.loaders
+            apply_to_collection(dataloader, DataLoader, get_num_batches, method_name)
+
+        if not_implemented_count == 4:
+            rank_zero_warn("You datamodule does not have any valid dataloader so `__len__` will be returned as 0.")
+
+        return num_batches
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 00eeae5f13537..ca4b2af7eee17 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -21,13 +21,14 @@
 import tempfile
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Mapping, Optional, overload, Tuple, Union
 
 import torch
 from torch import ScriptModule, Tensor
 from torch.nn import Module
 from torch.optim.optimizer import Optimizer
 from torchmetrics import Metric
+from typing_extensions import Literal
 
 import pytorch_lightning as pl
 from pytorch_lightning.callbacks.progress import base as progress_base
@@ -36,7 +37,12 @@
 from pytorch_lightning.core.optimizer import LightningOptimizer
 from pytorch_lightning.core.saving import ModelIO
 from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator
-from pytorch_lightning.utilities import _TORCH_SHARDED_TENSOR_AVAILABLE, rank_zero_deprecation, rank_zero_warn
+from pytorch_lightning.utilities import (
+    _TORCH_SHARDED_TENSOR_AVAILABLE,
+    GradClipAlgorithmType,
+    rank_zero_deprecation,
+    rank_zero_warn,
+)
 from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.distributed import distributed_available, sync_ddp
@@ -115,6 +121,14 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         # deprecated, will be removed in 1.6
         self._loaded_optimizer_states_dict = {}
 
+    @overload
+    def optimizers(self, use_pl_optimizer: Literal[True] = True) -> Union[LightningOptimizer, List[LightningOptimizer]]:
+        ...
+
+    @overload
+    def optimizers(self, use_pl_optimizer: Literal[False]) -> Union[Optimizer, List[Optimizer]]:
+        ...
+
     def optimizers(
         self, use_pl_optimizer: bool = True
     ) -> Union[Optimizer, LightningOptimizer, List[Optimizer], List[LightningOptimizer]]:
@@ -1264,6 +1278,9 @@ def configure_optimizers(self):
                     "lr_scheduler": {
                         "scheduler": ReduceLROnPlateau(optimizer, ...),
                         "monitor": "metric_to_track",
+                        "frequency": "indicates how often the metric is updated"
+                        # If "monitor" references validation metrics, then "frequency" should be set to a
+                        # multiple of "trainer.check_val_every_n_epoch".
                     },
                 }
 
@@ -1418,17 +1435,16 @@ def backward(self, loss, optimizer, optimizer_idx):
         """
         loss.backward(*args, **kwargs)
 
-    def toggle_optimizer(self, optimizer: Optimizer, optimizer_idx: int):
+    def toggle_optimizer(self, optimizer: Union[Optimizer, LightningOptimizer], optimizer_idx: int) -> None:
         """Makes sure only the gradients of the current optimizer's parameters are calculated in the training step
-        to prevent dangling gradients in multiple-optimizer setup. It works with :meth:`untoggle_optimizer` to make
-        sure ``param_requires_grad_state`` is properly reset. Override for your own behavior.
+        to prevent dangling gradients in multiple-optimizer setup.
 
-        Args:
-            optimizer: Current optimizer used in the training loop
-            optimizer_idx: Current optimizer idx in the training loop
+        This is only called automatically when automatic optimization is enabled and multiple optimizers are used.
+        It works with :meth:`untoggle_optimizer` to make sure ``param_requires_grad_state`` is properly reset.
 
-        Note:
-            Only called when using multiple optimizers
+        Args:
+            optimizer: The optimizer to toggle.
+            optimizer_idx: The index of the optimizer to toggle.
         """
         # Iterate over all optimizer parameters to preserve their `requires_grad` information
         # in case these are pre-defined during `configure_optimizers`
@@ -1449,15 +1465,13 @@ def toggle_optimizer(self, optimizer: Optimizer, optimizer_idx: int):
                 param.requires_grad = param_requires_grad_state[param]
         self._param_requires_grad_state = param_requires_grad_state
 
-    def untoggle_optimizer(self, optimizer_idx: int):
-        """Resets the state of required gradients that were toggled with :meth:`toggle_optimizer`. Override for
-        your own behavior.
+    def untoggle_optimizer(self, optimizer_idx: int) -> None:
+        """Resets the state of required gradients that were toggled with :meth:`toggle_optimizer`.
 
-        Args:
-            optimizer_idx: Current optimizer idx in the training loop
+        This is only called automatically when automatic optimization is enabled and multiple optimizers are used.
 
-        Note:
-            Only called when using multiple optimizers
+        Args:
+            optimizer_idx: The index of the optimizer to untoggle.
         """
         for opt_idx, opt in enumerate(self.optimizers(use_pl_optimizer=False)):
             if optimizer_idx != opt_idx:
@@ -1468,16 +1482,106 @@ def untoggle_optimizer(self, optimizer_idx: int):
         # save memory
         self._param_requires_grad_state = {}
 
+    def clip_gradients(
+        self,
+        optimizer: Optimizer,
+        gradient_clip_val: Optional[Union[int, float]] = None,
+        gradient_clip_algorithm: Optional[Union[str, GradClipAlgorithmType]] = None,
+    ):
+        """Handles gradient clipping internally.
+
+        Note:
+            Do not override this method. If you want to customize gradient clipping, consider
+            using :meth:`configure_gradient_clipping` method.
+
+        Args:
+            optimizer: Current optimizer being used.
+            gradient_clip_val: The value at which to clip gradients.
+            gradient_clip_algorithm: The gradient clipping algorithm to use. Pass ``gradient_clip_algorithm="value"``
+                to clip by value, and ``gradient_clip_algorithm="norm"`` to clip by norm.
+        """
+        if gradient_clip_val is None:
+            gradient_clip_val = self.trainer.gradient_clip_val or 0.0
+        elif self.trainer.gradient_clip_val is not None and self.trainer.gradient_clip_val != gradient_clip_val:
+            raise MisconfigurationException(
+                "You have set `Trainer(gradient_clip_val)` and have passed"
+                " `gradient_clip_val` inside `clip_gradients`. Please use only one of them."
+            )
+
+        if gradient_clip_algorithm is None:
+            gradient_clip_algorithm = self.trainer.gradient_clip_algorithm or "norm"
+        else:
+            gradient_clip_algorithm = gradient_clip_algorithm.lower()
+            if (
+                self.trainer.gradient_clip_algorithm is not None
+                and self.trainer.gradient_clip_algorithm != gradient_clip_algorithm
+            ):
+                raise MisconfigurationException(
+                    "You have set `Trainer(gradient_clip_algorithm)` and have passed"
+                    " `gradient_clip_algorithm` inside `clip_gradients`. Please use only one of them."
+                )
+
+        if not isinstance(gradient_clip_val, (int, float)):
+            raise TypeError(f"`gradient_clip_val` should be an int or a float. Got {gradient_clip_val}.")
+
+        if not GradClipAlgorithmType.supported_type(gradient_clip_algorithm.lower()):
+            raise MisconfigurationException(
+                f"`gradient_clip_algorithm` {gradient_clip_algorithm} is invalid."
+                f" Allowed algorithms: {GradClipAlgorithmType.supported_types()}."
+            )
+
+        gradient_clip_algorithm = GradClipAlgorithmType(gradient_clip_algorithm)
+        self.trainer.accelerator.clip_gradients(optimizer, gradient_clip_val, gradient_clip_algorithm)
+
+    def configure_gradient_clipping(
+        self,
+        optimizer: Optimizer,
+        optimizer_idx: int,
+        gradient_clip_val: Optional[Union[int, float]] = None,
+        gradient_clip_algorithm: Optional[str] = None,
+    ):
+        """Perform gradient clipping for the optimizer parameters. Called before :meth:`optimizer_step`.
+
+        Note:
+            This hook won't be called when using deepspeed since it handles gradient clipping internally.
+            Consider setting ``gradient_clip_val`` and ``gradient_clip_algorithm`` inside ``Trainer``."
+
+        Args:
+            optimizer: Current optimizer being used.
+            optimizer_idx: Index of the current optimizer being used.
+            gradient_clip_val: The value at which to clip gradients. By default value passed in Trainer
+                will be available here.
+            gradient_clip_algorithm: The gradient clipping algorithm to use. By default value
+                passed in Trainer will be available here.
+
+        Example::
+
+            # Perform gradient clipping on gradients associated with discriminator (optimizer_idx=1) in GAN
+            def configure_gradient_clipping(self, optimizer, optimizer_idx, gradient_clip_val, gradient_clip_algorithm):
+                if optimizer_idx == 1:
+                    # Lightning will handle the gradient clipping
+                    self.clip_gradients(
+                        optimizer,
+                        gradient_clip_val=gradient_clip_val,
+                        gradient_clip_algorithm=gradient_clip_algorithm
+                    )
+                else:
+                    # implement your own custom logic to clip gradients for generator (optimizer_idx=0)
+        """
+        self.clip_gradients(
+            optimizer, gradient_clip_val=gradient_clip_val, gradient_clip_algorithm=gradient_clip_algorithm
+        )
+
     def optimizer_step(
         self,
-        epoch: int = None,
-        batch_idx: int = None,
-        optimizer: Optimizer = None,
-        optimizer_idx: int = None,
-        optimizer_closure: Optional[Callable] = None,
-        on_tpu: bool = None,
-        using_native_amp: bool = None,
-        using_lbfgs: bool = None,
+        epoch: int,
+        batch_idx: int,
+        optimizer: Union[Optimizer, LightningOptimizer],
+        optimizer_idx: int = 0,
+        optimizer_closure: Optional[Callable[[], Any]] = None,
+        on_tpu: bool = False,
+        using_native_amp: bool = False,
+        using_lbfgs: bool = False,
     ) -> None:
         r"""
         Override this method to adjust the default way the
@@ -1486,10 +1590,6 @@ def optimizer_step(
         once per optimizer. This method (and ``zero_grad()``) won't be called during the
         accumulation phase when ``Trainer(accumulate_grad_batches != 1)``.
 
-        Warning:
-            If you are overriding this method, make sure that you pass the ``optimizer_closure`` parameter
-            to ``optimizer.step()`` function as shown in the examples.
-
         Args:
             epoch: Current epoch
             batch_idx: Index of current batch
diff --git a/pytorch_lightning/core/optimizer.py b/pytorch_lightning/core/optimizer.py
index ba81644b9bd9a..25b3ffe3b1662 100644
--- a/pytorch_lightning/core/optimizer.py
+++ b/pytorch_lightning/core/optimizer.py
@@ -12,16 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from contextlib import contextmanager
-from typing import Callable, Optional
+from typing import Any, Callable, Generator, List, Optional
 from weakref import proxy
 
 from torch.optim import Optimizer
 
+import pytorch_lightning as pl
 from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
-def do_nothing_closure():
+def do_nothing_closure() -> None:
     return
 
 
@@ -44,39 +45,38 @@ def __init__(self, optimizer: Optimizer):
             self.__class__ = type("Lightning" + optimizer.__class__.__name__, (self.__class__, optimizer.__class__), {})
 
         self._optimizer = optimizer
-        self._trainer = None
-        self._optimizer_idx = None
-        self._total_optimizer_step_calls = 0
+        self._trainer: Optional["pl.Trainer"] = None
+        self._optimizer_idx = 0
 
     @property
-    def optimizer(self):
+    def optimizer(self) -> Optimizer:
         return self._optimizer
 
     @property
-    def defaults(self):
+    def defaults(self) -> dict:
         return self._optimizer.defaults
 
     @defaults.setter
-    def defaults(self, defaults):
+    def defaults(self, defaults: dict) -> None:
         self._optimizer.defaults = defaults
 
     @property
-    def state(self):
+    def state(self) -> dict:
         return self._optimizer.state
 
     @state.setter
-    def state(self, state):
+    def state(self, state: dict) -> None:
         self._optimizer.state = state
 
     @property
-    def param_groups(self):
+    def param_groups(self) -> List[dict]:
         return self._optimizer.param_groups
 
     @param_groups.setter
-    def param_groups(self, param_groups):
+    def param_groups(self, param_groups: List[dict]) -> None:
         self._optimizer.param_groups = param_groups
 
-    def _on_trainer_init(self, trainer):
+    def _on_trainer_init(self, trainer: "pl.Trainer") -> None:
         self._trainer = proxy(trainer)
         for opt_idx, opt in enumerate(trainer.optimizers):
             if opt == self._optimizer:
@@ -84,31 +84,22 @@ def _on_trainer_init(self, trainer):
                 break
 
     @classmethod
-    def _to_lightning_optimizer(cls, optimizer, trainer, opt_idx):
+    def _to_lightning_optimizer(cls, optimizer: Optimizer, trainer: "pl.Trainer", opt_idx: int) -> "LightningOptimizer":
         # apex overrides .step function and need to be wrapped on each step
-        if trainer.amp_backend == AMPType.APEX:
-            optimizer = cls(optimizer)
-            optimizer._on_trainer_init(trainer)
+        if trainer.amp_backend is not None and trainer.amp_backend == AMPType.APEX:
+            lightning_optimizer = cls(optimizer)
+            lightning_optimizer._on_trainer_init(trainer)
         else:
-            optimizer = trainer.lightning_optimizers[opt_idx]
-        return optimizer
-
-    def _toggle_model(self):
-        model_ref = self._trainer.lightning_module
-        model_ref.toggle_optimizer(self, self._optimizer_idx)
-
-    def _untoggle_model(self):
-        model_ref = self._trainer.lightning_module
-        model_ref.untoggle_optimizer(self)
+            lightning_optimizer = trainer.lightning_optimizers[opt_idx]
+        return lightning_optimizer
 
     @contextmanager
-    def toggle_model(self, sync_grad: bool = True):
+    def toggle_model(self, sync_grad: bool = True) -> Generator[None, None, None]:
         """This function is just a helper for advanced users.
 
         Considering the current optimizer as A and all other optimizers as B.
         Toggling means all parameters from B exclusive to A will have ``requires_grad`` set to False.
 
-
         When performing gradient accumulation, there is no need to perform grad synchronization
         during the accumulation phase.
         Setting `sync_grad` to False will block this synchronization and improve performance.
@@ -116,34 +107,24 @@ def toggle_model(self, sync_grad: bool = True):
         # local import here to avoid circular import
         from pytorch_lightning.loops.utilities import _block_parallel_sync_behavior
 
+        assert self._trainer is not None
+        lightning_module = self._trainer.lightning_module
+
         with _block_parallel_sync_behavior(self._trainer, block=(not sync_grad)):
-            self._toggle_model()
+            lightning_module.toggle_optimizer(self, self._optimizer_idx)
             yield
-            self._untoggle_model()
-
-    def __optimizer_step(self, closure: Callable, profiler_name: str = None, **kwargs):
-        trainer = self._trainer
-
-        with trainer.profiler.profile(profiler_name):
-            trainer.accelerator.optimizer_step(self._optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs)
+            lightning_module.untoggle_optimizer(self._optimizer_idx)
 
-    def step(self, closure: Optional[Callable] = None, **kwargs):
-        """Call this directly from your training_step when doing optimizations manually. By using this we can
-        ensure that all the proper scaling when using 16-bit, accelerator etc is been done properly for you.
-
-        .. note:: In Manual Optimization, the user is expected to know when to call zero_grad,
-            perform accumulated_grad_batches, etc ... Lightning will only take care of precision and accelerators
+    def step(self, closure: Optional[Callable[[], Any]] = None, **kwargs: Any) -> None:
+        """Performs a single optimization step (parameter update).
 
         Args:
-
-            closure: One could provide its own optimizer_closure. Set to None by default.
-
-            kwargs: Any parameters provided to wrapped optimizer.step()
+            closure: An optional optimizer_closure.
+            kwargs: Any additional arguments to the ``optimizer.step()`` call.
 
         Example::
 
-            # Scenario for a GAN.
-
+            # Scenario for a GAN using manual optimization
             def training_step(...):
                 opt_gen, opt_dis = self.optimizers()
 
@@ -165,8 +146,7 @@ def training_step(...):
                 opt_dis.step()
 
 
-            # Scenario for a GAN advanced
-
+            # A more advanced example
             def training_step(self, batch, batch_idx, ...):
                 opt_gen, opt_dis = self.optimizers()
 
@@ -193,17 +173,20 @@ def closure_dis():
                     opt_dis.step(closure=closure_dis)
         """
         if closure is None:
-            profiler_name = f"closure_{self._optimizer_idx}"
             closure = do_nothing_closure
+            profiler_action = "optimizer_step_without_closure"
+        elif not callable(closure):
+            raise MisconfigurationException("When `optimizer.step(closure)` is called, the closure should be callable")
         else:
-            if not callable(closure):
-                raise MisconfigurationException("When closure is provided, it should be a function")
-            profiler_name = f"optimizer_step_and_closure_{self._optimizer_idx}"
+            profiler_action = "optimizer_step_with_closure"
+        profiler_action += f"_{self._optimizer_idx}"
 
-        self.__optimizer_step(closure=closure, profiler_name=profiler_name, **kwargs)
-        self._total_optimizer_step_calls += 1
+        trainer = self._trainer
+        assert trainer is not None
+        with trainer.profiler.profile(profiler_action):
+            trainer.accelerator.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         groups = [
             {k: round(v, 12) if isinstance(v, float) else v for k, v in sorted(group.items()) if k != "params"}
             for group in self.param_groups
diff --git a/pytorch_lightning/loggers/wandb.py b/pytorch_lightning/loggers/wandb.py
index 8d15e4fa6e44d..498b93082a526 100644
--- a/pytorch_lightning/loggers/wandb.py
+++ b/pytorch_lightning/loggers/wandb.py
@@ -282,7 +282,7 @@ def __init__(
             rank_zero_warn(
                 f"Providing log_model={log_model} requires wandb version >= 0.10.22"
                 " for logging associated model metadata.\n"
-                "Hint: Upgrade with `pip install --ugrade wandb`."
+                "Hint: Upgrade with `pip install --upgrade wandb`."
             )
 
         super().__init__()
diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index 1a19c753b0e2b..ef53df92c6bb0 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -35,7 +35,7 @@ class Loop(ABC, Generic[T]):
 
     This class implements the following loop structure:
 
-    .. codeblock:: python
+    .. code-block:: python
 
         on_run_start()
 
diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py
index 93e156070d3d1..c1d800c42d853 100644
--- a/pytorch_lightning/loops/batch/training_batch_loop.py
+++ b/pytorch_lightning/loops/batch/training_batch_loop.py
@@ -23,9 +23,6 @@
 from pytorch_lightning.loops.optimization.optimizer_loop import OptimizerLoop
 from pytorch_lightning.loops.utilities import _get_active_optimizers
 from pytorch_lightning.trainer.supporters import TensorRunningAccum
-from pytorch_lightning.utilities import AttributeDict
-from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature
-from pytorch_lightning.utilities.warnings import WarningCache
 
 _OUTPUTS_TYPE = List[Union[_OPTIMIZER_LOOP_OUTPUTS_TYPE, _MANUAL_LOOP_OUTPUTS_TYPE]]
 
@@ -43,7 +40,6 @@ def __init__(self) -> None:
         self.manual_loop = ManualOptimization()
 
         self._outputs: _OUTPUTS_TYPE = []
-        self._warning_cache: WarningCache = WarningCache()
         self._remaining_splits: Optional[List[Any]] = None
 
     @property
@@ -59,42 +55,6 @@ def connect(
         if manual_loop is not None:
             self.manual_loop = manual_loop
 
-    def run(self, batch: Any, batch_idx: int) -> AttributeDict:
-        """Runs all the data splits and the ``on_batch_start`` and ``on_train_batch_start`` hooks.
-
-        Args:
-            batch: the current batch to run the train step on
-            batch_idx: the index of the current batch
-        """
-        if batch is None:
-            self._warning_cache.warn("train_dataloader yielded None. If this was on purpose, ignore this warning...")
-            return AttributeDict(signal=0, outputs=[])
-
-        # hook
-        self.trainer.logger_connector.on_batch_start()
-        response = self.trainer.call_hook("on_batch_start")
-        if response == -1:
-            return AttributeDict(signal=-1)
-
-        # hook
-        # TODO: Update this in v1.7 (deprecation: #9816)
-        model_fx = self.trainer.lightning_module.on_train_batch_start
-        extra_kwargs = (
-            {"dataloader_idx": 0}
-            if callable(model_fx) and is_param_in_hook_signature(model_fx, "dataloader_idx", explicit=True)
-            else {}
-        )
-        response = self.trainer.call_hook("on_train_batch_start", batch, batch_idx, **extra_kwargs)
-        if response == -1:
-            return AttributeDict(signal=-1)
-
-        self.trainer.fit_loop.epoch_loop.batch_progress.increment_started()
-
-        super().run(batch, batch_idx)
-
-        output, self._outputs = AttributeDict(signal=0, outputs=self._outputs), None  # free memory
-        return output
-
     def reset(self) -> None:
         """Resets the loop state."""
         self._outputs = []
@@ -117,11 +77,10 @@ def advance(self, batch, batch_idx):
             batch_idx: the index of the current batch
         """
         void(batch)
-        split_idx, split_batch = self._remaining_splits.pop(0)
-        self.split_idx = split_idx
+        self.split_idx, split_batch = self._remaining_splits.pop(0)
 
         # let logger connector extract current batch size
-        self.trainer.logger_connector.on_train_split_start(batch_idx, split_idx, split_batch)
+        self.trainer.logger_connector.on_train_split_start(self.split_idx, split_batch)
 
         # choose which loop will run the optimization
         if self.trainer.lightning_module.automatic_optimization:
@@ -135,10 +94,12 @@ def advance(self, batch, batch_idx):
             # then `advance` doesn't finish and an empty dict is returned
             self._outputs.append(outputs)
 
-    def on_run_end(self) -> None:
+    def on_run_end(self) -> _OUTPUTS_TYPE:
         self.optimizer_loop._hiddens = None
         # this is not necessary as the manual loop runs for only 1 iteration, but just in case
         self.manual_loop._hiddens = None
+        output, self._outputs = self._outputs, None  # free memory
+        return output
 
     def teardown(self) -> None:
         # release memory
diff --git a/pytorch_lightning/loops/dataloader/evaluation_loop.py b/pytorch_lightning/loops/dataloader/evaluation_loop.py
index 92c7d36cfd0d8..119e7f6c5472c 100644
--- a/pytorch_lightning/loops/dataloader/evaluation_loop.py
+++ b/pytorch_lightning/loops/dataloader/evaluation_loop.py
@@ -100,7 +100,7 @@ def advance(self, *args: Any, **kwargs: Any) -> None:
         void(*args, **kwargs)
 
         dataloader_idx: int = self.current_dataloader_idx
-        dataloader = self.trainer.accelerator.process_dataloader(self.current_dataloader)
+        dataloader = self.trainer.training_type_plugin.process_dataloader(self.current_dataloader)
         dataloader = self.trainer.data_connector.get_profiled_dataloader(dataloader, dataloader_idx=dataloader_idx)
         dl_max_batches = self._max_batches[dataloader_idx]
 
diff --git a/pytorch_lightning/loops/dataloader/prediction_loop.py b/pytorch_lightning/loops/dataloader/prediction_loop.py
index d4a6ab6d29cef..cf40316312107 100644
--- a/pytorch_lightning/loops/dataloader/prediction_loop.py
+++ b/pytorch_lightning/loops/dataloader/prediction_loop.py
@@ -84,7 +84,7 @@ def on_run_start(self) -> None:
     def advance(self, *args: Any, **kwargs: Any) -> None:
         """Predicts one entire dataloader."""
         void(*args, **kwargs)
-        dataloader = self.trainer.accelerator.process_dataloader(self.current_dataloader)
+        dataloader = self.trainer.training_type_plugin.process_dataloader(self.current_dataloader)
         dataloader_iter = enumerate(dataloader)
         dl_max_batches = self.max_batches[self.current_dataloader_idx]
 
diff --git a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
index 3e1b88a2d41c3..d666cc2ad0d59 100644
--- a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
@@ -233,10 +233,10 @@ def _on_evaluation_batch_start(self, batch: Any, batch_idx: int, dataloader_idx:
         Raises:
             AssertionError: If the number of dataloaders is None (has not yet been set).
         """
-        self.trainer.logger_connector.on_batch_start()
+        self.trainer.logger_connector.on_batch_start(batch_idx)
 
         assert self._num_dataloaders is not None
-        self.trainer.logger_connector.on_evaluation_batch_start(batch, batch_idx, dataloader_idx, self._num_dataloaders)
+        self.trainer.logger_connector.on_evaluation_batch_start(batch, dataloader_idx, self._num_dataloaders)
 
         if self.trainer.testing:
             self.trainer.call_hook("on_test_batch_start", batch, batch_idx, dataloader_idx)
diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
index fe3a2dc7431cc..4cc8eaa811231 100644
--- a/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -28,6 +28,7 @@
 from pytorch_lightning.utilities.fetching import AbstractDataFetcher
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature
+from pytorch_lightning.utilities.warnings import WarningCache
 
 _OUTPUTS_TYPE = List[_BATCH_OUTPUTS_TYPE]
 
@@ -57,6 +58,7 @@ def __init__(self, min_steps: int, max_steps: int):
 
         self._results = ResultCollection(training=True)
         self._outputs: _OUTPUTS_TYPE = []
+        self._warning_cache = WarningCache()
         self._dataloader_iter: Optional[Iterator] = None
         # caches the loaded dataloader state until dataloader objects are available
         self._dataloader_state_dict: Dict[str, Any] = {}
@@ -151,14 +153,37 @@ def advance(self, *args: Any, **kwargs: Any) -> None:
 
         self.batch_progress.increment_ready()
 
-        with self.trainer.profiler.profile("run_training_batch"):
-            batch_output = self.batch_loop.run(batch, batch_idx)
+        if batch is None:
+            self._warning_cache.warn("train_dataloader yielded None. If this was on purpose, ignore this warning...")
+            batch_output = []
+        else:
+            # hook
+            self.trainer.logger_connector.on_batch_start(batch_idx)
+            response = self.trainer.call_hook("on_batch_start")
+            if response == -1:
+                self.batch_progress.increment_processed()
+                raise StopIteration
+
+            # TODO: Update this in v1.7 (deprecation: #9816)
+            model_fx = self.trainer.lightning_module.on_train_batch_start
+            extra_kwargs = (
+                {"dataloader_idx": 0}
+                if callable(model_fx) and is_param_in_hook_signature(model_fx, "dataloader_idx", explicit=True)
+                else {}
+            )
 
-        self.batch_progress.increment_processed()
+            # hook
+            response = self.trainer.call_hook("on_train_batch_start", batch, batch_idx, **extra_kwargs)
+            if response == -1:
+                self.batch_progress.increment_processed()
+                raise StopIteration
 
-        # when returning -1 from train_step, we end epoch early
-        if batch_output.signal == -1:
-            raise StopIteration
+            self.batch_progress.increment_started()
+
+            with self.trainer.profiler.profile("run_training_batch"):
+                batch_output = self.batch_loop.run(batch, batch_idx)
+
+        self.batch_progress.increment_processed()
 
         # update non-plateau LR schedulers
         # update epoch-interval ones only when we are at the end of training epoch
@@ -167,7 +192,7 @@ def advance(self, *args: Any, **kwargs: Any) -> None:
             self.update_lr_schedulers("epoch", update_plateau_schedulers=False)
 
         batch_end_outputs = self._prepare_outputs_training_batch_end(
-            batch_output.outputs,
+            batch_output,
             automatic=self.trainer.lightning_module.trainer.lightning_module.automatic_optimization,
             num_optimizers=len(self.trainer.optimizers),
         )
@@ -186,7 +211,7 @@ def advance(self, *args: Any, **kwargs: Any) -> None:
         self.batch_progress.increment_completed()
 
         if is_overridden("training_epoch_end", self.trainer.lightning_module):
-            self._outputs.append(batch_output.outputs)
+            self._outputs.append(batch_output)
 
         # -----------------------------------------
         # SAVE METRICS TO LOGGERS AND PROGRESS_BAR
diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py
index 0d16c978fb374..6c692bb2dc6eb 100644
--- a/pytorch_lightning/loops/fit_loop.py
+++ b/pytorch_lightning/loops/fit_loop.py
@@ -204,7 +204,7 @@ def on_advance_start(self) -> None:
 
     def advance(self) -> None:
         """Runs one whole epoch."""
-        dataloader = self.trainer.accelerator.process_dataloader(self.trainer.train_dataloader)
+        dataloader = self.trainer.training_type_plugin.process_dataloader(self.trainer.train_dataloader)
         data_fetcher = self.trainer.data_connector.get_profiled_dataloader(dataloader)
 
         with self.trainer.profiler.profile("run_training_epoch"):
@@ -234,7 +234,7 @@ def on_run_end(self) -> None:
         self.trainer.call_hook("on_train_end")
 
         # give accelerators a chance to finish
-        self.trainer.accelerator.on_train_end()
+        self.trainer.training_type_plugin.on_train_end()
 
     def teardown(self) -> None:
         self.epoch_loop.teardown()
diff --git a/pytorch_lightning/loops/optimization/__init__.py b/pytorch_lightning/loops/optimization/__init__.py
index 17e96c49d30da..07249b6a130c1 100644
--- a/pytorch_lightning/loops/optimization/__init__.py
+++ b/pytorch_lightning/loops/optimization/__init__.py
@@ -12,4 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from pytorch_lightning.loops.optimization.manual_loop import ManualOptimization  # noqa: F401
 from pytorch_lightning.loops.optimization.optimizer_loop import OptimizerLoop  # noqa: F401
diff --git a/pytorch_lightning/loops/optimization/manual_loop.py b/pytorch_lightning/loops/optimization/manual_loop.py
index cc37217493c46..4c8bf157d331b 100644
--- a/pytorch_lightning/loops/optimization/manual_loop.py
+++ b/pytorch_lightning/loops/optimization/manual_loop.py
@@ -40,9 +40,7 @@ def __post_init__(self) -> None:
         self.extra = self._check_extra_detach_deprecation(self.extra)
 
     @classmethod
-    def from_training_step_output(
-        cls, training_step_output: Optional[STEP_OUTPUT], normalize: int = 1
-    ) -> "ManualResult":
+    def from_training_step_output(cls, training_step_output: Optional[STEP_OUTPUT]) -> "ManualResult":
         extra = {}
         if isinstance(training_step_output, dict):
             extra = {k: v for k, v in training_step_output.items() if k != "hiddens"}
@@ -55,9 +53,8 @@ def from_training_step_output(
             )
 
         if "loss" in extra:
-            # accumulate the loss. If `accumulate_grad_batches == 1`, no effect.
             # we detach manually as it's expected that it will have a `grad_fn`
-            extra["loss"] = extra["loss"].detach().div(normalize)
+            extra["loss"] = extra["loss"].detach()
 
         return cls(extra=extra)
 
@@ -110,7 +107,7 @@ def advance(self, batch: Any, batch_idx: int) -> None:  # type: ignore[override]
             lightning_module._current_fx_name = "training_step"
             with self.trainer.profiler.profile("training_step"):
                 training_step_output = self.trainer.accelerator.training_step(step_kwargs)
-                self.trainer.accelerator.post_training_step()
+                self.trainer.training_type_plugin.post_training_step()
 
             del step_kwargs
 
@@ -118,7 +115,7 @@ def advance(self, batch: Any, batch_idx: int) -> None:  # type: ignore[override]
 
             self._hiddens = _extract_hiddens(training_step_output, lightning_module.truncated_bptt_steps)
 
-            result = ManualResult.from_training_step_output(training_step_output, self.trainer.accumulate_grad_batches)
+            result = ManualResult.from_training_step_output(training_step_output)
 
             if self.trainer.move_metrics_to_cpu:
                 # hiddens and the training step output are not moved as they are not considered "metrics"
diff --git a/pytorch_lightning/loops/optimization/optimizer_loop.py b/pytorch_lightning/loops/optimization/optimizer_loop.py
index 590160c645afc..4ad85cc4650f2 100644
--- a/pytorch_lightning/loops/optimization/optimizer_loop.py
+++ b/pytorch_lightning/loops/optimization/optimizer_loop.py
@@ -240,7 +240,7 @@ def _backward(
 
         if not self.trainer.fit_loop._should_accumulate():
             # track gradients
-            grad_norm_dict = self._track_and_norm_grad(optimizer=optimizer)
+            grad_norm_dict = self._track_and_norm_grad(optimizer=optimizer, opt_idx=opt_idx)
             if grad_norm_dict:
                 self.trainer.lightning_module._current_fx_name = "on_after_backward"
                 self.trainer.lightning_module.log_grad_norm(grad_norm_dict)
@@ -344,7 +344,7 @@ def backward_fn(loss: Tensor) -> None:
             self._backward(loss, optimizer, opt_idx)
 
             # check if model weights are nan
-            if self.trainer.terminate_on_nan:
+            if self.trainer._terminate_on_nan:
                 detect_nan_parameters(self.trainer.lightning_module)
 
         return backward_fn
@@ -450,7 +450,7 @@ def _training_step(self, split_batch: Any, batch_idx: int, opt_idx: int) -> Clos
             lightning_module._current_fx_name = "training_step"
             with self.trainer.profiler.profile("training_step"):
                 training_step_output = self.trainer.accelerator.training_step(step_kwargs)
-                self.trainer.accelerator.post_training_step()
+                self.trainer.training_type_plugin.post_training_step()
 
             del step_kwargs
 
@@ -460,7 +460,7 @@ def _training_step(self, split_batch: Any, batch_idx: int, opt_idx: int) -> Clos
 
             result = ClosureResult.from_training_step_output(training_step_output, self.trainer.accumulate_grad_batches)
 
-            if self.trainer.terminate_on_nan:
+            if self.trainer._terminate_on_nan:
                 check_finite_loss(result.closure_loss)
 
             if self.trainer.move_metrics_to_cpu:
@@ -470,7 +470,7 @@ def _training_step(self, split_batch: Any, batch_idx: int, opt_idx: int) -> Clos
 
         return result
 
-    def _track_and_norm_grad(self, optimizer: torch.optim.Optimizer) -> Dict[str, float]:
+    def _track_and_norm_grad(self, optimizer: torch.optim.Optimizer, opt_idx: int) -> Dict[str, float]:
         """Tracks gradient norms and clips the gradients of all parameters optimized by the current optimizer.
 
         Args:
@@ -484,7 +484,11 @@ def _track_and_norm_grad(self, optimizer: torch.optim.Optimizer) -> Dict[str, fl
             grad_norm_dict = grad_norm(self.trainer.lightning_module, self.trainer.track_grad_norm)
 
         # clip gradients
-        self.trainer.accelerator.clip_gradients(
-            optimizer, self.trainer.gradient_clip_val, gradient_clip_algorithm=self.trainer.gradient_clip_algorithm
-        )
+        if not self.trainer.accelerator_connector.use_deepspeed:
+            self.trainer.lightning_module.configure_gradient_clipping(
+                optimizer,
+                opt_idx,
+                gradient_clip_val=self.trainer.gradient_clip_val,
+                gradient_clip_algorithm=self.trainer.gradient_clip_algorithm,
+            )
         return grad_norm_dict
diff --git a/pytorch_lightning/plugins/precision/double.py b/pytorch_lightning/plugins/precision/double.py
index 179daf9e91db8..5e9e8bd43b820 100644
--- a/pytorch_lightning/plugins/precision/double.py
+++ b/pytorch_lightning/plugins/precision/double.py
@@ -92,37 +92,7 @@ def connect(
         return super().connect(model, optimizers, lr_schedulers)
 
     @contextmanager
-    def train_step_context(self) -> Generator[None, None, None]:
-        """A context manager to change the default tensor type.
-
-        See: :meth:`torch.set_default_tensor_type`
-        """
-        torch.set_default_tensor_type(torch.DoubleTensor)
-        yield
-        torch.set_default_tensor_type(torch.FloatTensor)
-
-    @contextmanager
-    def val_step_context(self) -> Generator[None, None, None]:
-        """A context manager to change the default tensor type.
-
-        See: :meth:`torch.set_default_tensor_type`
-        """
-        torch.set_default_tensor_type(torch.DoubleTensor)
-        yield
-        torch.set_default_tensor_type(torch.FloatTensor)
-
-    @contextmanager
-    def test_step_context(self) -> Generator[None, None, None]:
-        """A context manager to change the default tensor type.
-
-        See: :meth:`torch.set_default_tensor_type`
-        """
-        torch.set_default_tensor_type(torch.DoubleTensor)
-        yield
-        torch.set_default_tensor_type(torch.FloatTensor)
-
-    @contextmanager
-    def predict_step_context(self) -> Generator[None, None, None]:
+    def forward_context(self) -> Generator[None, None, None]:
         """A context manager to change the default tensor type.
 
         See: :meth:`torch.set_default_tensor_type`
diff --git a/pytorch_lightning/plugins/precision/native_amp.py b/pytorch_lightning/plugins/precision/native_amp.py
index 8f93b63588c19..50c527f5f407d 100644
--- a/pytorch_lightning/plugins/precision/native_amp.py
+++ b/pytorch_lightning/plugins/precision/native_amp.py
@@ -102,25 +102,7 @@ def autocast_context_manager(self) -> torch.cuda.amp.autocast:
         return torch.cuda.amp.autocast()
 
     @contextmanager
-    def train_step_context(self) -> Generator[None, None, None]:
-        """Enable autocast context."""
-        with self.autocast_context_manager():
-            yield
-
-    @contextmanager
-    def val_step_context(self) -> Generator[None, None, None]:
-        """Enable autocast context."""
-        with self.autocast_context_manager():
-            yield
-
-    @contextmanager
-    def test_step_context(self) -> Generator[None, None, None]:
-        """Enable autocast context."""
-        with self.autocast_context_manager():
-            yield
-
-    @contextmanager
-    def predict_step_context(self) -> Generator[None, None, None]:
+    def forward_context(self) -> Generator[None, None, None]:
         """Enable autocast context."""
         with self.autocast_context_manager():
             yield
diff --git a/pytorch_lightning/plugins/precision/precision_plugin.py b/pytorch_lightning/plugins/precision/precision_plugin.py
index 5138bb660b9cd..c81a474faad34 100644
--- a/pytorch_lightning/plugins/precision/precision_plugin.py
+++ b/pytorch_lightning/plugins/precision/precision_plugin.py
@@ -143,21 +143,30 @@ def post_dispatch(self) -> None:
         """Hook to do something after the training/evaluation/prediction finishes."""
 
     @contextlib.contextmanager
-    def train_step_context(self) -> Generator:
-        """A contextmanager for the training step."""
+    def forward_context(self) -> Generator[None, None, None]:
+        """A contextmanager for managing model forward/training_step/evaluation_step/predict_step."""
         yield
 
     @contextlib.contextmanager
-    def val_step_context(self) -> Generator:
+    def train_step_context(self) -> Generator[None, None, None]:
+        """A contextmanager for the training step."""
+        with self.forward_context():
+            yield
+
+    @contextlib.contextmanager
+    def val_step_context(self) -> Generator[None, None, None]:
         """A contextmanager for the validation step."""
-        yield
+        with self.forward_context():
+            yield
 
     @contextlib.contextmanager
-    def test_step_context(self) -> Generator:
+    def test_step_context(self) -> Generator[None, None, None]:
         """A contextmanager for the test step."""
-        yield
+        with self.forward_context():
+            yield
 
     @contextlib.contextmanager
-    def predict_step_context(self) -> Generator:
+    def predict_step_context(self) -> Generator[None, None, None]:
         """A contextmanager for the predict step."""
-        yield
+        with self.forward_context():
+            yield
diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index f706e5f33346d..e2e8c316f48d1 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import argparse
 import contextlib
 import json
 import logging
@@ -33,8 +34,10 @@
 from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.apply_func import apply_to_collection
 from pytorch_lightning.utilities.distributed import log, rank_zero_info, rank_zero_only
+from pytorch_lightning.utilities.enums import GradClipAlgorithmType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _DEEPSPEED_AVAILABLE
+from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.seed import reset_seed
 from pytorch_lightning.utilities.types import _PATH, LRSchedulerTypeTuple
 from pytorch_lightning.utilities.warnings import rank_zero_warn, WarningCache
@@ -375,6 +378,18 @@ def pre_dispatch(self):
         self.barrier()
 
     def init_deepspeed(self):
+        # check that `configure_gradient_clipping` hook isn't overriden since deepspeed handles
+        # gradient clipping internally
+        if is_overridden("configure_gradient_clipping", self.lightning_module):
+            rank_zero_warn(
+                "Since deepspeed handles gradient clipping internally, this hook will"
+                " be ignored. Consider setting `gradient_clip_val` and `gradient_clip_algorithm`"
+                " inside `Trainer`."
+            )
+
+        if self.lightning_module.trainer.gradient_clip_algorithm == GradClipAlgorithmType.VALUE:
+            raise MisconfigurationException("Deepspeed does not support clipping gradients by value.")
+
         accumulation_scheduler = self.lightning_module.trainer.accumulation_scheduler
 
         if accumulation_scheduler.epochs != [0]:
@@ -429,6 +444,7 @@ def _initialize_deepspeed_train(self, model):
 
         model_parameters = filter(lambda p: p.requires_grad, self.model.parameters())
         model, deepspeed_optimizer, _, deepspeed_scheduler = deepspeed.initialize(
+            args=argparse.Namespace(device_rank=self.root_device.index),
             config=self.config,
             model=model,
             model_parameters=model_parameters,
@@ -505,6 +521,7 @@ def _initialize_deepspeed_inference(self, model):
         # Remove all module hooks before initializing new model
         remove_module_hooks(model)
         model, _, _, _ = deepspeed.initialize(
+            args=argparse.Namespace(device_rank=self.root_device.index),
             config=inference_config,
             model=model,
             optimizer=optimizer,
@@ -566,7 +583,7 @@ def _format_batch_size_and_grad_accum_config(self):
             batch_size = self._auto_select_batch_size()
             self.config["train_micro_batch_size_per_gpu"] = batch_size
         if "gradient_clipping" not in self.config:
-            self.config["gradient_clipping"] = self.lightning_module.trainer.gradient_clip_val
+            self.config["gradient_clipping"] = self.lightning_module.trainer.gradient_clip_val or 0.0
 
     def _auto_select_batch_size(self):
         # train_micro_batch_size_per_gpu is used for throughput logging purposes
diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py
index daa704e8a8243..b6728b0551081 100644
--- a/pytorch_lightning/plugins/training_type/ipu.py
+++ b/pytorch_lightning/plugins/training_type/ipu.py
@@ -285,7 +285,7 @@ def on_test_end(self):
     def on_predict_end(self):
         self._detach_models()
 
-    def on_train_batch_start(self, batch: Any, batch_idx: int) -> None:
+    def on_train_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> None:
         # Updates optimizer stats if LR scheduler modified the optimizer state
         optimizer = self.lightning_module.trainer.optimizers[0]
         self.poptorch_models[RunningStage.TRAINING].setOptimizer(optimizer)
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
index cf36a3502702d..9c53069063a52 100644
--- a/pytorch_lightning/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -345,7 +345,7 @@ def on_predict_end(self):
         """Called when predict ends."""
         pass
 
-    def on_train_batch_start(self, batch: Any, batch_idx: int) -> None:
+    def on_train_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> None:
         """Called in the training loop before anything happens for that batch."""
         pass
 
diff --git a/pytorch_lightning/profiler/pytorch.py b/pytorch_lightning/profiler/pytorch.py
index 8bdbadffec15b..58f4a18895498 100644
--- a/pytorch_lightning/profiler/pytorch.py
+++ b/pytorch_lightning/profiler/pytorch.py
@@ -117,11 +117,11 @@ def pre_step(self, current_action: str) -> None:
 
     def reset(self):
         # handle properly `fast_dev_run`. PyTorch Profiler will fail otherwise.
-        self._num_optimizer_step_and_closure = 0
+        self._num_optimizer_step_with_closure = 0
         self._num_validation_step = 0
         self._num_test_step = 0
         self._num_predict_step = 0
-        self._optimizer_step_and_closure_reached_end = False
+        self._optimizer_step_with_closure_reached_end = False
         self._validation_step_reached_end = False
         self._test_step_reached_end = False
         self._predict_step_reached_end = False
@@ -132,13 +132,13 @@ def reset(self):
     @property
     def is_training(self) -> bool:
         return self._current_action is not None and (
-            self._current_action.startswith("optimizer_step_and_closure_") or self._current_action == "training_step"
+            self._current_action.startswith("optimizer_step_with_closure_") or self._current_action == "training_step"
         )
 
     @property
     def num_step(self) -> int:
         if self.is_training:
-            return self._num_optimizer_step_and_closure
+            return self._num_optimizer_step_with_closure
         if self._current_action == "validation_step":
             return self._num_validation_step
         if self._current_action == "test_step":
@@ -149,10 +149,10 @@ def num_step(self) -> int:
 
     def _step(self) -> None:
         if self.is_training:
-            self._num_optimizer_step_and_closure += 1
+            self._num_optimizer_step_with_closure += 1
         elif self._current_action == "validation_step":
             if self._start_action_name == "on_fit_start":
-                if self._num_optimizer_step_and_closure > 0:
+                if self._num_optimizer_step_with_closure > 0:
                     self._num_validation_step += 1
             else:
                 self._num_validation_step += 1
@@ -164,7 +164,7 @@ def _step(self) -> None:
     @property
     def has_finished(self) -> bool:
         if self.is_training:
-            return self._optimizer_step_and_closure_reached_end
+            return self._optimizer_step_with_closure_reached_end
         if self._current_action == "validation_step":
             return self._validation_step_reached_end
         if self._current_action == "test_step":
@@ -182,7 +182,7 @@ def __call__(self, num_step: int) -> "ProfilerAction":
         action = self._schedule(max(self.num_step, 0))
         if action == ProfilerAction.RECORD_AND_SAVE:
             if self.is_training:
-                self._optimizer_step_and_closure_reached_end = True
+                self._optimizer_step_with_closure_reached_end = True
             elif self._current_action == "validation_step":
                 self._validation_step_reached_end = True
             elif self._current_action == "test_step":
@@ -202,9 +202,9 @@ class PyTorchProfiler(BaseProfiler):
         "test_step",
         "predict_step",
     }
-    RECORD_FUNCTION_PREFIX = "optimizer_step_and_closure_"
+    RECORD_FUNCTION_PREFIX = "optimizer_step_with_closure_"
     STEP_FUNCTIONS = {"training_step", "validation_step", "test_step", "predict_step"}
-    STEP_FUNCTION_PREFIX = "optimizer_step_and_closure_"
+    STEP_FUNCTION_PREFIX = "optimizer_step_with_closure_"
     AVAILABLE_SORT_KEYS = {
         "cpu_time",
         "cuda_time",
@@ -383,8 +383,8 @@ def start(self, action_name: str) -> None:
                 self._register.__enter__()
 
         if self._lightning_module is not None:
-            # when the model is used in automatic optimization,
-            # we use `optimizer_step_and_closure` to step the model.
+            # when the model is used in automatic optimization, we use `optimizer_step_with_closure` to step the model.
+            # this profiler event is generated in the `LightningOptimizer.step` method
             if self._lightning_module.automatic_optimization and "training_step" in self.STEP_FUNCTIONS:
                 self.STEP_FUNCTIONS.remove("training_step")
 
diff --git a/pytorch_lightning/trainer/configuration_validator.py b/pytorch_lightning/trainer/configuration_validator.py
index 3da05d69c1ff2..58260e67b77f6 100644
--- a/pytorch_lightning/trainer/configuration_validator.py
+++ b/pytorch_lightning/trainer/configuration_validator.py
@@ -201,7 +201,7 @@ def __verify_dp_batch_transfer_support(self, model: "pl.LightningModule") -> Non
     def __verify_manual_optimization_support(self, model: "pl.LightningModule") -> None:
         if model.automatic_optimization:
             return
-        if self.trainer.gradient_clip_val > 0:
+        if self.trainer.gradient_clip_val is not None and self.trainer.gradient_clip_val > 0:
             raise MisconfigurationException(
                 "Automatic gradient clipping is not supported for manual optimization."
                 f" Remove `Trainer(gradient_clip_val={self.trainer.gradient_clip_val})`"
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index 62d2af19881bb..fced37cd87a13 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -92,8 +92,8 @@ def __init__(
         devices,
         tpu_cores,
         ipus,
-        distributed_backend,
         accelerator,
+        strategy: Optional[Union[str, TrainingTypePlugin]],
         gpus,
         gpu_ids,
         num_nodes,
@@ -111,12 +111,7 @@ def __init__(
         self._distrib_type = None
         self._accelerator_type = None
 
-        if distributed_backend is not None:
-            rank_zero_deprecation(
-                f"`Trainer(distributed_backend={distributed_backend})` has been deprecated and will be removed in v1.5."
-                f" Use `Trainer(accelerator={distributed_backend})` instead."
-            )
-        distributed_backend = distributed_backend or accelerator
+        self.strategy = strategy.lower() if isinstance(strategy, str) else strategy
         self._init_deterministic(deterministic)
 
         self.num_processes = num_processes
@@ -126,11 +121,15 @@ def __init__(
         self.parallel_device_ids = gpu_ids
         self.tpu_cores = tpu_cores
         self.ipus = ipus
-        self.distributed_backend = distributed_backend
+        self.accelerator = accelerator
         self.num_nodes = num_nodes
         self.sync_batchnorm = sync_batchnorm
         self.benchmark = benchmark
         self.replace_sampler_ddp = replace_sampler_ddp
+        if not PrecisionType.supported_type(precision):
+            raise MisconfigurationException(
+                f"Precision {repr(precision)} is invalid. Allowed precision values: {PrecisionType.supported_types()}"
+            )
         self.precision = precision
         self.amp_type = amp_type.lower() if isinstance(amp_type, str) else None
         self.amp_level = amp_level
@@ -151,16 +150,23 @@ def __init__(
 
         self.plugins = plugins
 
+        self._handle_accelerator()
+
         self._validate_accelerator_and_devices()
 
         self._warn_if_devices_flag_ignored()
 
         self.select_accelerator_type()
-        self.set_distributed_mode()
+
+        if self.strategy is not None:
+            self._set_training_type_plugin()
+        else:
+            self.set_distributed_mode()
         self.configure_slurm_ddp()
 
         self.handle_given_plugins()
         self.update_device_type_if_ipu_plugin()
+        self.update_device_type_if_training_type_plugin_passed()
 
         self._validate_accelerator_type()
         self._set_devices_if_none()
@@ -170,7 +176,7 @@ def __init__(
 
         # override dist backend when using tpus
         if self.use_tpu:
-            self.distributed_backend = "tpu"
+            self.accelerator = "tpu"
 
         # init flags for SLURM+DDP to work
         self.world_size = 1
@@ -199,7 +205,7 @@ def _init_deterministic(self, deterministic: bool) -> None:
             os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
 
     def select_accelerator_type(self) -> None:
-        if self.distributed_backend == "auto":
+        if self.accelerator == "auto":
             if self.has_tpu:
                 self._accelerator_type = DeviceType.TPU
             elif self.has_ipu:
@@ -209,39 +215,40 @@ def select_accelerator_type(self) -> None:
             else:
                 self._set_devices_to_cpu_num_processes()
                 self._accelerator_type = DeviceType.CPU
-        elif self.distributed_backend == DeviceType.TPU:
+        elif self.accelerator == DeviceType.TPU:
             if not self.has_tpu:
                 msg = "TPUs are not available" if not _TPU_AVAILABLE else "you didn't pass `tpu_cores` to `Trainer`"
                 raise MisconfigurationException(f"You passed `accelerator='tpu'`, but {msg}.")
             self._accelerator_type = DeviceType.TPU
-        elif self.distributed_backend == DeviceType.IPU:
+        elif self.accelerator == DeviceType.IPU:
             if not self.has_ipu:
                 msg = "IPUs are not available" if not _IPU_AVAILABLE else "you didn't pass `ipus` to `Trainer`"
                 raise MisconfigurationException(f"You passed `accelerator='ipu'`, but {msg}.")
             self._accelerator_type = DeviceType.IPU
-        elif self.distributed_backend == DeviceType.GPU:
+        elif self.accelerator == DeviceType.GPU:
             if not self.has_gpu:
                 msg = "you didn't pass `gpus` to `Trainer`" if torch.cuda.is_available() else "GPUs are not available"
                 raise MisconfigurationException(f"You passed `accelerator='gpu'`, but {msg}.")
             self._accelerator_type = DeviceType.GPU
-        elif self.distributed_backend == DeviceType.CPU:
+        elif self.accelerator == DeviceType.CPU:
             self._set_devices_to_cpu_num_processes()
             self._accelerator_type = DeviceType.CPU
 
-        if self.distributed_backend in ["auto"] + list(DeviceType):
-            self.distributed_backend = None
+        if self.accelerator in ["auto"] + list(DeviceType):
+            self.accelerator = None
 
     def _validate_accelerator_and_devices(self) -> None:
-        if self.distributed_backend not in ["auto"] + list(DeviceType) and self.devices is not None:
+        if self.accelerator not in ["auto"] + list(DeviceType) and self.devices is not None:
             raise MisconfigurationException(
                 f"You passed `devices={self.devices}` but haven't specified"
                 " `accelerator=('auto'|'tpu'|'gpu'|'ipu'|'cpu')` for the devices mapping,"
-                f" got `accelerator={self.distributed_backend}`."
+                f" got `accelerator={self.accelerator}`."
             )
 
     def _validate_accelerator_type(self) -> None:
         if self._accelerator_type and self._accelerator_type != self._device_type:
-            raise MisconfigurationException(
+            # internal error: should not happen.
+            raise ValueError(
                 f"Mismatch between the requested accelerator type ({self._accelerator_type})"
                 f" and assigned device type ({self._device_type})."
             )
@@ -250,26 +257,18 @@ def _validate_accelerator_type(self) -> None:
     def _warn_if_devices_flag_ignored(self) -> None:
         if self.devices is None:
             return
+
         devices_warning = f"The flag `devices={self.devices}` will be ignored, as you have set"
-        if self.distributed_backend == "auto":
-            if self.tpu_cores is not None:
-                rank_zero_warn(f"{devices_warning} `tpu_cores={self.tpu_cores}`")
-            elif self.ipus is not None:
-                rank_zero_warn(f"{devices_warning} `ipus={self.ipus}`")
-            elif self.gpus is not None:
-                rank_zero_warn(f"{devices_warning} `gpus={self.gpus}`")
-            elif self.num_processes != 1:
-                rank_zero_warn(f"{devices_warning} `num_processes={self.num_processes}`")
-        elif self.distributed_backend == DeviceType.TPU:
+        if self.accelerator in ("auto", DeviceType.TPU):
             if self.tpu_cores is not None:
                 rank_zero_warn(f"{devices_warning} `tpu_cores={self.tpu_cores}`")
-        elif self.distributed_backend == DeviceType.IPU:
+        elif self.accelerator in ("auto", DeviceType.IPU):
             if self.ipus is not None:
                 rank_zero_warn(f"{devices_warning} `ipus={self.ipus}`")
-        elif self.distributed_backend == DeviceType.GPU:
+        elif self.accelerator in ("auto", DeviceType.GPU):
             if self.gpus is not None:
                 rank_zero_warn(f"{devices_warning} `gpus={self.gpus}`")
-        elif self.distributed_backend == DeviceType.CPU:
+        elif self.accelerator in ("auto", DeviceType.CPU):
             if self.num_processes != 1:
                 rank_zero_warn(f"{devices_warning} `num_processes={self.num_processes}`")
 
@@ -285,9 +284,42 @@ def _set_devices_if_none(self) -> None:
         elif self._accelerator_type == DeviceType.CPU:
             self.devices = self.num_processes
 
+    def _handle_accelerator(self) -> None:
+        if self.accelerator is not None and self.accelerator in list(DistributedType):
+            rank_zero_deprecation(
+                f"Passing `Trainer(accelerator={self.accelerator!r})` has been deprecated"
+                f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={self.accelerator!r})` instead."
+            )
+            if self.strategy is not None:
+                raise MisconfigurationException(
+                    f"You have passed `Trainer(strategy={self.strategy!r})` but have"
+                    f" also passed `Trainer(accelerator={self.accelerator!r})`."
+                    f" HINT: Use just `Trainer(strategy={self.strategy!r})` instead."
+                )
+
+    def _set_training_type_plugin(self) -> None:
+        if isinstance(self.strategy, str) and self.strategy in TrainingTypePluginsRegistry:
+            self._training_type_plugin = TrainingTypePluginsRegistry.get(self.strategy)
+        if isinstance(self.strategy, str):
+            self.set_distributed_mode(self.strategy)
+        elif isinstance(self.strategy, TrainingTypePlugin):
+            self._training_type_plugin = self.strategy
+
     def handle_given_plugins(self) -> None:
 
-        training_type = None
+        for plug in self.plugins:
+            if self.strategy is not None and self._is_plugin_training_type(plug):
+                raise MisconfigurationException(
+                    f"You have passed `Trainer(strategy={self.strategy!r})`"
+                    f" and you can only specify one training type plugin, but you have passed {plug} as a plugin."
+                )
+            if self._is_plugin_training_type(plug):
+                rank_zero_deprecation(
+                    f"Passing {plug} `strategy` to the `plugins` flag in Trainer has been deprecated"
+                    f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={plug})` instead."
+                )
+
+        training_type = self._training_type_plugin or None
         checkpoint = None
         precision = None
         cluster_environment = None
@@ -350,6 +382,10 @@ def handle_given_plugins(self) -> None:
         self._checkpoint_io = checkpoint
         self._cluster_environment = cluster_environment or self.select_cluster_environment()
 
+    @property
+    def accelerator_types(self) -> List[str]:
+        return ["auto"] + list(DeviceType)
+
     @property
     def precision_plugin(self) -> PrecisionPlugin:
         if self._precision_plugin is None:
@@ -448,7 +484,7 @@ def _map_devices_to_accelerator(self, accelerator: str) -> bool:
         if accelerator == DeviceType.CPU:
             if not isinstance(self.devices, int):
                 raise MisconfigurationException(
-                    "The flag `devices` only supports integer for `accelerator='cpu'`,"
+                    "The flag `devices` must be an int with `accelerator='cpu'`,"
                     f" got `devices={self.devices}` instead."
                 )
             self.num_processes = self.devices
@@ -540,9 +576,18 @@ def root_gpu(self) -> Optional[int]:
             else None
         )
 
+    @staticmethod
+    def _is_plugin_training_type(plugin: Union[str, TrainingTypePlugin]) -> bool:
+        if isinstance(plugin, str) and (plugin in TrainingTypePluginsRegistry or plugin in list(DistributedType)):
+            return True
+        return isinstance(plugin, TrainingTypePlugin)
+
     @property
     def is_training_type_in_plugins(self) -> bool:
-        return any(isinstance(plug, str) and plug in TrainingTypePluginsRegistry for plug in self.plugins)
+        return any(
+            (isinstance(plug, str) and plug in TrainingTypePluginsRegistry) or isinstance(plug, TrainingTypePlugin)
+            for plug in self.plugins
+        )
 
     def select_precision_plugin(self) -> PrecisionPlugin:
         # set precision type
@@ -593,16 +638,9 @@ def select_precision_plugin(self) -> PrecisionPlugin:
 
                 return ApexMixedPrecisionPlugin(self.amp_level)
 
-        raise MisconfigurationException(
-            f"Precision {self.precision} is invalid. Allowed precision values: {PrecisionType.supported_types()}"
-        )
-
     def select_training_type_plugin(self) -> TrainingTypePlugin:
-        if (
-            isinstance(self.distributed_backend, Accelerator)
-            and self.distributed_backend.training_type_plugin is not None
-        ):
-            plugin = self.distributed_backend.training_type_plugin
+        if isinstance(self.accelerator, Accelerator) and self.accelerator.training_type_plugin is not None:
+            plugin = self.accelerator.training_type_plugin
         elif self.use_ddp2:
             plugin = DDP2Plugin(parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment)
         elif self.use_ddp and self.use_deepspeed:
@@ -684,7 +722,7 @@ def resolve_training_type_plugin(self, training_type: TrainingTypePlugin) -> Tra
         return training_type
 
     def select_accelerator(self) -> Accelerator:
-        if isinstance(self.distributed_backend, Accelerator):
+        if isinstance(self.accelerator, Accelerator):
             # custom accelerator from user
             if self._precision_plugin is not None or self._training_type_plugin is not None:
                 # plugins also specified by user
@@ -692,7 +730,7 @@ def select_accelerator(self) -> Accelerator:
                     "Specified `Precision` and `TrainingType` plugins will be ignored,"
                     " since an `Accelerator` instance was provided."
                 )
-            return self.distributed_backend
+            return self.accelerator
 
         if self.use_gpu:
             acc_cls = GPUAccelerator
@@ -726,47 +764,50 @@ def select_cluster_environment(self) -> ClusterEnvironment:
             env = LightningEnvironment()
         return env
 
-    def set_distributed_mode(self, distributed_backend: Optional[str] = None):
+    def set_distributed_mode(self, accelerator: Optional[str] = None):
 
-        if distributed_backend is None and self.is_training_type_in_plugins:
+        if accelerator is None and self.is_training_type_in_plugins:
             return
 
-        if distributed_backend is not None and distributed_backend in TrainingTypePluginsRegistry:
-            self.distributed_backend = TrainingTypePluginsRegistry[distributed_backend]["distributed_backend"]
-        elif distributed_backend is not None:
-            self.distributed_backend = distributed_backend
+        if accelerator is not None and accelerator in TrainingTypePluginsRegistry:
+            self.accelerator = TrainingTypePluginsRegistry[accelerator]["distributed_backend"]
+        elif accelerator is not None:
+            self.accelerator = accelerator
 
-        if isinstance(self.distributed_backend, Accelerator):
+        if isinstance(self.accelerator, Accelerator):
             return
 
         is_cpu_accelerator_type = self._accelerator_type and self._accelerator_type == DeviceType.CPU
-        _use_cpu = is_cpu_accelerator_type or self.distributed_backend and "cpu" in self.distributed_backend
+        _use_cpu = is_cpu_accelerator_type or self.accelerator and "cpu" in self.accelerator
 
-        if self.distributed_backend is None:
+        if self.accelerator is None:
             if self.has_horovodrun():
                 self._set_horovod_backend()
             elif self.num_gpus == 0 and self.num_nodes > 1:
                 self._distrib_type = DistributedType.DDP
             elif self.num_gpus == 0 and self.num_processes > 1:
-                self.distributed_backend = DistributedType.DDP_SPAWN
+                self.accelerator = DistributedType.DDP_SPAWN
             elif self.num_gpus > 1 and not _use_cpu:
                 rank_zero_warn(
                     "You requested multiple GPUs but did not specify a backend, e.g."
-                    ' `Trainer(accelerator="dp"|"ddp"|"ddp2")`. Setting `accelerator="ddp_spawn"` for you.'
+                    ' `Trainer(strategy="dp"|"ddp"|"ddp2")`. Setting `strategy="ddp_spawn"` for you.'
                 )
-                self.distributed_backend = DistributedType.DDP_SPAWN
+                self.accelerator = DistributedType.DDP_SPAWN
 
         # special case with DDP on CPUs
-        if self.distributed_backend == DistributedType.DDP_CPU:
+        if self.accelerator == DistributedType.DDP_CPU:
             if _TPU_AVAILABLE:
                 raise MisconfigurationException(
                     "`accelerator='ddp_cpu'` is not supported on TPU machines. "
                     "Learn more: https://github.com/PyTorchLightning/pytorch-lightning/issues/7810"
                 )
-            self._distrib_type = DistributedType.DDP_SPAWN
+            if self.num_processes == 1 and self.num_nodes > 1:
+                self._distrib_type = DistributedType.DDP
+            else:
+                self._distrib_type = DistributedType.DDP_SPAWN
             if self.num_gpus > 0:
                 rank_zero_warn(
-                    "You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs."
+                    "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs."
                 )
                 self.parallel_device_ids = None
             if self.num_processes is None:
@@ -779,8 +820,8 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None):
                 self._distrib_type = DistributedType.TPU_SPAWN
         elif self.has_ipu and not _use_cpu:
             self._device_type = DeviceType.IPU
-        elif self.distributed_backend and self._distrib_type is None:
-            self._distrib_type = DistributedType(self.distributed_backend)
+        elif self.accelerator and self._distrib_type is None:
+            self._distrib_type = DistributedType(self.accelerator)
 
         if self.num_gpus > 0 and not _use_cpu:
             self._device_type = DeviceType.GPU
@@ -792,7 +833,7 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None):
             if (self.num_nodes and self.num_nodes > 1) or (self.num_processes and self.num_processes > 1):
                 if self._distrib_type in (DistributedType.DP, DistributedType.DDP2):
                     rank_zero_warn(
-                        f"{self._distrib_type} is not supported on CPUs, hence setting the distributed type to `ddp`."
+                        f"{self._distrib_type.value!r} is not supported on CPUs, hence setting `strategy='ddp'`."
                     )
                     self._distrib_type = DistributedType.DDP
             else:
@@ -813,15 +854,14 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None):
             self.num_processes = self.num_nodes
 
         # Horovod is an extra case...
-        if self.distributed_backend == DistributedType.HOROVOD:
+        if self.accelerator == DistributedType.HOROVOD:
             self._set_horovod_backend()
 
         using_valid_distributed = self.use_ddp or self.use_ddp2
         if self.num_nodes > 1 and not using_valid_distributed:
             # throw error to force user to choose a supported distributed type such as ddp or ddp2
             raise MisconfigurationException(
-                "Your chosen distributed type does not support num_nodes > 1. "
-                "Please set accelerator=ddp or accelerator=ddp2."
+                "Your chosen strategy does not support `num_nodes > 1`. Please set `strategy=('ddp'|'ddp2')`."
             )
 
     def _set_horovod_backend(self):
@@ -843,9 +883,12 @@ def check_interactive_compatibility(self):
 
         if _IS_INTERACTIVE and self._distrib_type is not None and not self._distrib_type.is_interactive_compatible():
             raise MisconfigurationException(
-                f"Selected distributed backend {self._distrib_type} is not compatible with an interactive"
+                f"`Trainer(strategy={self._distrib_type.value!r})` or"
+                f" `Trainer(accelerator={self._distrib_type.value!r})` is not compatible with an interactive"
                 " environment. Run your code as a script, or choose one of the compatible backends:"
-                f" {', '.join(DistributedType.interactive_compatible_types())}"
+                f" {', '.join(DistributedType.interactive_compatible_types())}."
+                " In case you are spawning processes yourself, make sure to include the Trainer"
+                " creation inside the worker function."
             )
 
     def check_horovod(self):
@@ -873,6 +916,25 @@ def update_device_type_if_ipu_plugin(self) -> None:
         if isinstance(self._training_type_plugin, IPUPlugin) and self._device_type != DeviceType.IPU:
             self._device_type = DeviceType.IPU
 
+    def update_device_type_if_training_type_plugin_passed(self) -> None:
+        if isinstance(self.strategy, TrainingTypePlugin) or any(
+            isinstance(plug, TrainingTypePlugin) for plug in self.plugins
+        ):
+            if self._accelerator_type is not None:
+                if self.use_ipu:
+                    self._device_type = DeviceType.IPU
+                elif self.use_tpu:
+                    self._device_type = DeviceType.TPU
+                elif self.use_gpu:
+                    self._device_type = DeviceType.GPU
+            else:
+                if self.has_ipu:
+                    self._device_type = DeviceType.IPU
+                elif self.has_tpu:
+                    self._device_type = DeviceType.TPU
+                elif self.has_gpu:
+                    self._device_type = DeviceType.GPU
+
     def configure_slurm_ddp(self):
         # extract SLURM flag vars
         # whenever we have the correct number of tasks, we let slurm manage processes
diff --git a/pytorch_lightning/trainer/connectors/callback_connector.py b/pytorch_lightning/trainer/connectors/callback_connector.py
index 4b6fe7feac4d7..2f63e65340760 100644
--- a/pytorch_lightning/trainer/connectors/callback_connector.py
+++ b/pytorch_lightning/trainer/connectors/callback_connector.py
@@ -38,12 +38,14 @@ def __init__(self, trainer):
     def on_trainer_init(
         self,
         callbacks: Optional[Union[List[Callback], Callback]],
-        checkpoint_callback: bool,
+        checkpoint_callback: Optional[bool],
+        enable_checkpointing: bool,
         enable_progress_bar: bool,
         progress_bar_refresh_rate: Optional[int],
         process_position: int,
         default_root_dir: Optional[str],
         weights_save_path: Optional[str],
+        enable_model_summary: bool,
         weights_summary: Optional[str],
         stochastic_weight_avg: bool,
         max_time: Optional[Union[str, timedelta, Dict[str, int]]] = None,
@@ -67,7 +69,7 @@ def on_trainer_init(
 
         # configure checkpoint callback
         # pass through the required args to figure out defaults
-        self._configure_checkpoint_callbacks(checkpoint_callback)
+        self._configure_checkpoint_callbacks(checkpoint_callback, enable_checkpointing)
 
         # configure swa callback
         self._configure_swa_callbacks()
@@ -100,7 +102,7 @@ def on_trainer_init(
             self.trainer._progress_bar_callback = None
 
         # configure the ModelSummary callback
-        self._configure_model_summary_callback(weights_summary)
+        self._configure_model_summary_callback(enable_model_summary, weights_summary)
 
         # accumulated grads
         self._configure_accumulated_gradients(accumulate_grad_batches)
@@ -140,42 +142,68 @@ def _configure_accumulated_gradients(
         self.trainer.accumulate_grad_batches = grad_accum_callback.get_accumulate_grad_batches(0)
         self.trainer.accumulation_scheduler = grad_accum_callback
 
-    def _configure_checkpoint_callbacks(self, checkpoint_callback: bool) -> None:
-        # TODO: Remove this error in v1.5 so we rely purely on the type signature
-        if not isinstance(checkpoint_callback, bool):
-            error_msg = (
-                "Invalid type provided for checkpoint_callback:"
-                f" Expected bool but received {type(checkpoint_callback)}."
+    def _configure_checkpoint_callbacks(self, checkpoint_callback: Optional[bool], enable_checkpointing: bool) -> None:
+        if checkpoint_callback is not None:
+            rank_zero_deprecation(
+                f"Setting `Trainer(checkpoint_callback={checkpoint_callback})` is deprecated in v1.5 and will "
+                f"be removed in v1.7. Please consider using `Trainer(enable_checkpointing={checkpoint_callback})`."
             )
-            if isinstance(checkpoint_callback, Callback):
-                error_msg += " Pass callback instances to the `callbacks` argument in the Trainer constructor instead."
-            raise MisconfigurationException(error_msg)
-        if self._trainer_has_checkpoint_callbacks() and checkpoint_callback is False:
+            # if both are set then checkpoint only if both are True
+            enable_checkpointing = checkpoint_callback and enable_checkpointing
+
+        if self._trainer_has_checkpoint_callbacks() and enable_checkpointing is False:
             raise MisconfigurationException(
-                "Trainer was configured with checkpoint_callback=False but found ModelCheckpoint in callbacks list."
+                "Trainer was configured with `enable_checkpointing=False`"
+                " but found `ModelCheckpoint` in callbacks list."
             )
 
-        if not self._trainer_has_checkpoint_callbacks() and checkpoint_callback is True:
+        if not self._trainer_has_checkpoint_callbacks() and enable_checkpointing is True:
             self.trainer.callbacks.append(ModelCheckpoint())
 
-    def _configure_model_summary_callback(self, weights_summary: Optional[str] = None) -> None:
-        if any(isinstance(cb, ModelSummary) for cb in self.trainer.callbacks):
+    def _configure_model_summary_callback(
+        self, enable_model_summary: bool, weights_summary: Optional[str] = None
+    ) -> None:
+        if weights_summary is None:
+            rank_zero_deprecation(
+                "Setting `Trainer(weights_summary=None)` is deprecated in v1.5 and will be removed"
+                " in v1.7. Please set `Trainer(enable_model_summary=False)` instead."
+            )
+            return
+        if not enable_model_summary:
+            return
+
+        model_summary_cbs = [type(cb) for cb in self.trainer.callbacks if isinstance(cb, ModelSummary)]
+        if model_summary_cbs:
+            rank_zero_info(
+                f"Trainer already configured with model summary callbacks: {model_summary_cbs}."
+                " Skipping setting a default `ModelSummary` callback."
+            )
             return
-        if weights_summary is not None:
+
+        if weights_summary == "top":
+            # special case the default value for weights_summary to preserve backward compatibility
+            max_depth = 1
+        else:
+            rank_zero_deprecation(
+                f"Setting `Trainer(weights_summary={weights_summary})` is deprecated in v1.5 and will be removed"
+                " in v1.7. Please pass `pytorch_lightning.callbacks.model_summary.ModelSummary` with"
+                " `max_depth` directly to the Trainer's `callbacks` argument instead."
+            )
             if weights_summary not in ModelSummaryMode.supported_types():
                 raise MisconfigurationException(
                     f"`weights_summary` can be None, {', '.join(ModelSummaryMode.supported_types())}",
                     f" but got {weights_summary}",
                 )
             max_depth = ModelSummaryMode.get_max_depth(weights_summary)
-            if self.trainer._progress_bar_callback is not None and isinstance(
-                self.trainer._progress_bar_callback, RichProgressBar
-            ):
-                model_summary = RichModelSummary(max_depth=max_depth)
-            else:
-                model_summary = ModelSummary(max_depth=max_depth)
-            self.trainer.callbacks.append(model_summary)
-            self.trainer.weights_summary = weights_summary
+
+        is_progress_bar_rich = isinstance(self.trainer._progress_bar_callback, RichProgressBar)
+
+        if self.trainer._progress_bar_callback is not None and is_progress_bar_rich:
+            model_summary = RichModelSummary(max_depth=max_depth)
+        else:
+            model_summary = ModelSummary(max_depth=max_depth)
+        self.trainer.callbacks.append(model_summary)
+        self.trainer._weights_summary = weights_summary
 
     def _configure_swa_callbacks(self):
         if not self.trainer._stochastic_weight_avg:
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index d48ab3f518443..2bde85de052ca 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -59,9 +59,6 @@ def resume_start(self) -> None:
         1. from HPC weights if found
         2. from `resume_from_checkpoint` file if provided
         3. don't restore
-
-        Raises:
-            FileNotFoundError: If the path to the checkpoint file is provided but the file does not exist.
         """
         self.resume_checkpoint_path = self.hpc_resume_path or self.resume_checkpoint_path
         checkpoint_path = self.resume_checkpoint_path
@@ -465,7 +462,7 @@ def save_checkpoint(self, filepath: _PATH, weights_only: bool = False) -> None:
             weights_only: saving model weights only
         """
         _checkpoint = self.dump_checkpoint(weights_only)
-        self.trainer.accelerator.save_checkpoint(_checkpoint, filepath)
+        self.trainer.training_type_plugin.save_checkpoint(_checkpoint, filepath)
 
     def _get_lightning_module_state_dict(self) -> Dict[str, torch.Tensor]:
         metrics = (
@@ -478,7 +475,7 @@ def _get_lightning_module_state_dict(self) -> Dict[str, torch.Tensor]:
             metric.persistent(True)
             metric.sync()
 
-        state_dict = self.trainer.accelerator.lightning_module_state_dict()
+        state_dict = self.trainer.training_type_plugin.lightning_module_state_dict()
 
         for metric in metrics:
             # sync can be a no-op (e.g. on cpu) so `unsync` would raise a user error exception if we don't check
diff --git a/pytorch_lightning/trainer/connectors/debugging_connector.py b/pytorch_lightning/trainer/connectors/debugging_connector.py
index 8bd1a3f3207f9..52fc0c9a80615 100644
--- a/pytorch_lightning/trainer/connectors/debugging_connector.py
+++ b/pytorch_lightning/trainer/connectors/debugging_connector.py
@@ -60,7 +60,7 @@ def on_init_start(
             self.trainer.fit_loop.max_epochs = 1
             val_check_interval = 1.0
             self.trainer.check_val_every_n_epoch = 1
-            self.trainer.logger = DummyLogger()
+            self.trainer.logger = DummyLogger() if self.trainer.logger is not None else None
 
             rank_zero_info(
                 "Running in fast_dev_run mode: will run a full train,"
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
index 8b0a59a45b84e..cb01e7edbc97a 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -29,6 +29,11 @@
 class LoggerConnector:
     def __init__(self, trainer: "pl.Trainer", log_gpu_memory: Optional[str] = None) -> None:
         self.trainer = trainer
+        if log_gpu_memory is not None:
+            rank_zero_deprecation(
+                "Setting `log_gpu_memory` with the trainer flag is deprecated in v1.5 and will be removed in v1.7. "
+                "Please monitor GPU stats with the `DeviceStatsMonitor` callback directly instead."
+            )
         self.log_gpu_memory = log_gpu_memory
         self.eval_loop_results: List[_OUT_DICT] = []
         self._val_log_step: int = 0
@@ -103,6 +108,9 @@ def log_metrics(self, metrics: _OUT_DICT, step: Optional[int] = None) -> None:
 
         if step is None:
             step = scalar_metrics.pop("step", None)
+
+        self._logged_metrics.update(scalar_metrics)
+
         if step is None:
             # added metrics for convenience
             scalar_metrics.setdefault("epoch", self.trainer.current_epoch)
@@ -112,8 +120,6 @@ def log_metrics(self, metrics: _OUT_DICT, step: Optional[int] = None) -> None:
         self.trainer.logger.agg_and_log_metrics(scalar_metrics, step=step)
         self.trainer.logger.save()
 
-        self._logged_metrics.update(scalar_metrics)
-
     """
     Evaluation metric updates
     """
@@ -132,7 +138,7 @@ def _increment_eval_log_step(self) -> None:
         elif self.trainer.state.stage is RunningStage.TESTING:
             self._test_log_step += 1
 
-    def on_evaluation_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int, num_dataloaders: int) -> None:
+    def on_evaluation_batch_start(self, batch: Any, dataloader_idx: int, num_dataloaders: int) -> None:
         model = self.trainer.lightning_module
         # set dataloader_idx only if multiple ones
         model._current_dataloader_idx = dataloader_idx if num_dataloaders > 1 else None
@@ -140,7 +146,6 @@ def on_evaluation_batch_start(self, batch: Any, batch_idx: int, dataloader_idx:
         # track batch_size
         assert self.trainer._results is not None
         self.trainer._results.extract_batch_size(batch)
-        self._batch_idx = batch_idx
 
     def update_eval_step_metrics(self) -> None:
         if self.trainer.sanity_checking:
@@ -207,20 +212,19 @@ def update_eval_epoch_metrics(self) -> List[_OUT_DICT]:
     Train metric updates
     """
 
-    def on_train_split_start(self, batch_idx: int, split_idx: int, split_batch: Any) -> None:
+    def on_train_split_start(self, split_idx: int, split_batch: Any) -> None:
         assert self.trainer._results is not None
         # when the user requests `dataloader_iter`, we can't track the batch_size
         # and this is left to user responsibility.
         if isinstance(split_batch, pl.utilities.fetching.DataLoaderIterDataFetcher):
             self.trainer._results.extract_batch_size(split_batch)
-
-        self._batch_idx = batch_idx
         self._split_idx = split_idx
 
     def update_train_step_metrics(self) -> None:
         if self.trainer.fit_loop._should_accumulate() and self.trainer.lightning_module.automatic_optimization:
             return
 
+        # TODO: remove this call in v1.7
         self._log_gpus_metrics()
 
         # when metrics should be logged
@@ -238,6 +242,11 @@ def update_train_epoch_metrics(self) -> None:
         self.trainer._results.reset(metrics=True)
 
     def _log_gpus_metrics(self) -> None:
+        """
+        .. deprecated:: v1.5
+            This function was deprecated in v1.5 in favor of
+            `pytorch_lightning.accelerators.gpu._get_nvidia_gpu_stats` and will be removed in v1.7.
+        """
         for key, mem in self.gpus_metrics.items():
             if self.log_gpu_memory == "min_max":
                 self.trainer.lightning_module.log(key, mem, prog_bar=False, logger=True)
@@ -255,7 +264,8 @@ def _log_gpus_metrics(self) -> None:
     def on_epoch_start(self) -> None:
         self._epoch_end_reached = False
 
-    def on_batch_start(self) -> None:
+    def on_batch_start(self, batch_idx: int) -> None:
+        self._batch_idx = batch_idx
         self._epoch_end_reached = False
 
     def epoch_end_reached(self) -> None:
@@ -308,6 +318,14 @@ def metrics(self) -> _METRICS:
 
     @property
     def gpus_metrics(self) -> Dict[str, float]:
+        """
+        .. deprecated:: v1.5
+            Will be removed in v1.7.
+        """
+        rank_zero_deprecation(
+            "The property `LoggerConnector.gpus_metrics` was deprecated in v1.5"
+            " and will be removed in 1.7. Use the `DeviceStatsMonitor` callback instead."
+        )
         if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory:
             mem_map = memory.get_memory_profile(self.log_gpu_memory)
             self._gpus_metrics.update(mem_map)
diff --git a/pytorch_lightning/trainer/connectors/training_trick_connector.py b/pytorch_lightning/trainer/connectors/training_trick_connector.py
index 84cfdb16846cb..810adfeb8f734 100644
--- a/pytorch_lightning/trainer/connectors/training_trick_connector.py
+++ b/pytorch_lightning/trainer/connectors/training_trick_connector.py
@@ -11,9 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Union
+from typing import Optional, Union
 
-from pytorch_lightning.utilities import GradClipAlgorithmType
+from pytorch_lightning.utilities import GradClipAlgorithmType, rank_zero_deprecation
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
@@ -23,31 +23,44 @@ def __init__(self, trainer):
 
     def on_trainer_init(
         self,
-        gradient_clip_val: Union[int, float],
-        gradient_clip_algorithm: str,
+        gradient_clip_val: Optional[Union[int, float]],
+        gradient_clip_algorithm: Optional[str],
         track_grad_norm: Union[int, float, str],
-        terminate_on_nan: bool,
+        terminate_on_nan: Optional[bool],
     ):
-        if not isinstance(terminate_on_nan, bool):
-            raise TypeError(f"`terminate_on_nan` should be a bool, got {terminate_on_nan}.")
+        if terminate_on_nan is not None:
+            rank_zero_deprecation(
+                "Trainer argument `terminate_on_nan` was deprecated in v1.5 and will be removed in 1.7."
+                " Please use `Trainer(detect_anomaly=True)` instead."
+            )
+            if not isinstance(terminate_on_nan, bool):
+                raise TypeError(f"`terminate_on_nan` should be a bool, got {terminate_on_nan}.")
 
         # gradient clipping
-        if not isinstance(gradient_clip_val, (int, float)):
+        if gradient_clip_val is not None and not isinstance(gradient_clip_val, (int, float)):
             raise TypeError(f"`gradient_clip_val` should be an int or a float. Got {gradient_clip_val}.")
 
-        if not GradClipAlgorithmType.supported_type(gradient_clip_algorithm.lower()):
+        if gradient_clip_algorithm is not None and not GradClipAlgorithmType.supported_type(
+            gradient_clip_algorithm.lower()
+        ):
             raise MisconfigurationException(
                 f"`gradient_clip_algorithm` {gradient_clip_algorithm} is invalid. "
                 f"Allowed algorithms: {GradClipAlgorithmType.supported_types()}."
             )
 
         # gradient norm tracking
-        if not isinstance(track_grad_norm, (int, float)) and track_grad_norm != "inf":
+        if track_grad_norm != -1 and not (
+            (isinstance(track_grad_norm, (int, float)) or track_grad_norm == "inf") and float(track_grad_norm) > 0
+        ):
             raise MisconfigurationException(
-                f"`track_grad_norm` should be an int, a float or 'inf' (infinity norm). Got {track_grad_norm}."
+                f"`track_grad_norm` must be a positive number or 'inf' (infinity norm). Got {track_grad_norm}."
             )
 
-        self.trainer.terminate_on_nan = terminate_on_nan
+        self.trainer._terminate_on_nan = terminate_on_nan
         self.trainer.gradient_clip_val = gradient_clip_val
-        self.trainer.gradient_clip_algorithm = GradClipAlgorithmType(gradient_clip_algorithm.lower())
+        self.trainer.gradient_clip_algorithm = (
+            GradClipAlgorithmType(gradient_clip_algorithm.lower())
+            if gradient_clip_algorithm is not None
+            else gradient_clip_algorithm
+        )
         self.trainer.track_grad_norm = float(track_grad_norm)
diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index 377a9cca3c4cc..07fff76c773b0 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -38,6 +38,7 @@
     FastForwardSampler,
 )
 from pytorch_lightning.utilities.data import has_iterable_dataset, has_len
+from pytorch_lightning.utilities.enums import DistributedType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _fault_tolerant_training
 from pytorch_lightning.utilities.model_helpers import is_overridden
@@ -69,7 +70,7 @@ def _worker_check(self, dataloader: DataLoader, name: str) -> None:
         if not isinstance(dataloader, DataLoader):
             return
 
-        using_spawn = self.accelerator_connector.distributed_backend == "ddp_spawn"
+        using_spawn = self.accelerator_connector._distrib_type == DistributedType.DDP_SPAWN
         num_cpus = multiprocessing.cpu_count()
 
         # ddp_spawn + num_workers > 0 don't mix! tell the user
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index af225e708e343..bc741d5f11444 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -120,11 +120,12 @@ class Trainer(
     def __init__(
         self,
         logger: Union[LightningLoggerBase, Iterable[LightningLoggerBase], bool] = True,
-        checkpoint_callback: bool = True,
+        checkpoint_callback: Optional[bool] = None,
+        enable_checkpointing: bool = True,
         callbacks: Optional[Union[List[Callback], Callback]] = None,
         default_root_dir: Optional[str] = None,
-        gradient_clip_val: Union[int, float] = 0.0,
-        gradient_clip_algorithm: str = "norm",
+        gradient_clip_val: Optional[Union[int, float]] = None,
+        gradient_clip_algorithm: Optional[str] = None,
         process_position: int = 0,
         num_nodes: int = 1,
         num_processes: int = 1,
@@ -133,7 +134,7 @@ def __init__(
         auto_select_gpus: bool = False,
         tpu_cores: Optional[Union[List[int], str, int]] = None,
         ipus: Optional[int] = None,
-        log_gpu_memory: Optional[str] = None,
+        log_gpu_memory: Optional[str] = None,  # TODO: Remove in 1.7
         progress_bar_refresh_rate: Optional[int] = None,  # TODO: remove in v1.7
         enable_progress_bar: bool = True,
         overfit_batches: Union[int, float] = 0.0,
@@ -154,8 +155,10 @@ def __init__(
         flush_logs_every_n_steps: Optional[int] = None,
         log_every_n_steps: int = 50,
         accelerator: Optional[Union[str, Accelerator]] = None,
+        strategy: Optional[Union[str, TrainingTypePlugin]] = None,
         sync_batchnorm: bool = False,
         precision: Union[int, str] = 32,
+        enable_model_summary: bool = True,
         weights_summary: Optional[str] = "top",
         weights_save_path: Optional[str] = None,
         num_sanity_val_steps: int = 2,
@@ -167,25 +170,24 @@ def __init__(
         reload_dataloaders_every_epoch: bool = False,
         auto_lr_find: Union[bool, str] = False,
         replace_sampler_ddp: bool = True,
-        terminate_on_nan: bool = False,
+        detect_anomaly: bool = False,
         auto_scale_batch_size: Union[str, bool] = False,
         prepare_data_per_node: Optional[bool] = None,
         plugins: Optional[Union[PLUGIN_INPUT, List[PLUGIN_INPUT]]] = None,
         amp_backend: str = "native",
         amp_level: Optional[str] = None,
-        distributed_backend: Optional[str] = None,
         move_metrics_to_cpu: bool = False,
         multiple_trainloader_mode: str = "max_size_cycle",
         stochastic_weight_avg: bool = False,
-        detect_anomaly: bool = False,
+        terminate_on_nan: Optional[bool] = None,
     ):
         r"""
         Customize every aspect of training via flags.
 
         Args:
 
-            accelerator: Previously known as distributed_backend (dp, ddp, ddp2, etc...).
-                Can also take in an accelerator object for custom hardware.
+            accelerator: (dp, ddp, ddp2, etc...). Can also take in an accelerator object
+                for custom hardware.
 
             accumulate_grad_batches: Accumulates grads every k batches or as set up in the dict.
 
@@ -215,6 +217,12 @@ def __init__(
             callbacks: Add a callback or list of callbacks.
 
             checkpoint_callback: If ``True``, enable checkpointing.
+
+                .. deprecated:: v1.5
+                    ``checkpoint_callback`` has been deprecated in v1.5 and will be removed in v1.7.
+                    Please consider using ``enable_checkpointing`` instead.
+
+            enable_checkpointing: If ``True``, enable checkpointing.
                 It will configure a default ModelCheckpoint callback if there is no user-defined ModelCheckpoint in
                 :paramref:`~pytorch_lightning.trainer.trainer.Trainer.callbacks`.
 
@@ -232,8 +240,6 @@ def __init__(
             devices: Will be mapped to either `gpus`, `tpu_cores`, `num_processes` or `ipus`,
                 based on the accelerator type.
 
-            distributed_backend: Deprecated. Please use ``accelerator``.
-
             fast_dev_run: Runs n if set to ``n`` (int) else 1 if set to ``True`` batch(es)
                 of train, val and test to find any bugs (ie: a sort of unit test).
 
@@ -245,11 +251,12 @@ def __init__(
 
             gpus: Number of GPUs to train on (int) or which GPUs to train on (list or str) applied per node
 
-            gradient_clip_val: The value at which to clip gradients. Passing ``gradient_clip_val=0`` disables gradient
-                clipping.
+            gradient_clip_val: The value at which to clip gradients. Passing ``gradient_clip_val=None`` disables
+                gradient clipping.
 
             gradient_clip_algorithm: The gradient clipping algorithm to use. Pass ``gradient_clip_algorithm="value"``
-                for clip_by_value, and ``gradient_clip_algorithm="norm"`` for clip_by_norm.
+                to clip by value, and ``gradient_clip_algorithm="norm"`` to clip by norm. By default it will
+                be set to ``"norm"``.
 
             limit_train_batches: How much of training dataset to check (float = fraction, int = num_batches).
 
@@ -267,6 +274,10 @@ def __init__(
 
             log_gpu_memory: None, 'min_max', 'all'. Might slow performance.
 
+                .. deprecated:: v1.5
+                    Deprecated in v1.5.0 and will be removed in v1.7.0
+                    Please use the ``DeviceStatsMonitor`` callback directly instead.
+
             log_every_n_steps: How often to log within steps (defaults to every 50 steps).
 
             prepare_data_per_node: If True, each LOCAL_RANK=0 will call prepare data.
@@ -343,14 +354,23 @@ def __init__(
                 you can set ``replace_sampler_ddp=False`` and add your own distributed sampler.
 
             resume_from_checkpoint: Path/URL of the checkpoint from which training is resumed. If there is
-                no checkpoint file at the path, start from scratch. If resuming from mid-epoch checkpoint,
+                no checkpoint file at the path, an exception is raised. If resuming from mid-epoch checkpoint,
                 training will start from the beginning of the next epoch.
 
+            strategy: Supports different training strategies with aliases
+                as well custom training type plugins.
+
             sync_batchnorm: Synchronize batch norm layers between process groups/whole world.
 
             terminate_on_nan: If set to True, will terminate training (by raising a `ValueError`) at the
                 end of each training batch, if any of the parameters or the loss are NaN or +/-inf.
 
+                .. deprecated:: v1.5
+                    Trainer argument ``terminate_on_nan`` was deprecated in v1.5 and will be removed in 1.7.
+                    Please use ``detect_anomaly`` instead.
+
+            detect_anomaly: Enable anomaly detection for the autograd engine.
+
             tpu_cores: How many TPU cores to train on (1 or 8) / Single TPU to train on [1]
 
             ipus: How many IPUs to train on.
@@ -360,8 +380,16 @@ def __init__(
             val_check_interval: How often to check the validation set. Use float to check within a training epoch,
                 use int to check every n steps (batches).
 
+            enable_model_summary: Whether to enable model summarization by default.
+
             weights_summary: Prints a summary of the weights when training begins.
 
+                .. deprecated:: v1.5
+                    ``weights_summary`` has been deprecated in v1.5 and will be removed in v1.7.
+                    To disable the summary, pass ``enable_model_summary = False`` to the Trainer.
+                    To customize the summary, pass :class:`~pytorch_lightning.callbacks.model_summary.ModelSummary`
+                    directly to the Trainer's ``callbacks`` argument.
+
             weights_save_path: Where to save weights if specified. Will override default_root_dir
                 for checkpoints only. Use this if for whatever reason you need the checkpoints
                 stored in a different place than the logs written in `default_root_dir`.
@@ -391,7 +419,7 @@ def __init__(
         gpu_ids, tpu_cores = self._parse_devices(gpus, auto_select_gpus, tpu_cores)
 
         # init connectors
-        self.config_validator = ConfigValidator(self)
+        self._config_validator = ConfigValidator(self)
         self.data_connector = DataConnector(self, multiple_trainloader_mode)
         self.optimizer_connector = OptimizerConnector(self)
 
@@ -400,8 +428,8 @@ def __init__(
             devices,
             tpu_cores,
             ipus,
-            distributed_backend,
             accelerator,
+            strategy,
             gpus,
             gpu_ids,
             num_nodes,
@@ -454,16 +482,21 @@ def __init__(
         self.tested_ckpt_path: Optional[str] = None
         self.predicted_ckpt_path: Optional[str] = None
 
+        # todo: remove in v1.7
+        self._weights_summary: Optional[str] = None
+
         # init callbacks
         # Declare attributes to be set in callback_connector on_trainer_init
         self.callback_connector.on_trainer_init(
             callbacks,
             checkpoint_callback,
+            enable_checkpointing,
             enable_progress_bar,
             progress_bar_refresh_rate,
             process_position,
             default_root_dir,
             weights_save_path,
+            enable_model_summary,
             weights_summary,
             stochastic_weight_avg,
             max_time,
@@ -655,7 +688,8 @@ def validate(
 
             ckpt_path: Either ``best`` or path to the checkpoint you wish to validate.
                 If ``None`` and the model instance was passed, use the current weights.
-                Otherwise, the best model from the previous ``trainer.fit`` call will be loaded.
+                Otherwise, the best model checkpoint from the previous ``trainer.fit`` call will be loaded
+                if a checkpoint callback is configured.
 
             verbose: If True, prints the validation results.
 
@@ -744,7 +778,8 @@ def test(
 
             ckpt_path: Either ``best`` or path to the checkpoint you wish to test.
                 If ``None`` and the model instance was passed, use the current weights.
-                Otherwise, the best model from the previous ``trainer.fit`` call will be loaded.
+                Otherwise, the best model checkpoint from the previous ``trainer.fit`` call will be loaded
+                if a checkpoint callback is configured.
 
             verbose: If True, prints the test results.
 
@@ -838,7 +873,8 @@ def predict(
 
             ckpt_path: Either ``best`` or path to the checkpoint you wish to predict.
                 If ``None`` and the model instance was passed, use the current weights.
-                Otherwise, the best model from the previous ``trainer.fit`` call will be loaded.
+                Otherwise, the best model checkpoint from the previous ``trainer.fit`` call will be loaded
+                if a checkpoint callback is configured.
 
         Returns:
             Returns a list of dictionaries, one for each provided dataloader containing their respective predictions.
@@ -978,19 +1014,19 @@ def _run(self, model: "pl.LightningModule") -> Optional[Union[_EVALUATE_OUTPUT,
         if hasattr(model, "hparams"):
             parsing.clean_namespace(model.hparams)
 
-        self.config_validator.verify_loop_configurations(model)
+        self._config_validator.verify_loop_configurations(model)
 
         # attach model log function to callback
         self.callback_connector.attach_model_logging_functions(model)
 
         # attach model to the training type plugin
-        self.accelerator.connect(model)
+        self.training_type_plugin.connect(model)
 
         # hook
         self.data_connector.prepare_data()
         self.callback_connector._attach_model_callbacks()
 
-        if self._ckpt_path and not self.accelerator.restore_checkpoint_after_pre_dispatch:
+        if self._ckpt_path and not self.training_type_plugin.restore_checkpoint_after_pre_dispatch:
             self._load_checkpoint_weights()
 
         # ----------------------------
@@ -1001,7 +1037,7 @@ def _run(self, model: "pl.LightningModule") -> Optional[Union[_EVALUATE_OUTPUT,
         self._call_setup_hook()  # allow user to setup lightning_module in accelerator environment
 
         # check if we should delay restoring checkpoint till later
-        if not self.accelerator.restore_checkpoint_after_pre_dispatch:
+        if not self.training_type_plugin.restore_checkpoint_after_pre_dispatch:
             self.checkpoint_connector.resume_start()
             self._restore_modules_and_callbacks()
 
@@ -1019,9 +1055,9 @@ def _run(self, model: "pl.LightningModule") -> Optional[Union[_EVALUATE_OUTPUT,
                                 |                             ||
                          {self._dispatch}                     ||
                                 |                             ||  LIGHTNING
-                  {self.accelerator.start_training}           ||
-                or {self.accelerator.start_evaluating}        ||
-                or {self.accelerator.start_predicting}        ||  FLOW
+         {self.training_type_plugin.start_training}           ||
+       or {self.training_type_plugin.start_evaluating}        ||
+       or {self.training_type_plugin.start_predicting}        ||  FLOW
                                 |                             ||
                          {self.run_stage}                     ||
                                 |                             ||  DIRECTION
@@ -1051,7 +1087,7 @@ def _run(self, model: "pl.LightningModule") -> Optional[Union[_EVALUATE_OUTPUT,
         # plugin will setup fitting (e.g. ddp will launch child processes)
         self._pre_dispatch()
 
-        if self.accelerator.restore_checkpoint_after_pre_dispatch:
+        if self.training_type_plugin.restore_checkpoint_after_pre_dispatch:
             if self._ckpt_path:
                 self._load_checkpoint_weights()
 
@@ -1083,7 +1119,7 @@ def _run(self, model: "pl.LightningModule") -> Optional[Union[_EVALUATE_OUTPUT,
             self.state.status = TrainerStatus.FINISHED
         self.state.stage = None
 
-        return self.accelerator.results
+        return self.training_type_plugin.results
 
     def _pre_dispatch(self):
         self.accelerator.pre_dispatch(self)
@@ -1137,11 +1173,11 @@ def _post_dispatch(self):
 
     def _dispatch(self):
         if self.evaluating:
-            self.accelerator.start_evaluating(self)
+            self.training_type_plugin.start_evaluating(self)
         elif self.predicting:
-            self.accelerator.start_predicting(self)
+            self.training_type_plugin.start_predicting(self)
         else:
-            self.accelerator.start_training(self)
+            self.training_type_plugin.start_training(self)
 
     def run_stage(self):
         self.accelerator.dispatch(self)
@@ -1185,7 +1221,8 @@ def _run_train(self) -> None:
         # reload data when needed
         model = self.lightning_module
 
-        self.reset_train_val_dataloaders(model)
+        if isinstance(self.fit_loop, FitLoop):
+            self.reset_train_val_dataloaders(model)
 
         self.fit_loop.trainer = self
         with torch.autograd.set_detect_anomaly(self._detect_anomaly):
@@ -1267,15 +1304,20 @@ def __set_ckpt_path(self, ckpt_path: Optional[str], model_provided: bool, model_
 
         if model_connected and ckpt_path is None:
             rank_zero_warn(
-                f"`.{fn}(ckpt_path=None)` was called without a model. "
-                "The best model of the previous `fit` call will be used. "
-                f"You can pass `{fn}(ckpt_path='best')` to avoid this warning "
-                "or `ckpt_path=trainer.model_checkpoint.last_model_path` to use the last model."
+                f"`.{fn}(ckpt_path=None)` was called without a model."
+                " The best model of the previous `fit` call will be used."
+                f" You can pass `{fn}(ckpt_path='best')` to use and best model"
+                " checkpoint and avoid this warning or"
+                " `ckpt_path=trainer.model_checkpoint.last_model_path` to use the last model."
             )
             ckpt_path = "best"
 
         if ckpt_path == "best":
             # if user requests the best checkpoint but we don't have it, error
+            if not self.checkpoint_callback:
+                raise MisconfigurationException(
+                    f'`.{fn}(ckpt_path="best")` is set but `ModelCheckpoint` is not configured.'
+                )
             if not self.checkpoint_callback.best_model_path:
                 if self.fast_dev_run:
                     raise MisconfigurationException(
@@ -1356,8 +1398,15 @@ def call_hook(
             if callable(model_fx):
                 output = model_fx(*args, **kwargs)
 
+            # *Bad code alert*
+            # The `Accelerator` mostly calls the `TrainingTypePlugin` but some of those calls are deprecated.
+            # The following logic selectively chooses which hooks are called on each object.
+            # In the case of `setup` and `teardown`, the hooks on the `LightningModule` should not call the hooks of the
+            # same name in these objects as they are meant to be managed outside of the `LightningModule` lifecycle.
+            # All of this should be fixed by #8506
+
             # call the accelerator hook
-            if hook_name not in ("setup", "teardown") and hasattr(self.accelerator, hook_name):
+            if hook_name in ("on_train_start",) and hasattr(self.accelerator, hook_name):
                 accelerator_hook = getattr(self.accelerator, hook_name)
                 accelerator_output = accelerator_hook(*args, **kwargs)
                 # Rely on the accelerator output if lightningModule hook returns nothing
@@ -1365,6 +1414,14 @@ def call_hook(
                 # todo: move this data parallel logic into the data parallel plugin
                 output = accelerator_output if output is None else output
 
+            # call the ttp hook
+            if hook_name not in ("setup", "teardown", "on_train_start") and hasattr(
+                self.training_type_plugin, hook_name
+            ):
+                ttp_hook = getattr(self.training_type_plugin, hook_name)
+                ttp_output = ttp_hook(*args, **kwargs)
+                output = ttp_output if output is None else output
+
         if pl_module:
             # restore current_fx when nested context
             pl_module._current_fx_name = prev_fx_name
@@ -1453,11 +1510,6 @@ def _on_exception(self):
     def accelerator(self) -> Accelerator:
         return self.accelerator_connector.accelerator
 
-    @property
-    def distributed_backend(self) -> Optional[str]:
-        # for backward compatibility
-        return self.accelerator_connector.distributed_backend
-
     @property
     def training_type_plugin(self) -> TrainingTypePlugin:
         return self.accelerator.training_type_plugin
@@ -1468,26 +1520,26 @@ def precision_plugin(self) -> PrecisionPlugin:
 
     @property
     def global_rank(self) -> int:
-        return self.accelerator.training_type_plugin.global_rank
+        return self.training_type_plugin.global_rank
 
     @property
     def local_rank(self) -> int:
         # some training types define a local rank
-        return getattr(self.accelerator.training_type_plugin, "local_rank", 0)
+        return getattr(self.training_type_plugin, "local_rank", 0)
 
     @property
     def node_rank(self) -> int:
         # some training types define a local rank
-        return getattr(self.accelerator.training_type_plugin, "node_rank", 0)
+        return getattr(self.training_type_plugin, "node_rank", 0)
 
     @property
     def world_size(self) -> int:
         # some training types define a world size
-        return getattr(self.accelerator.training_type_plugin, "world_size", 1)
+        return getattr(self.training_type_plugin, "world_size", 1)
 
     @property
     def should_rank_save_checkpoint(self) -> bool:
-        return self.accelerator.training_type_plugin.should_rank_save_checkpoint
+        return self.training_type_plugin.should_rank_save_checkpoint
 
     @property
     def _distrib_type(self) -> DistributedType:
@@ -1977,13 +2029,6 @@ def _active_loop(self) -> Optional[Union[FitLoop, EvaluationLoop, PredictionLoop
         if self.predicting:
             return self.predict_loop
 
-    @property
-    def train_loop(self) -> FitLoop:
-        rank_zero_deprecation(
-            "`Trainer.train_loop` has been renamed to `Trainer.fit_loop` and will be removed in v1.6."
-        )
-        return self.fit_loop
-
     @property
     def _ckpt_path(self) -> Optional[str]:
         if self.state.fn == TrainerFn.VALIDATING:
@@ -2021,6 +2066,16 @@ def _exit_gracefully_on_signal(self) -> None:
             class_name = caller[0].f_locals["self"].__class__.__name__
             raise ExitGracefullyException(f"Exiting gracefully on {class_name}:{caller.function}")
 
+    @property
+    def weights_summary(self) -> Optional[str]:
+        rank_zero_deprecation("`Trainer.weights_summary` is deprecated in v1.5 and will be removed in v1.7.")
+        return self._weights_summary
+
+    @weights_summary.setter
+    def weights_summary(self, val: Optional[str]) -> None:
+        rank_zero_deprecation("Setting `Trainer.weights_summary` is deprecated in v1.5 and will be removed in v1.7.")
+        self._weights_summary = val
+
     """
     Other
     """
@@ -2033,3 +2088,23 @@ def __getstate__(self):
 
     def __setstate__(self, state):
         self.__dict__ = state
+
+    @property
+    def train_loop(self) -> FitLoop:
+        rank_zero_deprecation(
+            "`Trainer.train_loop` has been renamed to `Trainer.fit_loop` and will be removed in v1.6."
+        )
+        return self.fit_loop
+
+    @property
+    def terminate_on_nan(self) -> bool:
+        rank_zero_deprecation("`Trainer.terminate_on_nan` is deprecated in v1.5 and will be removed in 1.7.")
+        return self._terminate_on_nan
+
+    @terminate_on_nan.setter
+    def terminate_on_nan(self, val: bool) -> None:
+        rank_zero_deprecation(
+            f"Setting `Trainer.terminate_on_nan = {val}` is deprecated in v1.5 and will be removed in 1.7."
+            f" Please set `Trainer(detect_anomaly={val})` instead."
+        )
+        self._terminate_on_nan = val  # : 212
diff --git a/pytorch_lightning/tuner/batch_size_scaling.py b/pytorch_lightning/tuner/batch_size_scaling.py
index 8316793ea01b0..42f9ce084a43c 100644
--- a/pytorch_lightning/tuner/batch_size_scaling.py
+++ b/pytorch_lightning/tuner/batch_size_scaling.py
@@ -120,7 +120,7 @@ def __scale_batch_reset_params(trainer: "pl.Trainer", model: "pl.LightningModule
     trainer.fit_loop.current_epoch = 0
     trainer.fit_loop.max_steps = steps_per_trial  # take few steps
     trainer.weights_summary = None  # not needed before full run
-    trainer.logger = DummyLogger()
+    trainer.logger = DummyLogger() if trainer.logger is not None else None
     trainer.callbacks = []  # not needed before full run
     trainer.limit_train_batches = 1.0
     trainer.optimizers, trainer.lr_schedulers = [], []  # required for saving
@@ -205,6 +205,7 @@ def _run_binsearch_scaling(
             if changed:
                 # Force the train dataloader to reset as the batch size has changed
                 trainer.reset_train_dataloader(model)
+                trainer.reset_val_dataloader(model)
             else:
                 break
 
diff --git a/pytorch_lightning/tuner/lr_finder.py b/pytorch_lightning/tuner/lr_finder.py
index 6ba9364e867d9..ee6e6a25802ee 100644
--- a/pytorch_lightning/tuner/lr_finder.py
+++ b/pytorch_lightning/tuner/lr_finder.py
@@ -16,16 +16,16 @@
 import os
 import uuid
 from functools import wraps
-from typing import Callable, Optional, Sequence
+from typing import Optional, Sequence
 
 import numpy as np
 import torch
-from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 
 import pytorch_lightning as pl
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.loggers.base import DummyLogger
+from pytorch_lightning.trainer.optimizers import _get_default_scheduler_config
 from pytorch_lightning.utilities import rank_zero_warn
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -98,25 +98,15 @@ def __init__(self, mode: str, lr_min: float, lr_max: float, num_training: int):
         self.results = {}
         self._total_batch_idx = 0  # for debug purpose
 
-    def _exchange_scheduler(self, configure_optimizers: Callable):
-        """Decorate configure_optimizers methods such that it returns the users originally specified optimizer
+    def _exchange_scheduler(self, trainer: "pl.Trainer"):
+        """Decorate `trainer.init_optimizers` method such that it returns the users originally specified optimizer
         together with a new scheduler that that takes care of the learning rate search."""
+        init_optimizers = trainer.init_optimizers
 
-        @wraps(configure_optimizers)
-        def func():
-            # Decide the structure of the output from configure_optimizers
-            # Same logic as method `init_optimizers` in trainer/optimizers.py
-            optim_conf = configure_optimizers()
-            if isinstance(optim_conf, Optimizer):
-                optimizers = [optim_conf]
-            elif isinstance(optim_conf, (list, tuple)) and len(optim_conf) == 2 and isinstance(optim_conf[0], list):
-                optimizers, _ = optim_conf
-            elif isinstance(optim_conf, dict):
-                optimizers = [optim_conf["optimizer"]]
-            elif isinstance(optim_conf, (list, tuple)) and isinstance(optim_conf[0], dict):
-                optimizers = [opt_dict["optimizer"] for opt_dict in optim_conf]
-            elif isinstance(optim_conf, (list, tuple)) and all(isinstance(opt, Optimizer) for opt in optim_conf):
-                optimizers = list(optim_conf)
+        @wraps(init_optimizers)
+        def func(model):
+            # Decide the structure of the output from init_optimizers
+            optimizers, _, _ = init_optimizers(model)
 
             if len(optimizers) != 1:
                 raise MisconfigurationException(
@@ -133,8 +123,10 @@ def func():
 
             args = (optimizer, self.lr_max, self.num_training)
             scheduler = _LinearLR(*args) if self.mode == "linear" else _ExponentialLR(*args)
+            sched_config = _get_default_scheduler_config()
+            sched_config.update({"scheduler": scheduler, "interval": "step"})
 
-            return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
+            return [optimizer], [sched_config], []
 
         return func
 
@@ -223,7 +215,7 @@ def lr_find(
     trainer.callbacks = [_LRCallback(num_training, early_stop_threshold, progress_bar_refresh_rate=1)]
 
     # No logging
-    trainer.logger = DummyLogger()
+    trainer.logger = DummyLogger() if trainer.logger is not None else None
 
     # Max step set to number of iterations
     trainer.fit_loop.max_steps = num_training
@@ -240,7 +232,7 @@ def lr_find(
     trainer.save_checkpoint(str(save_path))
 
     # Configure optimizer and scheduler
-    model.configure_optimizers = lr_finder._exchange_scheduler(model.configure_optimizers)
+    trainer.init_optimizers = lr_finder._exchange_scheduler(trainer)
 
     # Fit, lr & loss logged in callback
     trainer.tuner._run(model)
@@ -286,7 +278,7 @@ def __lr_finder_dump_params(trainer, model):
         "max_steps": trainer.max_steps,
         "checkpoint_callback": trainer.checkpoint_callback,
         "current_epoch": trainer.current_epoch,
-        "configure_optimizers": model.configure_optimizers,
+        "init_optimizers": trainer.init_optimizers,
     }
 
 
@@ -297,7 +289,7 @@ def __lr_finder_restore_params(trainer, model):
     trainer.fit_loop.global_step = trainer.__dumped_params["global_step"]
     trainer.fit_loop.max_steps = trainer.__dumped_params["max_steps"]
     trainer.fit_loop.current_epoch = trainer.__dumped_params["current_epoch"]
-    model.configure_optimizers = trainer.__dumped_params["configure_optimizers"]
+    trainer.init_optimizers = trainer.__dumped_params["init_optimizers"]
     del trainer.__dumped_params
 
 
diff --git a/pytorch_lightning/utilities/apply_func.py b/pytorch_lightning/utilities/apply_func.py
index 2758262653ba7..3bd920c2e304b 100644
--- a/pytorch_lightning/utilities/apply_func.py
+++ b/pytorch_lightning/utilities/apply_func.py
@@ -118,18 +118,19 @@ def apply_to_collection(
 
     if _is_dataclass_instance(data):
         out_dict = {}
-        for field in data.__dataclass_fields__:
-            v = apply_to_collection(
-                getattr(data, field),
-                dtype,
-                function,
-                *args,
-                wrong_dtype=wrong_dtype,
-                include_none=include_none,
-                **kwargs,
-            )
-            if include_none or v is not None:
-                out_dict[field] = v
+        for field in dataclasses.fields(data):
+            if field.init:
+                v = apply_to_collection(
+                    getattr(data, field.name),
+                    dtype,
+                    function,
+                    *args,
+                    wrong_dtype=wrong_dtype,
+                    include_none=include_none,
+                    **kwargs,
+                )
+                if include_none or v is not None:
+                    out_dict[field.name] = v
         return elem_type(**out_dict)
 
     # data is neither of dtype, nor a collection
diff --git a/pytorch_lightning/utilities/enums.py b/pytorch_lightning/utilities/enums.py
index 2f0deee9ae40d..436c675c382c2 100644
--- a/pytorch_lightning/utilities/enums.py
+++ b/pytorch_lightning/utilities/enums.py
@@ -62,6 +62,7 @@ class PrecisionType(LightningEnum):
     FLOAT = "32"
     FULL = "64"
     BFLOAT = "bf16"
+    MIXED = "mixed"
 
     @staticmethod
     def supported_type(precision: Union[str, int]) -> bool:
diff --git a/pytorch_lightning/utilities/grads.py b/pytorch_lightning/utilities/grads.py
index 93e9e832f0a26..480da8b3eac0a 100644
--- a/pytorch_lightning/utilities/grads.py
+++ b/pytorch_lightning/utilities/grads.py
@@ -35,6 +35,9 @@ def grad_norm(module: Module, norm_type: Union[float, int, str]) -> Dict[str, fl
             as a single vector.
     """
     norm_type = float(norm_type)
+    if norm_type <= 0:
+        raise ValueError(f"`norm_type` must be a positive number or 'inf' (infinity norm). Got {norm_type}")
+
     norms = {
         f"grad_{norm_type}_norm_{name}": p.grad.data.norm(norm_type).item()
         for name, p in module.named_parameters()
diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py
index 4dd213552f6f3..717e8b5e447e1 100644
--- a/pytorch_lightning/utilities/memory.py
+++ b/pytorch_lightning/utilities/memory.py
@@ -96,7 +96,12 @@ def garbage_collection_cuda() -> None:
 
 
 def get_memory_profile(mode: str) -> Dict[str, float]:
-    """Get a profile of the current memory usage.
+    r"""
+    .. deprecated:: v1.5
+        This function was deprecated in v1.5 in favor of
+        `pytorch_lightning.accelerators.gpu._get_nvidia_gpu_stats` and will be removed in v1.7.
+
+    Get a profile of the current memory usage.
 
     Args:
         mode: There are two modes:
@@ -124,7 +129,12 @@ def get_memory_profile(mode: str) -> Dict[str, float]:
 
 
 def get_gpu_memory_map() -> Dict[str, float]:
-    """Get the current gpu usage.
+    r"""
+    .. deprecated:: v1.5
+        This function was deprecated in v1.5 in favor of
+        `pytorch_lightning.accelerators.gpu._get_nvidia_gpu_stats` and will be removed in v1.7.
+
+    Get the current gpu usage.
 
     Return:
         A dictionary in which the keys are device ids as integers and
diff --git a/pytorch_lightning/utilities/xla_device.py b/pytorch_lightning/utilities/xla_device.py
index b922a749e7742..2feef71c563f2 100644
--- a/pytorch_lightning/utilities/xla_device.py
+++ b/pytorch_lightning/utilities/xla_device.py
@@ -70,9 +70,7 @@ def _is_device_tpu() -> bool:
         # we would have to use `torch_xla.distributed.xla_dist` for
         # multiple VMs and TPU_CONFIG won't be available, running
         # `xm.get_xla_supported_devices("TPU")` won't be possible.
-        if xm.xrt_world_size() > 1:
-            return True
-        return len(xm.get_xla_supported_devices("TPU")) > 0
+        return (xm.xrt_world_size() > 1) or bool(xm.get_xla_supported_devices("TPU"))
 
     @staticmethod
     def xla_available() -> bool:
diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
index 0fe5b2824b82f..6f168b9275241 100644
--- a/tests/accelerators/test_accelerator_connector.py
+++ b/tests/accelerators/test_accelerator_connector.py
@@ -26,6 +26,7 @@
 from pytorch_lightning.accelerators.gpu import GPUAccelerator
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.plugins import (
+    DataParallelPlugin,
     DDP2Plugin,
     DDPPlugin,
     DDPShardedPlugin,
@@ -42,7 +43,7 @@
     SLURMEnvironment,
     TorchElasticEnvironment,
 )
-from pytorch_lightning.utilities import DistributedType
+from pytorch_lightning.utilities import DeviceType, DistributedType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel
 from tests.helpers.runif import RunIf
@@ -54,10 +55,12 @@ def test_accelerator_choice_cpu(tmpdir):
     assert isinstance(trainer.training_type_plugin, SingleDevicePlugin)
 
 
-def test_accelerator_choice_ddp_cpu(tmpdir):
-    trainer = Trainer(fast_dev_run=True, accelerator="ddp_cpu")
+@pytest.mark.parametrize(("num_processes", "num_nodes"), ([(1, 1), (1, 2), (2, 1), (2, 2)]))
+def test_accelerator_choice_ddp_cpu(tmpdir, num_processes: int, num_nodes: int):
+    trainer = Trainer(fast_dev_run=True, accelerator="ddp_cpu", num_processes=num_processes, num_nodes=num_nodes)
     assert isinstance(trainer.accelerator, CPUAccelerator)
-    assert isinstance(trainer.training_type_plugin, DDPSpawnPlugin)
+    no_spawn = num_processes == 1 and num_nodes > 1
+    assert isinstance(trainer.training_type_plugin, DDPPlugin if no_spawn else DDPSpawnPlugin)
     assert isinstance(trainer.training_type_plugin.cluster_environment, LightningEnvironment)
 
 
@@ -444,10 +447,10 @@ def on_fit_start(self, trainer, pl_module):
 @mock.patch("pytorch_lightning.utilities._IS_INTERACTIVE", return_value=True)
 @mock.patch("torch.cuda.device_count", return_value=2)
 def test_ipython_incompatible_backend_error(*_):
-    with pytest.raises(MisconfigurationException, match="backend ddp is not compatible"):
+    with pytest.raises(MisconfigurationException, match=r"strategy='ddp'\)`.*is not compatible"):
         Trainer(accelerator="ddp", gpus=2)
 
-    with pytest.raises(MisconfigurationException, match="backend ddp2 is not compatible"):
+    with pytest.raises(MisconfigurationException, match=r"strategy='ddp2'\)`.*is not compatible"):
         Trainer(accelerator="ddp2", gpus=2)
 
 
@@ -612,14 +615,14 @@ def test_set_devices_if_none_gpu():
 
 def test_devices_with_cpu_only_supports_integer():
 
-    with pytest.raises(MisconfigurationException, match="The flag `devices` only supports integer"):
+    with pytest.raises(MisconfigurationException, match="The flag `devices` must be an int"):
         Trainer(accelerator="cpu", devices="1,3")
 
 
 @pytest.mark.parametrize("training_type", ["ddp2", "dp"])
 def test_unsupported_distrib_types_on_cpu(training_type):
 
-    with pytest.warns(UserWarning, match="is not supported on CPUs, hence setting the distributed type to `ddp`."):
+    with pytest.warns(UserWarning, match="is not supported on CPUs, hence setting `strategy='ddp"):
         trainer = Trainer(accelerator=training_type, num_processes=2)
 
     assert trainer._distrib_type == DistributedType.DDP
@@ -631,10 +634,77 @@ def test_accelerator_ddp_for_cpu(tmpdir):
     assert isinstance(trainer.training_type_plugin, DDPPlugin)
 
 
+def test_exception_when_strategy_used_with_accelerator():
+    with pytest.raises(MisconfigurationException, match="but have also passed"):
+        Trainer(accelerator="ddp", strategy="ddp_spawn")
+
+
+def test_exception_when_strategy_used_with_plugins():
+    with pytest.raises(MisconfigurationException, match="only specify one training type plugin, but you have passed"):
+        Trainer(plugins="ddp_find_unused_parameters_false", strategy="ddp_spawn")
+
+
+@pytest.mark.parametrize(
+    ["strategy", "plugin"],
+    [
+        ("ddp_spawn", DDPSpawnPlugin),
+        ("ddp_spawn_find_unused_parameters_false", DDPSpawnPlugin),
+        ("ddp", DDPPlugin),
+        ("ddp_find_unused_parameters_false", DDPPlugin),
+    ],
+)
+def test_strategy_choice_cpu_str(tmpdir, strategy, plugin):
+    trainer = Trainer(strategy=strategy, accelerator="cpu", devices=2)
+    assert isinstance(trainer.training_type_plugin, plugin)
+
+
+@pytest.mark.parametrize("plugin", [DDPSpawnPlugin, DDPPlugin])
+def test_strategy_choice_cpu_plugin(tmpdir, plugin):
+    trainer = Trainer(strategy=plugin(), accelerator="cpu", devices=2)
+    assert isinstance(trainer.training_type_plugin, plugin)
+
+
+@RunIf(min_gpus=2)
+@pytest.mark.parametrize(
+    ["strategy", "plugin"],
+    [
+        ("ddp_spawn", DDPSpawnPlugin),
+        ("ddp_spawn_find_unused_parameters_false", DDPSpawnPlugin),
+        ("ddp", DDPPlugin),
+        ("ddp_find_unused_parameters_false", DDPPlugin),
+        ("ddp2", DDP2Plugin),
+        ("dp", DataParallelPlugin),
+        ("ddp_sharded", DDPShardedPlugin),
+        ("ddp_sharded_spawn", DDPSpawnShardedPlugin),
+        pytest.param("deepspeed", DeepSpeedPlugin, marks=RunIf(deepspeed=True)),
+    ],
+)
+def test_strategy_choice_gpu_str(tmpdir, strategy, plugin):
+    trainer = Trainer(strategy=strategy, accelerator="gpu", devices=2)
+    assert isinstance(trainer.training_type_plugin, plugin)
+
+
+@RunIf(min_gpus=2)
+@pytest.mark.parametrize("plugin", [DDPSpawnPlugin, DDPPlugin])
+def test_strategy_choice_gpu_plugin(tmpdir, plugin):
+    trainer = Trainer(strategy=plugin(), accelerator="gpu", devices=2)
+    assert isinstance(trainer.training_type_plugin, plugin)
+
+
+@RunIf(min_gpus=2)
+@pytest.mark.parametrize("plugin", [DDPSpawnPlugin, DDPPlugin])
+def test_device_type_when_training_plugin_gpu_passed(tmpdir, plugin):
+
+    trainer = Trainer(strategy=plugin(), gpus=2)
+    assert isinstance(trainer.training_type_plugin, plugin)
+    assert trainer._device_type == DeviceType.GPU
+    assert isinstance(trainer.accelerator, GPUAccelerator)
+
+
 @pytest.mark.parametrize("precision", [1, 12, "invalid"])
 def test_validate_precision_type(tmpdir, precision):
 
-    with pytest.raises(MisconfigurationException, match=f"Precision {precision} is invalid"):
+    with pytest.raises(MisconfigurationException, match=f"Precision {repr(precision)} is invalid"):
         Trainer(precision=precision)
 
 
@@ -642,3 +712,267 @@ def test_validate_precision_type(tmpdir, precision):
 def test_amp_level_raises_error_with_native(tmpdir):
     with pytest.raises(MisconfigurationException, match="not supported with `amp_backend='native'`"):
         _ = Trainer(default_root_dir=tmpdir, gpus=1, amp_level="O2", amp_backend="native", precision=16)
+
+
+def test_strategy_choice_ddp_spawn_cpu(tmpdir):
+    trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", num_processes=2)
+    assert isinstance(trainer.accelerator, CPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDPSpawnPlugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, LightningEnvironment)
+
+
+@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
+@mock.patch("torch.cuda.device_count", return_value=2)
+@mock.patch("torch.cuda.is_available", return_value=True)
+def test_strategy_choice_ddp(cuda_available_mock, device_count_mock):
+    trainer = Trainer(fast_dev_run=True, strategy="ddp", gpus=1)
+    assert isinstance(trainer.accelerator, GPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDPPlugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, LightningEnvironment)
+
+
+@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
+@mock.patch("torch.cuda.device_count", return_value=2)
+@mock.patch("torch.cuda.is_available", return_value=True)
+def test_strategy_choice_ddp_spawn(cuda_available_mock, device_count_mock):
+    trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", gpus=1)
+    assert isinstance(trainer.accelerator, GPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDPSpawnPlugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, LightningEnvironment)
+
+
+@RunIf(min_gpus=2)
+@mock.patch.dict(
+    os.environ,
+    {
+        "CUDA_VISIBLE_DEVICES": "0,1",
+        "SLURM_NTASKS": "2",
+        "SLURM_JOB_NAME": "SOME_NAME",
+        "SLURM_NODEID": "0",
+        "SLURM_PROCID": "1",
+        "SLURM_LOCALID": "1",
+    },
+)
+@mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
+def test_strategy_choice_ddp_slurm(setup_distributed_mock):
+    class CB(Callback):
+        def on_fit_start(self, trainer, pl_module):
+            assert trainer.accelerator_connector.is_slurm_managing_tasks
+            assert isinstance(trainer.accelerator, GPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDPPlugin)
+            assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
+            assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
+            assert trainer.training_type_plugin.task_idx == 1
+            raise SystemExit()
+
+    model = BoringModel()
+    trainer = Trainer(fast_dev_run=True, strategy="ddp", gpus=2, callbacks=[CB()])
+
+    with pytest.raises(SystemExit):
+        trainer.fit(model)
+
+
+@RunIf(min_gpus=2)
+@mock.patch.dict(
+    os.environ,
+    {
+        "CUDA_VISIBLE_DEVICES": "0,1",
+        "SLURM_NTASKS": "2",
+        "SLURM_JOB_NAME": "SOME_NAME",
+        "SLURM_NODEID": "0",
+        "SLURM_PROCID": "1",
+        "SLURM_LOCALID": "1",
+    },
+)
+@mock.patch("torch.cuda.device_count", return_value=2)
+@mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
+def test_strategy_choice_ddp2_slurm(device_count_mock, setup_distributed_mock):
+    class CB(Callback):
+        def on_fit_start(self, trainer, pl_module):
+            assert trainer.accelerator_connector.is_slurm_managing_tasks
+            assert isinstance(trainer.accelerator, GPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDP2Plugin)
+            assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
+            assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
+            assert trainer.training_type_plugin.task_idx == 1
+            raise SystemExit()
+
+    model = BoringModel()
+    trainer = Trainer(fast_dev_run=True, strategy="ddp2", gpus=2, callbacks=[CB()])
+
+    with pytest.raises(SystemExit):
+        trainer.fit(model)
+
+
+@RunIf(min_gpus=1)
+@mock.patch.dict(
+    os.environ,
+    {
+        "CUDA_VISIBLE_DEVICES": "0,1",
+        "WORLD_SIZE": "2",
+        "LOCAL_WORLD_SIZE": "2",
+        "RANK": "1",
+        "LOCAL_RANK": "1",
+        "GROUP_RANK": "0",
+    },
+)
+@mock.patch("torch.cuda.device_count", return_value=2)
+@mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
+def test_strategy_choice_ddp_te(device_count_mock, setup_distributed_mock):
+    class CB(Callback):
+        def on_fit_start(self, trainer, pl_module):
+            assert isinstance(trainer.accelerator, GPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDPPlugin)
+            assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
+            assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
+            assert trainer.training_type_plugin.task_idx == 1
+            raise SystemExit()
+
+    model = BoringModel()
+    trainer = Trainer(fast_dev_run=True, strategy="ddp", gpus=2, callbacks=[CB()])
+
+    with pytest.raises(SystemExit):
+        trainer.fit(model)
+
+
+@RunIf(min_gpus=1)
+@mock.patch.dict(
+    os.environ,
+    {
+        "CUDA_VISIBLE_DEVICES": "0,1",
+        "WORLD_SIZE": "2",
+        "LOCAL_WORLD_SIZE": "2",
+        "RANK": "1",
+        "LOCAL_RANK": "1",
+        "GROUP_RANK": "0",
+    },
+)
+@mock.patch("torch.cuda.device_count", return_value=2)
+@mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
+def test_strategy_choice_ddp2_te(device_count_mock, setup_distributed_mock):
+    class CB(Callback):
+        def on_fit_start(self, trainer, pl_module):
+            assert isinstance(trainer.accelerator, GPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDP2Plugin)
+            assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
+            assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
+            assert trainer.training_type_plugin.task_idx == 1
+            raise SystemExit()
+
+    model = BoringModel()
+    trainer = Trainer(fast_dev_run=True, strategy="ddp2", gpus=2, callbacks=[CB()])
+
+    with pytest.raises(SystemExit):
+        trainer.fit(model)
+
+
+@mock.patch.dict(
+    os.environ, {"WORLD_SIZE": "2", "LOCAL_WORLD_SIZE": "2", "RANK": "1", "LOCAL_RANK": "1", "GROUP_RANK": "0"}
+)
+@mock.patch("torch.cuda.device_count", return_value=0)
+@mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
+def test_strategy_choice_ddp_cpu_te(device_count_mock, setup_distributed_mock):
+    class CB(Callback):
+        def on_fit_start(self, trainer, pl_module):
+            assert isinstance(trainer.accelerator, CPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDPPlugin)
+            assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
+            assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
+            assert trainer.training_type_plugin.task_idx == 1
+            raise SystemExit()
+
+    model = BoringModel()
+    trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", num_processes=2, callbacks=[CB()])
+
+    with pytest.raises(SystemExit):
+        trainer.fit(model)
+
+
+@RunIf(min_gpus=1)
+@mock.patch.dict(
+    os.environ,
+    {
+        "CUDA_VISIBLE_DEVICES": "0",
+        "KUBERNETES_PORT": "tcp://127.0.0.1:443",
+        "MASTER_ADDR": "1.2.3.4",
+        "MASTER_PORT": "500",
+        "WORLD_SIZE": "20",
+        "RANK": "1",
+    },
+)
+@mock.patch("torch.cuda.device_count", return_value=1)
+@mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
+def test_strategy_choice_ddp_kubeflow(device_count_mock, setup_distributed_mock):
+    class CB(Callback):
+        def on_fit_start(self, trainer, pl_module):
+            assert isinstance(trainer.accelerator, GPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDPPlugin)
+            assert isinstance(trainer.training_type_plugin.cluster_environment, KubeflowEnvironment)
+            assert trainer.training_type_plugin.cluster_environment.local_rank() == 0
+            assert trainer.training_type_plugin.task_idx == 0
+            raise SystemExit()
+
+    model = BoringModel()
+    trainer = Trainer(fast_dev_run=True, strategy="ddp", gpus=1, callbacks=[CB()])
+
+    with pytest.raises(SystemExit):
+        trainer.fit(model)
+
+
+@mock.patch.dict(
+    os.environ,
+    {
+        "KUBERNETES_PORT": "tcp://127.0.0.1:443",
+        "MASTER_ADDR": "1.2.3.4",
+        "MASTER_PORT": "500",
+        "WORLD_SIZE": "20",
+        "RANK": "1",
+    },
+)
+@mock.patch("torch.cuda.device_count", return_value=0)
+@mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
+def test_strategy_choice_ddp_cpu_kubeflow(device_count_mock, setup_distributed_mock):
+    class CB(Callback):
+        def on_fit_start(self, trainer, pl_module):
+            assert isinstance(trainer.accelerator, CPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDPPlugin)
+            assert isinstance(trainer.training_type_plugin.cluster_environment, KubeflowEnvironment)
+            assert trainer.training_type_plugin.cluster_environment.local_rank() == 0
+            assert trainer.training_type_plugin.task_idx == 0
+            raise SystemExit()
+
+    model = BoringModel()
+    trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", num_processes=2, callbacks=[CB()])
+
+    with pytest.raises(SystemExit):
+        trainer.fit(model)
+
+
+@mock.patch.dict(
+    os.environ,
+    {
+        "SLURM_NTASKS": "2",
+        "SLURM_JOB_NAME": "SOME_NAME",
+        "SLURM_NODEID": "0",
+        "LOCAL_RANK": "0",
+        "SLURM_PROCID": "0",
+        "SLURM_LOCALID": "0",
+    },
+)
+@mock.patch("torch.cuda.device_count", return_value=0)
+@mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
+def test_strategy_choice_ddp_cpu_slurm(device_count_mock, setup_distributed_mock):
+    class CB(Callback):
+        def on_fit_start(self, trainer, pl_module):
+            assert trainer.accelerator_connector.is_slurm_managing_tasks
+            assert isinstance(trainer.accelerator, CPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDPPlugin)
+            assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
+            assert trainer.training_type_plugin.task_idx == 0
+            raise SystemExit()
+
+    model = BoringModel()
+    trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", num_processes=2, callbacks=[CB()])
+
+    with pytest.raises(SystemExit):
+        trainer.fit(model)
diff --git a/tests/accelerators/test_common.py b/tests/accelerators/test_common.py
index cb1560d2af355..cf00d84eecb64 100644
--- a/tests/accelerators/test_common.py
+++ b/tests/accelerators/test_common.py
@@ -27,8 +27,8 @@
     "trainer_kwargs",
     (
         pytest.param(dict(gpus=1), marks=RunIf(min_gpus=1)),
-        pytest.param(dict(accelerator="dp", gpus=2), marks=RunIf(min_gpus=2)),
-        pytest.param(dict(accelerator="ddp_spawn", gpus=2), marks=RunIf(min_gpus=2)),
+        pytest.param(dict(strategy="dp", gpus=2), marks=RunIf(min_gpus=2)),
+        pytest.param(dict(strategy="ddp_spawn", gpus=2), marks=RunIf(min_gpus=2)),
     ),
 )
 def test_evaluate(tmpdir, trainer_kwargs):
diff --git a/tests/accelerators/test_cpu.py b/tests/accelerators/test_cpu.py
index f95d182f9e5e1..697fae1644b1b 100644
--- a/tests/accelerators/test_cpu.py
+++ b/tests/accelerators/test_cpu.py
@@ -43,7 +43,7 @@ def test_restore_checkpoint_after_pre_dispatch_default():
     """Assert default for restore_checkpoint_after_pre_dispatch is False."""
     plugin = SingleDevicePlugin(torch.device("cpu"))
     accelerator = CPUAccelerator(training_type_plugin=plugin, precision_plugin=PrecisionPlugin())
-    assert not accelerator.restore_checkpoint_after_pre_dispatch
+    assert not accelerator.training_type_plugin.restore_checkpoint_after_pre_dispatch
     assert not plugin.restore_checkpoint_after_pre_dispatch
 
 
@@ -77,7 +77,7 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
     plugin = TestPlugin(torch.device("cpu"), checkpoint_io=TorchCheckpointIO())
     accelerator = CPUAccelerator(training_type_plugin=plugin, precision_plugin=PrecisionPlugin())
 
-    assert accelerator.restore_checkpoint_after_pre_dispatch == restore_after_pre_dispatch
+    assert accelerator.training_type_plugin.restore_checkpoint_after_pre_dispatch == restore_after_pre_dispatch
     assert plugin.restore_checkpoint_after_pre_dispatch == restore_after_pre_dispatch
 
     trainer = Trainer(
diff --git a/tests/accelerators/test_ddp.py b/tests/accelerators/test_ddp.py
index e03bc467a453a..00b08d88b6d39 100644
--- a/tests/accelerators/test_ddp.py
+++ b/tests/accelerators/test_ddp.py
@@ -85,7 +85,7 @@ def test_torch_distributed_backend_env_variables(tmpdir):
     with patch.dict(os.environ, _environ), patch("torch.cuda.device_count", return_value=2):
         with pytest.raises(ValueError, match="Invalid backend: 'undefined'"):
             model = BoringModel()
-            trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, accelerator="ddp", gpus=2, logger=False)
+            trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="ddp", gpus=2, logger=False)
             trainer.fit(model)
 
 
@@ -103,7 +103,7 @@ def setup(self, stage: Optional[str] = None) -> None:
             raise SystemExit()
 
     model = TestModel()
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, accelerator="ddp", gpus=1)
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="ddp", gpus=1)
     with pytest.raises(SystemExit):
         trainer.fit(model)
 
@@ -144,7 +144,7 @@ def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule")
         default_root_dir=tmpdir,
         fast_dev_run=True,
         precision=precision,
-        accelerator="ddp",
+        strategy="ddp",
         gpus=2,
         callbacks=CustomCallback(),
     )
diff --git a/tests/accelerators/test_ddp_spawn.py b/tests/accelerators/test_ddp_spawn.py
index 806c2fd458402..47b6305a2eb98 100644
--- a/tests/accelerators/test_ddp_spawn.py
+++ b/tests/accelerators/test_ddp_spawn.py
@@ -33,7 +33,7 @@ def test_multi_gpu_early_stop_ddp_spawn(tmpdir):
         limit_train_batches=10,
         limit_val_batches=10,
         gpus=[0, 1],
-        accelerator="ddp_spawn",
+        strategy="ddp_spawn",
     )
 
     dm = ClassifDataModule()
@@ -51,7 +51,7 @@ def test_multi_gpu_model_ddp_spawn(tmpdir):
         limit_train_batches=10,
         limit_val_batches=10,
         gpus=[0, 1],
-        accelerator="ddp_spawn",
+        strategy="ddp_spawn",
         enable_progress_bar=False,
     )
 
@@ -78,7 +78,7 @@ def test_ddp_all_dataloaders_passed_to_fit(tmpdir):
         limit_train_batches=0.2,
         limit_val_batches=0.2,
         gpus=[0, 1],
-        accelerator="ddp_spawn",
+        strategy="ddp_spawn",
     )
     trainer.fit(model, **fit_options)
     assert trainer.state.finished, "DDP doesn't work with dataloaders passed to fit()."
diff --git a/tests/accelerators/test_dp.py b/tests/accelerators/test_dp.py
index dafb763170519..57dd6a5b3e2ec 100644
--- a/tests/accelerators/test_dp.py
+++ b/tests/accelerators/test_dp.py
@@ -71,7 +71,7 @@ def test_multi_gpu_early_stop_dp(tmpdir):
         limit_train_batches=10,
         limit_val_batches=10,
         gpus=[0, 1],
-        accelerator="dp",
+        strategy="dp",
     )
 
     tpipes.run_model_test(trainer_options, model, dm)
@@ -87,7 +87,7 @@ def test_multi_gpu_model_dp(tmpdir):
         limit_train_batches=10,
         limit_val_batches=10,
         gpus=[0, 1],
-        accelerator="dp",
+        strategy="dp",
         enable_progress_bar=False,
     )
 
@@ -147,7 +147,7 @@ def transfer_batch_to_device(self, batch, device):
             batch = batch.to(device)
             return batch
 
-    trainer_options = dict(default_root_dir=tmpdir, max_steps=7, gpus=[0, 1], accelerator="dp")
+    trainer_options = dict(default_root_dir=tmpdir, max_steps=7, gpus=[0, 1], strategy="dp")
 
     trainer = Trainer(**trainer_options)
     model = CustomModel()
@@ -193,6 +193,6 @@ def test_dp_training_step_dict(tmpdir):
         limit_val_batches=1,
         limit_test_batches=1,
         gpus=2,
-        accelerator="dp",
+        strategy="dp",
     )
     trainer.fit(model)
diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py
index acb3fd65959eb..c8c557eab4ebf 100644
--- a/tests/accelerators/test_ipu.py
+++ b/tests/accelerators/test_ipu.py
@@ -24,7 +24,7 @@
 from pytorch_lightning.plugins import IPUPlugin, IPUPrecisionPlugin
 from pytorch_lightning.trainer.states import RunningStage
 from pytorch_lightning.trainer.supporters import CombinedLoader
-from pytorch_lightning.utilities import _IPU_AVAILABLE
+from pytorch_lightning.utilities import _IPU_AVAILABLE, DeviceType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel
 from tests.helpers.datamodules import ClassifDataModule
@@ -120,7 +120,7 @@ def test_warning_if_ipus_not_used(tmpdir):
 @RunIf(ipu=True)
 def test_no_warning_plugin(tmpdir):
     with pytest.warns(None) as record:
-        Trainer(default_root_dir=tmpdir, plugins=IPUPlugin(training_opts=poptorch.Options()))
+        Trainer(default_root_dir=tmpdir, strategy=IPUPlugin(training_opts=poptorch.Options()))
     assert len(record) == 0
 
 
@@ -528,3 +528,18 @@ def test_set_devices_if_none_ipu():
 
     trainer = Trainer(accelerator="ipu", ipus=8)
     assert trainer.devices == 8
+
+
+@RunIf(ipu=True)
+def test_strategy_choice_ipu_plugin(tmpdir):
+    trainer = Trainer(strategy=IPUPlugin(), accelerator="ipu", devices=8)
+    assert isinstance(trainer.training_type_plugin, IPUPlugin)
+
+
+@RunIf(ipu=True)
+def test_device_type_when_training_plugin_ipu_passed(tmpdir):
+
+    trainer = Trainer(strategy=IPUPlugin(), ipus=8)
+    assert isinstance(trainer.training_type_plugin, IPUPlugin)
+    assert trainer._device_type == DeviceType.IPU
+    assert isinstance(trainer.accelerator, IPUAccelerator)
diff --git a/tests/accelerators/test_multi_nodes_gpu.py b/tests/accelerators/test_multi_nodes_gpu.py
index a1abed776c12d..0df49a41b0fd0 100644
--- a/tests/accelerators/test_multi_nodes_gpu.py
+++ b/tests/accelerators/test_multi_nodes_gpu.py
@@ -54,8 +54,8 @@ def validation_step(self, batch, batch_idx):
         limit_train_batches=1,
         limit_val_batches=1,
         max_epochs=2,
-        weights_summary=None,
-        accelerator="ddp",
+        enable_model_summary=False,
+        strategy="ddp",
         gpus=1,
         num_nodes=2,
     )
@@ -101,15 +101,15 @@ def backward(self, loss, optimizer, optimizer_idx):
         limit_val_batches=2,
         max_epochs=2,
         log_every_n_steps=1,
-        weights_summary=None,
-        accelerator="ddp",
+        enable_model_summary=False,
+        strategy="ddp",
         gpus=1,
         num_nodes=2,
     )
     trainer.fit(model)
 
     # make sure all the metrics are available for callbacks
-    assert set(trainer.logged_metrics) == {"a2", "a_step", "a_epoch", "b_step", "b_epoch", "epoch"}
+    assert set(trainer.logged_metrics) == {"a2", "a_step", "a_epoch", "b_step", "b_epoch"}
 
     # we don't want to enable val metrics during steps because it is not something that users should do
     # on purpose DO NOT allow b_step... it's silly to monitor val step metrics
diff --git a/tests/accelerators/test_tpu.py b/tests/accelerators/test_tpu.py
index f3a2c50c0e347..25743d5b3bc3d 100644
--- a/tests/accelerators/test_tpu.py
+++ b/tests/accelerators/test_tpu.py
@@ -1,16 +1,286 @@
-from pytorch_lightning.accelerators import TPUAccelerator
-from pytorch_lightning.plugins import SingleTPUPlugin
-from pytorch_lightning.plugins.training_type import TPUSpawnPlugin
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+import collections
+from copy import deepcopy
+from unittest.mock import patch
+
+import pytest
+import torch
+from torch import nn
+
+from pytorch_lightning import Trainer
+from pytorch_lightning.accelerators.cpu import CPUAccelerator
+from pytorch_lightning.accelerators.tpu import TPUAccelerator
+from pytorch_lightning.plugins import TPUSpawnPlugin
+from pytorch_lightning.utilities import find_shared_parameters
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from tests.helpers.boring_model import BoringModel
 from tests.helpers.runif import RunIf
+from tests.helpers.utils import pl_multi_process_test
+
+
+class WeightSharingModule(BoringModel):
+    def __init__(self):
+        super().__init__()
+        self.layer_1 = nn.Linear(32, 10, bias=False)
+        self.layer_2 = nn.Linear(10, 32, bias=False)
+        self.layer_3 = nn.Linear(32, 10, bias=False)
+        self.layer_3.weight = self.layer_1.weight
+
+    def forward(self, x):
+        x = self.layer_1(x)
+        x = self.layer_2(x)
+        x = self.layer_3(x)
+        return x
+
+
+@RunIf(tpu=True)
+@pl_multi_process_test
+def test_resume_training_on_cpu(tmpdir):
+    """Checks if training can be resumed from a saved checkpoint on CPU."""
+    # Train a model on TPU
+    model = BoringModel()
+    trainer = Trainer(max_epochs=1, tpu_cores=8)
+    trainer.fit(model)
+
+    model_path = trainer.checkpoint_callback.best_model_path
+
+    # Verify saved Tensors are on CPU
+    ckpt = torch.load(model_path)
+    weight_tensor = list(ckpt["state_dict"].values())[0]
+    assert weight_tensor.device == torch.device("cpu")
+
+    # Verify that training is resumed on CPU
+    trainer = Trainer(resume_from_checkpoint=model_path, max_epochs=1, default_root_dir=tmpdir)
+    trainer.fit(model)
+    assert trainer.state.finished, f"Training failed with {trainer.state}"
+
+
+@RunIf(tpu=True)
+@pl_multi_process_test
+def test_if_test_works_after_train(tmpdir):
+    """Ensure that .test() works after .fit()"""
+
+    # Train a model on TPU
+    model = BoringModel()
+    trainer = Trainer(max_epochs=1, tpu_cores=8, default_root_dir=tmpdir, fast_dev_run=True)
+    trainer.fit(model)
+    assert len(trainer.test(model)) == 1
+
+
+@RunIf(tpu=True)
+def test_accelerator_tpu():
+
+    trainer = Trainer(accelerator="tpu", tpu_cores=8)
+
+    assert trainer._device_type == "tpu"
+    assert isinstance(trainer.accelerator, TPUAccelerator)
+
+    with pytest.raises(
+        MisconfigurationException, match="You passed `accelerator='tpu'`, but you didn't pass `tpu_cores` to `Trainer`"
+    ):
+        trainer = Trainer(accelerator="tpu")
+
+
+@RunIf(tpu=True)
+def test_accelerator_cpu_with_tpu_cores_flag():
+
+    trainer = Trainer(accelerator="cpu", tpu_cores=8)
+
+    assert trainer._device_type == "cpu"
+    assert isinstance(trainer.accelerator, CPUAccelerator)
+
+
+@RunIf(tpu=True)
+def test_accelerator_tpu_with_auto():
+
+    trainer = Trainer(accelerator="auto", tpu_cores=8)
+
+    assert trainer._device_type == "tpu"
+    assert isinstance(trainer.accelerator, TPUAccelerator)
+
+
+@RunIf(tpu=True)
+def test_accelerator_tpu_with_devices():
+
+    trainer = Trainer(accelerator="tpu", devices=8)
+
+    assert trainer.tpu_cores == 8
+    assert isinstance(trainer.training_type_plugin, TPUSpawnPlugin)
+    assert isinstance(trainer.accelerator, TPUAccelerator)
 
 
 @RunIf(tpu=True)
-def test_device_stats_tpu(tmpdir):
-    """Test TPU get_device_stats."""
-    plugin = SingleTPUPlugin(1)
-    TPUAccel = TPUAccelerator(training_type_plugin=TPUSpawnPlugin(), precision_plugin=plugin)
-    tpu_stats = TPUAccel.get_device_stats("1")
-    fields = ["avg. free memory (MB)", "avg. peak memory (MB)"]
+def test_accelerator_auto_with_devices_tpu():
+
+    trainer = Trainer(accelerator="auto", devices=8)
+
+    assert trainer._device_type == "tpu"
+    assert trainer.tpu_cores == 8
+
+
+@RunIf(tpu=True)
+def test_accelerator_tpu_with_tpu_cores_priority():
+    """Test for checking `tpu_cores` flag takes priority over `devices`."""
+
+    tpu_cores = 8
+    with pytest.warns(UserWarning, match="The flag `devices=1` will be ignored,"):
+        trainer = Trainer(accelerator="tpu", devices=1, tpu_cores=tpu_cores)
+
+    assert trainer.tpu_cores == tpu_cores
+
+
+@RunIf(tpu=True)
+def test_set_devices_if_none_tpu():
+
+    trainer = Trainer(accelerator="tpu", tpu_cores=8)
+    assert trainer.devices == 8
+
+
+@RunIf(tpu=True)
+def test_manual_optimization_tpus(tmpdir):
+    class ManualOptimizationModel(BoringModel):
+
+        count = 0
+        called = collections.defaultdict(int)
+
+        def __init__(self):
+            super().__init__()
+            self.automatic_optimization = False
+
+        @property
+        def should_update(self):
+            return self.count % 2 == 0
+
+        def on_train_batch_start(self, batch, batch_idx):
+            self.called["on_train_batch_start"] += 1
+            self.weight_before = self.layer.weight.clone()
+
+        def training_step(self, batch, batch_idx):
+            self.called["training_step"] += 1
+            opt = self.optimizers()
+            output = self.layer(batch)
+            loss = self.loss(batch, output)
+
+            if self.should_update:
+                self.manual_backward(loss)
+                opt.step()
+                opt.zero_grad()
+            return loss
+
+        def on_train_batch_end(self, outputs, batch, batch_idx):
+            self.called["on_train_batch_end"] += 1
+            after_before = self.layer.weight.clone()
+            if self.should_update:
+                assert not torch.equal(self.weight_before, after_before), self.count
+            else:
+                assert torch.equal(self.weight_before, after_before)
+            assert torch.all(self.layer.weight.grad == 0)
+            self.count += 1
+
+        def on_train_start(self):
+            opt = self.optimizers()
+            self.opt_step_patch = patch.object(opt, "step", wraps=opt.step)
+            self.opt_step_mock = self.opt_step_patch.start()
+
+        def on_train_end(self):
+            assert self.called["training_step"] == 5
+            assert self.called["on_train_batch_start"] == 5
+            assert self.called["on_train_batch_end"] == 5
+
+            self.opt_step_patch.stop()
+            assert self.opt_step_mock.call_count == 3
+
+    model = ManualOptimizationModel()
+    model_copy = deepcopy(model)
+    model.training_step_end = None
+    model.training_epoch_end = None
+
+    trainer = Trainer(
+        max_epochs=1,
+        default_root_dir=tmpdir,
+        limit_train_batches=5,
+        limit_test_batches=0,
+        limit_val_batches=0,
+        tpu_cores=8,
+    )
+    trainer.fit(model)
+
+    for param, param_copy in zip(model.parameters(), model_copy.parameters()):
+        assert not torch.equal(param.cpu().data, param_copy.data)
+
+
+@RunIf(tpu=True)
+def test_ddp_cpu_not_supported_on_tpus():
+    with pytest.raises(MisconfigurationException, match="`accelerator='ddp_cpu'` is not supported on TPU machines"):
+        Trainer(accelerator="ddp_cpu")
+
+
+@RunIf(tpu=True)
+@pytest.mark.parametrize("strategy", ["tpu_spawn", "tpu_spawn_debug"])
+def test_strategy_choice_tpu_str(tmpdir, strategy):
+    trainer = Trainer(strategy=strategy, accelerator="tpu", devices=8)
+    assert isinstance(trainer.training_type_plugin, TPUSpawnPlugin)
+
+
+@RunIf(tpu=True)
+def test_strategy_choice_tpu_plugin(tmpdir):
+    trainer = Trainer(strategy=TPUSpawnPlugin(), accelerator="tpu", devices=8)
+    assert isinstance(trainer.training_type_plugin, TPUSpawnPlugin)
+
+
+@RunIf(tpu=True)
+def test_auto_parameters_tying_tpus(tmpdir):
+
+    model = WeightSharingModule()
+    shared_params = find_shared_parameters(model)
+
+    assert shared_params[0] == ["layer_1.weight", "layer_3.weight"]
+
+    trainer = Trainer(default_root_dir=tmpdir, limit_train_batches=5, tpu_cores=8, max_epochs=1)
+    trainer.fit(model)
+
+    assert torch.all(torch.eq(model.layer_1.weight, model.layer_3.weight))
+
+
+@RunIf(tpu=True)
+def test_auto_parameters_tying_tpus_nested_module(tmpdir):
+    class SubModule(nn.Module):
+        def __init__(self, layer):
+            super().__init__()
+            self.layer = layer
+
+        def forward(self, x):
+            return self.layer(x)
+
+    class NestedModule(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.layer = nn.Linear(32, 10, bias=False)
+            self.net_a = SubModule(self.layer)
+            self.layer_2 = nn.Linear(10, 32, bias=False)
+            self.net_b = SubModule(self.layer)
+
+        def forward(self, x):
+            x = self.net_a(x)
+            x = self.layer_2(x)
+            x = self.net_b(x)
+            return x
+
+    model = NestedModule()
+
+    trainer = Trainer(default_root_dir=tmpdir, limit_train_batches=5, tpu_cores=8, max_epochs=1)
+    trainer.fit(model)
 
-    for f in fields:
-        assert any(f in h for h in tpu_stats.keys())
+    assert torch.all(torch.eq(model.net_a.layer.weight, model.net_b.layer.weight))
diff --git a/tests/accelerators/test_tpu_backend.py b/tests/accelerators/test_tpu_backend.py
deleted file mode 100644
index d9b9e67aeb4ab..0000000000000
--- a/tests/accelerators/test_tpu_backend.py
+++ /dev/null
@@ -1,275 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License
-import collections
-from copy import deepcopy
-
-import pytest
-import torch
-from torch import nn
-
-from pytorch_lightning import Trainer
-from pytorch_lightning.accelerators.cpu import CPUAccelerator
-from pytorch_lightning.accelerators.tpu import TPUAccelerator
-from pytorch_lightning.callbacks import Callback
-from pytorch_lightning.plugins import TPUSpawnPlugin
-from pytorch_lightning.utilities import find_shared_parameters
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests.helpers.boring_model import BoringModel
-from tests.helpers.runif import RunIf
-from tests.helpers.utils import pl_multi_process_test
-
-
-class WeightSharingModule(BoringModel):
-    def __init__(self):
-        super().__init__()
-        self.layer_1 = nn.Linear(32, 10, bias=False)
-        self.layer_2 = nn.Linear(10, 32, bias=False)
-        self.layer_3 = nn.Linear(32, 10, bias=False)
-        self.layer_3.weight = self.layer_1.weight
-
-    def forward(self, x):
-        x = self.layer_1(x)
-        x = self.layer_2(x)
-        x = self.layer_3(x)
-        return x
-
-
-@RunIf(tpu=True)
-@pl_multi_process_test
-def test_resume_training_on_cpu(tmpdir):
-    """Checks if training can be resumed from a saved checkpoint on CPU."""
-    # Train a model on TPU
-    model = BoringModel()
-    trainer = Trainer(checkpoint_callback=True, max_epochs=1, tpu_cores=8)
-    trainer.fit(model)
-
-    model_path = trainer.checkpoint_callback.best_model_path
-
-    # Verify saved Tensors are on CPU
-    ckpt = torch.load(model_path)
-    weight_tensor = list(ckpt["state_dict"].values())[0]
-    assert weight_tensor.device == torch.device("cpu")
-
-    # Verify that training is resumed on CPU
-    trainer = Trainer(
-        resume_from_checkpoint=model_path, checkpoint_callback=True, max_epochs=1, default_root_dir=tmpdir
-    )
-    trainer.fit(model)
-    assert trainer.state.finished, f"Training failed with {trainer.state}"
-
-
-@RunIf(tpu=True)
-@pl_multi_process_test
-def test_if_test_works_after_train(tmpdir):
-    """Ensure that .test() works after .fit()"""
-
-    # Train a model on TPU
-    model = BoringModel()
-    trainer = Trainer(max_epochs=1, tpu_cores=8, default_root_dir=tmpdir, fast_dev_run=True)
-    trainer.fit(model)
-    assert len(trainer.test(model)) == 1
-
-
-@RunIf(tpu=True)
-def test_accelerator_tpu():
-
-    trainer = Trainer(accelerator="tpu", tpu_cores=8)
-
-    assert trainer._device_type == "tpu"
-    assert isinstance(trainer.accelerator, TPUAccelerator)
-
-    with pytest.raises(
-        MisconfigurationException, match="You passed `accelerator='tpu'`, but you didn't pass `tpu_cores` to `Trainer`"
-    ):
-        trainer = Trainer(accelerator="tpu")
-
-
-@RunIf(tpu=True)
-def test_accelerator_cpu_with_tpu_cores_flag():
-
-    trainer = Trainer(accelerator="cpu", tpu_cores=8)
-
-    assert trainer._device_type == "cpu"
-    assert isinstance(trainer.accelerator, CPUAccelerator)
-
-
-@RunIf(tpu=True)
-def test_accelerator_tpu_with_auto():
-
-    trainer = Trainer(accelerator="auto", tpu_cores=8)
-
-    assert trainer._device_type == "tpu"
-    assert isinstance(trainer.accelerator, TPUAccelerator)
-
-
-@RunIf(tpu=True)
-def test_accelerator_tpu_with_devices():
-
-    trainer = Trainer(accelerator="tpu", devices=8)
-
-    assert trainer.tpu_cores == 8
-    assert isinstance(trainer.training_type_plugin, TPUSpawnPlugin)
-    assert isinstance(trainer.accelerator, TPUAccelerator)
-
-
-@RunIf(tpu=True)
-def test_accelerator_auto_with_devices_tpu():
-
-    trainer = Trainer(accelerator="auto", devices=8)
-
-    assert trainer._device_type == "tpu"
-    assert trainer.tpu_cores == 8
-
-
-@RunIf(tpu=True)
-def test_accelerator_tpu_with_tpu_cores_priority():
-    """Test for checking `tpu_cores` flag takes priority over `devices`."""
-
-    tpu_cores = 8
-    with pytest.warns(UserWarning, match="The flag `devices=1` will be ignored,"):
-        trainer = Trainer(accelerator="tpu", devices=1, tpu_cores=tpu_cores)
-
-    assert trainer.tpu_cores == tpu_cores
-
-
-@RunIf(tpu=True)
-def test_set_devices_if_none_tpu():
-
-    trainer = Trainer(accelerator="tpu", tpu_cores=8)
-    assert trainer.devices == 8
-
-
-@RunIf(tpu=True)
-def test_manual_optimization_tpus(tmpdir):
-    class ManualOptimizationModel(BoringModel):
-
-        count = 0
-        called = collections.defaultdict(int)
-
-        def __init__(self):
-            super().__init__()
-            self.automatic_optimization = False
-
-        @property
-        def should_update(self):
-            return self.count % 2 == 0
-
-        def on_train_batch_start(self, batch, batch_idx):
-            self.called["on_train_batch_start"] += 1
-            self.weight_before = self.layer.weight.clone()
-
-        def training_step(self, batch, batch_idx):
-            self.called["training_step"] += 1
-            opt = self.optimizers()
-            output = self.layer(batch)
-            loss = self.loss(batch, output)
-
-            if self.should_update:
-                self.manual_backward(loss)
-                opt.step()
-                opt.zero_grad()
-            return loss
-
-        def on_train_batch_end(self, outputs, batch, batch_idx):
-            self.called["on_train_batch_end"] += 1
-            after_before = self.layer.weight.clone()
-            if self.should_update:
-                assert not torch.equal(self.weight_before, after_before), self.count
-            else:
-                assert torch.equal(self.weight_before, after_before)
-            assert torch.all(self.layer.weight.grad == 0)
-            self.count += 1
-
-        def on_train_end(self):
-            assert self.called["training_step"] == 5
-            assert self.called["on_train_batch_start"] == 5
-            assert self.called["on_train_batch_end"] == 5
-
-    class TestManualOptimizationCallack(Callback):
-        def on_train_end(self, trainer, pl_module):
-
-            opt = pl_module.optimizers()
-            assert opt._total_optimizer_step_calls == 3
-
-    model = ManualOptimizationModel()
-    model_copy = deepcopy(model)
-    model.training_step_end = None
-    model.training_epoch_end = None
-
-    trainer = Trainer(
-        max_epochs=1,
-        default_root_dir=tmpdir,
-        limit_train_batches=5,
-        limit_test_batches=0,
-        limit_val_batches=0,
-        tpu_cores=8,
-        callbacks=[TestManualOptimizationCallack()],
-    )
-    trainer.fit(model)
-
-    for param, param_copy in zip(model.parameters(), model_copy.parameters()):
-        assert not torch.equal(param.cpu().data, param_copy.data)
-
-
-@RunIf(tpu=True)
-def test_ddp_cpu_not_supported_on_tpus():
-
-    with pytest.raises(MisconfigurationException, match="`accelerator='ddp_cpu'` is not supported on TPU machines"):
-        Trainer(accelerator="ddp_cpu")
-
-
-@RunIf(tpu=True)
-def test_auto_parameters_tying_tpus(tmpdir):
-
-    model = WeightSharingModule()
-    shared_params = find_shared_parameters(model)
-
-    assert shared_params[0] == ["layer_1.weight", "layer_3.weight"]
-
-    trainer = Trainer(default_root_dir=tmpdir, limit_train_batches=5, tpu_cores=8, max_epochs=1)
-    trainer.fit(model)
-
-    assert torch.all(torch.eq(model.layer_1.weight, model.layer_3.weight))
-
-
-@RunIf(tpu=True)
-def test_auto_parameters_tying_tpus_nested_module(tmpdir):
-    class SubModule(nn.Module):
-        def __init__(self, layer):
-            super().__init__()
-            self.layer = layer
-
-        def forward(self, x):
-            return self.layer(x)
-
-    class NestedModule(BoringModel):
-        def __init__(self):
-            super().__init__()
-            self.layer = nn.Linear(32, 10, bias=False)
-            self.net_a = SubModule(self.layer)
-            self.layer_2 = nn.Linear(10, 32, bias=False)
-            self.net_b = SubModule(self.layer)
-
-        def forward(self, x):
-            x = self.net_a(x)
-            x = self.layer_2(x)
-            x = self.net_b(x)
-            return x
-
-    model = NestedModule()
-
-    trainer = Trainer(default_root_dir=tmpdir, limit_train_batches=5, tpu_cores=8, max_epochs=1)
-    trainer.fit(model)
-
-    assert torch.all(torch.eq(model.net_a.layer.weight, model.net_b.layer.weight))
diff --git a/tests/callbacks/test_callback_hook_outputs.py b/tests/callbacks/test_callback_hook_outputs.py
index 7e52c4f49e5e6..d55313fde37e6 100644
--- a/tests/callbacks/test_callback_hook_outputs.py
+++ b/tests/callbacks/test_callback_hook_outputs.py
@@ -53,7 +53,7 @@ def training_epoch_end(self, outputs) -> None:
         limit_val_batches=2,
         max_epochs=1,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
 
     assert any(isinstance(c, CB) for c in trainer.callbacks)
@@ -74,7 +74,7 @@ def on_epoch_end(self, trainer, pl_module):
         limit_train_batches=2,
         limit_val_batches=2,
         max_epochs=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
 
     trainer.fit(model)
diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
index 78e21d821b810..34beca2021933 100644
--- a/tests/callbacks/test_callbacks.py
+++ b/tests/callbacks/test_callbacks.py
@@ -35,7 +35,7 @@ def configure_callbacks(self):
 
     model = TestModel()
     trainer_options = dict(
-        default_root_dir=tmpdir, checkpoint_callback=False, fast_dev_run=True, enable_progress_bar=False
+        default_root_dir=tmpdir, enable_checkpointing=False, fast_dev_run=True, enable_progress_bar=False
     )
 
     def assert_expected_calls(_trainer, model_callback, trainer_callback):
@@ -86,7 +86,7 @@ def configure_callbacks(self):
             return [model_callback_mock]
 
     model = TestModel()
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, checkpoint_callback=False)
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, enable_checkpointing=False)
 
     callbacks_before_fit = trainer.callbacks.copy()
     assert callbacks_before_fit
diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py
new file mode 100644
index 0000000000000..5488cd44de931
--- /dev/null
+++ b/tests/callbacks/test_device_stats_monitor.py
@@ -0,0 +1,130 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, Optional
+
+import pytest
+
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import DeviceStatsMonitor
+from pytorch_lightning.loggers import CSVLogger
+from pytorch_lightning.utilities.distributed import rank_zero_only
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from tests.helpers import BoringModel
+from tests.helpers.runif import RunIf
+
+
+@RunIf(min_torch="1.8")
+@RunIf(min_gpus=1)
+def test_device_stats_gpu_from_torch(tmpdir):
+    """Test GPU stats are logged using a logger with Pytorch >= 1.8.0."""
+    model = BoringModel()
+    device_stats = DeviceStatsMonitor()
+
+    class DebugLogger(CSVLogger):
+        @rank_zero_only
+        def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
+            fields = ["allocated_bytes.all.freed", "inactive_split.all.peak", "reserved_bytes.large_pool.peak"]
+            for f in fields:
+                assert any(f in h for h in metrics.keys())
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=2,
+        limit_train_batches=7,
+        log_every_n_steps=1,
+        gpus=1,
+        callbacks=[device_stats],
+        logger=DebugLogger(tmpdir),
+        enable_checkpointing=False,
+        enable_progress_bar=False,
+    )
+
+    trainer.fit(model)
+
+
+@RunIf(max_torch="1.7")
+@RunIf(min_gpus=1)
+def test_device_stats_gpu_from_nvidia(tmpdir):
+    """Test GPU stats are logged using a logger with Pytorch < 1.8.0."""
+    model = BoringModel()
+    device_stats = DeviceStatsMonitor()
+
+    class DebugLogger(CSVLogger):
+        @rank_zero_only
+        def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
+            fields = ["utilization.gpu", "memory.used", "memory.free", "utilization.memory"]
+            for f in fields:
+                assert any(f in h for h in metrics.keys())
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=2,
+        limit_train_batches=7,
+        log_every_n_steps=1,
+        gpus=1,
+        callbacks=[device_stats],
+        logger=DebugLogger(tmpdir),
+        enable_checkpointing=False,
+        enable_progress_bar=False,
+    )
+
+    trainer.fit(model)
+
+
+@RunIf(tpu=True)
+def test_device_stats_monitor_tpu(tmpdir):
+    """Test TPU stats are logged using a logger."""
+
+    model = BoringModel()
+    device_stats = DeviceStatsMonitor()
+
+    class DebugLogger(CSVLogger):
+        @rank_zero_only
+        def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
+            fields = ["avg. free memory (MB)", "avg. peak memory (MB)"]
+            for f in fields:
+                assert any(f in h for h in metrics.keys())
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_train_batches=1,
+        tpu_cores=8,
+        log_every_n_steps=1,
+        callbacks=[device_stats],
+        logger=DebugLogger(tmpdir),
+        enable_checkpointing=False,
+        enable_progress_bar=False,
+    )
+
+    trainer.fit(model)
+
+
+def test_device_stats_monitor_no_logger(tmpdir):
+    """Test DeviceStatsMonitor with no logger in Trainer."""
+
+    model = BoringModel()
+    device_stats = DeviceStatsMonitor()
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        callbacks=[device_stats],
+        max_epochs=1,
+        logger=False,
+        enable_checkpointing=False,
+        enable_progress_bar=False,
+    )
+
+    with pytest.raises(MisconfigurationException, match="Trainer that has no logger."):
+        trainer.fit(model)
diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index fe6873c8f43bf..df6a430680e32 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -95,7 +95,7 @@ def test_resume_early_stopping_from_checkpoint(tmpdir):
     )
 
     with pytest.raises(MisconfigurationException, match=r"You restored a checkpoint with current_epoch"):
-        new_trainer.fit(model)
+        new_trainer.fit(model, datamodule=dm)
 
 
 def test_early_stopping_no_extraneous_invocations(tmpdir):
@@ -111,7 +111,7 @@ def test_early_stopping_no_extraneous_invocations(tmpdir):
         limit_train_batches=4,
         limit_val_batches=4,
         max_epochs=expected_count,
-        checkpoint_callback=False,
+        enable_checkpointing=False,
     )
     trainer.fit(model, datamodule=dm)
 
@@ -386,7 +386,7 @@ def on_train_end(self) -> None:
 
 
 @pytest.mark.parametrize(
-    "callbacks, expected_stop_epoch, check_on_train_epoch_end, accelerator, num_processes",
+    "callbacks, expected_stop_epoch, check_on_train_epoch_end, strategy, num_processes",
     [
         ([EarlyStopping("abc"), EarlyStopping("cba", patience=3)], 3, False, None, 1),
         ([EarlyStopping("cba", patience=3), EarlyStopping("abc")], 3, False, None, 1),
@@ -407,7 +407,7 @@ def test_multiple_early_stopping_callbacks(
     callbacks: List[EarlyStopping],
     expected_stop_epoch: int,
     check_on_train_epoch_end: bool,
-    accelerator: Optional[str],
+    strategy: Optional[str],
     num_processes: int,
 ):
     """Ensure when using multiple early stopping callbacks we stop if any signals we should stop."""
@@ -419,7 +419,7 @@ def test_multiple_early_stopping_callbacks(
         callbacks=callbacks,
         overfit_batches=0.20,
         max_epochs=20,
-        accelerator=accelerator,
+        strategy=strategy,
         num_processes=num_processes,
     )
     trainer.fit(model)
diff --git a/tests/callbacks/test_gradient_accumulation_scheduler.py b/tests/callbacks/test_gradient_accumulation_scheduler.py
new file mode 100644
index 0000000000000..25ce1e843b80e
--- /dev/null
+++ b/tests/callbacks/test_gradient_accumulation_scheduler.py
@@ -0,0 +1,108 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from unittest.mock import patch
+
+import pytest
+
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import GradientAccumulationScheduler
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from tests.helpers import BoringModel
+
+
+@pytest.mark.parametrize("accumulate_grad_batches", (1, 2, 3))
+def test_trainer_accumulate_grad_batches_zero_grad(tmpdir, accumulate_grad_batches):
+    with patch("torch.optim.SGD.zero_grad") as sgd_zero_grad:
+        model = BoringModel()
+        trainer = Trainer(
+            default_root_dir=tmpdir,
+            limit_train_batches=20,
+            limit_val_batches=1,
+            max_epochs=1,
+            enable_model_summary=False,
+            accumulate_grad_batches=accumulate_grad_batches,
+        )
+        assert trainer.accumulate_grad_batches == accumulate_grad_batches
+        trainer.fit(model)
+
+        assert sum(isinstance(cb, GradientAccumulationScheduler) for cb in trainer.callbacks) == 1
+        assert sgd_zero_grad.call_count == math.ceil(trainer.limit_train_batches / accumulate_grad_batches)
+
+
+@pytest.mark.parametrize(
+    ["accumulate_grad_batches", "expected_call_count"],
+    [
+        ({1: 2, 3: 4}, 10 + 5 + 5 + 3),
+        ({0: 2, 2: 1}, 5 + 5 + 10 + 10),
+    ],
+)
+def test_trainer_accumulate_grad_batches_dict_zero_grad(tmpdir, accumulate_grad_batches, expected_call_count):
+    with patch("torch.optim.SGD.zero_grad") as sgd_zero_grad:
+        model = BoringModel()
+        trainer = Trainer(
+            default_root_dir=tmpdir,
+            limit_train_batches=10,
+            limit_val_batches=1,
+            max_epochs=4,
+            enable_model_summary=False,
+            accumulate_grad_batches=accumulate_grad_batches,
+        )
+        assert trainer.accumulate_grad_batches == accumulate_grad_batches.get(0, 1)
+        trainer.fit(model)
+
+        assert sum(isinstance(cb, GradientAccumulationScheduler) for cb in trainer.callbacks) == 1
+        assert sgd_zero_grad.call_count == expected_call_count
+
+
+def test_trainer_accumulate_grad_batches_with_callback(tmpdir):
+    with patch("torch.optim.SGD.zero_grad") as sgd_zero_grad:
+        model = BoringModel()
+        trainer = Trainer(
+            default_root_dir=tmpdir,
+            limit_train_batches=10,
+            limit_val_batches=1,
+            max_epochs=4,
+            enable_model_summary=False,
+            callbacks=[GradientAccumulationScheduler({1: 2, 3: 4})],
+        )
+        assert trainer.accumulate_grad_batches == 1
+        trainer.fit(model)
+
+        assert sum(isinstance(cb, GradientAccumulationScheduler) for cb in trainer.callbacks) == 1
+        assert sgd_zero_grad.call_count == 10 + 5 + 5 + 3
+
+
+@pytest.mark.parametrize(
+    "scheduling",
+    [
+        {1: 2, -3: 4},
+        {0: 2, "2": 1},
+    ],
+)
+def test_invalid_keys_for_grad_accum_scheduler(scheduling):
+    with pytest.raises(MisconfigurationException, match="Epoch should be an int"):
+        _ = GradientAccumulationScheduler(scheduling=scheduling)
+
+
+@pytest.mark.parametrize(
+    "scheduling",
+    [
+        {1: 0, 3: 4},
+        {0: 2, 2: "2"},
+    ],
+)
+def test_invalid_values_for_grad_accum_scheduler(scheduling):
+    with pytest.raises(MisconfigurationException, match="Accumulation factor should be an int"):
+        _ = GradientAccumulationScheduler(scheduling=scheduling)
diff --git a/tests/callbacks/test_lr_monitor.py b/tests/callbacks/test_lr_monitor.py
index df84a1b186231..f4ac5b5f57494 100644
--- a/tests/callbacks/test_lr_monitor.py
+++ b/tests/callbacks/test_lr_monitor.py
@@ -37,16 +37,11 @@ def test_lr_monitor_single_lr(tmpdir):
         default_root_dir=tmpdir, max_epochs=2, limit_val_batches=0.1, limit_train_batches=0.5, callbacks=[lr_monitor]
     )
     trainer.fit(model)
-    assert trainer.state.finished, f"Training failed with {trainer.state}"
 
     assert lr_monitor.lrs, "No learning rates logged"
     assert all(v is None for v in lr_monitor.last_momentum_values.values()), "Momentum should not be logged by default"
-    assert len(lr_monitor.lrs) == len(
-        trainer.lr_schedulers
-    ), "Number of learning rates logged does not match number of lr schedulers"
-    assert (
-        lr_monitor.lr_sch_names == list(lr_monitor.lrs.keys()) == ["lr-SGD"]
-    ), "Names of learning rates not set correctly"
+    assert len(lr_monitor.lrs) == len(trainer.lr_schedulers)
+    assert lr_monitor.lr_sch_names == list(lr_monitor.lrs.keys()) == ["lr-SGD"]
 
 
 @pytest.mark.parametrize("opt", ["SGD", "Adam"])
@@ -79,15 +74,10 @@ def configure_optimizers(self):
         callbacks=[lr_monitor],
     )
     trainer.fit(model)
-    assert trainer.state.finished, f"Training failed with {trainer.state}"
 
     assert all(v is not None for v in lr_monitor.last_momentum_values.values()), "Expected momentum to be logged"
-    assert len(lr_monitor.last_momentum_values) == len(
-        trainer.lr_schedulers
-    ), "Number of momentum values logged does not match number of lr schedulers"
-    assert all(
-        k == f"lr-{opt}-momentum" for k in lr_monitor.last_momentum_values.keys()
-    ), "Names of momentum values not set correctly"
+    assert len(lr_monitor.last_momentum_values) == len(trainer.lr_schedulers)
+    assert all(k == f"lr-{opt}-momentum" for k in lr_monitor.last_momentum_values.keys())
 
 
 def test_log_momentum_no_momentum_optimizer(tmpdir):
@@ -111,18 +101,14 @@ def configure_optimizers(self):
     )
     with pytest.warns(RuntimeWarning, match="optimizers do not have momentum."):
         trainer.fit(model)
-    assert trainer.state.finished, f"Training failed with {trainer.state}"
 
     assert all(v == 0 for v in lr_monitor.last_momentum_values.values()), "Expected momentum to be logged"
-    assert len(lr_monitor.last_momentum_values) == len(
-        trainer.lr_schedulers
-    ), "Number of momentum values logged does not match number of lr schedulers"
-    assert all(
-        k == "lr-ASGD-momentum" for k in lr_monitor.last_momentum_values.keys()
-    ), "Names of momentum values not set correctly"
+    assert len(lr_monitor.last_momentum_values) == len(trainer.lr_schedulers)
+    assert all(k == "lr-ASGD-momentum" for k in lr_monitor.last_momentum_values.keys())
 
 
-def test_lr_monitor_no_lr_scheduler(tmpdir):
+def test_lr_monitor_no_lr_scheduler_single_lr(tmpdir):
+    """Test that learning rates are extracted and logged for no lr scheduler."""
     tutils.reset_seed()
 
     class CustomBoringModel(BoringModel):
@@ -137,9 +123,72 @@ def configure_optimizers(self):
         default_root_dir=tmpdir, max_epochs=2, limit_val_batches=0.1, limit_train_batches=0.5, callbacks=[lr_monitor]
     )
 
-    with pytest.warns(RuntimeWarning, match="have no learning rate schedulers"):
+    trainer.fit(model)
+
+    assert lr_monitor.lrs, "No learning rates logged"
+    assert len(lr_monitor.lrs) == len(trainer.optimizers)
+    assert lr_monitor.lr_sch_names == ["lr-SGD"]
+
+
+@pytest.mark.parametrize("opt", ["SGD", "Adam"])
+def test_lr_monitor_no_lr_scheduler_single_lr_with_momentum(tmpdir, opt: str):
+    """Test that learning rates and momentum are extracted and logged for no lr scheduler."""
+
+    class LogMomentumModel(BoringModel):
+        def __init__(self, opt):
+            super().__init__()
+            self.opt = opt
+
+        def configure_optimizers(self):
+            if self.opt == "SGD":
+                opt_kwargs = {"momentum": 0.9}
+            elif self.opt == "Adam":
+                opt_kwargs = {"betas": (0.9, 0.999)}
+
+            optimizer = getattr(optim, self.opt)(self.parameters(), lr=1e-2, **opt_kwargs)
+            return [optimizer]
+
+    model = LogMomentumModel(opt=opt)
+    lr_monitor = LearningRateMonitor(log_momentum=True)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=2,
+        limit_val_batches=2,
+        limit_train_batches=5,
+        log_every_n_steps=1,
+        callbacks=[lr_monitor],
+    )
+    trainer.fit(model)
+
+    assert all(v is not None for v in lr_monitor.last_momentum_values.values()), "Expected momentum to be logged"
+    assert len(lr_monitor.last_momentum_values) == len(trainer.optimizers)
+    assert all(k == f"lr-{opt}-momentum" for k in lr_monitor.last_momentum_values.keys())
+
+
+def test_log_momentum_no_momentum_optimizer_no_lr_scheduler(tmpdir):
+    """Test that if optimizer doesn't have momentum then a warning is raised with log_momentum=True."""
+
+    class LogMomentumModel(BoringModel):
+        def configure_optimizers(self):
+            optimizer = optim.ASGD(self.parameters(), lr=1e-2)
+            return [optimizer]
+
+    model = LogMomentumModel()
+    lr_monitor = LearningRateMonitor(log_momentum=True)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_val_batches=2,
+        limit_train_batches=5,
+        log_every_n_steps=1,
+        callbacks=[lr_monitor],
+    )
+    with pytest.warns(RuntimeWarning, match="optimizers do not have momentum."):
         trainer.fit(model)
-    assert trainer.state.finished, f"Training failed with {trainer.state}"
+
+    assert all(v == 0 for v in lr_monitor.last_momentum_values.values()), "Expected momentum to be logged"
+    assert len(lr_monitor.last_momentum_values) == len(trainer.optimizers)
+    assert all(k == "lr-ASGD-momentum" for k in lr_monitor.last_momentum_values.keys())
 
 
 def test_lr_monitor_no_logger(tmpdir):
@@ -186,12 +235,9 @@ def configure_optimizers(self):
         callbacks=[lr_monitor],
     )
     trainer.fit(model)
-    assert trainer.state.finished, f"Training failed with {trainer.state}"
 
     assert lr_monitor.lrs, "No learning rates logged"
-    assert len(lr_monitor.lrs) == len(
-        trainer.lr_schedulers
-    ), "Number of learning rates logged does not match number of lr schedulers"
+    assert len(lr_monitor.lrs) == len(trainer.lr_schedulers)
     assert lr_monitor.lr_sch_names == ["lr-Adam", "lr-Adam-1"], "Names of learning rates not set correctly"
 
     if logging_interval == "step":
@@ -199,9 +245,50 @@ def configure_optimizers(self):
     if logging_interval == "epoch":
         expected_number_logged = trainer.max_epochs
 
-    assert all(
-        len(lr) == expected_number_logged for lr in lr_monitor.lrs.values()
-    ), "Length of logged learning rates do not match the expected number"
+    assert all(len(lr) == expected_number_logged for lr in lr_monitor.lrs.values())
+
+
+@pytest.mark.parametrize("logging_interval", ["step", "epoch"])
+def test_lr_monitor_no_lr_scheduler_multi_lrs(tmpdir, logging_interval: str):
+    """Test that learning rates are extracted and logged for multi optimizers but no lr scheduler."""
+    tutils.reset_seed()
+
+    class CustomBoringModel(BoringModel):
+        def training_step(self, batch, batch_idx, optimizer_idx):
+            return super().training_step(batch, batch_idx)
+
+        def configure_optimizers(self):
+            optimizer1 = optim.Adam(self.parameters(), lr=1e-2)
+            optimizer2 = optim.Adam(self.parameters(), lr=1e-2)
+
+            return [optimizer1, optimizer2]
+
+    model = CustomBoringModel()
+    model.training_epoch_end = None
+
+    lr_monitor = LearningRateMonitor(logging_interval=logging_interval)
+    log_every_n_steps = 2
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=2,
+        log_every_n_steps=log_every_n_steps,
+        limit_train_batches=7,
+        limit_val_batches=0.1,
+        callbacks=[lr_monitor],
+    )
+    trainer.fit(model)
+
+    assert lr_monitor.lrs, "No learning rates logged"
+    assert len(lr_monitor.lrs) == len(trainer.optimizers)
+    assert lr_monitor.lr_sch_names == ["lr-Adam", "lr-Adam-1"], "Names of learning rates not set correctly"
+
+    if logging_interval == "step":
+        expected_number_logged = trainer.global_step // log_every_n_steps
+    if logging_interval == "epoch":
+        expected_number_logged = trainer.max_epochs
+
+    assert all(len(lr) == expected_number_logged for lr in lr_monitor.lrs.values())
 
 
 def test_lr_monitor_param_groups(tmpdir):
@@ -227,12 +314,9 @@ def configure_optimizers(self):
         default_root_dir=tmpdir, max_epochs=2, limit_val_batches=0.1, limit_train_batches=0.5, callbacks=[lr_monitor]
     )
     trainer.fit(model, datamodule=dm)
-    assert trainer.state.finished, f"Training failed with {trainer.state}"
 
     assert lr_monitor.lrs, "No learning rates logged"
-    assert len(lr_monitor.lrs) == 2 * len(
-        trainer.lr_schedulers
-    ), "Number of learning rates logged does not match number of param groups"
+    assert len(lr_monitor.lrs) == 2 * len(trainer.lr_schedulers)
     assert lr_monitor.lr_sch_names == ["lr-Adam"]
     assert list(lr_monitor.lrs.keys()) == ["lr-Adam/pg1", "lr-Adam/pg2"], "Names of learning rates not set correctly"
 
@@ -252,7 +336,7 @@ def configure_optimizers(self):
         limit_train_batches=0.5,
         callbacks=[lr_monitor],
         enable_progress_bar=False,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(TestModel())
     assert lr_monitor.lr_sch_names == list(lr_monitor.lrs.keys()) == ["my_logging_name"]
@@ -273,7 +357,7 @@ def configure_optimizers(self):
         limit_train_batches=2,
         callbacks=[lr_monitor],
         enable_progress_bar=False,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(TestModel())
     assert lr_monitor.lr_sch_names == ["lr-SGD"]
@@ -311,7 +395,7 @@ def configure_optimizers(self):
         limit_train_batches=2,
         callbacks=[lr_monitor],
         enable_progress_bar=False,
-        weights_summary=None,
+        enable_model_summary=False,
     )
 
     with pytest.raises(
@@ -350,17 +434,30 @@ def configure_optimizers(self):
     class Check(Callback):
         def on_train_epoch_start(self, trainer, pl_module) -> None:
             num_param_groups = sum(len(opt.param_groups) for opt in trainer.optimizers)
-            assert lr_monitor.lr_sch_names == ["lr-Adam", "lr-Adam-1"]
+            assert lr_monitor.lr_sch_names == ["lr-Adam", "lr-Adam-1", "lr-Adam-2"]
             if trainer.current_epoch == 0:
                 assert num_param_groups == 3
             elif trainer.current_epoch == 1:
                 assert num_param_groups == 4
-                assert list(lr_monitor.lrs) == ["lr-Adam-1", "lr-Adam/pg1", "lr-Adam/pg2"]
+                assert list(lr_monitor.lrs) == ["lr-Adam-1", "lr-Adam-2", "lr-Adam/pg1", "lr-Adam/pg2"]
             elif trainer.current_epoch == 2:
                 assert num_param_groups == 5
-                assert list(lr_monitor.lrs) == ["lr-Adam/pg1", "lr-Adam/pg2", "lr-Adam-1/pg1", "lr-Adam-1/pg2"]
+                assert list(lr_monitor.lrs) == [
+                    "lr-Adam-2",
+                    "lr-Adam/pg1",
+                    "lr-Adam/pg2",
+                    "lr-Adam-1/pg1",
+                    "lr-Adam-1/pg2",
+                ]
             else:
-                expected = ["lr-Adam/pg1", "lr-Adam/pg2", "lr-Adam-1/pg1", "lr-Adam-1/pg2", "lr-Adam-1/pg3"]
+                expected = [
+                    "lr-Adam-2",
+                    "lr-Adam/pg1",
+                    "lr-Adam/pg2",
+                    "lr-Adam-1/pg1",
+                    "lr-Adam-1/pg2",
+                    "lr-Adam-1/pg3",
+                ]
                 assert list(lr_monitor.lrs) == expected
 
     class TestFinetuning(BackboneFinetuning):
@@ -389,13 +486,16 @@ def finetune_function(self, pl_module, epoch: int, optimizer, opt_idx: int):
         limit_train_batches=2,
         callbacks=[TestFinetuning(), lr_monitor, Check()],
         enable_progress_bar=False,
-        weights_summary=None,
-        checkpoint_callback=False,
+        enable_model_summary=False,
+        enable_checkpointing=False,
     )
     model = TestModel()
     model.training_epoch_end = None
     trainer.fit(model)
 
+    expected = [0.1, 0.1, 0.1, 0.1, 0.1]
+    assert lr_monitor.lrs["lr-Adam-2"] == expected
+
     expected = [0.1, 0.05, 0.025, 0.0125, 0.00625]
     assert lr_monitor.lrs["lr-Adam/pg1"] == expected
 
diff --git a/tests/callbacks/test_model_summary.py b/tests/callbacks/test_model_summary.py
index a0264186d9969..a270d381f043a 100644
--- a/tests/callbacks/test_model_summary.py
+++ b/tests/callbacks/test_model_summary.py
@@ -32,8 +32,18 @@ def test_model_summary_callback_present_trainer():
 
 
 def test_model_summary_callback_with_weights_summary_none():
+    with pytest.deprecated_call(match=r"weights_summary=None\)` is deprecated"):
+        trainer = Trainer(weights_summary=None)
+    assert not any(isinstance(cb, ModelSummary) for cb in trainer.callbacks)
+
+    trainer = Trainer(enable_model_summary=False)
+    assert not any(isinstance(cb, ModelSummary) for cb in trainer.callbacks)
+
+    trainer = Trainer(enable_model_summary=False, weights_summary="full")
+    assert not any(isinstance(cb, ModelSummary) for cb in trainer.callbacks)
 
-    trainer = Trainer(weights_summary=None)
+    with pytest.deprecated_call(match=r"weights_summary=None\)` is deprecated"):
+        trainer = Trainer(enable_model_summary=True, weights_summary=None)
     assert not any(isinstance(cb, ModelSummary) for cb in trainer.callbacks)
 
 
diff --git a/tests/callbacks/test_progress_bar.py b/tests/callbacks/test_progress_bar.py
index 746a9717dbbca..caf5fb82521a2 100644
--- a/tests/callbacks/test_progress_bar.py
+++ b/tests/callbacks/test_progress_bar.py
@@ -263,7 +263,7 @@ def on_validation_epoch_end(self, *args):
         limit_val_batches=limit_val_batches,
         callbacks=[progress_bar],
         logger=False,
-        checkpoint_callback=False,
+        enable_checkpointing=False,
     )
     trainer.fit(model)
 
@@ -342,7 +342,7 @@ def test_main_progress_bar_update_amount(
         limit_val_batches=val_batches,
         callbacks=[progress_bar],
         logger=False,
-        checkpoint_callback=False,
+        enable_checkpointing=False,
     )
     trainer.fit(model)
     if train_batches > 0:
@@ -362,7 +362,7 @@ def test_test_progress_bar_update_amount(tmpdir, test_batches: int, refresh_rate
         limit_test_batches=test_batches,
         callbacks=[progress_bar],
         logger=False,
-        checkpoint_callback=False,
+        enable_checkpointing=False,
     )
     trainer.test(model)
     progress_bar.test_progress_bar.update.assert_has_calls([call(delta) for delta in test_deltas])
@@ -379,7 +379,7 @@ def training_step(self, batch, batch_idx):
             return super().training_step(batch, batch_idx)
 
     trainer = Trainer(
-        default_root_dir=tmpdir, max_epochs=1, limit_train_batches=2, logger=False, checkpoint_callback=False
+        default_root_dir=tmpdir, max_epochs=1, limit_train_batches=2, logger=False, enable_checkpointing=False
     )
     trainer.fit(TestModel())
 
@@ -547,10 +547,10 @@ def _test_progress_bar_max_val_check_interval(
         default_root_dir=tmpdir,
         num_sanity_val_steps=0,
         max_epochs=1,
-        weights_summary=None,
+        enable_model_summary=False,
         val_check_interval=val_check_interval,
         gpus=world_size,
-        accelerator="ddp",
+        strategy="ddp",
     )
     trainer.fit(model, train_dataloader=train_data, val_dataloaders=val_data)
 
diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py
index 5c2f2d4d7b3de..03070a8f1dc1f 100644
--- a/tests/callbacks/test_pruning.py
+++ b/tests/callbacks/test_pruning.py
@@ -65,7 +65,7 @@ def train_with_pruning_callback(
     use_global_unstructured=False,
     pruning_fn="l1_unstructured",
     use_lottery_ticket_hypothesis=False,
-    accelerator=None,
+    strategy=None,
     gpus=None,
     num_processes=1,
 ):
@@ -108,13 +108,13 @@ def train_with_pruning_callback(
     trainer = Trainer(
         default_root_dir=tmpdir,
         enable_progress_bar=False,
-        weights_summary=None,
-        checkpoint_callback=False,
+        enable_model_summary=False,
+        enable_checkpointing=False,
         logger=False,
         limit_train_batches=10,
         limit_val_batches=2,
         max_epochs=10,
-        accelerator=accelerator,
+        strategy=strategy,
         gpus=gpus,
         num_processes=num_processes,
         callbacks=pruning,
@@ -122,7 +122,7 @@ def train_with_pruning_callback(
     trainer.fit(model)
     trainer.test(model)
 
-    if not accelerator:
+    if not strategy:
         # Check some have been pruned
         assert torch.any(model.layer.mlp_2.weight == 0)
 
@@ -165,39 +165,33 @@ def test_pruning_callback(
 @RunIf(special=True, min_gpus=2)
 def test_pruning_callback_ddp_0(tmpdir):
     train_with_pruning_callback(
-        tmpdir, parameters_to_prune=False, use_global_unstructured=False, accelerator="ddp", gpus=2
+        tmpdir, parameters_to_prune=False, use_global_unstructured=False, strategy="ddp", gpus=2
     )
 
 
 @RunIf(special=True, min_gpus=2)
 def test_pruning_callback_ddp_1(tmpdir):
-    train_with_pruning_callback(
-        tmpdir, parameters_to_prune=False, use_global_unstructured=True, accelerator="ddp", gpus=2
-    )
+    train_with_pruning_callback(tmpdir, parameters_to_prune=False, use_global_unstructured=True, strategy="ddp", gpus=2)
 
 
 @RunIf(special=True, min_gpus=2)
 def test_pruning_callback_ddp_2(tmpdir):
-    train_with_pruning_callback(
-        tmpdir, parameters_to_prune=True, use_global_unstructured=False, accelerator="ddp", gpus=2
-    )
+    train_with_pruning_callback(tmpdir, parameters_to_prune=True, use_global_unstructured=False, strategy="ddp", gpus=2)
 
 
 @RunIf(special=True, min_gpus=2)
 def test_pruning_callback_ddp_3(tmpdir):
-    train_with_pruning_callback(
-        tmpdir, parameters_to_prune=True, use_global_unstructured=True, accelerator="ddp", gpus=2
-    )
+    train_with_pruning_callback(tmpdir, parameters_to_prune=True, use_global_unstructured=True, strategy="ddp", gpus=2)
 
 
 @RunIf(min_gpus=2, skip_windows=True)
 def test_pruning_callback_ddp_spawn(tmpdir):
-    train_with_pruning_callback(tmpdir, use_global_unstructured=True, accelerator="ddp_spawn", gpus=2)
+    train_with_pruning_callback(tmpdir, use_global_unstructured=True, strategy="ddp_spawn", gpus=2)
 
 
 @RunIf(skip_windows=True)
 def test_pruning_callback_ddp_cpu(tmpdir):
-    train_with_pruning_callback(tmpdir, parameters_to_prune=True, accelerator="ddp_cpu", num_processes=2)
+    train_with_pruning_callback(tmpdir, parameters_to_prune=True, strategy="ddp_spawn", num_processes=2)
 
 
 @pytest.mark.parametrize("resample_parameters", (False, True))
@@ -226,8 +220,8 @@ def apply_lottery_ticket_hypothesis(self):
     trainer = Trainer(
         default_root_dir=tmpdir,
         enable_progress_bar=False,
-        weights_summary=None,
-        checkpoint_callback=False,
+        enable_model_summary=False,
+        enable_checkpointing=False,
         logger=False,
         limit_train_batches=10,
         limit_val_batches=2,
@@ -253,8 +247,8 @@ def test_multiple_pruning_callbacks(tmpdir, caplog, make_pruning_permanent: bool
     trainer = Trainer(
         default_root_dir=tmpdir,
         enable_progress_bar=False,
-        weights_summary=None,
-        checkpoint_callback=False,
+        enable_model_summary=False,
+        enable_checkpointing=False,
         logger=False,
         limit_train_batches=10,
         limit_val_batches=2,
diff --git a/tests/callbacks/test_quantization.py b/tests/callbacks/test_quantization.py
index 73d1a458b52ed..f548d4d98adcc 100644
--- a/tests/callbacks/test_quantization.py
+++ b/tests/callbacks/test_quantization.py
@@ -56,7 +56,7 @@ def test_quantization(tmpdir, observe: str, fuse: bool, convert: bool):
     assert torch.allclose(org_score, quant_score, atol=0.45)
     model_path = trainer.checkpoint_callback.best_model_path
 
-    trainer_args.update(dict(max_epochs=1, checkpoint_callback=False))
+    trainer_args.update(dict(max_epochs=1, enable_checkpointing=False))
     if not convert:
         trainer = Trainer(callbacks=[QuantizationAwareTraining()], **trainer_args)
         trainer.fit(qmodel, datamodule=dm)
diff --git a/tests/callbacks/test_stochastic_weight_avg.py b/tests/callbacks/test_stochastic_weight_avg.py
index b2693ed5ded48..82c4b257b9ef4 100644
--- a/tests/callbacks/test_stochastic_weight_avg.py
+++ b/tests/callbacks/test_stochastic_weight_avg.py
@@ -110,7 +110,7 @@ def on_train_end(self, trainer, pl_module):
 
 
 def train_with_swa(
-    tmpdir, batchnorm=True, accelerator=None, gpus=None, num_processes=1, interval="epoch", iterable_dataset=False
+    tmpdir, batchnorm=True, strategy=None, gpus=None, num_processes=1, interval="epoch", iterable_dataset=False
 ):
     model = SwaTestModel(batchnorm=batchnorm, interval=interval, iterable_dataset=iterable_dataset)
     swa_start = 2
@@ -127,7 +127,7 @@ def train_with_swa(
         limit_val_batches=0,
         callbacks=[swa_callback],
         accumulate_grad_batches=2,
-        accelerator=accelerator,
+        strategy=strategy,
         gpus=gpus,
         num_processes=num_processes,
     )
@@ -141,17 +141,17 @@ def train_with_swa(
 
 @RunIf(min_gpus=2, special=True)
 def test_swa_callback_ddp(tmpdir):
-    train_with_swa(tmpdir, accelerator="ddp", gpus=2)
+    train_with_swa(tmpdir, strategy="ddp", gpus=2)
 
 
 @RunIf(min_gpus=2)
 def test_swa_callback_ddp_spawn(tmpdir):
-    train_with_swa(tmpdir, accelerator="ddp_spawn", gpus=2)
+    train_with_swa(tmpdir, strategy="ddp_spawn", gpus=2)
 
 
 @RunIf(skip_windows=True)
 def test_swa_callback_ddp_cpu(tmpdir):
-    train_with_swa(tmpdir, accelerator="ddp_cpu", num_processes=2)
+    train_with_swa(tmpdir, strategy="ddp_spawn", num_processes=2)
 
 
 @RunIf(min_gpus=1)
diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py
index cb38ae8f0573f..c75d7332e2e42 100644
--- a/tests/checkpointing/test_checkpoint_callback_frequency.py
+++ b/tests/checkpointing/test_checkpoint_callback_frequency.py
@@ -22,9 +22,9 @@
 from tests.helpers.runif import RunIf
 
 
-def test_checkpoint_callback_disabled(tmpdir):
+def test_disabled_checkpointing(tmpdir):
     # no callback
-    trainer = Trainer(max_epochs=3, checkpoint_callback=False)
+    trainer = Trainer(max_epochs=3, enable_checkpointing=False)
     assert not trainer.checkpoint_callbacks
     trainer.fit(BoringModel())
     assert not trainer.checkpoint_callbacks
@@ -40,7 +40,7 @@ def test_default_checkpoint_freq(save_mock, tmpdir, epochs: int, val_check_inter
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=epochs,
-        weights_summary=None,
+        enable_model_summary=False,
         val_check_interval=val_check_interval,
         limit_val_batches=1,
         enable_progress_bar=False,
@@ -75,7 +75,7 @@ def training_step(self, batch, batch_idx):
         callbacks=[callbacks.ModelCheckpoint(dirpath=tmpdir, monitor="my_loss", save_top_k=k, save_last=save_last)],
         default_root_dir=tmpdir,
         max_epochs=epochs,
-        weights_summary=None,
+        enable_model_summary=False,
         val_check_interval=val_check_interval,
     )
     trainer.fit(model)
@@ -121,9 +121,9 @@ def training_epoch_end(self, outputs) -> None:
         default_root_dir=tmpdir,
         enable_progress_bar=False,
         max_epochs=epochs,
-        weights_summary=None,
+        enable_model_summary=False,
         val_check_interval=val_check_interval,
-        accelerator="ddp",
+        strategy="ddp",
         gpus=2,
         limit_train_batches=64,
         limit_val_batches=32,
diff --git a/tests/checkpointing/test_legacy_checkpoints.py b/tests/checkpointing/test_legacy_checkpoints.py
index 0910959fc7e7c..af7836590c2eb 100644
--- a/tests/checkpointing/test_legacy_checkpoints.py
+++ b/tests/checkpointing/test_legacy_checkpoints.py
@@ -79,7 +79,6 @@ def test_resume_legacy_checkpoints(tmpdir, pl_version: str):
             default_root_dir=str(tmpdir),
             gpus=int(torch.cuda.is_available()),
             precision=(16 if torch.cuda.is_available() else 32),
-            checkpoint_callback=True,
             callbacks=[es, stop],
             max_epochs=21,
             accumulate_grad_batches=2,
diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py
index ed2da81dbdeda..131b7dc6e0fe7 100644
--- a/tests/checkpointing/test_model_checkpoint.py
+++ b/tests/checkpointing/test_model_checkpoint.py
@@ -392,7 +392,7 @@ def test_model_checkpoint_no_extraneous_invocations(tmpdir):
     num_epochs = 4
     model_checkpoint = ModelCheckpointTestInvocations(monitor="early_stop_on", expected_count=num_epochs, save_top_k=-1)
     trainer = Trainer(
-        accelerator="ddp_cpu",
+        strategy="ddp_spawn",
         num_processes=2,
         default_root_dir=tmpdir,
         callbacks=[model_checkpoint],
@@ -516,15 +516,6 @@ def test_none_monitor_top_k(tmpdir):
     ModelCheckpoint(dirpath=tmpdir, save_top_k=1)
 
 
-def test_none_monitor_save_last(tmpdir):
-    """Test that a warning appears for save_last=True with monitor=None."""
-    with pytest.warns(UserWarning, match=r"ModelCheckpoint.*is a redundant.*"):
-        ModelCheckpoint(dirpath=tmpdir, save_last=True)
-    # These should not fail
-    ModelCheckpoint(dirpath=tmpdir, save_last=None)
-    ModelCheckpoint(dirpath=tmpdir, save_last=False)
-
-
 def test_invalid_every_n_epochs(tmpdir):
     """Make sure that a MisconfigurationException is raised for a negative every_n_epochs argument."""
     with pytest.raises(MisconfigurationException, match=r".*Must be >= 0"):
@@ -890,8 +881,8 @@ def validation_step(self, batch, batch_idx):
         limit_val_batches=2,
         limit_test_batches=2,
         callbacks=[checkpoint_callback],
-        weights_summary=None,
         enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
     assert os.listdir(tmpdir) == ["epoch=00.ckpt"]
@@ -906,8 +897,8 @@ def validation_step(self, batch, batch_idx):
             limit_val_batches=2,
             limit_test_batches=2,
             resume_from_checkpoint=checkpoint_callback.best_model_path,
-            weights_summary=None,
             enable_progress_bar=False,
+            enable_model_summary=False,
         )
         trainer.fit(model)
         trainer.test(model, verbose=False)
@@ -1007,17 +998,17 @@ def test_configure_model_checkpoint(tmpdir):
     callback2 = ModelCheckpoint()
 
     # no callbacks
-    trainer = Trainer(checkpoint_callback=False, callbacks=[], **kwargs)
+    trainer = Trainer(enable_checkpointing=False, callbacks=[], **kwargs)
     assert not any(isinstance(c, ModelCheckpoint) for c in trainer.callbacks)
     assert trainer.checkpoint_callback is None
 
     # default configuration
-    trainer = Trainer(checkpoint_callback=True, callbacks=[], **kwargs)
+    trainer = Trainer(callbacks=[], **kwargs)
     assert sum(1 for c in trainer.callbacks if isinstance(c, ModelCheckpoint)) == 1
     assert isinstance(trainer.checkpoint_callback, ModelCheckpoint)
 
-    # custom callback passed to callbacks list, checkpoint_callback=True is ignored
-    trainer = Trainer(checkpoint_callback=True, callbacks=[callback1], **kwargs)
+    # custom callback passed to callbacks list, enable_checkpointing=True is ignored
+    trainer = Trainer(enable_checkpointing=True, callbacks=[callback1], **kwargs)
     assert [c for c in trainer.callbacks if isinstance(c, ModelCheckpoint)] == [callback1]
     assert trainer.checkpoint_callback == callback1
 
@@ -1026,8 +1017,8 @@ def test_configure_model_checkpoint(tmpdir):
     assert trainer.checkpoint_callback == callback1
     assert trainer.checkpoint_callbacks == [callback1, callback2]
 
-    with pytest.raises(MisconfigurationException, match="checkpoint_callback=False but found ModelCheckpoint"):
-        Trainer(checkpoint_callback=False, callbacks=[callback1], **kwargs)
+    with pytest.raises(MisconfigurationException, match="`enable_checkpointing=False` but found `ModelCheckpoint`"):
+        Trainer(enable_checkpointing=False, callbacks=[callback1], **kwargs)
 
 
 def test_val_check_interval_checkpoint_files(tmpdir):
@@ -1041,8 +1032,8 @@ def test_val_check_interval_checkpoint_files(tmpdir):
         limit_train_batches=10,
         callbacks=[model_checkpoint],
         logger=False,
-        weights_summary=None,
         enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
     files = {p.basename for p in tmpdir.listdir()}
@@ -1065,8 +1056,8 @@ def training_step(self, *args):
         limit_val_batches=1,
         callbacks=[model_checkpoint],
         logger=False,
-        weights_summary=None,
         enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(TestModel())
     assert model_checkpoint.current_score == 0.3
@@ -1098,8 +1089,8 @@ def training_step(self, *args):
         limit_val_batches=1,
         callbacks=[model_checkpoint],
         logger=False,
-        weights_summary=None,
         enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(TestModel())
     expected = float("inf" if mode == "min" else "-inf")
@@ -1122,8 +1113,8 @@ def __init__(self, hparams):
         limit_val_batches=1,
         callbacks=[model_checkpoint],
         logger=False,
-        weights_summary=None,
         enable_progress_bar=False,
+        enable_model_summary=False,
     )
     hp = {"test_hp_0": 1, "test_hp_1": 2}
     hp = OmegaConf.create(hp) if hparams_type == Container else Namespace(**hp)
@@ -1150,8 +1141,8 @@ def test_ckpt_version_after_rerun_new_trainer(tmpdir):
             default_root_dir=tmpdir,
             callbacks=[mc],
             logger=False,
-            weights_summary=None,
             enable_progress_bar=False,
+            enable_model_summary=False,
         )
         trainer.fit(BoringModel())
 
@@ -1176,8 +1167,8 @@ def test_ckpt_version_after_rerun_same_trainer(tmpdir):
         default_root_dir=tmpdir,
         callbacks=[mc],
         logger=False,
-        weights_summary=None,
         enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(BoringModel())
     trainer.fit_loop.max_epochs = 4
@@ -1196,12 +1187,6 @@ def test_model_checkpoint_mode_options():
         ModelCheckpoint(mode="unknown_option")
 
 
-def test_trainer_checkpoint_callback_bool(tmpdir):
-    mc = ModelCheckpoint(dirpath=tmpdir)
-    with pytest.raises(MisconfigurationException, match="Invalid type provided for checkpoint_callback"):
-        Trainer(checkpoint_callback=mc)
-
-
 def test_check_val_every_n_epochs_top_k_integration(tmpdir):
     model = BoringModel()
     mc = ModelCheckpoint(dirpath=tmpdir, monitor="epoch", save_top_k=-1, filename="{epoch}")
@@ -1213,7 +1198,7 @@ def test_check_val_every_n_epochs_top_k_integration(tmpdir):
         max_epochs=5,
         check_val_every_n_epoch=2,
         callbacks=mc,
-        weights_summary=None,
+        enable_model_summary=False,
         logger=False,
     )
     trainer.fit(model)
diff --git a/tests/checkpointing/test_torch_saving.py b/tests/checkpointing/test_torch_saving.py
index e95ce1c91d6f4..8b0f0e457bff9 100644
--- a/tests/checkpointing/test_torch_saving.py
+++ b/tests/checkpointing/test_torch_saving.py
@@ -40,7 +40,7 @@ def test_model_torch_save_ddp_cpu(tmpdir):
     model = BoringModel()
     num_epochs = 1
     trainer = Trainer(
-        default_root_dir=tmpdir, max_epochs=num_epochs, accelerator="ddp_cpu", num_processes=2, logger=False
+        default_root_dir=tmpdir, max_epochs=num_epochs, strategy="ddp_spawn", num_processes=2, logger=False
     )
     temp_path = os.path.join(tmpdir, "temp.pt")
     trainer.fit(model)
@@ -55,7 +55,7 @@ def test_model_torch_save_ddp_cuda(tmpdir):
     """Test to ensure torch save does not fail for model and trainer using gpu ddp."""
     model = BoringModel()
     num_epochs = 1
-    trainer = Trainer(default_root_dir=tmpdir, max_epochs=num_epochs, accelerator="ddp_spawn", gpus=2)
+    trainer = Trainer(default_root_dir=tmpdir, max_epochs=num_epochs, strategy="ddp_spawn", gpus=2)
     temp_path = os.path.join(tmpdir, "temp.pt")
     trainer.fit(model)
 
diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py
index 2f84032593472..7ec61690d83ab 100644
--- a/tests/core/test_datamodules.py
+++ b/tests/core/test_datamodules.py
@@ -21,13 +21,15 @@
 import pytest
 import torch
 from omegaconf import OmegaConf
+from torch.utils.data import DataLoader
 
 from pytorch_lightning import LightningDataModule, Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.trainer.supporters import CombinedLoader
 from pytorch_lightning.utilities import AttributeDict
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.model_helpers import is_overridden
-from tests.helpers import BoringDataModule, BoringModel
+from tests.helpers import BoringDataModule, BoringModel, RandomDataset
 from tests.helpers.datamodules import ClassifDataModule
 from tests.helpers.runif import RunIf
 from tests.helpers.simple_models import ClassificationModel
@@ -280,7 +282,7 @@ def test_train_loop_only(tmpdir):
     model.test_step_end = None
     model.test_epoch_end = None
 
-    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, weights_summary=None)
+    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, enable_model_summary=False)
 
     # fit model
     trainer.fit(model, datamodule=dm)
@@ -298,7 +300,7 @@ def test_train_val_loop_only(tmpdir):
     model.validation_step_end = None
     model.validation_epoch_end = None
 
-    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, weights_summary=None)
+    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, enable_model_summary=False)
 
     # fit model
     trainer.fit(model, datamodule=dm)
@@ -329,7 +331,7 @@ def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
         max_epochs=1,
         limit_train_batches=2,
         limit_val_batches=1,
-        weights_summary=None,
+        enable_model_summary=False,
         callbacks=[ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on")],
     )
 
@@ -348,7 +350,7 @@ def test_full_loop(tmpdir):
     dm = ClassifDataModule()
     model = ClassificationModel()
 
-    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, weights_summary=None, deterministic=True)
+    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, enable_model_summary=False, deterministic=True)
 
     # fit model
     trainer.fit(model, dm)
@@ -564,13 +566,14 @@ class BoringDataModule1(LightningDataModule):
         batch_size: int
         dims: int = 2
 
-        def __post_init__(self):
-            super().__init__(dims=self.dims)
+        def train_dataloader(self):
+            return DataLoader(torch.randn(self.batch_size * 2, 10), batch_size=self.batch_size)
 
     # asserts for the different dunder methods added by dataclass, when __init__ is implemented, i.e.
     # __repr__, __eq__, __lt__, __le__, etc.
     assert BoringDataModule1(batch_size=64).dims == 2
     assert BoringDataModule1(batch_size=32)
+    assert len(BoringDataModule1(batch_size=32)) == 2
     assert hasattr(BoringDataModule1, "__repr__")
     assert BoringDataModule1(batch_size=32) == BoringDataModule1(batch_size=32)
 
@@ -581,7 +584,9 @@ class BoringDataModule2(LightningDataModule):
 
     # asserts for the different dunder methods added by dataclass, when super class is inherently initialized, i.e.
     # __init__, __repr__, __eq__, __lt__, __le__, etc.
-    assert BoringDataModule2(batch_size=32)
+    assert BoringDataModule2(batch_size=32) is not None
+    assert BoringDataModule2(batch_size=32).batch_size == 32
+    assert len(BoringDataModule2(batch_size=32)) == 0
     assert hasattr(BoringDataModule2, "__repr__")
     assert BoringDataModule2(batch_size=32).prepare_data() is None
     assert BoringDataModule2(batch_size=32) == BoringDataModule2(batch_size=32)
@@ -625,3 +630,69 @@ def test_inconsistent_prepare_data_per_node(tmpdir):
         trainer.model = model
         trainer.datamodule = dm
         trainer.data_connector.prepare_data()
+
+
+DATALOADER = DataLoader(RandomDataset(1, 32))
+
+
+@pytest.mark.parametrize("method_name", ["train_dataloader", "val_dataloader", "test_dataloader", "predict_dataloader"])
+@pytest.mark.parametrize(
+    ["dataloader", "expected"],
+    [
+        [DATALOADER, 32],
+        [[DATALOADER, DATALOADER], 64],
+        [[[DATALOADER], [DATALOADER, DATALOADER]], 96],
+        [[{"foo": DATALOADER}, {"foo": DATALOADER, "bar": DATALOADER}], 96],
+        [{"foo": DATALOADER, "bar": DATALOADER}, 64],
+        [{"foo": {"foo": DATALOADER}, "bar": {"foo": DATALOADER, "bar": DATALOADER}}, 96],
+        [{"foo": [DATALOADER], "bar": [DATALOADER, DATALOADER]}, 96],
+        [CombinedLoader({"foo": DATALOADER, "bar": DATALOADER}), 64],
+    ],
+)
+def test_len_different_types(method_name, dataloader, expected):
+    dm = LightningDataModule()
+    setattr(dm, method_name, lambda: dataloader)
+    assert len(dm) == expected
+
+
+@pytest.mark.parametrize("method_name", ["train_dataloader", "val_dataloader", "test_dataloader", "predict_dataloader"])
+def test_len_dataloader_no_len(method_name):
+    class CustomNotImplementedErrorDataloader(DataLoader):
+        def __len__(self):
+            raise NotImplementedError
+
+    dataloader = CustomNotImplementedErrorDataloader(RandomDataset(1, 32))
+    dm = LightningDataModule()
+    setattr(dm, method_name, lambda: dataloader)
+    with pytest.warns(UserWarning, match=f"The number of batches for a dataloader in `{method_name}` is counted as 0"):
+        assert len(dm) == 0
+
+
+def test_len_all_dataloader_methods_implemented():
+    class BoringDataModule(LightningDataModule):
+        def __init__(self, dataloader):
+            super().__init__()
+            self.dataloader = dataloader
+
+        def train_dataloader(self):
+            return {"foo": self.dataloader, "bar": self.dataloader}
+
+        def val_dataloader(self):
+            return self.dataloader
+
+        def test_dataloader(self):
+            return [self.dataloader]
+
+        def predict_dataloader(self):
+            return [self.dataloader, self.dataloader]
+
+    dm = BoringDataModule(DATALOADER)
+
+    # 6 dataloaders each producing 32 batches: 6 * 32 = 192
+    assert len(dm) == 192
+
+
+def test_len_no_dataloader_methods_implemented():
+    dm = LightningDataModule()
+    with pytest.warns(UserWarning, match="You datamodule does not have any valid dataloader"):
+        assert len(dm) == 0
diff --git a/tests/core/test_lightning_module.py b/tests/core/test_lightning_module.py
index 8b787e0f57fcb..692044d91b894 100644
--- a/tests/core/test_lightning_module.py
+++ b/tests/core/test_lightning_module.py
@@ -22,6 +22,7 @@
 from pytorch_lightning import Trainer
 from pytorch_lightning.loggers import TensorBoardLogger
 from pytorch_lightning.utilities import _TORCH_SHARDED_TENSOR_AVAILABLE
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers import BoringModel
 from tests.helpers.runif import RunIf
 
@@ -335,3 +336,73 @@ def test_sharded_tensor_state_dict(tmpdir, single_process_pg):
     assert torch.allclose(
         m_1.sharded_tensor.local_shards()[0].tensor, m_0.sharded_tensor.local_shards()[0].tensor
     ), "Expect the shards to be same after `m_1` loading `m_0`'s state dict"
+
+
+def test_lightning_module_configure_gradient_clipping(tmpdir):
+    """Test custom gradient clipping inside `configure_gradient_clipping` hook."""
+
+    class TestModel(BoringModel):
+
+        has_validated_gradients = False
+        custom_gradient_clip_val = 1e-2
+
+        def configure_gradient_clipping(self, optimizer, optimizer_idx, gradient_clip_val, gradient_clip_algorithm):
+            assert gradient_clip_val == self.trainer.gradient_clip_val
+            assert gradient_clip_algorithm == self.trainer.gradient_clip_algorithm
+
+            for pg in optimizer.param_groups:
+                for p in pg["params"]:
+                    p.grad[p.grad > self.custom_gradient_clip_val] = self.custom_gradient_clip_val
+                    p.grad[p.grad <= 0] = 0
+
+        def on_before_optimizer_step(self, optimizer, optimizer_idx):
+            for pg in optimizer.param_groups:
+                for p in pg["params"]:
+                    if p.grad is not None and p.grad.abs().sum() > 0:
+                        self.has_validated_gradients = True
+                        assert p.grad.min() >= 0
+                        assert p.grad.max() <= self.custom_gradient_clip_val
+
+    model = TestModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir, max_epochs=1, limit_train_batches=2, limit_val_batches=0, gradient_clip_val=1e-4
+    )
+    trainer.fit(model)
+    assert model.has_validated_gradients
+
+
+def test_lightning_module_configure_gradient_clipping_different_argument_values(tmpdir):
+    """Test that setting gradient clipping arguments in `Trainer` and cusotmizing gradient clipping inside
+    `configure_gradient_clipping` with different values raises an exception."""
+
+    class TestModel(BoringModel):
+        custom_gradient_clip_val = 1e-2
+
+        def configure_gradient_clipping(self, optimizer, optimizer_idx, gradient_clip_val, gradient_clip_algorithm):
+            self.clip_gradients(optimizer, gradient_clip_val=self.custom_gradient_clip_val)
+
+    model = TestModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir, max_epochs=1, limit_train_batches=2, limit_val_batches=0, gradient_clip_val=1e-4
+    )
+    with pytest.raises(MisconfigurationException, match=r".*have set `Trainer\(gradient_clip_val\)` and have passed.*"):
+        trainer.fit(model)
+
+    class TestModel(BoringModel):
+        custom_gradient_clip_algorithm = "value"
+
+        def configure_gradient_clipping(self, optimizer, optimizer_idx, gradient_clip_val, gradient_clip_algorithm):
+            self.clip_gradients(optimizer, gradient_clip_algorithm=self.custom_gradient_clip_algorithm)
+
+    model = TestModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_train_batches=2,
+        limit_val_batches=0,
+        gradient_clip_algorithm="norm",
+    )
+    with pytest.raises(
+        MisconfigurationException, match=r".*have set `Trainer\(gradient_clip_algorithm\)` and have passed.*"
+    ):
+        trainer.fit(model)
diff --git a/tests/core/test_lightning_optimizer.py b/tests/core/test_lightning_optimizer.py
index 3f9ac37c2cb1c..05de6f44b9e44 100644
--- a/tests/core/test_lightning_optimizer.py
+++ b/tests/core/test_lightning_optimizer.py
@@ -35,7 +35,7 @@ def configure_optimizers(self):
 
     model = TestModel()
     trainer = Trainer(
-        default_root_dir=tmpdir, limit_train_batches=1, limit_val_batches=1, max_epochs=1, weights_summary=None
+        default_root_dir=tmpdir, limit_train_batches=1, limit_val_batches=1, max_epochs=1, enable_model_summary=False
     )
     trainer.fit(model)
 
@@ -59,7 +59,7 @@ def configure_optimizers(self):
 
     model = TestModel()
     trainer = Trainer(
-        default_root_dir=tmpdir, limit_train_batches=1, limit_val_batches=1, max_epochs=1, weights_summary=None
+        default_root_dir=tmpdir, limit_train_batches=1, limit_val_batches=1, max_epochs=1, enable_model_summary=False
     )
     trainer.fit(model)
 
@@ -108,7 +108,7 @@ def configure_optimizers(self):
     model.training_step_end = None
     model.training_epoch_end = None
     trainer = Trainer(
-        default_root_dir=tmpdir, limit_train_batches=8, limit_val_batches=1, max_epochs=1, weights_summary=None
+        default_root_dir=tmpdir, limit_train_batches=8, limit_val_batches=1, max_epochs=1, enable_model_summary=False
     )
 
     with patch.multiple(torch.optim.SGD, zero_grad=DEFAULT, step=DEFAULT) as sgd, patch.multiple(
@@ -161,7 +161,6 @@ def test_state(tmpdir):
         "zero_grad",
         "__setstate__",
         "add_param_group",
-        "_total_optimizer_step_calls",
     ]
 
     for k, v in lightning_optimizer.__dict__.items():
@@ -197,7 +196,7 @@ def configure_optimizers(self):
 
     model = TestModel()
     trainer = Trainer(
-        default_root_dir=tmpdir, limit_train_batches=20, limit_val_batches=1, max_epochs=1, weights_summary=None
+        default_root_dir=tmpdir, limit_train_batches=20, limit_val_batches=1, max_epochs=1, enable_model_summary=False
     )
 
     with patch("torch.optim.Adam.zero_grad") as adam_zero_grad, patch("torch.optim.SGD.zero_grad") as sgd_zero_grad:
@@ -241,7 +240,7 @@ def configure_optimizers(self):
         limit_train_batches=limit_train_batches,
         limit_val_batches=1,
         max_epochs=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
 
     with patch.multiple(torch.optim.SGD, zero_grad=DEFAULT, step=DEFAULT) as sgd, patch.multiple(
@@ -266,7 +265,7 @@ def configure_optimizers(self):
 
     model = TestModel()
     trainer = Trainer(
-        default_root_dir=tmpdir, limit_train_batches=1, limit_val_batches=1, max_epochs=1, weights_summary=None
+        default_root_dir=tmpdir, limit_train_batches=1, limit_val_batches=1, max_epochs=1, enable_model_summary=False
     )
 
     with patch("torch.optim.LBFGS.zero_grad") as zero_grad:
diff --git a/tests/core/test_metric_result_integration.py b/tests/core/test_metric_result_integration.py
index 1d79a0a0194f8..b97a3ee8c5c78 100644
--- a/tests/core/test_metric_result_integration.py
+++ b/tests/core/test_metric_result_integration.py
@@ -480,14 +480,14 @@ def test_result_collection_reload(tmpdir):
 @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
 @pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_7, reason="Requires at least PyTorch 1.7")
 def test_result_collection_reload_1_gpu_ddp(tmpdir):
-    result_collection_reload(default_root_dir=tmpdir, accelerator="ddp", gpus=1)
+    result_collection_reload(default_root_dir=tmpdir, strategy="ddp", gpus=1)
 
 
 @RunIf(min_gpus=2, special=True)
 @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
 @pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_7, reason="Requires at least PyTorch 1.7")
 def test_result_collection_reload_2_gpus(tmpdir):
-    result_collection_reload(default_root_dir=tmpdir, accelerator="ddp", gpus=2)
+    result_collection_reload(default_root_dir=tmpdir, strategy="ddp", gpus=2)
 
 
 def test_metric_collections(tmpdir):
diff --git a/tests/deprecated_api/test_remove_1-5.py b/tests/deprecated_api/test_remove_1-5.py
deleted file mode 100644
index a9d17601153ae..0000000000000
--- a/tests/deprecated_api/test_remove_1-5.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Test deprecated functionality which will be removed in v1.5.0."""
-import pytest
-
-from pytorch_lightning import Trainer
-
-
-def test_v1_5_0_distributed_backend_trainer_flag():
-    with pytest.deprecated_call(match="has been deprecated and will be removed in v1.5."):
-        Trainer(distributed_backend="ddp_cpu")
diff --git a/tests/deprecated_api/test_remove_1-6.py b/tests/deprecated_api/test_remove_1-6.py
index f580d9b89f7f8..a5101d3311bf3 100644
--- a/tests/deprecated_api/test_remove_1-6.py
+++ b/tests/deprecated_api/test_remove_1-6.py
@@ -30,7 +30,7 @@
 
 def test_v1_6_0_trainer_model_hook_mixin(tmpdir):
     model = BoringModel()
-    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, checkpoint_callback=False, logger=False)
+    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, enable_checkpointing=False, logger=False)
     trainer.fit(model)
     with pytest.deprecated_call(match="is deprecated in v1.4 and will be removed in v1.6"):
         trainer.is_function_implemented("training_step", model)
@@ -330,7 +330,7 @@ def test_v1_6_0_deprecated_device_dtype_mixin_import():
         from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin  # noqa: F401
 
 
-def test_v1_7_0_deprecated_accelerator_collective():
+def test_v1_6_0_deprecated_accelerator_pass_through_functions():
     from pytorch_lightning.plugins.precision import PrecisionPlugin
     from pytorch_lightning.plugins.training_type import SingleDevicePlugin
 
@@ -347,3 +347,62 @@ def test_v1_7_0_deprecated_accelerator_collective():
     with pytest.deprecated_call(match="will be removed in v1.6"):
         tensor = torch.rand(2, 2, requires_grad=True)
         accelerator.all_gather(tensor)
+
+    with pytest.deprecated_call(match="will be removed in v1.6"):
+        model = BoringModel()
+        accelerator.connect(model)
+
+    with pytest.deprecated_call(match="will be removed in v1.6"):
+        accelerator.post_training_step()
+
+    with pytest.deprecated_call(match="will be removed in v1.6"):
+        tensor = torch.rand(2, 2, requires_grad=True)
+        accelerator.training_step_end(tensor)
+
+    with pytest.deprecated_call(match="will be removed in v1.6"):
+        tensor = torch.rand(2, 2, requires_grad=True)
+        accelerator.test_step_end(tensor)
+
+    with pytest.deprecated_call(match="will be removed in v1.6"):
+        tensor = torch.rand(2, 2, requires_grad=True)
+        accelerator.validation_step_end(tensor)
+
+    with pytest.deprecated_call(match="will be removed in v1.6"):
+        accelerator.lightning_module_state_dict()
+
+    with pytest.deprecated_call(match="will be removed in v1.6"):
+        dl = model.train_dataloader()
+        accelerator.process_dataloader(dl)
+
+    with pytest.deprecated_call(match="will be removed in v1.6"):
+        accelerator.results
+
+    with pytest.deprecated_call(match="will be removed in v1.6"):
+        accelerator.setup_optimizers_in_pre_dispatch
+
+    with pytest.deprecated_call(match="will be removed in v1.6"):
+        accelerator.restore_checkpoint_after_pre_dispatch
+
+    with pytest.deprecated_call(match="will be removed in v1.6"):
+        accelerator.on_validation_start()
+
+    with pytest.deprecated_call(match="will be removed in v1.6"):
+        accelerator.on_test_start()
+
+    with pytest.deprecated_call(match="will be removed in v1.6"):
+        accelerator.on_predict_start()
+
+    with pytest.deprecated_call(match="will be removed in v1.6"):
+        accelerator.on_validation_end()
+
+    with pytest.deprecated_call(match="will be removed in v1.6"):
+        accelerator.on_test_end()
+
+    with pytest.deprecated_call(match="will be removed in v1.6"):
+        accelerator.on_predict_end()
+
+    with pytest.deprecated_call(match="will be removed in v1.6"):
+        accelerator.on_train_end()
+
+    with pytest.deprecated_call(match="will be removed in v1.6"):
+        accelerator.on_train_batch_start(batch=None, batch_idx=0)
diff --git a/tests/deprecated_api/test_remove_1-7.py b/tests/deprecated_api/test_remove_1-7.py
index 30cb0269d72c5..a1b62d3cfd24b 100644
--- a/tests/deprecated_api/test_remove_1-7.py
+++ b/tests/deprecated_api/test_remove_1-7.py
@@ -18,7 +18,10 @@
 import torch
 
 from pytorch_lightning import Callback, LightningDataModule, Trainer
+from pytorch_lightning.callbacks.gpu_stats_monitor import GPUStatsMonitor
+from pytorch_lightning.callbacks.xla_stats_monitor import XLAStatsMonitor
 from pytorch_lightning.loggers import LoggerCollection, TestTubeLogger
+from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector
 from tests.deprecated_api import _soft_unimport_module
 from tests.helpers import BoringModel
 from tests.helpers.datamodules import MNISTDataModule
@@ -122,6 +125,23 @@ def test_v1_7_0_stochastic_weight_avg_trainer_constructor(tmpdir):
         _ = Trainer(stochastic_weight_avg=True)
 
 
+@pytest.mark.parametrize("terminate_on_nan", [True, False])
+def test_v1_7_0_trainer_terminate_on_nan(tmpdir, terminate_on_nan):
+    with pytest.deprecated_call(
+        match="Trainer argument `terminate_on_nan` was deprecated in v1.5 and will be removed in 1.7"
+    ):
+        trainer = Trainer(terminate_on_nan=terminate_on_nan)
+        assert trainer.terminate_on_nan is terminate_on_nan
+        assert trainer._detect_anomaly is False
+
+    trainer = Trainer()
+    with pytest.deprecated_call(match=r"`Trainer.terminate_on_nan` is deprecated in v1.5"):
+        _ = trainer.terminate_on_nan
+
+    with pytest.deprecated_call(match=r"Setting `Trainer.terminate_on_nan = True` is deprecated in v1.5"):
+        trainer.terminate_on_nan = True
+
+
 def test_v1_7_0_deprecated_on_task_dataloader(tmpdir):
     class CustomBoringModel(BoringModel):
         def on_train_dataloader(self):
@@ -223,7 +243,7 @@ def get_from_queue(self, queue: torch.multiprocessing.SimpleQueue) -> None:
 @RunIf(skip_windows=True)
 def test_v1_7_0_deprecate_add_get_queue(tmpdir):
     model = BoringCallbackDDPSpawnModel()
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, num_processes=2, accelerator="ddp_cpu")
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, num_processes=2, strategy="ddp_spawn")
 
     with pytest.deprecated_call(match=r"`LightningModule.add_to_queue` method was deprecated in v1.5"):
         trainer.fit(model)
@@ -257,6 +277,11 @@ def test_v1_7_0_deprecate_lightning_distributed(tmpdir):
         _ = LightningDistributed()
 
 
+def test_v1_7_0_checkpoint_callback_trainer_constructor(tmpdir):
+    with pytest.deprecated_call(match=r"Setting `Trainer\(checkpoint_callback=True\)` is deprecated in v1.5"):
+        _ = Trainer(checkpoint_callback=True)
+
+
 def test_v1_7_0_old_on_train_batch_start(tmpdir):
     class OldSignature(Callback):
         def on_train_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_idx):
@@ -319,3 +344,50 @@ def test_v1_7_0_deprecate_parameter_validation():
         match="Using `pytorch_lightning.core.decorators.parameter_validation` is deprecated in v1.5"
     ):
         from pytorch_lightning.core.decorators import parameter_validation  # noqa: F401
+
+
+def test_v1_7_0_passing_strategy_to_accelerator_trainer_flag():
+    with pytest.deprecated_call(match="has been deprecated in v1.5 and will be removed in v1.7."):
+        Trainer(accelerator="ddp_spawn")
+
+
+def test_v1_7_0_passing_strategy_to_plugins_flag():
+    with pytest.deprecated_call(match="has been deprecated in v1.5 and will be removed in v1.7."):
+        Trainer(plugins="ddp_spawn")
+
+
+def test_v1_7_0_weights_summary_trainer(tmpdir):
+    with pytest.deprecated_call(match=r"Setting `Trainer\(weights_summary=full\)` is deprecated in v1.5"):
+        t = Trainer(weights_summary="full")
+
+    with pytest.deprecated_call(match=r"Setting `Trainer\(weights_summary=None\)` is deprecated in v1.5"):
+        t = Trainer(weights_summary=None)
+
+    t = Trainer(weights_summary="top")
+    with pytest.deprecated_call(match=r"`Trainer.weights_summary` is deprecated in v1.5"):
+        _ = t.weights_summary
+
+    with pytest.deprecated_call(match=r"Setting `Trainer.weights_summary` is deprecated in v1.5"):
+        t.weights_summary = "blah"
+
+
+def test_v1_7_0_trainer_log_gpu_memory(tmpdir):
+    with pytest.deprecated_call(
+        match="Setting `log_gpu_memory` with the trainer flag is deprecated in v1.5 and will be removed"
+    ):
+        trainer = Trainer(log_gpu_memory="min_max")
+    with pytest.deprecated_call(match="The property `LoggerConnector.gpus_metrics` was deprecated in v1.5"):
+        lg = LoggerConnector(trainer)
+        _ = lg.gpus_metrics
+
+
+@RunIf(min_gpus=1)
+def test_v1_7_0_deprecate_gpu_stats_monitor(tmpdir):
+    with pytest.deprecated_call(match="The `GPUStatsMonitor` callback was deprecated in v1.5"):
+        _ = GPUStatsMonitor()
+
+
+@RunIf(tpu=True)
+def test_v1_7_0_deprecate_xla_stats_monitor(tmpdir):
+    with pytest.deprecated_call(match="The `XLAStatsMonitor` callback was deprecated in v1.5"):
+        _ = XLAStatsMonitor()
diff --git a/tests/helpers/boring_model.py b/tests/helpers/boring_model.py
index 4036d34663a9f..d51fb44bff0d2 100644
--- a/tests/helpers/boring_model.py
+++ b/tests/helpers/boring_model.py
@@ -158,18 +158,15 @@ def prepare_data(self):
     def setup(self, stage: Optional[str] = None):
         if stage == "fit" or stage is None:
             self.random_train = Subset(self.random_full, indices=range(64))
-            self.dims = self.random_train[0].shape
 
         if stage in ("fit", "validate") or stage is None:
             self.random_val = Subset(self.random_full, indices=range(64, 64 * 2))
 
         if stage == "test" or stage is None:
             self.random_test = Subset(self.random_full, indices=range(64 * 2, 64 * 3))
-            self.dims = getattr(self, "dims", self.random_test[0].shape)
 
         if stage == "predict" or stage is None:
             self.random_predict = Subset(self.random_full, indices=range(64 * 3, 64 * 4))
-            self.dims = getattr(self, "dims", self.random_predict[0].shape)
 
     def train_dataloader(self):
         return DataLoader(self.random_train)
diff --git a/tests/helpers/datamodules.py b/tests/helpers/datamodules.py
index 08fa3c6d214fd..0cb178a749a09 100644
--- a/tests/helpers/datamodules.py
+++ b/tests/helpers/datamodules.py
@@ -40,11 +40,6 @@ def __init__(self, data_dir: str = "./", batch_size: int = 32, use_trials: bool
         # TrialMNIST is a constrained MNIST dataset
         self.dataset_cls = TrialMNIST if use_trials else MNIST
 
-        # self.dims is returned when you call dm.size()
-        # Setting default dims here because we know them.
-        # Could optionally be assigned dynamically in dm.setup()
-        self.dims = (1, 28, 28)
-
     def prepare_data(self):
         # download only
         self.dataset_cls(self.data_dir, train=True, download=True)
diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py
index 6cc8daa75f773..67838e219fcfb 100644
--- a/tests/loggers/test_all.py
+++ b/tests/loggers/test_all.py
@@ -335,10 +335,9 @@ def _test_logger_created_on_rank_zero_only(tmpdir, logger_class):
     trainer = Trainer(
         logger=logger,
         default_root_dir=tmpdir,
-        accelerator="ddp_cpu",
+        strategy="ddp_spawn",
         num_processes=2,
         max_steps=1,
-        checkpoint_callback=True,
         callbacks=[RankZeroLoggerCheck()],
     )
     trainer.fit(model)
diff --git a/tests/loggers/test_base.py b/tests/loggers/test_base.py
index 878ea1362b9ed..7b02d6d72b0fc 100644
--- a/tests/loggers/test_base.py
+++ b/tests/loggers/test_base.py
@@ -318,9 +318,9 @@ class _Test:
         limit_train_batches=0.1,
         limit_val_batches=0.1,
         num_sanity_val_steps=0,
-        checkpoint_callback=False,
+        enable_checkpointing=False,
         enable_progress_bar=False,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     # there should be no exceptions raised for the same key/value pair in the hparams of both
     # the lightning module and data module
@@ -342,9 +342,9 @@ class _Test:
         limit_train_batches=0.1,
         limit_val_batches=0.1,
         num_sanity_val_steps=0,
-        checkpoint_callback=False,
+        enable_checkpointing=False,
         enable_progress_bar=False,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     with pytest.raises(MisconfigurationException, match="Error while merging hparams"):
         trainer.fit(model, dm)
@@ -359,9 +359,9 @@ class _Test:
         limit_train_batches=0.1,
         limit_val_batches=0.1,
         num_sanity_val_steps=0,
-        checkpoint_callback=False,
+        enable_checkpointing=False,
         enable_progress_bar=False,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     with pytest.raises(MisconfigurationException, match="Error while merging hparams"):
         trainer.fit(model, dm)
diff --git a/tests/loops/batch/test_truncated_bptt.py b/tests/loops/batch/test_truncated_bptt.py
index 874a621f8a485..55adbc618b9f9 100644
--- a/tests/loops/batch/test_truncated_bptt.py
+++ b/tests/loops/batch/test_truncated_bptt.py
@@ -99,9 +99,9 @@ def on_train_batch_start(self, *_, **__) -> None:
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=2,
-        weights_summary=None,
+        enable_model_summary=False,
         logger=False,
-        checkpoint_callback=False,
+        enable_checkpointing=False,
     )
     trainer.fit(model)
 
@@ -140,9 +140,9 @@ def training_epoch_end(self, training_step_outputs):
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=1,
-        weights_summary=None,
+        enable_model_summary=False,
         logger=False,
-        checkpoint_callback=False,
+        enable_checkpointing=False,
     )
     trainer.fit(model, train_dataloaders=train_dataloader)
 
@@ -165,8 +165,8 @@ def training_step(self, *args, **kwargs):
         default_root_dir=tmpdir,
         max_epochs=2,
         log_every_n_steps=2,
-        weights_summary=None,
-        checkpoint_callback=False,
+        enable_model_summary=False,
+        enable_checkpointing=False,
     )
     trainer.fit(model)
-    assert set(trainer.logged_metrics) == {"loss_step", "loss_epoch", "epoch"}
+    assert set(trainer.logged_metrics) == {"loss_step", "loss_epoch"}
diff --git a/tests/loops/optimization/test_manual_loop.py b/tests/loops/optimization/test_manual_loop.py
index 0a8a266bbc64c..04b517eb99334 100644
--- a/tests/loops/optimization/test_manual_loop.py
+++ b/tests/loops/optimization/test_manual_loop.py
@@ -22,10 +22,10 @@
 
 def test_manual_result():
     training_step_output = {"loss": torch.tensor(25.0, requires_grad=True), "something": "jiraffe"}
-    result = ManualResult.from_training_step_output(training_step_output, normalize=5)
+    result = ManualResult.from_training_step_output(training_step_output)
     asdict = result.asdict()
     assert not asdict["loss"].requires_grad
-    assert asdict["loss"] == 5
+    assert asdict["loss"] == 25
     assert result.extra == asdict
 
 
diff --git a/tests/loops/optimization/test_optimizer_loop.py b/tests/loops/optimization/test_optimizer_loop.py
index 4f7f8ac7d48ce..66a58524a399b 100644
--- a/tests/loops/optimization/test_optimizer_loop.py
+++ b/tests/loops/optimization/test_optimizer_loop.py
@@ -194,7 +194,7 @@ def configure_optimizers(self):
         limit_val_batches=0,
         num_sanity_val_steps=0,
         logger=False,
-        checkpoint_callback=False,
+        enable_checkpointing=False,
     )
     trainer.fit(model)
     weights_complete = model.parameters()
@@ -213,7 +213,7 @@ def configure_optimizers(self):
         limit_val_batches=0,
         num_sanity_val_steps=0,
         logger=False,
-        checkpoint_callback=False,
+        enable_checkpointing=False,
     )
     with pytest.raises(CustomException):
         trainer.fit(model)
@@ -234,7 +234,7 @@ def configure_optimizers(self):
         limit_val_batches=0,
         num_sanity_val_steps=0,
         logger=False,
-        checkpoint_callback=False,
+        enable_checkpointing=False,
     )
     trainer.fit(model)
     weights_resumed = model.parameters()
diff --git a/tests/loops/test_evaluation_loop.py b/tests/loops/test_evaluation_loop.py
index 75e873e7aa083..2b67dec18de34 100644
--- a/tests/loops/test_evaluation_loop.py
+++ b/tests/loops/test_evaluation_loop.py
@@ -30,7 +30,7 @@ def test_on_evaluation_epoch_end(eval_epoch_end_mock, tmpdir):
     model = BoringModel()
 
     trainer = Trainer(
-        default_root_dir=tmpdir, limit_train_batches=2, limit_val_batches=2, max_epochs=2, weights_summary=None
+        default_root_dir=tmpdir, limit_train_batches=2, limit_val_batches=2, max_epochs=2, enable_model_summary=False
     )
 
     trainer.fit(model)
@@ -55,7 +55,7 @@ def on_validation_end(self):
             order.append("on_validation_end")
             super().on_validation_end()
 
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1, weights_summary=None, num_sanity_val_steps=0)
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1, enable_model_summary=False, num_sanity_val_steps=0)
     trainer.fit(LessBoringModel())
 
     assert order == ["log_epoch_metrics", "on_validation_end"]
@@ -99,7 +99,9 @@ def validation_step(self, batch, batch_idx):
             return super().validation_step(batch, batch_idx)
 
     torch.cuda.empty_cache()
-    trainer = Trainer(gpus=1, default_root_dir=tmpdir, fast_dev_run=2, move_metrics_to_cpu=True, weights_summary=None)
+    trainer = Trainer(
+        gpus=1, default_root_dir=tmpdir, fast_dev_run=2, move_metrics_to_cpu=True, enable_model_summary=False
+    )
     trainer.fit(BoringLargeBatchModel())
 
 
diff --git a/tests/loops/test_evaluation_loop_flow.py b/tests/loops/test_evaluation_loop_flow.py
index d927262021d82..0fe90557b3530 100644
--- a/tests/loops/test_evaluation_loop_flow.py
+++ b/tests/loops/test_evaluation_loop_flow.py
@@ -52,7 +52,7 @@ def backward(self, loss, optimizer, optimizer_idx):
         limit_val_batches=2,
         max_epochs=2,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
@@ -64,10 +64,8 @@ def backward(self, loss, optimizer, optimizer_idx):
     # simulate training manually
     trainer.state.stage = RunningStage.TRAINING
     batch_idx, batch = 0, next(iter(model.train_dataloader()))
-    out = trainer.fit_loop.epoch_loop.batch_loop.run(batch, batch_idx)
-    assert out.signal == 0
+    train_step_out = trainer.fit_loop.epoch_loop.batch_loop.run(batch, batch_idx)
 
-    train_step_out = out.outputs
     assert len(train_step_out) == 1
     train_step_out = train_step_out[0][0]
     assert isinstance(train_step_out["loss"], torch.Tensor)
@@ -117,7 +115,7 @@ def backward(self, loss, optimizer, optimizer_idx):
         limit_val_batches=2,
         max_epochs=2,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
@@ -129,10 +127,8 @@ def backward(self, loss, optimizer, optimizer_idx):
     trainer.state.stage = RunningStage.TRAINING
     # make sure training outputs what is expected
     batch_idx, batch = 0, next(iter(model.train_dataloader()))
-    out = trainer.fit_loop.epoch_loop.batch_loop.run(batch, batch_idx)
-    assert out.signal == 0
+    train_step_out = trainer.fit_loop.epoch_loop.batch_loop.run(batch, batch_idx)
 
-    train_step_out = out.outputs
     assert len(train_step_out) == 1
     train_step_out = train_step_out[0][0]
     assert isinstance(train_step_out["loss"], torch.Tensor)
@@ -188,7 +184,7 @@ def backward(self, loss, optimizer, optimizer_idx):
         limit_val_batches=2,
         max_epochs=2,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
 
     trainer.fit(model)
@@ -246,7 +242,7 @@ def backward(self, loss, optimizer, optimizer_idx):
         limit_val_batches=2,
         max_epochs=2,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
 
     trainer.fit(model)
diff --git a/tests/loops/test_flow_warnings.py b/tests/loops/test_flow_warnings.py
index d7860c807ac5b..02cdf309f6801 100644
--- a/tests/loops/test_flow_warnings.py
+++ b/tests/loops/test_flow_warnings.py
@@ -35,7 +35,7 @@ def test_no_depre_without_epoch_end(tmpdir):
         limit_val_batches=2,
         max_epochs=2,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
 
     with warnings.catch_warnings(record=True) as w:
diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index 23a709e3e8c94..ed648897400ec 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -367,7 +367,7 @@ def configure_optimizers_multiple(self):
         accumulate_grad_batches=accumulate_grad_batches,
         enable_progress_bar=False,
         logger=False,
-        checkpoint_callback=False,
+        enable_checkpointing=False,
     )
 
     # simulate a failure
@@ -567,7 +567,6 @@ def train_dataloader(self):
         accumulate_grad_batches=accumulate_grad_batches,
         enable_progress_bar=False,
         logger=False,
-        checkpoint_callback=True,
     )
     trainer.fit(model)
 
@@ -683,7 +682,7 @@ def test_fit_loop_reset(tmpdir):
         max_epochs=2,
         callbacks=[checkpoint_callback],
         logger=False,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
diff --git a/tests/loops/test_training_loop.py b/tests/loops/test_training_loop.py
index a6bc414e0e93f..ebfe0d4762806 100644
--- a/tests/loops/test_training_loop.py
+++ b/tests/loops/test_training_loop.py
@@ -54,7 +54,7 @@ def training_epoch_end(self, outputs):
         limit_train_batches=2,
         limit_test_batches=1,
         enable_progress_bar=False,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
@@ -86,7 +86,7 @@ def run_training(**trainer_kwargs):
 @pytest.mark.parametrize(["max_epochs", "batch_idx_"], [(2, 5), (3, 8), (4, 12)])
 def test_on_train_batch_start_return_minus_one(max_epochs, batch_idx_, tmpdir):
     class CurrentModel(BoringModel):
-        def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
+        def on_train_batch_start(self, batch, batch_idx):
             if batch_idx == batch_idx_:
                 return -1
 
diff --git a/tests/loops/test_training_loop_flow_dict.py b/tests/loops/test_training_loop_flow_dict.py
index b8061610a49bb..657e8d31b8f64 100644
--- a/tests/loops/test_training_loop_flow_dict.py
+++ b/tests/loops/test_training_loop_flow_dict.py
@@ -42,7 +42,7 @@ def backward(self, loss, optimizer, optimizer_idx):
         limit_val_batches=2,
         max_epochs=2,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
@@ -81,7 +81,7 @@ def backward(self, loss, optimizer, optimizer_idx):
         limit_val_batches=2,
         max_epochs=2,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
@@ -128,7 +128,7 @@ def backward(self, loss, optimizer, optimizer_idx):
         limit_val_batches=2,
         max_epochs=2,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
@@ -181,7 +181,7 @@ def backward(self, loss, optimizer, optimizer_idx):
         limit_val_batches=2,
         max_epochs=2,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
diff --git a/tests/loops/test_training_loop_flow_scalar.py b/tests/loops/test_training_loop_flow_scalar.py
index 4f64a906646ba..f7f539efef8cd 100644
--- a/tests/loops/test_training_loop_flow_scalar.py
+++ b/tests/loops/test_training_loop_flow_scalar.py
@@ -47,7 +47,7 @@ def backward(self, loss, optimizer, optimizer_idx):
         limit_val_batches=2,
         max_epochs=2,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
@@ -86,7 +86,7 @@ def backward(self, loss, optimizer, optimizer_idx):
         limit_val_batches=2,
         max_epochs=2,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
@@ -131,7 +131,7 @@ def backward(self, loss, optimizer, optimizer_idx):
         limit_val_batches=2,
         max_epochs=2,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
@@ -147,10 +147,8 @@ def backward(self, loss, optimizer, optimizer_idx):
     trainer.state.stage = RunningStage.TRAINING
     # make sure training outputs what is expected
     batch_idx, batch = 0, next(iter(model.train_dataloader()))
-    out = trainer.fit_loop.epoch_loop.batch_loop.run(batch, batch_idx)
-    assert out.signal == 0
+    train_step_out = trainer.fit_loop.epoch_loop.batch_loop.run(batch, batch_idx)
 
-    train_step_out = out.outputs
     assert len(train_step_out) == 1
     train_step_out = train_step_out[0][0]
     assert isinstance(train_step_out["loss"], torch.Tensor)
@@ -205,7 +203,7 @@ def backward(self, loss, optimizer, optimizer_idx):
         limit_val_batches=2,
         max_epochs=2,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
@@ -221,10 +219,8 @@ def backward(self, loss, optimizer, optimizer_idx):
     trainer.state.stage = RunningStage.TRAINING
     # make sure training outputs what is expected
     batch_idx, batch = 0, next(iter(model.train_dataloader()))
-    out = trainer.fit_loop.epoch_loop.batch_loop.run(batch, batch_idx)
-    assert out.signal == 0
+    train_step_out = trainer.fit_loop.epoch_loop.batch_loop.run(batch, batch_idx)
 
-    train_step_out = out.outputs
     assert len(train_step_out) == 1
     train_step_out = train_step_out[0][0]
     assert isinstance(train_step_out["loss"], torch.Tensor)
@@ -295,9 +291,9 @@ def training_step(self, batch, batch_idx):
         limit_train_batches=4,
         limit_val_batches=1,
         max_epochs=4,
-        weights_summary=None,
+        enable_model_summary=False,
         logger=False,
-        checkpoint_callback=False,
+        enable_checkpointing=False,
     )
 
     Closure.warning_cache.clear()
@@ -311,8 +307,7 @@ def training_step(self, batch, batch_idx):
     for batch_idx, batch in enumerate(model.train_dataloader()):
         out = trainer.fit_loop.epoch_loop.batch_loop.run(batch, batch_idx)
         if not batch_idx % 2:
-            assert out.outputs == []
-        assert out.signal == 0
+            assert out == []
 
 
 def test_training_step_none_batches(tmpdir):
@@ -321,7 +316,6 @@ def test_training_step_none_batches(tmpdir):
     class TestModel(BoringModel):
         def __init__(self):
             super().__init__()
-
             self.counter = 0
 
         def collate_none_when_even(self, batch):
@@ -333,27 +327,23 @@ def collate_none_when_even(self, batch):
             return result
 
         def train_dataloader(self):
-            return DataLoader(RandomDataset(32, 64), collate_fn=self.collate_none_when_even)
+            return DataLoader(RandomDataset(32, 4), collate_fn=self.collate_none_when_even)
+
+        def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx):
+            if batch_idx % 2 == 0:
+                assert outputs == []
+            else:
+                assert outputs
 
     model = TestModel()
     trainer = Trainer(
         default_root_dir=tmpdir,
-        limit_train_batches=4,
         limit_val_batches=1,
         max_epochs=4,
-        weights_summary=None,
+        enable_model_summary=False,
         logger=False,
-        checkpoint_callback=False,
+        enable_checkpointing=False,
     )
 
     with pytest.warns(UserWarning, match=r".*train_dataloader yielded None.*"):
         trainer.fit(model)
-
-    trainer.state.stage = RunningStage.TRAINING
-
-    # manually check a few batches
-    for batch_idx, batch in enumerate(model.train_dataloader()):
-        out = trainer.fit_loop.epoch_loop.batch_loop.run(batch, batch_idx)
-        if not batch_idx % 2:
-            assert out.outputs == []
-        assert out.signal == 0
diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py
index 71acb9a168081..ff93a1bcfdc70 100644
--- a/tests/models/data/horovod/train_default_model.py
+++ b/tests/models/data/horovod/train_default_model.py
@@ -96,7 +96,7 @@ def training_epoch_end(self, outputs) -> None:
     trainer.checkpoint_connector.restore(checkpoint_path)
 
     if on_gpu:
-        trainer = Trainer(gpus=1, accelerator="horovod", max_epochs=1)
+        trainer = Trainer(gpus=1, strategy="horovod", max_epochs=1)
         # Test the root_gpu property
         assert trainer.root_gpu == hvd.local_rank()
 
diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py
index c56cfb11f0164..0b6cb7822b064 100644
--- a/tests/models/test_amp.py
+++ b/tests/models/test_amp.py
@@ -71,7 +71,7 @@ def _assert_autocast_enabled(self):
 
 @pytest.mark.skipif(not _TORCH_CPU_AMP_AVAILABLE, reason="CPU AMP not available")
 @pytest.mark.parametrize(
-    "accelerator",
+    "strategy",
     [
         None,
         pytest.param("dp", marks=pytest.mark.skip("dp + amp not supported on CPU currently")),  # TODO
@@ -86,12 +86,12 @@ def _assert_autocast_enabled(self):
     ],
 )
 @pytest.mark.parametrize("num_processes", [1, 2])
-def test_amp_cpus(tmpdir, accelerator, precision, num_processes):
+def test_amp_cpus(tmpdir, strategy, precision, num_processes):
     """Make sure combinations of AMP and training types work if supported."""
     tutils.reset_seed()
 
     trainer = Trainer(
-        default_root_dir=tmpdir, num_processes=num_processes, max_epochs=1, accelerator=accelerator, precision=precision
+        default_root_dir=tmpdir, num_processes=num_processes, max_epochs=1, strategy=strategy, precision=precision
     )
 
     model = AMPTestModel()
@@ -105,7 +105,7 @@ def test_amp_cpus(tmpdir, accelerator, precision, num_processes):
 
 @RunIf(min_gpus=2)
 @pytest.mark.parametrize(
-    "accelerator",
+    "strategy",
     [None, "dp", "ddp_spawn"],
 )
 @pytest.mark.parametrize(
@@ -119,11 +119,11 @@ def test_amp_cpus(tmpdir, accelerator, precision, num_processes):
     ],
 )
 @pytest.mark.parametrize("gpus", [1, 2])
-def test_amp_gpus(tmpdir, accelerator, precision, gpus):
+def test_amp_gpus(tmpdir, strategy, precision, gpus):
     """Make sure combinations of AMP and training types work if supported."""
     tutils.reset_seed()
 
-    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, gpus=gpus, accelerator=accelerator, precision=precision)
+    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, gpus=gpus, strategy=strategy, precision=precision)
 
     model = AMPTestModel()
     # tutils.run_model_test(trainer_options, model)
@@ -164,7 +164,7 @@ def test_amp_gpu_ddp_slurm_managed(tmpdir):
         default_root_dir=tmpdir,
         max_epochs=1,
         gpus=[0],
-        accelerator="ddp_spawn",
+        strategy="ddp_spawn",
         precision=16,
         callbacks=[checkpoint],
         logger=logger,
diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py
index 18663562df626..7271af004c48b 100644
--- a/tests/models/test_cpu.py
+++ b/tests/models/test_cpu.py
@@ -135,7 +135,7 @@ def test_multi_cpu_model_ddp(tmpdir):
         limit_val_batches=0.2,
         gpus=None,
         num_processes=2,
-        accelerator="ddp_cpu",
+        strategy="ddp_spawn",
     )
 
     dm = ClassifDataModule()
@@ -160,7 +160,6 @@ def __init__(self, optimizer_name, learning_rate):
         default_root_dir=tmpdir,
         max_epochs=1,
         enable_progress_bar=False,
-        weights_summary="top",
         limit_train_batches=0.2,
         limit_val_batches=0.2,
     )
diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index d0b25b2af9960..9317804b1cca3 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -39,7 +39,7 @@
 
 @RunIf(min_gpus=2)
 def test_multi_gpu_none_backend(tmpdir):
-    """Make sure when using multiple GPUs the user can't use `distributed_backend = None`."""
+    """Make sure when using multiple GPUs the user can't use `accelerator = None`."""
     tutils.set_random_master_port()
     trainer_options = dict(
         default_root_dir=tmpdir,
@@ -93,7 +93,7 @@ def device_count():
 
 
 @pytest.mark.parametrize(
-    ["gpus", "expected_num_gpus", "distributed_backend"],
+    ["gpus", "expected_num_gpus", "strategy"],
     [
         pytest.param(None, 0, None, id="None - expect 0 gpu to use."),
         pytest.param(0, 0, None, id="Oth gpu, expect 1 gpu to use."),
@@ -103,23 +103,23 @@ def device_count():
         pytest.param(3, 3, "ddp", id="3rd gpu - 1 gpu to use (backend:ddp)"),
     ],
 )
-def test_trainer_gpu_parse(mocked_device_count, gpus, expected_num_gpus, distributed_backend):
-    assert Trainer(gpus=gpus, accelerator=distributed_backend).num_gpus == expected_num_gpus
+def test_trainer_gpu_parse(mocked_device_count, gpus, expected_num_gpus, strategy):
+    assert Trainer(gpus=gpus, strategy=strategy).num_gpus == expected_num_gpus
 
 
 @pytest.mark.parametrize(
-    ["gpus", "expected_num_gpus", "distributed_backend"],
+    ["gpus", "expected_num_gpus", "strategy"],
     [
         pytest.param(None, 0, None, id="None - expect 0 gpu to use."),
         pytest.param(None, 0, "ddp", id="None - expect 0 gpu to use."),
     ],
 )
-def test_trainer_num_gpu_0(mocked_device_count_0, gpus, expected_num_gpus, distributed_backend):
-    assert Trainer(gpus=gpus, accelerator=distributed_backend).num_gpus == expected_num_gpus
+def test_trainer_num_gpu_0(mocked_device_count_0, gpus, expected_num_gpus, strategy):
+    assert Trainer(gpus=gpus, strategy=strategy).num_gpus == expected_num_gpus
 
 
 @pytest.mark.parametrize(
-    ["gpus", "expected_root_gpu", "distributed_backend"],
+    ["gpus", "expected_root_gpu", "strategy"],
     [
         pytest.param(None, None, "ddp", id="None is None"),
         pytest.param(0, None, "ddp", id="O gpus, expect gpu root device to be None."),
@@ -129,25 +129,25 @@ def test_trainer_num_gpu_0(mocked_device_count_0, gpus, expected_num_gpus, distr
         pytest.param(3, 0, "ddp", id="3 gpus, expect gpu root device to be 0.(backend:ddp)"),
     ],
 )
-def test_root_gpu_property(mocked_device_count, gpus, expected_root_gpu, distributed_backend):
-    assert Trainer(gpus=gpus, accelerator=distributed_backend).root_gpu == expected_root_gpu
+def test_root_gpu_property(mocked_device_count, gpus, expected_root_gpu, strategy):
+    assert Trainer(gpus=gpus, strategy=strategy).root_gpu == expected_root_gpu
 
 
 @pytest.mark.parametrize(
-    ["gpus", "expected_root_gpu", "distributed_backend"],
+    ["gpus", "expected_root_gpu", "strategy"],
     [
         pytest.param(None, None, None, id="None is None"),
         pytest.param(None, None, "ddp", id="None is None"),
         pytest.param(0, None, "ddp", id="None is None"),
     ],
 )
-def test_root_gpu_property_0_passing(mocked_device_count_0, gpus, expected_root_gpu, distributed_backend):
-    assert Trainer(gpus=gpus, accelerator=distributed_backend).root_gpu == expected_root_gpu
+def test_root_gpu_property_0_passing(mocked_device_count_0, gpus, expected_root_gpu, strategy):
+    assert Trainer(gpus=gpus, strategy=strategy).root_gpu == expected_root_gpu
 
 
 # Asking for a gpu when non are available will result in a MisconfigurationException
 @pytest.mark.parametrize(
-    ["gpus", "expected_root_gpu", "distributed_backend"],
+    ["gpus", "expected_root_gpu", "strategy"],
     [
         (1, None, "ddp"),
         (3, None, "ddp"),
@@ -158,9 +158,9 @@ def test_root_gpu_property_0_passing(mocked_device_count_0, gpus, expected_root_
         ("-1", None, "ddp"),
     ],
 )
-def test_root_gpu_property_0_raising(mocked_device_count_0, gpus, expected_root_gpu, distributed_backend):
+def test_root_gpu_property_0_raising(mocked_device_count_0, gpus, expected_root_gpu, strategy):
     with pytest.raises(MisconfigurationException):
-        Trainer(gpus=gpus, accelerator=distributed_backend)
+        Trainer(gpus=gpus, strategy=strategy)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 9af883f73c78f..b74f933edd228 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -198,8 +198,8 @@ def train_dataloader(self):
         limit_train_batches=2,
         limit_val_batches=0,
         max_epochs=1,
-        weights_summary=None,
-        accelerator="ddp",
+        enable_model_summary=False,
+        strategy="ddp",
         gpus=2,
     )
     trainer.fit(model)
@@ -281,6 +281,24 @@ def _auto_train_batch(trainer, model, batches, device=torch.device("cpu"), curre
             dict(name="Callback.on_before_optimizer_step", args=(trainer, model, ANY, 0)),
             dict(name="on_before_optimizer_step", args=(ANY, 0)),
         ]
+
+        # deepspeed handles gradient clipping internally
+        configure_gradient_clipping = (
+            []
+            if using_deepspeed
+            else [
+                dict(
+                    name="clip_gradients",
+                    args=(ANY,),
+                    kwargs=dict(gradient_clip_val=None, gradient_clip_algorithm=None),
+                ),
+                dict(
+                    name="configure_gradient_clipping",
+                    args=(ANY, 0),
+                    kwargs=dict(gradient_clip_val=None, gradient_clip_algorithm=None),
+                ),
+            ]
+        )
         for i in range(batches):
             out.extend(
                 [
@@ -305,6 +323,7 @@ def _auto_train_batch(trainer, model, batches, device=torch.device("cpu"), curre
                     *([dict(name="backward", args=(ANY, ANY, 0))] if not using_deepspeed else []),
                     dict(name="Callback.on_after_backward", args=(trainer, model)),
                     dict(name="on_after_backward"),
+                    *configure_gradient_clipping,
                     *(on_before_optimizer_step if using_plugin else []),
                     dict(
                         name="optimizer_step",
@@ -416,18 +435,35 @@ def _predict_batch(trainer, model, batches):
         return out
 
 
+@RunIf(deepspeed=True, min_gpus=1, special=True)
+def test_trainer_model_hook_system_fit_deepspeed_automatic_optimization(tmpdir):
+    _run_trainer_model_hook_system_fit(
+        dict(gpus=1, precision=16, plugins="deepspeed"), tmpdir, automatic_optimization=True
+    )
+
+
+@RunIf(deepspeed=True, min_gpus=1, special=True)
+def test_trainer_model_hook_system_fit_deepspeed_manual_optimization(tmpdir):
+    _run_trainer_model_hook_system_fit(
+        dict(gpus=1, precision=16, plugins="deepspeed"), tmpdir, automatic_optimization=False
+    )
+
+
 @pytest.mark.parametrize(
     "kwargs",
     [
         {},
         # these precision plugins modify the optimization flow, so testing them explicitly
-        pytest.param(dict(gpus=1, precision=16, plugins="deepspeed"), marks=RunIf(deepspeed=True, min_gpus=1)),
         pytest.param(dict(gpus=1, precision=16, amp_backend="native"), marks=RunIf(min_gpus=1)),
         pytest.param(dict(gpus=1, precision=16, amp_backend="apex"), marks=RunIf(amp_apex=True, min_gpus=1)),
     ],
 )
 @pytest.mark.parametrize("automatic_optimization", (True, False))
 def test_trainer_model_hook_system_fit(tmpdir, kwargs, automatic_optimization):
+    _run_trainer_model_hook_system_fit(kwargs, tmpdir, automatic_optimization)
+
+
+def _run_trainer_model_hook_system_fit(kwargs, tmpdir, automatic_optimization):
     called = []
 
     class TestModel(HookedModel):
@@ -455,18 +491,15 @@ def training_step(self, batch, batch_idx):
         limit_train_batches=train_batches,
         limit_val_batches=val_batches,
         enable_progress_bar=False,
-        weights_summary=None,
+        enable_model_summary=False,
         callbacks=[callback],
         **kwargs,
     )
-
     assert called == [
         dict(name="Callback.on_init_start", args=(trainer,)),
         dict(name="Callback.on_init_end", args=(trainer,)),
     ]
-
     trainer.fit(model)
-
     saved_ckpt = {
         "callbacks": ANY,
         "epoch": 1,
@@ -481,7 +514,6 @@ def training_step(self, batch, batch_idx):
     elif kwargs.get("amp_backend") == "apex":
         saved_ckpt["amp_scaling_state"] = ANY
     device = torch.device("cuda:0" if "gpus" in kwargs else "cpu")
-
     expected = [
         dict(name="Callback.on_init_start", args=(trainer,)),
         dict(name="Callback.on_init_end", args=(trainer,)),
@@ -566,7 +598,7 @@ def test_trainer_model_hook_system_fit_no_val_and_resume(tmpdir):
         max_steps=1,
         limit_val_batches=0,
         enable_progress_bar=False,
-        weights_summary=None,
+        enable_model_summary=False,
         callbacks=[HookedCallback([])],
     )
     trainer.fit(model)
@@ -583,7 +615,7 @@ def test_trainer_model_hook_system_fit_no_val_and_resume(tmpdir):
         max_steps=(1 + train_batches),
         limit_val_batches=0,
         enable_progress_bar=False,
-        weights_summary=None,
+        enable_model_summary=False,
         resume_from_checkpoint=best_model_path,
         callbacks=[callback],
     )
@@ -678,7 +710,7 @@ def test_trainer_model_hook_system_eval(tmpdir, batches, verb, noun, dataloader,
         limit_val_batches=batches,
         limit_test_batches=batches,
         enable_progress_bar=False,
-        weights_summary=None,
+        enable_model_summary=False,
         callbacks=[callback],
     )
     assert called == [
@@ -845,7 +877,7 @@ def call(hook, fn, *args, **kwargs):
         limit_test_batches=batches,
         limit_predict_batches=batches,
         enable_progress_bar=False,
-        weights_summary=None,
+        enable_model_summary=False,
         reload_dataloaders_every_epoch=True,
     )
 
diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index e58bd391feb82..73862825941fe 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -77,7 +77,7 @@ def test_horovod_cpu(tmpdir):
         max_epochs=1,
         limit_train_batches=0.4,
         limit_val_batches=0.2,
-        accelerator="horovod",
+        strategy="horovod",
     )
     _run_horovod(trainer_options)
 
@@ -94,7 +94,7 @@ def test_horovod_cpu_clip_grad_by_value(tmpdir):
         max_epochs=1,
         limit_train_batches=0.4,
         limit_val_batches=0.2,
-        accelerator="horovod",
+        strategy="horovod",
     )
     _run_horovod(trainer_options)
 
@@ -126,7 +126,7 @@ def test_horovod_multi_gpu(tmpdir):
         limit_train_batches=0.4,
         limit_val_batches=0.2,
         gpus=2,
-        accelerator="horovod",
+        strategy="horovod",
     )
     _run_horovod(trainer_options, on_gpu=True)
 
@@ -144,7 +144,7 @@ def test_horovod_multi_gpu_grad_by_value(tmpdir):
         limit_train_batches=0.4,
         limit_val_batches=0.2,
         gpus=2,
-        accelerator="horovod",
+        strategy="horovod",
     )
     _run_horovod(trainer_options, on_gpu=True)
 
@@ -165,7 +165,7 @@ def test_horovod_apex(tmpdir):
         limit_train_batches=0.4,
         limit_val_batches=0.2,
         gpus=2,
-        accelerator="horovod",
+        strategy="horovod",
         amp_backend="apex",
         precision=16,
     )
@@ -184,7 +184,7 @@ def test_horovod_amp(tmpdir):
         limit_train_batches=0.4,
         limit_val_batches=0.2,
         gpus=2,
-        accelerator="horovod",
+        strategy="horovod",
         amp_backend="native",
         precision=16,
     )
@@ -203,7 +203,7 @@ def test_horovod_gather(tmpdir):
         limit_train_batches=0.4,
         limit_val_batches=0.2,
         gpus=2,
-        accelerator="horovod",
+        strategy="horovod",
     )
     _run_horovod(trainer_options, on_gpu=True)
 
@@ -228,7 +228,7 @@ def validation_step(self, batch, *args, **kwargs):
         limit_train_batches=0.4,
         limit_val_batches=0.2,
         gpus=1,
-        accelerator="horovod",
+        strategy="horovod",
     )
     tpipes.run_model_test_without_loggers(trainer_options, model)
 
@@ -244,7 +244,7 @@ def test_horovod_multi_optimizer(tmpdir):
         max_epochs=1,
         limit_train_batches=0.4,
         limit_val_batches=0.2,
-        accelerator="horovod",
+        strategy="horovod",
     )
     trainer.fit(model)
     assert trainer.state.finished, f"Training failed with {trainer.state}"
@@ -306,7 +306,7 @@ def training_epoch_end(self, outputs) -> None:
             limit_val_batches=2,
             max_epochs=1,
             log_every_n_steps=1,
-            weights_summary=None,
+            enable_model_summary=False,
             logger=False,
         )
 
@@ -332,7 +332,7 @@ def sk_metric(preds, target):
     target = torch.randint(high=2, size=(num_batches, batch_size))
 
     def _compute_batch():
-        trainer = Trainer(fast_dev_run=True, accelerator="horovod", logger=False)
+        trainer = Trainer(fast_dev_run=True, strategy="horovod", logger=False)
 
         assert isinstance(trainer.accelerator, CPUAccelerator)
         # TODO: test that we selected the correct training_type_plugin based on horovod flags
@@ -388,7 +388,7 @@ def configure_optimizers(self):
 
         # fit model
         trainer = Trainer(
-            default_root_dir=tmpdir, max_epochs=1, limit_val_batches=0.5, limit_train_batches=0.2, accelerator="horovod"
+            default_root_dir=tmpdir, max_epochs=1, limit_val_batches=0.5, limit_train_batches=0.2, strategy="horovod"
         )
         trainer.fit(model)
 
diff --git a/tests/models/test_onnx.py b/tests/models/test_onnx.py
index fcce3d4bc86b7..70fce02a689e9 100644
--- a/tests/models/test_onnx.py
+++ b/tests/models/test_onnx.py
@@ -96,7 +96,7 @@ def test_model_saves_on_multi_gpu(tmpdir):
         limit_train_batches=10,
         limit_val_batches=10,
         gpus=[0, 1],
-        accelerator="ddp_spawn",
+        strategy="ddp_spawn",
         enable_progress_bar=False,
     )
 
diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py
index 1034fa26a3ac5..569c861eb32f0 100644
--- a/tests/models/test_restore.py
+++ b/tests/models/test_restore.py
@@ -153,7 +153,7 @@ def configure_optimizers(self):
     state_dict = torch.load(resume_ckpt)
 
     trainer_args.update(
-        {"max_epochs": 3, "resume_from_checkpoint": resume_ckpt, "checkpoint_callback": False, "callbacks": []}
+        {"max_epochs": 3, "resume_from_checkpoint": resume_ckpt, "enable_checkpointing": False, "callbacks": []}
     )
 
     class CustomClassifModel(CustomClassifModel):
@@ -323,7 +323,7 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir):
         callbacks=[checkpoint],
         logger=logger,
         gpus=[0, 1],
-        accelerator="dp",
+        strategy="dp",
         default_root_dir=tmpdir,
     )
 
@@ -340,7 +340,7 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir):
     new_trainer.test(pretrained_model)
     pretrained_model.cpu()
 
-    dataloaders = model.test_dataloader()
+    dataloaders = dm.test_dataloader()
     if not isinstance(dataloaders, list):
         dataloaders = [dataloaders]
 
@@ -369,7 +369,7 @@ def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir):
         callbacks=[checkpoint],
         logger=logger,
         gpus=[0, 1],
-        accelerator="ddp_spawn",
+        strategy="ddp_spawn",
         default_root_dir=tmpdir,
     )
 
@@ -487,7 +487,7 @@ def test_dp_resume(tmpdir):
     model = CustomClassificationModelDP(lr=0.1)
     dm = ClassifDataModule()
 
-    trainer_options = dict(max_epochs=1, gpus=2, accelerator="dp", default_root_dir=tmpdir)
+    trainer_options = dict(max_epochs=1, gpus=2, strategy="dp", default_root_dir=tmpdir)
 
     # get logger
     logger = tutils.get_default_logger(tmpdir)
@@ -539,7 +539,7 @@ def on_pretrain_routine_end(self):
             # haven't trained with the new loaded model
             new_trainer.state.stage = RunningStage.VALIDATING
 
-            dataloader = self.train_dataloader()
+            dataloader = dm.train_dataloader()
             tpipes.run_prediction_eval_model_template(self.trainer.lightning_module, dataloader=dataloader)
             self.on_pretrain_routine_end_called = True
 
diff --git a/tests/models/test_sync_batchnorm.py b/tests/models/test_sync_batchnorm.py
index 9b8c3a2988806..0a98413c59e98 100644
--- a/tests/models/test_sync_batchnorm.py
+++ b/tests/models/test_sync_batchnorm.py
@@ -114,7 +114,7 @@ def test_sync_batchnorm_ddp(tmpdir):
         default_root_dir=tmpdir,
         gpus=2,
         num_nodes=1,
-        accelerator="ddp_spawn",
+        strategy="ddp_spawn",
         max_epochs=1,
         max_steps=3,
         sync_batchnorm=True,
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 77fddd239bc47..287ec61772a4a 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -26,7 +26,7 @@
 from pytorch_lightning.callbacks import EarlyStopping
 from pytorch_lightning.plugins import TPUSpawnPlugin
 from pytorch_lightning.trainer.connectors.logger_connector.result import _Sync
-from pytorch_lightning.utilities import _TPU_AVAILABLE
+from pytorch_lightning.utilities import _TPU_AVAILABLE, DeviceType
 from pytorch_lightning.utilities.distributed import ReduceOp
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers import BoringModel, RandomDataset
@@ -277,9 +277,9 @@ def test_exception_when_no_tpu_found(tmpdir):
 
 @pytest.mark.parametrize("tpu_cores", [1, 8, [1]])
 @RunIf(tpu=True)
-def test_distributed_backend_set_when_using_tpu(tmpdir, tpu_cores):
-    """Test if distributed_backend is set to `tpu` when tpu_cores is not None."""
-    assert Trainer(tpu_cores=tpu_cores).distributed_backend == "tpu"
+def test_accelerator_set_when_using_tpu(tmpdir, tpu_cores):
+    """Test if accelerator is set to `tpu` when tpu_cores is not None."""
+    assert Trainer(tpu_cores=tpu_cores).accelerator == "tpu"
 
 
 @RunIf(tpu=True)
@@ -399,11 +399,11 @@ def test_tpu_precision_16_clip_gradients(mock_clip_grad_norm, clip_val, tmpdir):
 @RunIf(tpu=True)
 @pl_multi_process_test
 def test_if_test_works_with_checkpoint_false(tmpdir):
-    """Ensure that model trains properly when `checkpoint_callback` is set to False."""
+    """Ensure that model trains properly when `enable_checkpointing` is set to False."""
 
     # Train a model on TPU
     model = BoringModel()
-    trainer = Trainer(max_epochs=1, tpu_cores=8, default_root_dir=tmpdir, fast_dev_run=True, checkpoint_callback=False)
+    trainer = Trainer(max_epochs=1, tpu_cores=8, default_root_dir=tmpdir, fast_dev_run=True, enable_checkpointing=False)
     trainer.fit(model)
     assert trainer.state.finished, f"Training failed with {trainer.state}"
 
@@ -473,3 +473,13 @@ def teardown(self, stage):
 
     model = DebugModel()
     tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
+
+
+@RunIf(tpu=True)
+@pl_multi_process_test
+def test_device_type_when_training_plugin_tpu_passed(tmpdir):
+
+    trainer = Trainer(strategy=TPUSpawnPlugin(), tpu_cores=8)
+    assert isinstance(trainer.training_type_plugin, TPUSpawnPlugin)
+    assert trainer._device_type == DeviceType.TPU
+    assert isinstance(trainer.accelerator, TPUAccelerator)
diff --git a/tests/plugins/environments/torch_elastic_deadlock.py b/tests/plugins/environments/torch_elastic_deadlock.py
index a34f2dfcb9847..ead433200c304 100644
--- a/tests/plugins/environments/torch_elastic_deadlock.py
+++ b/tests/plugins/environments/torch_elastic_deadlock.py
@@ -23,7 +23,7 @@ def training_step(self, batch, batch_idx):
     model = Model()
 
     trainer = Trainer(
-        default_root_dir=".", max_epochs=1, limit_train_batches=5, num_sanity_val_steps=0, gpus=2, accelerator="ddp"
+        default_root_dir=".", max_epochs=1, limit_train_batches=5, num_sanity_val_steps=0, gpus=2, strategy="ddp"
     )
     assert isinstance(trainer.training_type_plugin, DDPPlugin)
 
diff --git a/tests/plugins/test_amp_plugins.py b/tests/plugins/test_amp_plugins.py
index d039866312463..c711989f556be 100644
--- a/tests/plugins/test_amp_plugins.py
+++ b/tests/plugins/test_amp_plugins.py
@@ -67,7 +67,7 @@ def test_amp_apex_ddp(
         precision=16,
         amp_backend=amp,
         gpus=gpus,
-        accelerator=ddp_backend,
+        strategy=ddp_backend,
         plugins=[plugin_cls()] if custom_plugin else None,
     )
     assert isinstance(trainer.precision_plugin, plugin_cls)
@@ -94,7 +94,7 @@ def test_amp_gradient_unscale(tmpdir, accum: int):
         limit_test_batches=2,
         limit_val_batches=2,
         amp_backend="native",
-        accelerator="ddp_spawn",
+        strategy="ddp_spawn",
         gpus=2,
         precision=16,
         track_grad_norm=2,
@@ -151,7 +151,7 @@ def training_step(self, batch, batch_idx):
         precision=16,
         amp_backend="apex",
         gpus=2,
-        accelerator="ddp",
+        strategy="ddp",
         plugins=ApexMixedPrecisionPlugin(amp_level=amp_level),
     )
     assert isinstance(trainer.precision_plugin, ApexMixedPrecisionPlugin)
@@ -170,7 +170,7 @@ def test_amp_apex_ddp_spawn_fit(amp_level, tmpdir):
         precision=16,
         amp_backend="apex",
         gpus=2,
-        accelerator="ddp_spawn",
+        strategy="ddp_spawn",
         plugins=ApexMixedPrecisionPlugin(amp_level=amp_level),
     )
     assert isinstance(trainer.precision_plugin, ApexMixedPrecisionPlugin)
diff --git a/tests/plugins/test_cluster_integration.py b/tests/plugins/test_cluster_integration.py
index c5539285475c5..66ada4b89528f 100644
--- a/tests/plugins/test_cluster_integration.py
+++ b/tests/plugins/test_cluster_integration.py
@@ -80,11 +80,11 @@ def test_ranks_available_manual_plugin_selection(plugin_cls):
 @pytest.mark.parametrize(
     "trainer_kwargs",
     [
-        dict(accelerator="ddp", gpus=[1, 2]),
-        dict(accelerator="ddp_sharded", gpus=[1, 2]),
-        dict(accelerator="ddp2", gpus=[1, 2]),
-        dict(accelerator="ddp_cpu", num_processes=2),
-        dict(accelerator="ddp_spawn", gpus=[1, 2]),
+        dict(strategy="ddp", gpus=[1, 2]),
+        dict(strategy="ddp_sharded", gpus=[1, 2]),
+        dict(strategy="ddp2", gpus=[1, 2]),
+        dict(strategy="ddp_spawn", num_processes=2),
+        dict(strategy="ddp_spawn", gpus=[1, 2]),
     ],
 )
 @mock.patch("torch.cuda.is_available", return_value=True)
@@ -96,9 +96,9 @@ def test_ranks_available_automatic_plugin_selection(mock0, mock1, trainer_kwargs
 
     for cluster, variables, expected in environment_combinations():
 
-        if trainer_kwargs["accelerator"] == "ddp2":
+        if trainer_kwargs["strategy"] == "ddp2":
             expected.update(global_rank=expected["node_rank"], world_size=num_nodes)
-        if trainer_kwargs["accelerator"] in ("ddp_cpu", "ddp_spawn"):
+        if trainer_kwargs["strategy"] == "ddp_spawn":
             if isinstance(cluster, (SLURMEnvironment, TorchElasticEnvironment)):
                 # slurm and torchelastic do not work with spawn plugins
                 continue
diff --git a/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py b/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py
index 61688b8847778..332c4e8d69d60 100644
--- a/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py
+++ b/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py
@@ -120,7 +120,7 @@ def test_fully_sharded_plugin_checkpoint_multi_gpus(tmpdir):
 
 def _assert_save_equality(trainer, ckpt_path, cls=TestFSDPModel):
     # Use FullySharded to get the state dict for the sake of comparison
-    model_state_dict = trainer.accelerator.lightning_module_state_dict()
+    model_state_dict = trainer.training_type_plugin.lightning_module_state_dict()
 
     if trainer.is_global_zero:
         saved_model = cls.load_from_checkpoint(ckpt_path)
diff --git a/tests/plugins/test_ddp_plugin.py b/tests/plugins/test_ddp_plugin.py
index 03cc0e1ff7beb..4c29e28c25179 100644
--- a/tests/plugins/test_ddp_plugin.py
+++ b/tests/plugins/test_ddp_plugin.py
@@ -36,7 +36,7 @@ def on_train_start(self) -> None:
 @RunIf(skip_windows=True, min_gpus=2, special=True)
 def test_ddp_with_2_gpus():
     """Tests if device is set correctely when training and after teardown for DDPPlugin."""
-    trainer = Trainer(gpus=2, accelerator="ddp", fast_dev_run=True)
+    trainer = Trainer(gpus=2, strategy="ddp", fast_dev_run=True)
     # assert training type plugin attributes for device setting
     assert isinstance(trainer.training_type_plugin, DDPPlugin)
     assert trainer.training_type_plugin.on_gpu
@@ -70,7 +70,7 @@ def test_ddp_barrier_non_consecutive_device_ids(barrier_mock, tmpdir):
     """Test correct usage of barriers when device ids do not start at 0 or are not consecutive."""
     model = BoringModel()
     gpus = [1, 3]
-    trainer = Trainer(default_root_dir=tmpdir, max_steps=1, gpus=gpus, accelerator="ddp")
+    trainer = Trainer(default_root_dir=tmpdir, max_steps=1, gpus=gpus, strategy="ddp")
     trainer.fit(model)
     barrier_mock.assert_any_call(device_ids=[gpus[trainer.local_rank]])
 
@@ -87,9 +87,9 @@ def creates_children(self):
     model = BoringModel()
     trainer = Trainer(
         default_root_dir=tmpdir,
-        accelerator="ddp",
+        strategy="ddp",
         num_processes=2,
-        plugins=[DDPPlugin(), WronglyImplementedEnvironment()],
+        plugins=[WronglyImplementedEnvironment()],
     )
     with pytest.raises(
         RuntimeError, match="Lightning attempted to launch new distributed processes with `local_rank > 0`."
@@ -108,7 +108,7 @@ def test_ddp_configure_ddp():
     )
     # test wrap the model if fitting
     trainer.state.fn = TrainerFn.FITTING
-    trainer.accelerator.connect(model)
+    trainer.training_type_plugin.connect(model)
     trainer.accelerator.setup_environment()
     trainer.accelerator.setup(trainer)
     trainer.lightning_module.trainer = trainer
@@ -122,7 +122,7 @@ def test_ddp_configure_ddp():
         plugins=[ddp_plugin],
     )
     # test do not wrap the model if trainerFN is not fitting
-    trainer.accelerator.connect(model)
+    trainer.training_type_plugin.connect(model)
     trainer.accelerator.setup_environment()
     trainer.accelerator.setup(trainer)
     trainer.lightning_module.trainer = trainer
diff --git a/tests/plugins/test_ddp_spawn_plugin.py b/tests/plugins/test_ddp_spawn_plugin.py
index a89ddd3aaa50b..c389cf9290c78 100644
--- a/tests/plugins/test_ddp_spawn_plugin.py
+++ b/tests/plugins/test_ddp_spawn_plugin.py
@@ -66,7 +66,7 @@ def test_ddp_cpu():
 def test_ddp_spawn_extra_parameters(tmpdir):
     """Tests if device is set correctly when training for DDPSpawnPlugin and tests add_to_queue/get_from_queue with
     Lightning Module (deprecated way)."""
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, gpus=2, accelerator="ddp_spawn")
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, gpus=2, strategy="ddp_spawn")
 
     assert isinstance(trainer.training_type_plugin, DDPSpawnPlugin)
     assert trainer.training_type_plugin.on_gpu
@@ -96,9 +96,7 @@ def test_ddp_spawn_add_get_queue(tmpdir):
     """Tests add_to_queue/get_from_queue with DDPSpawnPlugin."""
 
     ddp_spawn_plugin = TestDDPSpawnPlugin()
-    trainer = Trainer(
-        default_root_dir=tmpdir, fast_dev_run=True, num_processes=2, accelerator="ddp_cpu", plugins=[ddp_spawn_plugin]
-    )
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, num_processes=2, strategy=ddp_spawn_plugin)
 
     val: float = 1.0
     val_name: str = "val_acc"
@@ -133,7 +131,7 @@ def on_predict_start(self) -> None:
 @RunIf(skip_windows=True)
 def test_ddp_spawn_configure_ddp(tmpdir):
     """Tests with ddp spawn plugin."""
-    trainer = Trainer(default_root_dir=tmpdir, num_processes=2, accelerator="ddp_spawn", fast_dev_run=True)
+    trainer = Trainer(default_root_dir=tmpdir, num_processes=2, strategy="ddp_spawn", fast_dev_run=True)
 
     model = BoringModelDDP()
 
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index ca02e9b9b03d5..53b7bdbf7f0f0 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -213,12 +213,13 @@ def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args
         trainer.fit(model)
 
 
-@RunIf(min_gpus=1, deepspeed=True, special=True)
+@RunIf(min_gpus=1, deepspeed=True)
 @pytest.mark.parametrize(
     ["dataset_cls", "value"],
     [(RandomDataset, "auto"), (RandomDataset, 10), (RandomIterableDataset, "auto"), (RandomIterableDataset, 10)],
 )
-def test_deepspeed_auto_batch_size_config_select(tmpdir, dataset_cls, value):
+@mock.patch("deepspeed.init_distributed", autospec=True)
+def test_deepspeed_auto_batch_size_config_select(mock_deepspeed_distributed, tmpdir, dataset_cls, value):
     """Test to ensure that the batch size is correctly set as expected for deepspeed logging purposes."""
 
     class TestModel(BoringModel):
@@ -226,7 +227,7 @@ def train_dataloader(self):
             return DataLoader(dataset_cls(32, 64))
 
     class AssertCallback(Callback):
-        def on_train_start(self, trainer, pl_module) -> None:
+        def setup(self, trainer, pl_module, stage: Optional[str] = None) -> None:
             assert isinstance(trainer.accelerator.training_type_plugin, DeepSpeedPlugin)
             config = trainer.accelerator.training_type_plugin.config
 
@@ -855,7 +856,7 @@ def test_deepspeed_multigpu_no_schedulers(tmpdir):
     _assert_save_model_is_equal(model, tmpdir, trainer)
 
 
-@RunIf(min_gpus=1, deepspeed=True)
+@RunIf(min_gpus=1, deepspeed=True, special=True)
 def test_deepspeed_skip_backward_raises(tmpdir):
     class TestModel(BoringModel):
         def training_step(self, batch, batch_idx):
@@ -961,6 +962,41 @@ def configure_optimizers(self):
         assert mock_step.call_count == 1 + (max_epoch * limit_train_batches)
 
 
+@RunIf(min_gpus=1, deepspeed=True, special=True)
+def test_deepspeed_configure_gradient_clipping(tmpdir):
+    """Test to ensure that a warning is raised when `LightningModule.configure_gradient_clipping` is overridden in
+    case of deepspeed."""
+
+    class TestModel(BoringModel):
+        def configure_gradient_clipping(self, optimizer, optimizer_idx, gradient_clip_val, gradient_clip_algorithm):
+            if optimizer_idx == 0:
+                self.clip_gradients(optimizer, gradient_clip_val, gradient_clip_algorithm)
+
+    model = TestModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        gpus=1,
+        plugins="deepspeed",
+        fast_dev_run=True,
+    )
+    with pytest.warns(UserWarning, match="handles gradient clipping internally"):
+        trainer.fit(model)
+
+
+@RunIf(min_gpus=1, deepspeed=True, special=True)
+def test_deepspeed_gradient_clip_by_value(tmpdir):
+    """Test to ensure that an exception is raised when using `gradient_clip_algorithm='value'`."""
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        gpus=1,
+        plugins="deepspeed",
+        gradient_clip_algorithm="value",
+    )
+    with pytest.raises(MisconfigurationException, match="does not support clipping gradients by value"):
+        trainer.fit(model)
+
+
 @RunIf(min_gpus=1, deepspeed=True, special=True)
 def test_different_accumulate_grad_batches_fails(tmpdir):
     model = BoringModel()
@@ -969,3 +1005,40 @@ def test_different_accumulate_grad_batches_fails(tmpdir):
         MisconfigurationException, match="DeepSpeed currently does not support different `accumulate_grad_batches`"
     ):
         trainer.fit(model)
+
+
+@RunIf(min_gpus=2, deepspeed=True, special=True)
+def test_specific_gpu_device_id(tmpdir):
+    class TestCallback(Callback):
+        def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
+            assert model.device.index == 1
+
+        def on_train_batch_start(
+            self,
+            trainer: Trainer,
+            pl_module: LightningModule,
+            batch: Any,
+            batch_idx: int,
+            dataloader_idx: int,
+        ) -> None:
+            assert batch.device.index == 1
+
+        def on_test_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
+            assert model.device.index == 1
+
+        def on_test_batch_start(
+            self,
+            trainer: Trainer,
+            pl_module: LightningModule,
+            batch: Any,
+            batch_idx: int,
+            dataloader_idx: int,
+        ) -> None:
+            assert batch.device.index == 1
+
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir, fast_dev_run=True, gpus=[1], plugins="deepspeed", callbacks=TestCallback()
+    )
+    trainer.fit(model)
+    trainer.test(model)
diff --git a/tests/plugins/test_double_plugin.py b/tests/plugins/test_double_plugin.py
index 71595024c80af..6893a7aee324a 100644
--- a/tests/plugins/test_double_plugin.py
+++ b/tests/plugins/test_double_plugin.py
@@ -159,7 +159,7 @@ def test_double_precision_ddp(tmpdir):
     trainer = Trainer(
         max_epochs=1,
         default_root_dir=tmpdir,
-        accelerator="ddp_spawn",
+        strategy="ddp_spawn",
         gpus=2,
         fast_dev_run=2,
         precision=64,
diff --git a/tests/plugins/test_plugins_registry.py b/tests/plugins/test_plugins_registry.py
index 0cbf9bdd7827e..cc9e80c60f606 100644
--- a/tests/plugins/test_plugins_registry.py
+++ b/tests/plugins/test_plugins_registry.py
@@ -29,7 +29,7 @@
 def test_training_type_plugins_registry_with_new_plugin():
     class TestPlugin:
 
-        distributed_backend = "test_plugin"
+        accelerator = "test_plugin"
 
         def __init__(self, param1, param2):
             self.param1 = param1
@@ -45,7 +45,7 @@ def __init__(self, param1, param2):
     assert plugin_name in TrainingTypePluginsRegistry
     assert TrainingTypePluginsRegistry[plugin_name]["description"] == plugin_description
     assert TrainingTypePluginsRegistry[plugin_name]["init_params"] == {"param1": "abc", "param2": 123}
-    assert TrainingTypePluginsRegistry[plugin_name]["distributed_backend"] == "test_plugin"
+    assert TrainingTypePluginsRegistry[plugin_name]["accelerator"] == "test_plugin"
     assert isinstance(TrainingTypePluginsRegistry.get(plugin_name), TestPlugin)
 
     TrainingTypePluginsRegistry.remove(plugin_name)
diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py
index f2c99fa4cb17a..f1f784818c30e 100644
--- a/tests/plugins/test_sharded_plugin.py
+++ b/tests/plugins/test_sharded_plugin.py
@@ -23,7 +23,7 @@
 def test_ddp_sharded_precision_16_clip_gradients(mock_oss_clip_grad_norm, clip_val, tmpdir):
     """Ensure that clip gradients is only called if the value is greater than 0."""
     model = BoringModel()
-    trainer = Trainer(accelerator="ddp_sharded", gpus=1, precision=16, fast_dev_run=True, gradient_clip_val=clip_val)
+    trainer = Trainer(strategy="ddp_sharded", gpus=1, precision=16, fast_dev_run=True, gradient_clip_val=clip_val)
     trainer.fit(model)
     if clip_val > 0:
         mock_oss_clip_grad_norm.assert_called()
@@ -32,20 +32,20 @@ def test_ddp_sharded_precision_16_clip_gradients(mock_oss_clip_grad_norm, clip_v
 
 
 @RunIf(fairscale=True)
-@pytest.mark.parametrize(["accelerator"], [("ddp_sharded",), ("ddp_sharded_spawn",)])
-def test_sharded_ddp_choice(tmpdir, accelerator):
+@pytest.mark.parametrize(["strategy"], [("ddp_sharded",), ("ddp_sharded_spawn",)])
+def test_sharded_ddp_choice(tmpdir, strategy):
     """Test to ensure that plugin is correctly chosen."""
 
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
-            if accelerator == "ddp_sharded":
+            if strategy == "ddp_sharded":
                 assert isinstance(trainer.accelerator.training_type_plugin, DDPShardedPlugin)
-            elif accelerator == "ddp_sharded_spawn":
+            elif strategy == "ddp_sharded_spawn":
                 assert isinstance(trainer.accelerator.training_type_plugin, DDPSpawnShardedPlugin)
             raise SystemExit()
 
     model = BoringModel()
-    trainer = Trainer(fast_dev_run=True, accelerator=accelerator, callbacks=[CB()])
+    trainer = Trainer(fast_dev_run=True, strategy=strategy, callbacks=[CB()])
 
     with pytest.raises(SystemExit):
         trainer.fit(model)
@@ -57,26 +57,26 @@ def test_invalid_apex_sharded(tmpdir):
 
     model = BoringModel()
     with pytest.raises(MisconfigurationException, match="Sharded Plugin is not supported with Apex AMP"):
-        trainer = Trainer(fast_dev_run=True, accelerator="ddp_sharded_spawn", precision=16, amp_backend="apex")
+        trainer = Trainer(fast_dev_run=True, strategy="ddp_sharded_spawn", precision=16, amp_backend="apex")
 
         trainer.fit(model)
 
 
 @RunIf(min_gpus=1, fairscale=True)
-@pytest.mark.parametrize(["accelerator"], [("ddp_sharded",), ("ddp_sharded_spawn",)])
-def test_ddp_choice_sharded_amp(tmpdir, accelerator):
+@pytest.mark.parametrize(["strategy"], [("ddp_sharded",), ("ddp_sharded_spawn",)])
+def test_ddp_choice_sharded_amp(tmpdir, strategy):
     """Test to ensure that plugin native amp plugin is correctly chosen when using sharded."""
 
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
-            if accelerator == "ddp_sharded":
+            if strategy == "ddp_sharded":
                 assert isinstance(trainer.accelerator.training_type_plugin, DDPShardedPlugin)
-            elif accelerator == "ddp_sharded_spawn":
+            elif strategy == "ddp_sharded_spawn":
                 assert isinstance(trainer.accelerator.training_type_plugin, DDPSpawnShardedPlugin)
             raise SystemExit()
 
     model = BoringModel()
-    trainer = Trainer(fast_dev_run=True, gpus=1, precision=16, accelerator=accelerator, callbacks=[CB()])
+    trainer = Trainer(fast_dev_run=True, gpus=1, precision=16, strategy=strategy, callbacks=[CB()])
 
     with pytest.raises(SystemExit):
         trainer.fit(model)
@@ -86,7 +86,7 @@ def on_fit_start(self, trainer, pl_module):
 def test_ddp_sharded_plugin_checkpoint_cpu(tmpdir):
     """Test to ensure that checkpoint is saved correctly."""
     model = BoringModel()
-    trainer = Trainer(accelerator="ddp_sharded_spawn", num_processes=2, fast_dev_run=True)
+    trainer = Trainer(strategy="ddp_sharded_spawn", num_processes=2, fast_dev_run=True)
 
     trainer.fit(model)
 
@@ -103,7 +103,7 @@ def test_ddp_sharded_plugin_checkpoint_cpu(tmpdir):
 def test_ddp_sharded_plugin_checkpoint_multi_gpu(tmpdir):
     """Test to ensure that checkpoint is saved correctly when using multiple GPUs."""
     model = BoringModel()
-    trainer = Trainer(gpus=2, accelerator="ddp_sharded_spawn", fast_dev_run=True)
+    trainer = Trainer(gpus=2, strategy="ddp_sharded_spawn", fast_dev_run=True)
 
     trainer.fit(model)
 
@@ -120,7 +120,7 @@ def test_ddp_sharded_plugin_checkpoint_multi_gpu(tmpdir):
 def test_ddp_sharded_plugin_finetune(tmpdir):
     """Test to ensure that we can save and restart training (simulate fine-tuning)"""
     model = BoringModel()
-    trainer = Trainer(gpus=2, accelerator="ddp_sharded_spawn", fast_dev_run=True)
+    trainer = Trainer(gpus=2, strategy="ddp_sharded_spawn", fast_dev_run=True)
     trainer.fit(model)
 
     checkpoint_path = os.path.join(tmpdir, "model.pt")
@@ -135,7 +135,7 @@ def test_ddp_sharded_plugin_finetune(tmpdir):
 def test_ddp_sharded_plugin_resume_from_checkpoint(tmpdir):
     """Test to ensure that resuming from checkpoint works."""
     model = BoringModel()
-    trainer = Trainer(accelerator="ddp_sharded_spawn", num_processes=2, fast_dev_run=True)
+    trainer = Trainer(strategy="ddp_sharded_spawn", num_processes=2, fast_dev_run=True)
 
     trainer.fit(model)
 
@@ -145,7 +145,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint(tmpdir):
     model = BoringModel()
 
     trainer = Trainer(
-        accelerator="ddp_sharded_spawn", num_processes=2, fast_dev_run=True, resume_from_checkpoint=checkpoint_path
+        strategy="ddp_sharded_spawn", num_processes=2, fast_dev_run=True, resume_from_checkpoint=checkpoint_path
     )
 
     trainer.fit(model)
@@ -157,7 +157,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint(tmpdir):
 def test_ddp_sharded_plugin_resume_from_checkpoint_downsize_gpus(tmpdir):
     """Test to ensure that resuming from checkpoint works when downsizing number of GPUS."""
     model = BoringModel()
-    trainer = Trainer(accelerator="ddp_sharded_spawn", fast_dev_run=True, gpus=2)
+    trainer = Trainer(strategy="ddp_sharded_spawn", fast_dev_run=True, gpus=2)
 
     trainer.fit(model)
 
@@ -166,9 +166,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint_downsize_gpus(tmpdir):
 
     model = BoringModel()
 
-    trainer = Trainer(
-        accelerator="ddp_sharded_spawn", fast_dev_run=True, gpus=1, resume_from_checkpoint=checkpoint_path
-    )
+    trainer = Trainer(strategy="ddp_sharded_spawn", fast_dev_run=True, gpus=1, resume_from_checkpoint=checkpoint_path)
 
     trainer.fit(model)
 
@@ -177,7 +175,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint_downsize_gpus(tmpdir):
 def test_ddp_sharded_plugin_resume_from_checkpoint_gpu_to_cpu(tmpdir):
     """Test to ensure that resuming from checkpoint works when going from GPUs- > CPU."""
     model = BoringModel()
-    trainer = Trainer(accelerator="ddp_sharded_spawn", gpus=1, fast_dev_run=True)
+    trainer = Trainer(strategy="ddp_sharded_spawn", gpus=1, fast_dev_run=True)
 
     trainer.fit(model)
 
@@ -187,7 +185,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint_gpu_to_cpu(tmpdir):
     model = BoringModel()
 
     trainer = Trainer(
-        accelerator="ddp_sharded_spawn", num_processes=2, fast_dev_run=True, resume_from_checkpoint=checkpoint_path
+        strategy="ddp_sharded_spawn", num_processes=2, fast_dev_run=True, resume_from_checkpoint=checkpoint_path
     )
 
     trainer.fit(model)
@@ -198,7 +196,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint_gpu_to_cpu(tmpdir):
 def test_ddp_sharded_plugin_test_multigpu(tmpdir, trainer_kwargs):
     """Test to ensure we can use validate and test without fit."""
     model = BoringModel()
-    trainer = Trainer(accelerator="ddp_sharded_spawn", fast_dev_run=True, **trainer_kwargs)
+    trainer = Trainer(strategy="ddp_sharded_spawn", fast_dev_run=True, **trainer_kwargs)
 
     trainer.validate(model)
     trainer.test(model)
@@ -223,14 +221,14 @@ def training_step(self, batch, batch_idx):
 def test_ddp_sharded_plugin_manual_optimization_spawn(tmpdir):
     # todo (sean): this test has been split out as running both tests using parametrize causes "Address in use"
     model = ManualBoringModel()
-    trainer = Trainer(default_root_dir=tmpdir, accelerator="ddp_sharded_spawn", fast_dev_run=2, gpus=2)
+    trainer = Trainer(default_root_dir=tmpdir, strategy="ddp_sharded_spawn", fast_dev_run=2, gpus=2)
     trainer.fit(model)
 
 
 @RunIf(skip_windows=True, special=True, fairscale=True, min_gpus=2)
 def test_ddp_sharded_plugin_manual_optimization(tmpdir):
     model = ManualBoringModel()
-    trainer = Trainer(default_root_dir=tmpdir, accelerator="ddp_sharded", fast_dev_run=2, gpus=2)
+    trainer = Trainer(default_root_dir=tmpdir, strategy="ddp_sharded", fast_dev_run=2, gpus=2)
     trainer.fit(model)
 
 
@@ -258,7 +256,7 @@ def on_predict_start(self) -> None:
 @RunIf(skip_windows=True, fairscale=True)
 def test_configure_ddp(tmpdir):
     """Tests with ddp sharded plugin."""
-    trainer = Trainer(default_root_dir=tmpdir, accelerator="ddp_sharded", fast_dev_run=True)
+    trainer = Trainer(default_root_dir=tmpdir, strategy="ddp_sharded", fast_dev_run=True)
 
     model = BoringModelSharded()
 
diff --git a/tests/profiler/test_profiler.py b/tests/profiler/test_profiler.py
index 4e0cd267f0384..9e22c3f8ec9f7 100644
--- a/tests/profiler/test_profiler.py
+++ b/tests/profiler/test_profiler.py
@@ -129,7 +129,7 @@ def test_simple_profiler_distributed_files(tmpdir):
     profiler = SimpleProfiler(dirpath=tmpdir, filename="profiler")
     model = BoringModel()
     trainer = Trainer(
-        default_root_dir=tmpdir, fast_dev_run=2, accelerator="ddp_cpu", num_processes=2, profiler=profiler, logger=False
+        default_root_dir=tmpdir, fast_dev_run=2, strategy="ddp_spawn", num_processes=2, profiler=profiler, logger=False
     )
     trainer.fit(model)
     trainer.validate(model)
@@ -262,7 +262,7 @@ def test_pytorch_profiler_trainer_ddp(tmpdir, pytorch_profiler):
         limit_train_batches=5,
         limit_val_batches=5,
         profiler=pytorch_profiler,
-        accelerator="ddp",
+        strategy="ddp",
         gpus=2,
     )
     trainer.fit(model)
@@ -285,7 +285,7 @@ def test_pytorch_profiler_trainer_ddp(tmpdir, pytorch_profiler):
         files = [file for file in files if file.endswith(".json")]
         assert len(files) == 2, files
         local_rank = trainer.local_rank
-        assert any(f"{local_rank}-optimizer_step_and_closure_" in f for f in files)
+        assert any(f"{local_rank}-optimizer_step_with_closure_" in f for f in files)
         assert any(f"{local_rank}-validation_step" in f for f in files)
 
 
diff --git a/tests/trainer/connectors/test_callback_connector.py b/tests/trainer/connectors/test_callback_connector.py
index 41b3faa82f70d..949723b022750 100644
--- a/tests/trainer/connectors/test_callback_connector.py
+++ b/tests/trainer/connectors/test_callback_connector.py
@@ -144,7 +144,10 @@ def _attach_callbacks(trainer_callbacks, model_callbacks):
         model = LightningModule()
         model.configure_callbacks = lambda: model_callbacks
         trainer = Trainer(
-            checkpoint_callback=False, enable_progress_bar=False, weights_summary=None, callbacks=trainer_callbacks
+            enable_checkpointing=False,
+            enable_progress_bar=False,
+            enable_model_summary=None,
+            callbacks=trainer_callbacks,
         )
         trainer.model = model
         cb_connector = CallbackConnector(trainer)
@@ -196,7 +199,7 @@ def test_attach_model_callbacks_override_info(caplog):
     """Test that the logs contain the info about overriding callbacks returned by configure_callbacks."""
     model = LightningModule()
     model.configure_callbacks = lambda: [LearningRateMonitor(), EarlyStopping()]
-    trainer = Trainer(checkpoint_callback=False, callbacks=[EarlyStopping(), LearningRateMonitor(), ProgressBar()])
+    trainer = Trainer(enable_checkpointing=False, callbacks=[EarlyStopping(), LearningRateMonitor(), ProgressBar()])
     trainer.model = model
     cb_connector = CallbackConnector(trainer)
     with caplog.at_level(logging.INFO):
diff --git a/tests/trainer/connectors/test_checkpoint_connector.py b/tests/trainer/connectors/test_checkpoint_connector.py
index 60ed5182b63f8..ff938598a4ada 100644
--- a/tests/trainer/connectors/test_checkpoint_connector.py
+++ b/tests/trainer/connectors/test_checkpoint_connector.py
@@ -41,7 +41,7 @@ def on_hpc_load(self, checkpoint):
 
 def test_hpc_hook_calls(tmpdir):
     model = HPCHookdedModel()
-    trainer = Trainer(default_root_dir=tmpdir, max_steps=1, checkpoint_callback=False, logger=False)
+    trainer = Trainer(default_root_dir=tmpdir, max_steps=1, enable_checkpointing=False, logger=False)
     trainer.fit(model)
     connector = trainer.checkpoint_connector
     connector.hpc_save(tmpdir, logger=Mock())
@@ -50,7 +50,7 @@ def test_hpc_hook_calls(tmpdir):
 
     # new training run, restore from hpc checkpoint file automatically
     assert set(os.listdir(tmpdir)) == {"hpc_ckpt_1.ckpt"}
-    trainer = Trainer(default_root_dir=tmpdir, max_steps=1, checkpoint_callback=False, logger=False)
+    trainer = Trainer(default_root_dir=tmpdir, max_steps=1, enable_checkpointing=False, logger=False)
     trainer.fit(model)
     assert model.hpc_save_called == 1
     assert model.hpc_load_called == 1
@@ -90,7 +90,7 @@ def test_preloaded_checkpoint_lifecycle(tmpdir):
 def test_hpc_restore_attempt(tmpdir):
     """Test that restore() attempts to restore the hpc_ckpt with highest priority."""
     model = BoringModel()
-    trainer = Trainer(default_root_dir=tmpdir, max_steps=1, checkpoint_callback=False, logger=False)
+    trainer = Trainer(default_root_dir=tmpdir, max_steps=1, enable_checkpointing=False, logger=False)
     trainer.fit(model)
 
     hpc_ckpt_path = tmpdir / "hpc_ckpt_3.ckpt"
@@ -102,7 +102,7 @@ def test_hpc_restore_attempt(tmpdir):
         torch.nn.init.constant_(param, 0)
 
     # case 1: restore hpc first, no explicit resume path provided
-    trainer = Trainer(default_root_dir=tmpdir, max_steps=2, checkpoint_callback=False, logger=False)
+    trainer = Trainer(default_root_dir=tmpdir, max_steps=2, enable_checkpointing=False, logger=False)
     trainer.fit(model)
 
     for param in model.parameters():
diff --git a/tests/trainer/dynamic_args/test_multiple_eval_dataloaders.py b/tests/trainer/dynamic_args/test_multiple_eval_dataloaders.py
index 7c49b4b12534b..b4378f34c1211 100644
--- a/tests/trainer/dynamic_args/test_multiple_eval_dataloaders.py
+++ b/tests/trainer/dynamic_args/test_multiple_eval_dataloaders.py
@@ -70,7 +70,7 @@ def val_dataloader(self):
         limit_val_batches=2,
         max_epochs=1,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
 
     trainer.fit(model)
@@ -100,7 +100,7 @@ def val_dataloader(self):
         limit_val_batches=2,
         max_epochs=1,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
 
     trainer.fit(model)
@@ -157,7 +157,7 @@ def configure_optimizers(self):
         limit_val_batches=2,
         max_epochs=1,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
 
     trainer.fit(model)
diff --git a/tests/trainer/flags/test_min_max_epochs.py b/tests/trainer/flags/test_min_max_epochs.py
index ecfdb8bedf6e6..4201c89350c21 100644
--- a/tests/trainer/flags/test_min_max_epochs.py
+++ b/tests/trainer/flags/test_min_max_epochs.py
@@ -26,7 +26,7 @@ def test_min_max_steps_epochs(tmpdir, min_epochs, max_epochs, min_steps, max_ste
         max_epochs=max_epochs,
         min_steps=min_steps,
         max_steps=max_steps,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
diff --git a/tests/trainer/flags/test_overfit_batches.py b/tests/trainer/flags/test_overfit_batches.py
index f9411c1eeea00..76c8b37405b47 100644
--- a/tests/trainer/flags/test_overfit_batches.py
+++ b/tests/trainer/flags/test_overfit_batches.py
@@ -38,7 +38,7 @@ def val_dataloader(self):
     model = TestModel()
 
     trainer = Trainer(
-        default_root_dir=tmpdir, max_epochs=2, overfit_batches=1, log_every_n_steps=1, weights_summary=None
+        default_root_dir=tmpdir, max_epochs=2, overfit_batches=1, log_every_n_steps=1, enable_model_summary=False
     )
 
     trainer.fit(model)
@@ -50,6 +50,6 @@ def test_overfit_basic(tmpdir, overfit):
 
     model = BoringModel()
 
-    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, overfit_batches=overfit, weights_summary=None)
+    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, overfit_batches=overfit, enable_model_summary=False)
 
     trainer.fit(model)
diff --git a/tests/trainer/logging_/test_distributed_logging.py b/tests/trainer/logging_/test_distributed_logging.py
index ac71de39757b9..487b7f38e4e19 100644
--- a/tests/trainer/logging_/test_distributed_logging.py
+++ b/tests/trainer/logging_/test_distributed_logging.py
@@ -65,13 +65,13 @@ def test_all_rank_logging_ddp_cpu(tmpdir):
     model = TestModel()
     all_rank_logger = AllRankLogger()
     trainer = Trainer(
-        accelerator="ddp_cpu",
+        strategy="ddp_spawn",
         num_processes=2,
         default_root_dir=tmpdir,
         limit_train_batches=1,
         limit_val_batches=1,
         max_epochs=1,
-        weights_summary=None,
+        enable_model_summary=False,
         logger=all_rank_logger,
         log_every_n_steps=1,
     )
@@ -85,14 +85,14 @@ def test_all_rank_logging_ddp_spawn(tmpdir):
     all_rank_logger = AllRankLogger()
     model.training_epoch_end = None
     trainer = Trainer(
-        accelerator="ddp_spawn",
+        strategy="ddp_spawn",
         gpus=2,
         default_root_dir=tmpdir,
         limit_train_batches=1,
         limit_val_batches=1,
         max_epochs=1,
         logger=all_rank_logger,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
diff --git a/tests/trainer/logging_/test_eval_loop_logging.py b/tests/trainer/logging_/test_eval_loop_logging.py
index 99145972f3561..42f188d47b0a3 100644
--- a/tests/trainer/logging_/test_eval_loop_logging.py
+++ b/tests/trainer/logging_/test_eval_loop_logging.py
@@ -51,11 +51,11 @@ def validation_step(self, batch, batch_idx):
         limit_val_batches=2,
         max_epochs=2,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
-    assert set(trainer.logged_metrics) == {"a2", "a_step", "a_epoch", "b_step", "b_epoch", "epoch"}
+    assert set(trainer.logged_metrics) == {"a2", "a_step", "a_epoch", "b_step", "b_epoch"}
 
     # we don't want to enable val metrics during steps because it is not something that users should do
     # on purpose DO NOT allow b_step... it's silly to monitor val step metrics
@@ -89,12 +89,12 @@ def validation_epoch_end(self, outputs):
         limit_val_batches=2,
         max_epochs=2,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
     # make sure all the metrics are available for loggers
-    assert set(trainer.logged_metrics) == {"epoch", "a", "b_step", "b_epoch", "c", "d_step", "d_epoch", "g"}
+    assert set(trainer.logged_metrics) == {"a", "b_step", "b_epoch", "c", "d_step", "d_epoch", "g"}
 
     assert not trainer.progress_bar_metrics
 
@@ -117,20 +117,20 @@ def validation_epoch_end(self, outputs):
         limit_val_batches=batches,
         max_epochs=max_epochs,
         log_every_n_steps=log_interval,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
     # assert the loggers received the expected number
     logged_metrics = set(trainer.logged_metrics)
-    assert logged_metrics == {"c", "d/e/f", "epoch"}
+    assert logged_metrics == {"c", "d/e/f"}
 
     pbar_metrics = set(trainer.progress_bar_metrics)
     assert pbar_metrics == {"c"}
 
     # make sure all the metrics are available for callbacks
     callback_metrics = set(trainer.callback_metrics)
-    assert callback_metrics == (logged_metrics | pbar_metrics) - {"epoch"}
+    assert callback_metrics == (logged_metrics | pbar_metrics)
 
 
 def test_eval_float_logging(tmpdir):
@@ -149,11 +149,11 @@ def validation_step(self, batch, batch_idx):
         limit_val_batches=2,
         max_epochs=1,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
-    assert set(trainer.logged_metrics) == {"a", "epoch"}
+    assert set(trainer.logged_metrics) == {"a"}
 
 
 def test_eval_logging_auto_reduce(tmpdir):
@@ -180,7 +180,7 @@ def validation_epoch_end(self, outputs) -> None:
         limit_val_batches=3,
         max_epochs=1,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
         num_sanity_val_steps=0,
     )
     trainer.fit(model)
@@ -210,7 +210,7 @@ def test_epoch_end(self, outputs):
         max_epochs=max_epochs,
         limit_test_batches=batches,
         log_every_n_steps=log_interval,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     results = trainer.test(model)
 
@@ -244,7 +244,7 @@ def test_dataloader(self):
         limit_test_batches=2,
         max_epochs=1,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     results = trainer.test(model)
 
@@ -655,6 +655,6 @@ def val_dataloader(self):
         val_check_interval=val_check_interval,
         max_epochs=3,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
diff --git a/tests/trainer/logging_/test_logger_connector.py b/tests/trainer/logging_/test_logger_connector.py
index 690acff6ad5fa..93cdb63e59082 100644
--- a/tests/trainer/logging_/test_logger_connector.py
+++ b/tests/trainer/logging_/test_logger_connector.py
@@ -330,7 +330,7 @@ def test_dataloader(self):
 
     model = TestModel()
     trainer = Trainer(
-        default_root_dir=tmpdir, accelerator="dp", gpus=2, limit_train_batches=2, limit_val_batches=2, max_epochs=1
+        default_root_dir=tmpdir, strategy="dp", gpus=2, limit_train_batches=2, limit_val_batches=2, max_epochs=1
     )
     trainer.fit(model)
     trainer.test(model)
@@ -512,7 +512,7 @@ def _assert_called(model, fn, stage):
         max_epochs=1,
         enable_progress_bar=False,
         num_sanity_val_steps=2,
-        checkpoint_callback=False,
+        enable_checkpointing=False,
     )
 
     trainer.fit(model)
diff --git a/tests/trainer/logging_/test_train_loop_logging.py b/tests/trainer/logging_/test_train_loop_logging.py
index 83a07ed7adcc5..f7f7190adb9bd 100644
--- a/tests/trainer/logging_/test_train_loop_logging.py
+++ b/tests/trainer/logging_/test_train_loop_logging.py
@@ -78,18 +78,18 @@ def training_step(self, batch, batch_idx):
         limit_val_batches=2,
         max_epochs=2,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
         callbacks=[ModelCheckpoint(monitor="l_se")],
     )
     trainer.fit(model)
 
     logged_metrics = set(trainer.logged_metrics)
-    assert logged_metrics == {"epoch", "default", "l_e", "l_s", "l_se_step", "l_se_epoch"}
+    assert logged_metrics == {"default", "l_e", "l_s", "l_se_step", "l_se_epoch"}
 
     pbar_metrics = set(trainer.progress_bar_metrics)
     assert pbar_metrics == {"p_e", "p_s", "p_se_step", "p_se_epoch"}
 
-    assert set(trainer.callback_metrics) == (logged_metrics | pbar_metrics | {"p_se", "l_se"}) - {"epoch"}
+    assert set(trainer.callback_metrics) == (logged_metrics | pbar_metrics | {"p_se", "l_se"})
 
 
 def test__training_step__epoch_end__log(tmpdir):
@@ -116,17 +116,17 @@ def training_epoch_end(self, outputs):
         limit_val_batches=2,
         max_epochs=2,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
     logged_metrics = set(trainer.logged_metrics)
-    assert logged_metrics == {"epoch", "a_step", "a_epoch", "b", "b1", "a1", "a2"}
+    assert logged_metrics == {"a_step", "a_epoch", "b", "b1", "a1", "a2"}
 
     pbar_metrics = set(trainer.progress_bar_metrics)
     assert pbar_metrics == {"b"}
 
-    assert set(trainer.callback_metrics) == (logged_metrics | pbar_metrics | {"a"}) - {"epoch"}
+    assert set(trainer.callback_metrics) == (logged_metrics | pbar_metrics | {"a"})
 
 
 @pytest.mark.parametrize(["batches", "log_interval", "max_epochs"], [(1, 1, 1), (64, 32, 2)])
@@ -156,18 +156,18 @@ def training_epoch_end(self, outputs):
         limit_val_batches=batches,
         max_epochs=max_epochs,
         log_every_n_steps=log_interval,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
     # make sure all the metrics are available for callbacks
     logged_metrics = set(trainer.logged_metrics)
-    assert logged_metrics == {"a_step", "a_epoch", "b_step", "b_epoch", "c", "d/e/f", "epoch"}
+    assert logged_metrics == {"a_step", "a_epoch", "b_step", "b_epoch", "c", "d/e/f"}
 
     pbar_metrics = set(trainer.progress_bar_metrics)
     assert pbar_metrics == {"c", "b_epoch", "b_step"}
 
-    assert set(trainer.callback_metrics) == (logged_metrics | pbar_metrics | {"a", "b"}) - {"epoch"}
+    assert set(trainer.callback_metrics) == (logged_metrics | pbar_metrics | {"a", "b"})
 
 
 @pytest.mark.parametrize(
@@ -194,7 +194,7 @@ def validation_step(self, batch, batch_idx):
         limit_train_batches=batches,
         limit_val_batches=batches,
         max_epochs=2,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
@@ -232,12 +232,12 @@ def val_dataloader(self):
         limit_train_batches=1,
         limit_val_batches=2,
         max_epochs=1,
-        weights_summary=None,
+        enable_model_summary=False,
         fast_dev_run=True,
     )
     trainer.fit(model)
 
-    assert set(trainer.logged_metrics) == {"a_step", "a_epoch", "n_step", "n_epoch", "epoch"}
+    assert set(trainer.logged_metrics) == {"a_step", "a_epoch", "n_step", "n_epoch"}
 
 
 def test_log_works_in_train_callback(tmpdir):
@@ -276,11 +276,21 @@ def on_train_epoch_start(self, _, pl_module):
                 pl_module, "on_train_epoch_start", on_steps=self.choices, on_epochs=[True], prob_bars=self.choices
             )
 
+        def on_batch_start(self, _, pl_module, *__):
+            self.make_logging(
+                pl_module, "on_batch_start", on_steps=self.choices, on_epochs=self.choices, prob_bars=self.choices
+            )
+
         def on_batch_end(self, _, pl_module):
             self.make_logging(
                 pl_module, "on_batch_end", on_steps=self.choices, on_epochs=self.choices, prob_bars=self.choices
             )
 
+        def on_train_batch_start(self, _, pl_module, *__):
+            self.make_logging(
+                pl_module, "on_train_batch_start", on_steps=self.choices, on_epochs=self.choices, prob_bars=self.choices
+            )
+
         def on_train_batch_end(self, _, pl_module, *__):
             self.make_logging(
                 pl_module, "on_train_batch_end", on_steps=self.choices, on_epochs=self.choices, prob_bars=self.choices
@@ -323,7 +333,9 @@ def training_step(self, batch, batch_idx):
         "on_train_start": 1,
         "on_epoch_start": 1,
         "on_train_epoch_start": 1,
+        "on_train_batch_start": 2,
         "on_train_batch_end": 2,
+        "on_batch_start": 2,
         "on_batch_end": 2,
         "on_train_epoch_end": 1,
         "on_epoch_end": 1,
@@ -391,7 +403,7 @@ def test_logging_sync_dist_true(tmpdir, gpus):
         default_root_dir=tmpdir,
         limit_train_batches=3,
         limit_val_batches=3,
-        weights_summary=None,
+        enable_model_summary=False,
         gpus=gpus,
     )
     trainer.fit(model)
@@ -439,8 +451,8 @@ def validation_step(self, batch, batch_idx):
         limit_train_batches=1,
         limit_val_batches=1,
         max_epochs=2,
-        weights_summary=None,
-        accelerator="ddp",
+        enable_model_summary=False,
+        strategy="ddp",
         gpus=2,
         profiler="pytorch",
     )
@@ -481,9 +493,9 @@ def on_epoch_end(self, trainer: Trainer, model: LightningModule):
         max_epochs=2,
         limit_train_batches=1,
         limit_val_batches=0,
-        checkpoint_callback=False,
+        enable_checkpointing=False,
         logger=False,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     model = TestModel()
     trainer.fit(model)
@@ -519,7 +531,7 @@ def on_train_epoch_end(self, trainer, pl_module):
         limit_train_batches=1,
         limit_val_batches=1,
         max_epochs=1,
-        weights_summary=None,
+        enable_model_summary=False,
         callbacks=[LoggingCallback()],
     )
     trainer.fit(model)
diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py
index 7fc1fba9c1940..0be43ee8f670e 100644
--- a/tests/trainer/optimization/test_manual_optimization.py
+++ b/tests/trainer/optimization/test_manual_optimization.py
@@ -23,7 +23,6 @@
 
 from pytorch_lightning import seed_everything, Trainer
 from pytorch_lightning.accelerators import Accelerator
-from pytorch_lightning.callbacks import Callback
 from tests.helpers.boring_model import BoringModel
 from tests.helpers.runif import RunIf
 
@@ -118,7 +117,7 @@ def on_train_end(self):
         limit_val_batches=2,
         max_epochs=1,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
         **kwargs,
     )
 
@@ -160,7 +159,7 @@ def training_epoch_end(self, outputs) -> None:
         limit_val_batches=2,
         max_epochs=1,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
 
     with mock.patch.object(Accelerator, "backward", wraps=trainer.accelerator.backward) as bwd_mock:
@@ -187,13 +186,13 @@ def training_epoch_end(self, outputs) -> None:
         limit_val_batches=2,
         max_epochs=1,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
 
     with mock.patch.object(Accelerator, "backward", wraps=trainer.accelerator.backward) as bwd_mock:
         trainer.fit(model)
     assert bwd_mock.call_count == limit_train_batches * 3
-    assert set(trainer.logged_metrics) == {"a_step", "a_epoch", "epoch"}
+    assert set(trainer.logged_metrics) == {"a_step", "a_epoch"}
 
 
 @RunIf(min_gpus=1)
@@ -208,7 +207,7 @@ def test_multiple_optimizers_manual_native_amp(tmpdir):
         limit_val_batches=2,
         max_epochs=1,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
         precision=16,
         gpus=1,
     )
@@ -296,7 +295,7 @@ def test_manual_optimization_and_return_tensor(tmpdir):
         limit_val_batches=0,
         precision=16,
         amp_backend="native",
-        accelerator="ddp_spawn",
+        strategy="ddp_spawn",
         gpus=2,
     )
     trainer.fit(model)
@@ -447,7 +446,7 @@ def training_epoch_end(self, outputs) -> None:
         limit_val_batches=2,
         max_epochs=1,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
         precision=16,
         amp_backend="native",
         gpus=1,
@@ -706,14 +705,6 @@ def configure_optimizers(self):
     mock_adam_step.assert_has_calls(expected_calls)
 
 
-class TestManualOptimizationDDPCallack(Callback):
-    def on_train_end(self, trainer, pl_module):
-
-        opt_a, opt_b = pl_module.optimizers()
-        assert opt_a._total_optimizer_step_calls == 4
-        assert opt_b._total_optimizer_step_calls == 2
-
-
 class TesManualOptimizationDDPModel(BoringModel):
     def __init__(self):
         super().__init__()
@@ -787,8 +778,22 @@ def configure_optimizers(self):
         optimizer_dis = torch.optim.Adam(self.layer.parameters(), lr=0.001)
         return [optimizer_gen, optimizer_dis]
 
+    def on_train_start(self):
+        # this is done here instead of in the calling function due to `spawn`
+        sgd, adam = self.optimizers()
+        self.sgd_step_patch = patch.object(sgd, "step", wraps=sgd.step)
+        self.sgd_step_mock = self.sgd_step_patch.start()
+        self.adam_step_patch = patch.object(adam, "step", wraps=adam.step)
+        self.adam_step_mock = self.adam_step_patch.start()
+
+    def on_train_end(self):
+        self.sgd_step_patch.stop()
+        assert self.sgd_step_mock.call_count == 4
+        self.adam_step_patch.stop()
+        assert self.adam_step_mock.call_count == 2
+
 
-def train_manual_optimization(tmpdir, accelerator, model_cls=TesManualOptimizationDDPModel):
+def train_manual_optimization(tmpdir, strategy, model_cls=TesManualOptimizationDDPModel):
 
     seed_everything(42)
 
@@ -805,8 +810,7 @@ def train_manual_optimization(tmpdir, accelerator, model_cls=TesManualOptimizati
         max_epochs=1,
         log_every_n_steps=1,
         gpus=2,
-        accelerator=accelerator,
-        callbacks=[TestManualOptimizationDDPCallack()],
+        strategy=strategy,
     )
 
     trainer.fit(model)
@@ -1048,14 +1052,14 @@ def configure_optimizers(self):
         limit_val_batches=2,
         max_epochs=1,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
         gpus=1,
         precision=precision,
     )
 
     trainer.fit(model)
 
-    assert set(trainer.logged_metrics) == {"epoch", "loss_d", "loss_g"}
+    assert set(trainer.logged_metrics) == {"loss_d", "loss_g"}
     assert set(trainer.progress_bar_metrics) == {"loss_d", "loss_g"}
 
 
diff --git a/tests/trainer/optimization/test_multiple_optimizers.py b/tests/trainer/optimization/test_multiple_optimizers.py
index 5c29bbbac2d7e..30ee7c635b0bf 100644
--- a/tests/trainer/optimization/test_multiple_optimizers.py
+++ b/tests/trainer/optimization/test_multiple_optimizers.py
@@ -45,7 +45,7 @@ def training_step(self, batch, batch_idx, optimizer_idx):
 
     # Initialize a trainer
     trainer = pl.Trainer(
-        default_root_dir=tmpdir, max_epochs=1, limit_train_batches=5, limit_val_batches=5, weights_summary=None
+        default_root_dir=tmpdir, max_epochs=1, limit_train_batches=5, limit_val_batches=5, enable_model_summary=False
     )
     trainer.fit(model)
 
@@ -77,7 +77,7 @@ def training_epoch_end(self, outputs) -> None:
         limit_val_batches=2,
         max_epochs=1,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
@@ -117,7 +117,7 @@ def training_epoch_end(self, outputs) -> None:
     model.val_dataloader = None
 
     trainer = pl.Trainer(
-        default_root_dir=tmpdir, limit_train_batches=2, max_epochs=1, log_every_n_steps=1, weights_summary=None
+        default_root_dir=tmpdir, limit_train_batches=2, max_epochs=1, log_every_n_steps=1, enable_model_summary=False
     )
     trainer.fit(model)
 
@@ -189,7 +189,7 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_c
         limit_train_batches=limit_train_batches,
         max_epochs=1,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
     assert len(model.training_step_called) == len(model.optimizer_step_called) == len(model.optimizers())
diff --git a/tests/trainer/optimization/test_optimizers.py b/tests/trainer/optimization/test_optimizers.py
index 9872d9088070e..9043f5eca8df0 100644
--- a/tests/trainer/optimization/test_optimizers.py
+++ b/tests/trainer/optimization/test_optimizers.py
@@ -370,7 +370,7 @@ def configure_optimizers(self):
         limit_train_batches=1,
         limit_val_batches=2,
         max_epochs=1,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
@@ -552,7 +552,7 @@ def on_train_start(self, *args, **kwargs):
             assert state["sum"].device == torch.device("cuda", self.local_rank) == self.device
 
     model = TestModel()
-    trainer = Trainer(default_root_dir=tmpdir, gpus=2, accelerator="ddp", fast_dev_run=True)
+    trainer = Trainer(default_root_dir=tmpdir, gpus=2, strategy="ddp", fast_dev_run=True)
     trainer.fit(model)
 
 
diff --git a/tests/trainer/properties/test_get_model.py b/tests/trainer/properties/test_get_model.py
index 13a8c617975ea..6e405739e83fe 100644
--- a/tests/trainer/properties/test_get_model.py
+++ b/tests/trainer/properties/test_get_model.py
@@ -49,7 +49,7 @@ def test_get_model_ddp_cpu(tmpdir):
         limit_train_batches=limit_train_batches,
         limit_val_batches=2,
         max_epochs=1,
-        accelerator="ddp_cpu",
+        strategy="ddp_spawn",
         num_processes=2,
     )
     trainer.fit(model)
diff --git a/tests/trainer/properties/test_log_dir.py b/tests/trainer/properties/test_log_dir.py
index 0623cf0097280..277a2f105efd6 100644
--- a/tests/trainer/properties/test_log_dir.py
+++ b/tests/trainer/properties/test_log_dir.py
@@ -47,7 +47,7 @@ def test_logdir_no_checkpoint_cb(tmpdir):
     expected = os.path.join(tmpdir, "lightning_logs", "version_0")
     model = TestModel(expected)
 
-    trainer = Trainer(default_root_dir=tmpdir, max_steps=2, checkpoint_callback=False)
+    trainer = Trainer(default_root_dir=tmpdir, max_steps=2, enable_checkpointing=False)
 
     assert trainer.log_dir == expected
     trainer.fit(model)
@@ -71,7 +71,7 @@ def test_logdir_no_logger_no_checkpoint(tmpdir):
     expected = os.path.join(tmpdir)
     model = TestModel(expected)
 
-    trainer = Trainer(default_root_dir=tmpdir, max_steps=2, logger=False, checkpoint_callback=False)
+    trainer = Trainer(default_root_dir=tmpdir, max_steps=2, logger=False, enable_checkpointing=False)
 
     assert trainer.log_dir == expected
     trainer.fit(model)
diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py
index 6e91cf926723c..f857e3b790acf 100644
--- a/tests/trainer/test_data_loading.py
+++ b/tests/trainer/test_data_loading.py
@@ -11,7 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import sys
+from contextlib import redirect_stderr
+from io import StringIO
 from re import escape
 
 import pytest
@@ -19,14 +20,13 @@
 from torch.utils.data.sampler import BatchSampler, Sampler, SequentialSampler
 
 from pytorch_lightning import Trainer
+from pytorch_lightning.utilities.enums import DistributedType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_7
 from tests.helpers import BoringModel, RandomDataset
+from tests.helpers.runif import RunIf
 
 
-@pytest.mark.skipif(
-    sys.platform == "win32" and not _TORCH_GREATER_EQUAL_1_7, reason="Bad `torch.distributed` support on Windows"
-)
+@RunIf(skip_windows=True, min_torch="1.7.0")
 @pytest.mark.parametrize("mode", (1, 2, 3))
 def test_replace_distributed_sampler(tmpdir, mode):
     class IndexedRandomDataset(RandomDataset):
@@ -100,24 +100,45 @@ def test_dataloader(self):
         trainer.test(model)
 
 
-@pytest.mark.parametrize("num_workers", [0, 1])
-def test_dataloader_warnings(num_workers):
-    class TestModel(BoringModel):
-        def on_train_start(self, *_) -> None:
-            raise SystemExit()
-
-    dl = DataLoader(RandomDataset(32, 64), num_workers=num_workers)
-    if hasattr(dl, "persistent_workers"):
-        if num_workers == 0:
-            warn_str = "Consider setting num_workers>0 and persistent_workers=True"
-        else:
-            warn_str = "Consider setting persistent_workers=True"
-    else:
-        warn_str = "Consider setting accelerator=ddp"
+class TestSpawnBoringModel(BoringModel):
+    def __init__(self, num_workers):
+        super().__init__()
+        self.num_workers = num_workers
+
+    def train_dataloader(self):
+        return DataLoader(RandomDataset(32, 64), num_workers=self.num_workers)
+
+    def on_pretrain_routine_start(self):
+        self._resout = StringIO()
+        self.ctx = redirect_stderr(self._resout)
+        self.ctx.__enter__()
 
-    trainer = Trainer(accelerator="ddp_spawn")
-    with pytest.warns(UserWarning, match=warn_str), pytest.raises(SystemExit):
-        trainer.fit(TestModel(), dl)
+    def on_train_end(self):
+        def _get_warning_msg():
+            dl = self.trainer.train_dataloader.loaders
+            if hasattr(dl, "persistent_workers"):
+                if self.num_workers == 0:
+                    warn_str = "Consider setting num_workers>0 and persistent_workers=True"
+                else:
+                    warn_str = "Consider setting persistent_workers=True"
+            else:
+                warn_str = "Consider setting accelerator=ddp"
+
+            return warn_str
+
+        if self.trainer.is_global_zero:
+            self.ctx.__exit__(None, None, None)
+            msg = self._resout.getvalue()
+            warn_str = _get_warning_msg()
+            assert warn_str in msg
+
+
+@RunIf(skip_windows=True)
+@pytest.mark.parametrize("num_workers", [0, 1])
+def test_dataloader_warnings(tmpdir, num_workers):
+    trainer = Trainer(default_root_dir=tmpdir, accelerator="ddp_spawn", num_processes=2, fast_dev_run=4)
+    assert trainer.accelerator_connector._distrib_type == DistributedType.DDP_SPAWN
+    trainer.fit(TestSpawnBoringModel(num_workers))
 
 
 def test_update_dataloader_raises():
@@ -233,7 +254,7 @@ def __init__(
             self.dummy_kwarg = dummy_kwarg
             self.something_unrelated = 1
 
-    trainer = Trainer(num_processes=1, accelerator="ddp_cpu")
+    trainer = Trainer(num_processes=2, strategy="ddp_spawn")
 
     class CustomDummyObj:
         sampler = None
@@ -267,19 +288,19 @@ def test_loader_detaching():
 
     class LoaderTestModel(BoringModel):
         def training_step(self, batch, batch_idx):
-            assert len(model.train_dataloader()) == 10
+            assert len(self.trainer.train_dataloader.loaders) == 10
             return super().training_step(batch, batch_idx)
 
         def validation_step(self, batch, batch_idx):
-            assert len(model.val_dataloader()) == 10
+            assert len(self.trainer.val_dataloaders[0]) == 10
             return super().validation_step(batch, batch_idx)
 
         def test_step(self, batch, batch_idx):
-            assert len(model.test_dataloader()) == 10
+            assert len(self.trainer.test_dataloaders[0]) == 10
             return super().test_step(batch, batch_idx)
 
         def predict_step(self, batch, batch_idx, dataloader_idx=None):
-            assert len(model.predict_dataloader()) == 10
+            assert len(self.trainer.predict_dataloaders[0]) == 10
             return super().predict_step(batch, batch_idx, dataloader_idx=dataloader_idx)
 
     loader = DataLoader(RandomDataset(32, 10), batch_size=1)
diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
index 5c9aacc92a8ac..0c7ff14189ef0 100644
--- a/tests/trainer/test_dataloaders.py
+++ b/tests/trainer/test_dataloaders.py
@@ -184,7 +184,7 @@ def test_dataloaders_passed_to_fn(tmpdir, ckpt_path, n):
         model.validation_epoch_end = model.validation_epoch_end__multiple_dataloaders
         model.test_step = model.test_step__multiple_dataloaders
 
-    # train, multiple val and multiple test passed to fit
+    # multiple val dataloaders passed to fit
     trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, limit_val_batches=0.1, limit_train_batches=0.2)
     trainer.fit(model, train_dataloader=model.dataloader(train=True), val_dataloaders=dataloaders)
 
@@ -195,10 +195,10 @@ def test_dataloaders_passed_to_fn(tmpdir, ckpt_path, n):
         ckpt_path = trainer.checkpoint_callback.best_model_path
 
     trainer.test(test_dataloaders=dataloaders, ckpt_path=ckpt_path)
-    trainer.validate(val_dataloaders=dataloaders, ckpt_path=ckpt_path)
+    assert len(trainer.test_dataloaders) == n
 
+    trainer.validate(val_dataloaders=dataloaders, ckpt_path=ckpt_path)
     assert len(trainer.val_dataloaders) == n
-    assert len(trainer.test_dataloaders) == n
 
 
 class DummyModel(BoringModel):
@@ -335,11 +335,11 @@ def test_dataloaders_with_limit_val_batches(tmpdir, dataset, limit_val_batches):
 
     epoch_cb = Counter()
     callbacks = [epoch_cb]
-    checkpoint_callback = True
+    enable_checkpointing = False
     if limit_val_batches > 0:
         callbacks.append(ModelCheckpoint(monitor="val_log", save_top_k=1, mode="max", verbose=False))
-    else:
-        checkpoint_callback = False
+        enable_checkpointing = True
+
     epochs = 2
     trainer = Trainer(
         default_root_dir=tmpdir,
@@ -347,7 +347,7 @@ def test_dataloaders_with_limit_val_batches(tmpdir, dataset, limit_val_batches):
         max_epochs=epochs,
         callbacks=callbacks,
         limit_val_batches=limit_val_batches,
-        checkpoint_callback=checkpoint_callback,
+        enable_checkpointing=enable_checkpointing,
     )
     model = DummyModel()
 
@@ -551,17 +551,15 @@ def test_mixing_of_dataloader_options(tmpdir, ckpt_path):
     # fit model
     trainer = Trainer(**trainer_options)
     trainer.fit(model, val_dataloaders=model.dataloader(train=False))
-    assert trainer.state.finished, f"Training failed with {trainer.state}"
 
     # fit model
     trainer = Trainer(**trainer_options)
     trainer.fit(model, val_dataloaders=model.dataloader(train=False))
-    assert trainer.state.finished, f"Training failed with {trainer.state}"
+    assert len(trainer.val_dataloaders) == 1, f"`val_dataloaders` not initiated properly, got {trainer.val_dataloaders}"
+
     if ckpt_path == "specific":
         ckpt_path = trainer.checkpoint_callback.best_model_path
     trainer.test(test_dataloaders=model.dataloader(train=False), ckpt_path=ckpt_path)
-
-    assert len(trainer.val_dataloaders) == 1, f"`val_dataloaders` not initiated properly, got {trainer.val_dataloaders}"
     assert (
         len(trainer.test_dataloaders) == 1
     ), f"`test_dataloaders` not initiated properly, got {trainer.test_dataloaders}"
@@ -810,7 +808,7 @@ def test_auto_add_worker_init_fn_distributed(tmpdir, monkeypatch):
 
     dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)
     seed_everything(0, workers=True)
-    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, gpus=2, accelerator="ddp_spawn")
+    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, gpus=2, strategy="ddp_spawn")
     model = MultiProcessModel()
     model.val_dataloader = None
     trainer.fit(model, train_dataloader=dataloader)
@@ -892,7 +890,9 @@ def gen(self):
     model = TestModel()
     train_dataloader = DataLoader(TestDataset(model.gen), batch_size=2)
     trainer = Trainer(
-        default_root_dir=os.getcwd(), max_epochs=2, weights_summary=None  # we expect the second epoch to be skipped
+        default_root_dir=os.getcwd(),
+        max_epochs=2,
+        enable_model_summary=False,  # we expect the second epoch to be skipped
     )
     trainer.fit(model, train_dataloader=train_dataloader)
     assert trainer.global_step == 2 * yield_at_all
@@ -930,7 +930,7 @@ def test_dataloader_distributed_sampler(tmpdir):
     trainer = Trainer(
         gpus=[0, 1],
         num_nodes=1,
-        accelerator="ddp_spawn",
+        strategy="ddp_spawn",
         default_root_dir=tmpdir,
         max_steps=1,
         callbacks=[DistribSamplerCallback(expected_seeds=(123, 123, 123))],
@@ -957,7 +957,7 @@ def test_dataloader_distributed_sampler_already_attached(tmpdir):
     trainer = Trainer(
         gpus=[0, 1],
         num_nodes=1,
-        accelerator="ddp_spawn",
+        strategy="ddp_spawn",
         default_root_dir=tmpdir,
         max_steps=100,
         callbacks=[DistribSamplerCallback(expected_seeds=(11, 123, 0))],
@@ -1313,8 +1313,8 @@ def test_dataloaders_load_only_once_passed_loaders(tmpdir):
 
 
 def test_dataloaders_reset_and_attach(tmpdir):
-    """Test that repeated calls to Trainer.{fit,validate,test,predict} properly reset and dataloaders before
-    attaching the new one."""
+    """Test that repeated calls to Trainer.{fit,validate,test,predict} properly reset dataloaders before attaching
+    the new one."""
     # the assertions compare the datasets and not dataloaders since we patch and replace the samplers
     dataloader_0 = DataLoader(dataset=RandomDataset(32, 64))
     dataloader_1 = DataLoader(dataset=RandomDataset(32, 64))
diff --git a/tests/trainer/test_supporters.py b/tests/trainer/test_supporters.py
index 45c5770617bb6..204f3079f544b 100644
--- a/tests/trainer/test_supporters.py
+++ b/tests/trainer/test_supporters.py
@@ -349,7 +349,7 @@ def __init__(self, data_source, name) -> None:
 
     with mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": str(int(use_fault_tolerant))}):
 
-        trainer = Trainer(replace_sampler_ddp=replace_sampler_ddp, accelerator="ddp", gpus=2)
+        trainer = Trainer(replace_sampler_ddp=replace_sampler_ddp, strategy="ddp", gpus=2)
         dataloader = trainer.prepare_dataloader(dataloader, shuffle=True)
         _count = 0
         _has_fastforward_sampler = False
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index acb0c10df6c63..c42444fcfdf4d 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -174,68 +174,6 @@ def test_strict_model_load(monkeypatch, tmpdir, tmpdir_server, url_ckpt):
     assert not failed, "Model should be loaded due to strict=False."
 
 
-@pytest.mark.parametrize("accumulate_grad_batches", (1, 2, 3))
-def test_trainer_accumulate_grad_batches_zero_grad(tmpdir, accumulate_grad_batches):
-    with patch("torch.optim.SGD.zero_grad") as sgd_zero_grad:
-        model = BoringModel()
-        trainer = Trainer(
-            default_root_dir=tmpdir,
-            limit_train_batches=20,
-            limit_val_batches=1,
-            max_epochs=1,
-            weights_summary=None,
-            accumulate_grad_batches=accumulate_grad_batches,
-        )
-        assert trainer.accumulate_grad_batches == accumulate_grad_batches
-        trainer.fit(model)
-
-        assert sum(isinstance(cb, GradientAccumulationScheduler) for cb in trainer.callbacks) == 1
-        assert sgd_zero_grad.call_count == math.ceil(trainer.limit_train_batches / accumulate_grad_batches)
-
-
-@pytest.mark.parametrize(
-    ["accumulate_grad_batches", "expected_call_count"],
-    [
-        ({1: 2, 3: 4}, 10 + 5 + 5 + 3),
-        ({0: 2, 2: 1}, 5 + 5 + 10 + 10),
-    ],
-)
-def test_trainer_accumulate_grad_batches_dict_zero_grad(tmpdir, accumulate_grad_batches, expected_call_count):
-    with patch("torch.optim.SGD.zero_grad") as sgd_zero_grad:
-        model = BoringModel()
-        trainer = Trainer(
-            default_root_dir=tmpdir,
-            limit_train_batches=10,
-            limit_val_batches=1,
-            max_epochs=4,
-            weights_summary=None,
-            accumulate_grad_batches=accumulate_grad_batches,
-        )
-        assert trainer.accumulate_grad_batches == accumulate_grad_batches.get(0, 1)
-        trainer.fit(model)
-
-        assert sum(isinstance(cb, GradientAccumulationScheduler) for cb in trainer.callbacks) == 1
-        assert sgd_zero_grad.call_count == expected_call_count
-
-
-def test_trainer_accumulate_grad_batches_with_callback(tmpdir):
-    with patch("torch.optim.SGD.zero_grad") as sgd_zero_grad:
-        model = BoringModel()
-        trainer = Trainer(
-            default_root_dir=tmpdir,
-            limit_train_batches=10,
-            limit_val_batches=1,
-            max_epochs=4,
-            weights_summary=None,
-            callbacks=[GradientAccumulationScheduler({1: 2, 3: 4})],
-        )
-        assert trainer.accumulate_grad_batches == 1
-        trainer.fit(model)
-
-        assert sum(isinstance(cb, GradientAccumulationScheduler) for cb in trainer.callbacks) == 1
-        assert sgd_zero_grad.call_count == 10 + 5 + 5 + 3
-
-
 def test_trainer_accumulate_grad_batches_incorrect_value(tmpdir):
     with pytest.raises(MisconfigurationException, match=".*should be an int or a dict.*"):
         Trainer(default_root_dir=tmpdir, accumulate_grad_batches=(2, 5))
@@ -491,7 +429,7 @@ def on_load_checkpoint(self, _):
         val_check_interval=1.0,
         enable_progress_bar=False,
         logger=False,
-        weights_summary=None,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
@@ -530,7 +468,7 @@ def test_trainer_max_steps_and_epochs(tmpdir):
         "max_epochs": 3,
         "max_steps": num_train_samples + 10,
         "logger": False,
-        "weights_summary": None,
+        "enable_model_summary": False,
         "enable_progress_bar": False,
     }
     trainer = Trainer(**trainer_kwargs)
@@ -617,7 +555,7 @@ def test_trainer_min_steps_and_epochs(tmpdir):
         # define less min steps than 1 epoch
         "min_steps": num_train_samples // 2,
         "logger": False,
-        "weights_summary": None,
+        "enable_model_summary": False,
         "enable_progress_bar": False,
     }
     trainer = Trainer(**trainer_kwargs)
@@ -683,8 +621,8 @@ def test_trainer_max_steps_accumulate_batches(tmpdir):
         max_steps=num_train_samples + 10,
         accumulate_grad_batches=10,
         logger=False,
-        weights_summary=None,
         enable_progress_bar=False,
+        enable_model_summary=False,
     )
     trainer.fit(model)
 
@@ -785,6 +723,50 @@ def predict_step(self, batch, *_):
             assert getattr(trainer, path_attr) == ckpt_path
 
 
+@pytest.mark.parametrize("enable_checkpointing", (False, True))
+@pytest.mark.parametrize("fn", ("validate", "test", "predict"))
+def test_tested_checkpoint_path_best(tmpdir, enable_checkpointing, fn):
+    class TestModel(BoringModel):
+        def validation_step(self, batch, batch_idx):
+            self.log("foo", -batch_idx)
+            return super().validation_step(batch, batch_idx)
+
+        def test_step(self, *args):
+            return self.validation_step(*args)
+
+        def predict_step(self, batch, *_):
+            return self(batch)
+
+    model = TestModel()
+    model.test_epoch_end = None
+    trainer = Trainer(
+        max_epochs=2,
+        limit_val_batches=1,
+        limit_test_batches=1,
+        limit_predict_batches=1,
+        enable_progress_bar=False,
+        default_root_dir=tmpdir,
+        enable_checkpointing=enable_checkpointing,
+    )
+    trainer.fit(model)
+
+    trainer_fn = getattr(trainer, fn)
+    path_attr = f"{fn}{'d' if fn == 'validate' else 'ed'}_ckpt_path"
+    assert getattr(trainer, path_attr) is None
+
+    if enable_checkpointing:
+        trainer_fn(ckpt_path="best")
+        assert getattr(trainer, path_attr) == trainer.checkpoint_callback.best_model_path
+
+        trainer_fn(model, ckpt_path="best")
+        assert getattr(trainer, path_attr) == trainer.checkpoint_callback.best_model_path
+    else:
+        with pytest.raises(MisconfigurationException, match="`ModelCheckpoint` is not configured."):
+            trainer_fn(ckpt_path="best")
+        with pytest.raises(MisconfigurationException, match="`ModelCheckpoint` is not configured."):
+            trainer_fn(model, ckpt_path="best")
+
+
 def test_disabled_training(tmpdir):
     """Verify that `limit_train_batches=0` disables the training loop unless `fast_dev_run=True`."""
 
@@ -929,9 +911,10 @@ def test_invalid_terminate_on_nan(tmpdir):
         Trainer(default_root_dir=tmpdir, terminate_on_nan="False")
 
 
-def test_invalid_track_grad_norm(tmpdir):
-    with pytest.raises(MisconfigurationException, match="`track_grad_norm` should be an int, a float"):
-        Trainer(default_root_dir=tmpdir, track_grad_norm="nan")
+@pytest.mark.parametrize("track_grad_norm", [0, torch.tensor(1), "nan"])
+def test_invalid_track_grad_norm(tmpdir, track_grad_norm):
+    with pytest.raises(MisconfigurationException, match="`track_grad_norm` must be a positive number or 'inf'"):
+        Trainer(default_root_dir=tmpdir, track_grad_norm=track_grad_norm)
 
 
 @mock.patch("torch.Tensor.backward")
@@ -1307,7 +1290,7 @@ def setup(self, trainer, model, stage):
 
     model = CurrentModel()
     callback = CurrentCallback()
-    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, checkpoint_callback=False, callbacks=[callback])
+    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, enable_checkpointing=False, callbacks=[callback])
 
     if stage == "fit":
         trainer.fit(model)
@@ -1407,7 +1390,7 @@ def predict(
         default_root_dir=tmpdir,
         max_epochs=1,
         log_every_n_steps=1,
-        weights_summary=None,
+        enable_model_summary=False,
         accelerator=accelerator,
         gpus=gpus,
         num_processes=num_processes,
@@ -1884,7 +1867,7 @@ def test_on_load_checkpoint_missing_callbacks(tmpdir):
 def test_module_current_fx_attributes_reset(tmpdir):
     """Ensure that lightning module's attributes related to current fx are reset at the end of execution."""
     model = BoringModel()
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1, checkpoint_callback=False, logger=False)
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1, enable_checkpointing=False, logger=False)
 
     trainer.fit(model)
     assert model._current_fx_name is None
@@ -2095,3 +2078,110 @@ def training_step(self, batch, batch_idx):
             UserWarning, match=r".*Error detected in.* Traceback of forward call that caused the error.*"
         ):
             trainer.fit(model)
+
+
+@pytest.mark.parametrize(
+    "trainer_kwargs,expected",
+    [
+        (
+            dict(strategy=None, gpus=None),
+            dict(_distrib_type=None, _device_type=DeviceType.CPU, num_gpus=0, num_processes=1),
+        ),
+        (
+            dict(strategy="dp", gpus=None),
+            dict(_distrib_type=None, _device_type=DeviceType.CPU, num_gpus=0, num_processes=1),
+        ),
+        (
+            dict(strategy="ddp", gpus=None),
+            dict(_distrib_type=None, _device_type=DeviceType.CPU, num_gpus=0, num_processes=1),
+        ),
+        (
+            dict(strategy="ddp", num_processes=2, gpus=None),
+            dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2),
+        ),
+        (
+            dict(strategy="ddp", num_nodes=2, gpus=None),
+            dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=1),
+        ),
+        (
+            dict(strategy="ddp2", gpus=None),
+            dict(_distrib_type=None, _device_type=DeviceType.CPU, num_gpus=0, num_processes=1),
+        ),
+        (
+            dict(strategy=None, gpus=1),
+            dict(_distrib_type=None, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1),
+        ),
+        (
+            dict(strategy="dp", gpus=1),
+            dict(_distrib_type=DistributedType.DP, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1),
+        ),
+        (
+            dict(strategy="ddp", gpus=1),
+            dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1),
+        ),
+        (
+            dict(strategy="ddp_spawn", gpus=1),
+            dict(_distrib_type=DistributedType.DDP_SPAWN, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1),
+        ),
+        (
+            dict(strategy="ddp2", gpus=1),
+            dict(_distrib_type=DistributedType.DDP2, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1),
+        ),
+        (
+            dict(strategy=None, gpus=2),
+            dict(_distrib_type=DistributedType.DDP_SPAWN, _device_type=DeviceType.GPU, num_gpus=2, num_processes=2),
+        ),
+        (
+            dict(strategy="dp", gpus=2),
+            dict(_distrib_type=DistributedType.DP, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1),
+        ),
+        (
+            dict(strategy="ddp", gpus=2),
+            dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.GPU, num_gpus=2, num_processes=2),
+        ),
+        (
+            dict(strategy="ddp2", gpus=2),
+            dict(_distrib_type=DistributedType.DDP2, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1),
+        ),
+        (
+            dict(strategy="ddp2", num_processes=2, gpus=None),
+            dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2),
+        ),
+        (
+            dict(strategy="dp", num_processes=2, gpus=None),
+            dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2),
+        ),
+        (
+            dict(strategy="ddp_spawn", num_processes=2, gpus=None),
+            dict(_distrib_type=DistributedType.DDP_SPAWN, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2),
+        ),
+        (
+            dict(strategy="ddp_spawn", num_processes=1, gpus=None),
+            dict(_distrib_type=None, _device_type=DeviceType.CPU, num_gpus=0, num_processes=1),
+        ),
+        (
+            dict(strategy="ddp_cpu", num_processes=1, num_nodes=1, gpus=None),
+            dict(_distrib_type=DistributedType.DDP_SPAWN, _device_type=DeviceType.CPU, num_gpus=0, num_processes=1),
+        ),
+        (
+            dict(strategy="ddp_cpu", num_processes=2, num_nodes=1, gpus=None),
+            dict(_distrib_type=DistributedType.DDP_SPAWN, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2),
+        ),
+        (
+            dict(strategy="ddp_cpu", num_processes=1, num_nodes=2, gpus=None),
+            dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=1),
+        ),
+        (
+            dict(strategy="ddp_cpu", num_processes=2, num_nodes=2, gpus=None),
+            dict(_distrib_type=DistributedType.DDP_SPAWN, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2),
+        ),
+    ],
+)
+def test_trainer_config_strategy(trainer_kwargs, expected, monkeypatch):
+    if trainer_kwargs["gpus"] is not None:
+        monkeypatch.setattr(torch.cuda, "is_available", lambda: True)
+        monkeypatch.setattr(torch.cuda, "device_count", lambda: trainer_kwargs["gpus"])
+    trainer = Trainer(**trainer_kwargs)
+    assert len(expected) == 4
+    for k, v in expected.items():
+        assert getattr(trainer, k) == v, f"Failed {k}: {v}"
diff --git a/tests/trainer/test_trainer_cli.py b/tests/trainer/test_trainer_cli.py
index d2a4b462ceb77..6512170084a94 100644
--- a/tests/trainer/test_trainer_cli.py
+++ b/tests/trainer/test_trainer_cli.py
@@ -190,7 +190,7 @@ def test_argparse_args_parsing_gpus(cli_args, expected_parsed, expected_device_i
         ({}, {}),
         ({"logger": False}, {}),
         ({"logger": False}, {"logger": True}),
-        ({"logger": False}, {"checkpoint_callback": True}),
+        ({"logger": False}, {"enable_checkpointing": True}),
     ],
 )
 def test_init_from_argparse_args(cli_args, extra_args):
diff --git a/tests/trainer/test_trainer_tricks.py b/tests/trainer/test_trainer_tricks.py
index 922dbdd13ab41..a1bc7e6cafd49 100644
--- a/tests/trainer/test_trainer_tricks.py
+++ b/tests/trainer/test_trainer_tricks.py
@@ -84,6 +84,7 @@ def test_overfit_batch_limits(tmpdir):
     # test train loader applies correct limits
     # ------------------------------------------------------
     trainer = Trainer(overfit_batches=4)
+    trainer.data_connector.attach_dataloaders(model=model)
     trainer.reset_train_dataloader(model)
     assert trainer.num_training_batches == 4
 
@@ -93,6 +94,7 @@ def test_overfit_batch_limits(tmpdir):
     assert torch.eq(ya, yb).all()
 
     trainer = Trainer(overfit_batches=0.11)
+    trainer.data_connector.attach_dataloaders(model=model)
     trainer.reset_train_dataloader(model)
     # The dataloader should have been overwritten with a Sequential sampler.
     assert trainer.train_dataloader is not train_loader
@@ -111,7 +113,9 @@ def test_overfit_batch_limits(tmpdir):
         # ------------------------------------------------------
         # test overfit_batches as percent
         # ------------------------------------------------------
-        loader_num_batches, dataloaders = Trainer(overfit_batches=0.11)._reset_eval_dataloader(split, model=model)
+        trainer = Trainer(overfit_batches=0.11)
+        trainer.data_connector.attach_dataloaders(model)
+        loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model)
         assert loader_num_batches[0] == num_train_samples
 
         # make sure we turned off shuffle for the user
@@ -125,23 +129,35 @@ def test_overfit_batch_limits(tmpdir):
         # ------------------------------------------------------
         # test overfit_batches as int
         # ------------------------------------------------------
-        loader_num_batches, dataloaders = Trainer(overfit_batches=1)._reset_eval_dataloader(split, model=model)
+        trainer = Trainer(overfit_batches=1)
+        trainer.data_connector.attach_dataloaders(model)
+        loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model)
         assert loader_num_batches[0] == 1
-        loader_num_batches, dataloaders = Trainer(overfit_batches=5)._reset_eval_dataloader(split, model=model)
+        trainer = Trainer(overfit_batches=5)
+        trainer.data_connector.attach_dataloaders(model)
+        loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model)
         assert loader_num_batches[0] == 5
 
         # ------------------------------------------------------
         # test limit_xxx_batches as percent AND int
         # ------------------------------------------------------
         if split == RunningStage.VALIDATING:
-            loader_num_batches, dataloaders = Trainer(limit_val_batches=0.1)._reset_eval_dataloader(split, model=model)
+            trainer = Trainer(limit_val_batches=0.1)
+            trainer.data_connector.attach_dataloaders(model)
+            loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model)
             assert loader_num_batches[0] == int(0.1 * len(val_loader))
 
-            loader_num_batches, dataloaders = Trainer(limit_val_batches=10)._reset_eval_dataloader(split, model=model)
+            trainer = Trainer(limit_val_batches=10)
+            trainer.data_connector.attach_dataloaders(model)
+            loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model)
             assert loader_num_batches[0] == 10
         else:
-            loader_num_batches, dataloaders = Trainer(limit_test_batches=0.1)._reset_eval_dataloader(split, model=model)
+            trainer = Trainer(limit_test_batches=0.1)
+            trainer.data_connector.attach_dataloaders(model)
+            loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model)
             assert loader_num_batches[0] == int(0.1 * len(test_loader))
 
-            loader_num_batches, dataloaders = Trainer(limit_test_batches=10)._reset_eval_dataloader(split, model=model)
+            trainer = Trainer(limit_test_batches=10)
+            trainer.data_connector.attach_dataloaders(model)
+            loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model)
             assert loader_num_batches[0] == 10
diff --git a/tests/tuner/test_scale_batch_size.py b/tests/tuner/test_scale_batch_size.py
index 32b6f1db41ac9..9dbb24d9edf30 100644
--- a/tests/tuner/test_scale_batch_size.py
+++ b/tests/tuner/test_scale_batch_size.py
@@ -220,9 +220,12 @@ def test_error_on_dataloader_passed_to_fit(tmpdir):
         limit_train_batches=0.2,
         auto_scale_batch_size="power",
     )
-    fit_options = dict(train_dataloader=model.dataloader(train=True))
+    fit_options = dict(train_dataloaders=model.dataloader(train=True))
 
-    with pytest.raises(MisconfigurationException):
+    with pytest.raises(
+        MisconfigurationException,
+        match="The batch scaling feature cannot be used with dataloaders passed directly",
+    ):
         trainer.tune(model, **fit_options)
 
 
@@ -271,10 +274,11 @@ def __init__(self):
         trainer.tuner.scale_batch_size(model, mode="ThisModeDoesNotExist")
 
 
-def test_dataloader_reset_with_scale_batch_size(tmpdir):
+@pytest.mark.parametrize("scale_method", ["power", "binsearch"])
+def test_dataloader_reset_with_scale_batch_size(tmpdir, scale_method):
     """Test that train and val dataloaders are reset at every update in scale batch size."""
     model = BatchSizeModel(batch_size=16)
-    scale_batch_size_kwargs = {"max_trials": 5, "init_val": 4}
+    scale_batch_size_kwargs = {"max_trials": 5, "init_val": 4, "mode": scale_method}
 
     trainer = Trainer(max_epochs=2, auto_scale_batch_size=True)
     new_batch_size = trainer.tune(model, scale_batch_size_kwargs=scale_batch_size_kwargs)["scale_batch_size"]
diff --git a/tests/utilities/test_all_gather_grad.py b/tests/utilities/test_all_gather_grad.py
index d34a0c64a63cc..073468fc4cb28 100644
--- a/tests/utilities/test_all_gather_grad.py
+++ b/tests/utilities/test_all_gather_grad.py
@@ -91,7 +91,7 @@ def training_epoch_end(self, outputs) -> None:
         log_every_n_steps=1,
         accumulate_grad_batches=2,
         gpus=2,
-        accelerator="ddp",
+        strategy="ddp",
     )
 
     trainer.fit(model)
@@ -115,6 +115,6 @@ def training_step(self, batch, batch_idx):
             return loss
 
     model = TestModel()
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, gpus=2, accelerator="ddp")
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, gpus=2, strategy="ddp")
     trainer.fit(model)
     assert model.training_step_called
diff --git a/tests/utilities/test_apply_func.py b/tests/utilities/test_apply_func.py
index 9862da05bf4a0..2c131f96ecc6f 100644
--- a/tests/utilities/test_apply_func.py
+++ b/tests/utilities/test_apply_func.py
@@ -36,6 +36,10 @@ class ModelExample:
         example_ids: List[str]
         feature: Feature
         label: torch.Tensor
+        some_constant: int = dataclasses.field(init=False)
+
+        def __post_init__(self):
+            self.some_constant = 7
 
     to_reduce = {
         "a": torch.tensor([1.0]),  # Tensor
diff --git a/tests/utilities/test_auto_restart.py b/tests/utilities/test_auto_restart.py
index 709ee1100a1b4..1000aefe2eeb2 100644
--- a/tests/utilities/test_auto_restart.py
+++ b/tests/utilities/test_auto_restart.py
@@ -941,8 +941,8 @@ def test_dataset_rng_states_restart_with_lightning(tmpdir, dataset_classes, mult
     trainer_kwargs = dict(
         default_root_dir=tmpdir,
         max_epochs=3,
-        weights_summary=None,
         enable_progress_bar=False,
+        enable_model_summary=False,
         multiple_trainloader_mode=multiple_trainloader_mode,
     )
 
diff --git a/tests/utilities/test_cli.py b/tests/utilities/test_cli.py
index ace76bd9374c8..9f4c7c67cc765 100644
--- a/tests/utilities/test_cli.py
+++ b/tests/utilities/test_cli.py
@@ -125,7 +125,7 @@ def _raise():
         ("--tpu_cores=1,", dict(tpu_cores="1,")),
         ("--limit_train_batches=100", dict(limit_train_batches=100)),
         ("--limit_train_batches 0.8", dict(limit_train_batches=0.8)),
-        ("--weights_summary=null", dict(weights_summary=None)),
+        ("--enable_model_summary FALSE", dict(enable_model_summary=False)),
         (
             "",
             dict(
@@ -135,7 +135,7 @@ def _raise():
                 min_steps=None,
                 max_steps=None,
                 log_gpu_memory=None,
-                distributed_backend=None,
+                accelerator=None,
                 weights_save_path=None,
                 resume_from_checkpoint=None,
                 profiler=None,
@@ -202,7 +202,7 @@ def test_parse_args_parsing_gpus(monkeypatch, cli_args, expected_gpu):
         ({}, {}),
         (dict(logger=False), {}),
         (dict(logger=False), dict(logger=True)),
-        (dict(logger=False), dict(checkpoint_callback=True)),
+        (dict(logger=False), dict(enable_checkpointing=True)),
     ],
 )
 def test_init_from_argparse_args(cli_args, extra_args):
@@ -337,7 +337,7 @@ def test_lightning_cli_args(tmpdir):
         f"--data.data_dir={tmpdir}",
         f"--trainer.default_root_dir={tmpdir}",
         "--trainer.max_epochs=1",
-        "--trainer.weights_summary=null",
+        "--trainer.enable_model_summary=False",
         "--seed_everything=1234",
     ]
 
@@ -384,7 +384,7 @@ def test_lightning_cli_config_and_subclass_mode(tmpdir):
         "fit": {
             "model": {"class_path": "tests.helpers.BoringModel"},
             "data": {"class_path": "tests.helpers.BoringDataModule", "init_args": {"data_dir": str(tmpdir)}},
-            "trainer": {"default_root_dir": str(tmpdir), "max_epochs": 1, "weights_summary": None},
+            "trainer": {"default_root_dir": str(tmpdir), "max_epochs": 1, "enable_model_summary": False},
         }
     }
     config_path = tmpdir / "config.yaml"
@@ -581,8 +581,8 @@ def on_fit_start(self):
 @pytest.mark.parametrize(
     "trainer_kwargs",
     (
-        dict(accelerator="ddp_cpu"),
-        dict(accelerator="ddp_cpu", plugins="ddp_find_unused_parameters_false"),
+        dict(strategy="ddp_spawn"),
+        dict(strategy="ddp"),
         pytest.param({"tpu_cores": 1}, marks=RunIf(tpu=True)),
     ),
 )
diff --git a/tests/utilities/test_dtype_device_mixin.py b/tests/utilities/test_dtype_device_mixin.py
index 9140b965aa316..2efe1e9bdf613 100644
--- a/tests/utilities/test_dtype_device_mixin.py
+++ b/tests/utilities/test_dtype_device_mixin.py
@@ -67,9 +67,7 @@ def test_submodules_device_and_dtype(dst_device, dst_dtype):
 @RunIf(min_gpus=2)
 def test_submodules_multi_gpu_dp(tmpdir):
     model = TopModule()
-    trainer = Trainer(
-        default_root_dir=tmpdir, accelerator="dp", gpus=2, callbacks=[DeviceAssertCallback()], max_steps=1
-    )
+    trainer = Trainer(default_root_dir=tmpdir, strategy="dp", gpus=2, callbacks=[DeviceAssertCallback()], max_steps=1)
     trainer.fit(model)
 
 
@@ -77,7 +75,7 @@ def test_submodules_multi_gpu_dp(tmpdir):
 def test_submodules_multi_gpu_ddp_spawn(tmpdir):
     model = TopModule()
     trainer = Trainer(
-        default_root_dir=tmpdir, accelerator="ddp_spawn", gpus=2, callbacks=[DeviceAssertCallback()], max_steps=1
+        default_root_dir=tmpdir, strategy="ddp_spawn", gpus=2, callbacks=[DeviceAssertCallback()], max_steps=1
     )
     trainer.fit(model)
 
diff --git a/tests/utilities/test_enums.py b/tests/utilities/test_enums.py
index f941c3e616446..4f902e2238d1c 100644
--- a/tests/utilities/test_enums.py
+++ b/tests/utilities/test_enums.py
@@ -27,7 +27,7 @@ def test_consistency():
 
 
 def test_precision_supported_types():
-    assert PrecisionType.supported_types() == ["16", "32", "64", "bf16"]
+    assert PrecisionType.supported_types() == ["16", "32", "64", "bf16", "mixed"]
     assert PrecisionType.supported_type(16)
     assert PrecisionType.supported_type("16")
     assert not PrecisionType.supported_type(1)
diff --git a/tests/utilities/test_grads.py b/tests/utilities/test_grads.py
new file mode 100644
index 0000000000000..5afdcbd685d48
--- /dev/null
+++ b/tests/utilities/test_grads.py
@@ -0,0 +1,78 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from unittest.mock import Mock
+
+import pytest
+import torch
+import torch.nn as nn
+
+from pytorch_lightning.utilities import grad_norm
+
+
+@pytest.mark.parametrize(
+    "norm_type,expected",
+    [
+        (
+            1,
+            {"grad_1.0_norm_param0": 1 + 2 + 3, "grad_1.0_norm_param1": 4 + 5, "grad_1.0_norm_total": 15},
+        ),
+        (
+            2,
+            {
+                "grad_2.0_norm_param0": pow(1 + 4 + 9, 0.5),
+                "grad_2.0_norm_param1": pow(16 + 25, 0.5),
+                "grad_2.0_norm_total": pow(1 + 4 + 9 + 16 + 25, 0.5),
+            },
+        ),
+        (
+            3.14,
+            {
+                "grad_3.14_norm_param0": pow(1 + 2 ** 3.14 + 3 ** 3.14, 1 / 3.14),
+                "grad_3.14_norm_param1": pow(4 ** 3.14 + 5 ** 3.14, 1 / 3.14),
+                "grad_3.14_norm_total": pow(1 + 2 ** 3.14 + 3 ** 3.14 + 4 ** 3.14 + 5 ** 3.14, 1 / 3.14),
+            },
+        ),
+        (
+            "inf",
+            {
+                "grad_inf_norm_param0": max(1, 2, 3),
+                "grad_inf_norm_param1": max(4, 5),
+                "grad_inf_norm_total": max(1, 2, 3, 4, 5),
+            },
+        ),
+    ],
+)
+def test_grad_norm(norm_type, expected):
+    """Test utility function for computing the p-norm of individual parameter groups and norm in total."""
+
+    class Model(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.param0 = nn.Parameter(torch.rand(3))
+            self.param1 = nn.Parameter(torch.rand(2, 1))
+            self.param0.grad = torch.tensor([-1.0, 2.0, -3.0])
+            self.param1.grad = torch.tensor([[-4.0], [5.0]])
+            # param without grad should not contribute to norm
+            self.param2 = nn.Parameter(torch.rand(1))
+
+    model = Model()
+    norms = grad_norm(model, norm_type)
+    expected = {k: round(v, 4) for k, v in expected.items()}
+    assert norms == expected
+
+
+@pytest.mark.parametrize("norm_type", [-1, 0])
+def test_grad_norm_invalid_norm_type(norm_type):
+    with pytest.raises(ValueError, match="`norm_type` must be a positive number or 'inf'"):
+        grad_norm(Mock(), norm_type)