From f7175c488935bef766255deb8c8d9bd42afa6e86 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Tue, 8 Feb 2022 11:42:05 +0200
Subject: [PATCH 001/167] Add hpu accelerator support

Signed-off-by: Jerome <janand@habana.ai>
---
 .../hpu_examples/simple_mnist/mnist.py        | 50 ++++++++++++
 .../simple_mnist/ops_bf16_mnist.txt           |  2 +
 .../simple_mnist/ops_fp32_mnist.txt           |  1 +
 pytorch_lightning/accelerators/__init__.py    |  1 +
 pytorch_lightning/accelerators/accelerator.py |  1 +
 pytorch_lightning/accelerators/hpu.py         | 63 +++++++++++++++
 pytorch_lightning/core/lightning.py           | 10 +++
 pytorch_lightning/plugins/__init__.py         |  4 +
 .../plugins/precision/hpu_precision.py        | 51 ++++++++++++
 .../plugins/training_type/__init__.py         |  1 +
 .../plugins/training_type/single_hpu.py       | 24 ++++++
 pytorch_lightning/strategies/__init__.py      |  1 +
 pytorch_lightning/strategies/ddp.py           |  9 ++-
 pytorch_lightning/strategies/hpu.py           | 76 ++++++++++++++++++
 .../connectors/accelerator_connector.py       | 80 ++++++++++++++++++-
 pytorch_lightning/trainer/trainer.py          | 32 +++++++-
 pytorch_lightning/utilities/__init__.py       |  1 +
 pytorch_lightning/utilities/argparse.py       |  2 +-
 pytorch_lightning/utilities/distributed.py    | 12 ++-
 pytorch_lightning/utilities/enums.py          |  1 +
 pytorch_lightning/utilities/imports.py        |  2 +
 21 files changed, 415 insertions(+), 9 deletions(-)
 create mode 100644 pl_examples/hpu_examples/simple_mnist/mnist.py
 create mode 100644 pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt
 create mode 100644 pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt
 create mode 100644 pytorch_lightning/accelerators/hpu.py
 create mode 100644 pytorch_lightning/plugins/precision/hpu_precision.py
 create mode 100644 pytorch_lightning/plugins/training_type/single_hpu.py
 create mode 100644 pytorch_lightning/strategies/hpu.py

diff --git a/pl_examples/hpu_examples/simple_mnist/mnist.py b/pl_examples/hpu_examples/simple_mnist/mnist.py
new file mode 100644
index 0000000000000..a8e864acdd6af
--- /dev/null
+++ b/pl_examples/hpu_examples/simple_mnist/mnist.py
@@ -0,0 +1,50 @@
+import os
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.utils.data import DataLoader, random_split
+from torchvision.datasets import MNIST
+from torchvision import transforms
+import pytorch_lightning as pl
+import sys
+
+import habana_frameworks.torch.core as htcore
+
+class MNISTModel(pl.LightningModule):
+
+    def __init__(self):
+        super(MNISTModel, self).__init__()
+        self.l1 = torch.nn.Linear(28 * 28, 10)
+
+    def forward(self, x):
+        return torch.relu(self.l1(x.view(x.size(0), -1)))
+
+    def training_step(self, batch, batch_nb):
+        x, y = batch
+        loss = F.cross_entropy(self(x), y)
+        return loss
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), lr=0.02)
+
+# Init our model
+mnist_model = MNISTModel()
+
+# Init DataLoader from MNIST Dataset
+train_ds = MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())
+train_loader = DataLoader(train_ds, batch_size=32)
+
+# TBD: import these keys from hmp
+hmp_keys = ["level", "verbose", "bf16_ops", "fp32_ops"]
+hmp_params = dict.fromkeys(hmp_keys)
+hmp_params["level"] = "O1"
+hmp_params["verbose"] = False
+hmp_params["bf16_ops"] = "./pytorch-lightning-fork/pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt"
+hmp_params["fp32_ops"] = "./pytorch-lightning-fork/pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt"
+
+# Initialize a trainer
+trainer = pl.Trainer(hpus=1, max_epochs=1, precision=16, hmp_params=hmp_params)
+
+# Train the model ⚡
+trainer.fit(mnist_model, train_loader)
diff --git a/pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt b/pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt
new file mode 100644
index 0000000000000..21dfc7eb22855
--- /dev/null
+++ b/pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt
@@ -0,0 +1,2 @@
+linear
+relu
\ No newline at end of file
diff --git a/pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt b/pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt
new file mode 100644
index 0000000000000..11322c514abd9
--- /dev/null
+++ b/pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt
@@ -0,0 +1 @@
+cross_entropy
\ No newline at end of file
diff --git a/pytorch_lightning/accelerators/__init__.py b/pytorch_lightning/accelerators/__init__.py
index 1c9e0024f39bd..e6fb6e3c84a9f 100644
--- a/pytorch_lightning/accelerators/__init__.py
+++ b/pytorch_lightning/accelerators/__init__.py
@@ -15,3 +15,4 @@
 from pytorch_lightning.accelerators.gpu import GPUAccelerator  # noqa: F401
 from pytorch_lightning.accelerators.ipu import IPUAccelerator  # noqa: F401
 from pytorch_lightning.accelerators.tpu import TPUAccelerator  # noqa: F401
+from pytorch_lightning.accelerators.hpu import HPUAccelerator  # noqa: F401
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 724b5b6f244c1..617c4bd193ce6 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -28,6 +28,7 @@ class Accelerator(ABC):
     - GPU
     - TPU
     - IPU
+    - HPU
     """
 
     def setup_environment(self, root_device: torch.device) -> None:
diff --git a/pytorch_lightning/accelerators/hpu.py b/pytorch_lightning/accelerators/hpu.py
new file mode 100644
index 0000000000000..f91b2afe3273b
--- /dev/null
+++ b/pytorch_lightning/accelerators/hpu.py
@@ -0,0 +1,63 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+from typing import Any
+
+import torch
+
+import pytorch_lightning as pl
+from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.plugins import DataParallelPlugin
+from pytorch_lightning.plugins.training_type.single_hpu import HPUPlugin
+from pytorch_lightning.plugins.precision.hpu_precision import HPUPrecisionPlugin
+from pytorch_lightning.plugins.training_type.ddp import DDPPlugin
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from typing import Any, Dict, Union
+
+_log = logging.getLogger(__name__)
+
+
+
+class HPUAccelerator(Accelerator):
+    """ Accelerator for HPU devices. """
+
+    def setup(self, trainer: "pl.Trainer") -> None:
+        """
+        Raises:
+            ValueError:
+                If the precision or training type plugin are unsupported.
+        """
+        if not isinstance(self.precision_plugin, HPUPrecisionPlugin):
+            # this configuration should have been avoided in the accelerator connector
+            raise ValueError(
+                f"The `HPUAccelerator` can only be used with a `HPUPrecisionPlugin`, found: {self.precision_plugin}."
+            )
+        if not isinstance(self.training_type_plugin, (HPUPlugin, DDPPlugin)):
+            raise ValueError(
+                "The `HPUAccelerator` can only be used with a `HPUPlugin` or `DDPPlugin,"
+                f" found {self.training_type_plugin}."
+            )
+        return super().setup(trainer)
+
+    def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
+        """HPU device stats aren't supported yet."""
+        return {}
+
+    @staticmethod
+    def auto_device_count() -> int:
+        """Get the devices when set to auto."""
+        # TBD: make this configurable
+        return 8
+        
\ No newline at end of file
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 908bc9ab9056e..b8592fb974665 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -223,6 +223,14 @@ def on_gpu(self):
         """
         return self.device.type == "cuda"
 
+    @property
+    def on_hpu(self):
+        """
+        True if your model is currently running on HPUs.
+        Useful to set flags around the LightningModule for different CPU vs GPU vs HPU behavior.
+        """
+        return self.device.type == "hpu"
+
     @property
     def automatic_optimization(self) -> bool:
         """If set to ``False`` you are responsible for calling ``.backward()``, ``.step()``, ``.zero_grad()``."""
@@ -1537,6 +1545,7 @@ def optimizer_step(
         optimizer_idx: int = 0,
         optimizer_closure: Optional[Callable[[], Any]] = None,
         on_tpu: bool = False,
+        on_hpu: bool = None,
         using_native_amp: bool = False,
         using_lbfgs: bool = False,
     ) -> None:
@@ -1555,6 +1564,7 @@ def optimizer_step(
             optimizer_closure: Closure for all optimizers. This closure must be executed as it includes the
                 calls to ``training_step()``, ``optimizer.zero_grad()``, and ``backward()``.
             on_tpu: ``True`` if TPU backward is required
+            on_hpu: ``True`` if HPU backward is required
             using_native_amp: ``True`` if using native amp
             using_lbfgs: True if the matching optimizer is :class:`torch.optim.LBFGS`
 
diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py
index 02f8493d7b97c..5a75407465042 100644
--- a/pytorch_lightning/plugins/__init__.py
+++ b/pytorch_lightning/plugins/__init__.py
@@ -9,6 +9,7 @@
 from pytorch_lightning.plugins.precision.double import DoublePrecisionPlugin
 from pytorch_lightning.plugins.precision.fully_sharded_native_amp import FullyShardedNativeMixedPrecisionPlugin
 from pytorch_lightning.plugins.precision.ipu import IPUPrecisionPlugin
+from pytorch_lightning.plugins.precision.hpu_precision import HPUPrecisionPlugin
 from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin
 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
 from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin
@@ -22,6 +23,7 @@
 from pytorch_lightning.plugins.training_type.fully_sharded import DDPFullyShardedPlugin
 from pytorch_lightning.plugins.training_type.horovod import HorovodPlugin
 from pytorch_lightning.plugins.training_type.ipu import IPUPlugin
+from pytorch_lightning.plugins.training_type.single_hpu import HPUPlugin
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.plugins.training_type.sharded import DDPShardedPlugin
 from pytorch_lightning.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin
@@ -50,6 +52,8 @@
     "HorovodPlugin",
     "IPUPlugin",
     "IPUPrecisionPlugin",
+    "HPUPlugin",
+    "HPUPrecisionPlugin",
     "NativeMixedPrecisionPlugin",
     "PrecisionPlugin",
     "ShardedNativeMixedPrecisionPlugin",
diff --git a/pytorch_lightning/plugins/precision/hpu_precision.py b/pytorch_lightning/plugins/precision/hpu_precision.py
new file mode 100644
index 0000000000000..e7130beff9740
--- /dev/null
+++ b/pytorch_lightning/plugins/precision/hpu_precision.py
@@ -0,0 +1,51 @@
+# Copyright (C) 2021 Habana Labs, Ltd. an Intel Company
+# All Rights Reserved.
+#
+# Unauthorized copying of this file or any element(s) within it, via any medium
+# is strictly prohibited.
+# This file contains Habana Labs, Ltd. proprietary and confidential information
+# and is subject to the confidentiality and license agreements under which it
+# was provided.
+#
+
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Any, List, Tuple
+
+import torch.nn as nn
+from torch.optim import Optimizer
+
+from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
+
+from habana_frameworks.torch.hpex import hmp
+
+class HPUPrecisionPlugin(PrecisionPlugin):
+    """Plugin that enables bfloats/floats on HPUs"""
+
+    def __init__(self, precision: int, hmp_params :[]) -> None:
+        super().__init__()
+        self.precision = precision
+        if hmp_params is not None:
+            hmp_opt_level = hmp_params["level"]
+            hmp_bf16 = hmp_params["bf16_ops"]
+            hmp_fp32 = hmp_params["fp32_ops"]
+            hmp_verbose = hmp_params["verbose"]
+            hmp.convert(opt_level=hmp_opt_level, bf16_file_path=hmp_bf16,
+                fp32_file_path=hmp_fp32, isVerbose=hmp_verbose)
+
+    def connect(
+        self, model: nn.Module, optimizers: List[Optimizer], lr_schedulers: List[Any]
+    ) -> Tuple[nn.Module, List[Optimizer], List[Any]]:
+        return super().connect(model=model, optimizers=optimizers, lr_schedulers=lr_schedulers)
diff --git a/pytorch_lightning/plugins/training_type/__init__.py b/pytorch_lightning/plugins/training_type/__init__.py
index f7bee339ef95a..760312df346c5 100644
--- a/pytorch_lightning/plugins/training_type/__init__.py
+++ b/pytorch_lightning/plugins/training_type/__init__.py
@@ -11,5 +11,6 @@
 from pytorch_lightning.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin  # noqa: F401
+from pytorch_lightning.plugins.training_type.single_hpu import HPUPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin  # noqa: F401
diff --git a/pytorch_lightning/plugins/training_type/single_hpu.py b/pytorch_lightning/plugins/training_type/single_hpu.py
new file mode 100644
index 0000000000000..1dae809141a26
--- /dev/null
+++ b/pytorch_lightning/plugins/training_type/single_hpu.py
@@ -0,0 +1,24 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from pytorch_lightning.strategies import HPUStrategy
+from pytorch_lightning.utilities import rank_zero_deprecation
+
+
+class HPUPlugin(HPUStrategy):
+    def __init__(self, *args, **kwargs) -> None:  # type: ignore[no-untyped-def]
+        rank_zero_deprecation(
+            "The `pl.plugins.training_type.hpu.HPUPlugin` is deprecated in v1.6 and will be removed in."
+            " v1.8. Use `pl.strategies.hpu.HPUStrategy` instead."
+        )
+        super().__init__(*args, **kwargs)
diff --git a/pytorch_lightning/strategies/__init__.py b/pytorch_lightning/strategies/__init__.py
index 205d4acb8c115..90f7b8ce585e7 100644
--- a/pytorch_lightning/strategies/__init__.py
+++ b/pytorch_lightning/strategies/__init__.py
@@ -9,6 +9,7 @@
 from pytorch_lightning.strategies.fully_sharded import DDPFullyShardedStrategy  # noqa: F401
 from pytorch_lightning.strategies.horovod import HorovodStrategy  # noqa: F401
 from pytorch_lightning.strategies.ipu import IPUStrategy  # noqa: F401
+from pytorch_lightning.strategies.hpu import HPUStrategy  # noqa: F401
 from pytorch_lightning.strategies.parallel import ParallelStrategy  # noqa: F401
 from pytorch_lightning.strategies.sharded import DDPShardedStrategy  # noqa: F401
 from pytorch_lightning.strategies.sharded_spawn import DDPSpawnShardedStrategy  # noqa: F401
diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py
index d92b12627e6ec..082d5eef6d7c1 100644
--- a/pytorch_lightning/strategies/ddp.py
+++ b/pytorch_lightning/strategies/ddp.py
@@ -351,7 +351,7 @@ def configure_ddp(self) -> None:
         self._register_ddp_hooks()
 
     def determine_ddp_device_ids(self):
-        if self.root_device.type == "cpu":
+        if self.root_device.type == "cpu" or self.root_device.type == "hpu":
             return None
         return [self.root_device.index]
 
@@ -496,6 +496,13 @@ def reconciliate_processes(self, trace: str) -> None:
         shutil.rmtree(sync_dir)
         raise DeadlockDetectedException(f"DeadLock detected from rank: {self.global_rank} \n {trace}")
 
+    def on_save(self, checkpoint: Dict[str, Union[Any, torch.Tensor]]) -> Dict[str, Union[Any, torch.Tensor]]:
+        if self.root_device.type == "hpu" and self.cluster_environment.global_rank() == 0:
+            from pytorch_lightning.utilities.apply_func import move_data_to_device
+            return move_data_to_device(checkpoint, torch.device("cpu"))
+        else:
+            return checkpoint
+
     def teardown(self) -> None:
         log.detail(f"{self.__class__.__name__}: tearing down DDP plugin")
         super().teardown()
diff --git a/pytorch_lightning/strategies/hpu.py b/pytorch_lightning/strategies/hpu.py
new file mode 100644
index 0000000000000..4e9e7b97865d5
--- /dev/null
+++ b/pytorch_lightning/strategies/hpu.py
@@ -0,0 +1,76 @@
+# Copyright (C) 2021 Habana Labs, Ltd. an Intel Company
+# All Rights Reserved.
+#
+# Unauthorized copying of this file or any element(s) within it, via any medium
+# is strictly prohibited.
+# This file contains Habana Labs, Ltd. proprietary and confidential information
+# and is subject to the confidentiality and license agreements under which it
+# was provided.
+#
+
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import os
+from typing import Any, Dict, Optional
+
+import pytorch_lightning as pl
+from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
+from pytorch_lightning.strategies.single_device import SingleDeviceStrategy
+from pytorch_lightning.utilities.apply_func import move_data_to_device
+from pytorch_lightning.utilities import _HPU_AVAILABLE, find_shared_parameters, set_shared_parameters
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.model_helpers import is_overridden
+from pytorch_lightning.utilities.types import _PATH
+
+class HPUStrategy(SingleDeviceStrategy):
+
+    def __init__(
+        self,
+        device: int,
+        checkpoint_io: Optional[CheckpointIO] = None,
+        debug: bool = False,
+    ):
+
+        device = torch.device("hpu")
+        checkpoint_io = checkpoint_io
+        super().__init__(device, checkpoint_io=checkpoint_io)
+
+        self.debug = debug
+
+    @property
+    def is_distributed(self) -> bool:
+        return False
+
+    def setup(self, trainer: "pl.Trainer") -> None:
+        shared_params = find_shared_parameters(self.model)
+        self.model_to_device()
+        if is_overridden("on_post_move_to_device", self.lightning_module):
+            self.model.on_post_move_to_device()
+        else:
+            set_shared_parameters(self.model, shared_params)
+
+    def model_to_device(self) -> None:
+        self.model.to(self.root_device)
+
+    @property
+    def on_hpu(self) -> bool:
+        return True
+
+    def pre_dispatch(self) -> None:
+        if isinstance(self.device, int):
+            self.device = torch.device(self.device)
+
+    def on_save(self, checkpoint: dict) -> dict:
+        return move_data_to_device(checkpoint, torch.device("cpu"))
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index fd65975618f02..88a91b0273ae0 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -24,6 +24,7 @@
 from pytorch_lightning.accelerators.gpu import GPUAccelerator
 from pytorch_lightning.accelerators.ipu import IPUAccelerator
 from pytorch_lightning.accelerators.tpu import TPUAccelerator
+from pytorch_lightning.accelerators.hpu import HPUAccelerator
 from pytorch_lightning.plugins import (
     ApexMixedPrecisionPlugin,
     CheckpointIO,
@@ -36,6 +37,7 @@
     ShardedNativeMixedPrecisionPlugin,
     TPUBf16PrecisionPlugin,
     TPUPrecisionPlugin,
+    HPUPrecisionPlugin,
 )
 from pytorch_lightning.plugins.environments import (
     BaguaEnvironment,
@@ -63,6 +65,7 @@
     Strategy,
     StrategyRegistry,
     TPUSpawnStrategy,
+    HPUStrategy,
 )
 from pytorch_lightning.utilities import _AcceleratorType, _StrategyType, AMPType, device_parser
 from pytorch_lightning.utilities.enums import PrecisionType
@@ -72,6 +75,7 @@
     _IPU_AVAILABLE,
     _TORCH_GREATER_EQUAL_1_8,
     _TPU_AVAILABLE,
+    _HPU_AVAILABLE,
 )
 from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info, rank_zero_warn
 
@@ -92,6 +96,7 @@ def __init__(
         strategy: Optional[Union[str, Strategy]],
         gpus,
         gpu_ids,
+        hpus,
         num_nodes,
         sync_batchnorm,
         benchmark,
@@ -100,6 +105,7 @@ def __init__(
         precision,
         amp_type,
         amp_level,
+        hmp_params,
         plugins,
     ):
         # initialization
@@ -120,6 +126,7 @@ def __init__(
         self.parallel_device_ids = gpu_ids
         self.tpu_cores = tpu_cores
         self.ipus = ipus
+        self.hpus = hpus
         self.num_nodes = num_nodes
         self.sync_batchnorm = sync_batchnorm
         self.benchmark = benchmark
@@ -131,6 +138,7 @@ def __init__(
         self.precision = precision
         self.amp_type = amp_type.lower() if isinstance(amp_type, str) else None
         self.amp_level = amp_level
+        self.hmp_params = hmp_params
 
         self._precision_plugin: Optional[PrecisionPlugin] = None
         self._strategy: Optional[Strategy] = None
@@ -166,6 +174,7 @@ def __init__(
         self._cluster_environment = self.select_cluster_environment()
 
         self.update_device_type_if_ipu_plugin()
+        self.update_device_type_if_hpu_plugin()
         self.update_device_type_if_strategy_passed()
 
         self._validate_accelerator_type()
@@ -202,6 +211,8 @@ def select_accelerator_type(self) -> None:
                 self._accelerator_type = _AcceleratorType.IPU
             elif self.has_gpu:
                 self._accelerator_type = _AcceleratorType.GPU
+            elif self.has_hpu:
+                self._accelerator_type = _AcceleratorType.HPU
             else:
                 self._set_devices_to_cpu_num_processes()
                 self._accelerator_type = _AcceleratorType.CPU
@@ -220,6 +231,11 @@ def select_accelerator_type(self) -> None:
                 msg = "you didn't pass `gpus` to `Trainer`" if torch.cuda.is_available() else "GPUs are not available"
                 raise MisconfigurationException(f"You passed `accelerator='gpu'`, but {msg}.")
             self._accelerator_type = _AcceleratorType.GPU
+        elif self.distributed_backend == _AcceleratorType.HPU:
+            if not self.has_hpu:
+                msg = "HPUs are not available" if not _HPU_AVAILABLE else "you didn't pass `hpus` to `Trainer`"
+                raise MisconfigurationException(f"You passed `accelerator='hpu'`, but {msg}.")
+            self._accelerator_type = _AcceleratorType.HPU
         elif self.distributed_backend == _AcceleratorType.CPU:
             self._set_devices_to_cpu_num_processes()
             self._accelerator_type = _AcceleratorType.CPU
@@ -231,7 +247,7 @@ def _validate_accelerator_and_devices(self) -> None:
         if self.distributed_backend not in self.accelerator_types and self.devices is not None:
             raise MisconfigurationException(
                 f"You passed `devices={self.devices}` but haven't specified"
-                " `accelerator=('auto'|'tpu'|'gpu'|'ipu'|'cpu')` for the devices mapping,"
+                " `accelerator=('auto'|'tpu'|'gpu'|'ipu'|'cpu'|'hpu')` for the devices mapping,"
                 f" got `accelerator={self.distributed_backend!r}`."
             )
 
@@ -257,6 +273,9 @@ def _warn_if_devices_flag_ignored(self) -> None:
         elif self.distributed_backend in ("auto", _AcceleratorType.GPU):
             if self.gpus is not None:
                 rank_zero_warn(f"{devices_warning} `gpus={self.gpus}`")
+        elif self.distributed_backend in ("auto", _AcceleratorType.HPU):
+            if self.hpus is not None:
+                rank_zero_warn(f"{devices_warning} `hpus={self.hpus}`")
         elif self.distributed_backend in ("auto", _AcceleratorType.CPU):
             if self.num_processes != 1:
                 rank_zero_warn(f"{devices_warning} `num_processes={self.num_processes}`")
@@ -270,6 +289,8 @@ def _set_devices_if_none(self) -> None:
             self.devices = self.ipus
         elif self._accelerator_type == _AcceleratorType.GPU:
             self.devices = self.gpus
+        elif self._accelerator_type == _AcceleratorType.HPU:
+            self.devices = self.hpus
         elif self._accelerator_type == _AcceleratorType.CPU:
             self.devices = self.num_processes
 
@@ -456,6 +477,18 @@ def tpu_id(self) -> Optional[int]:
             return self.tpu_cores[0]
         return None
 
+    @property
+    def has_hpu(self) -> bool:
+        # Here, we are not checking for HPU availability, but instead if User has passed
+        # `hpus` to Trainer for training.
+        if self.hpus is not None or isinstance(self._strategy, HPUStrategy):
+            return True
+        return self._map_devices_to_accelerator(_AcceleratorType.HPU)
+
+    @property
+    def use_hpu(self) -> bool:
+        return self._accelerator_type == _AcceleratorType.HPU and self.has_hpu
+
     @property
     def has_ipu(self) -> bool:
         # Here, we are not checking for IPU availability, but instead if User has passed
@@ -491,6 +524,11 @@ def _map_devices_to_accelerator(self, accelerator: str) -> bool:
             self.gpus = self.devices
             self.parallel_device_ids = device_parser.parse_gpu_ids(self.devices)
             return True
+        if accelerator == _AcceleratorType.HPU and _HPU_AVAILABLE:
+            if self.devices == "auto":
+                self.devices = HPUAccelerator.auto_device_count()
+            self.hpus = self.devices
+            return True
         if accelerator == _AcceleratorType.CPU:
             if self.devices == "auto":
                 self.devices = CPUAccelerator.auto_device_count()
@@ -570,6 +608,14 @@ def num_ipus(self) -> int:
             return self._strategy.replication_factor
         return 0
 
+    @property
+    def num_hpus(self) -> int:
+        if isinstance(self.hpus, int):
+            return self.hpus
+        if isinstance(self._strategy, HPUStrategy):
+            return self._strategy.replication_factor
+        return 0
+
     @property
     def parallel_devices(self) -> List[Union[torch.device, int]]:
         if self.use_gpu:
@@ -581,6 +627,8 @@ def parallel_devices(self) -> List[Union[torch.device, int]]:
                 devices = list(range(self.tpu_cores))
         elif self.use_ipu:
             devices = list(range(self.num_ipus))
+        elif self.use_hpu:
+            devices = [torch.device("hpu")] * self.num_processes
         else:
             devices = [torch.device("cpu")] * self.num_processes
         return devices
@@ -589,7 +637,7 @@ def parallel_devices(self) -> List[Union[torch.device, int]]:
     def root_gpu(self) -> Optional[int]:
         return (
             self.strategy.root_device.index
-            if not isinstance(self.accelerator, (IPUAccelerator, TPUAccelerator))
+            if not isinstance(self.accelerator, (IPUAccelerator, TPUAccelerator, HPUAccelerator))
             else None
         )
 
@@ -639,6 +687,13 @@ def select_precision_plugin(self) -> PrecisionPlugin:
                     )
                 return TPUBf16PrecisionPlugin()
 
+        if self.use_hpu:
+            if self.precision not in (16, 32):
+                raise MisconfigurationException(
+                    f"`Trainer(accelerator='hpu', precision={self.precision!r})` is not supported."
+                )
+            return HPUPrecisionPlugin(self.precision, self.hmp_params)
+
         if self._strategy_type == _StrategyType.DEEPSPEED or isinstance(self._strategy, DeepSpeedStrategy):
             return DeepSpeedPrecisionPlugin(self.precision, self.amp_type, self.amp_level)
 
@@ -752,6 +807,8 @@ def select_strategy(self) -> Strategy:
             plugin = SingleTPUStrategy(self.tpu_id)
         elif self.use_ipu:
             plugin = IPUStrategy(parallel_devices=self.parallel_devices)
+        elif self.use_hpu:
+            plugin = HPUStrategy(device=torch.device("hpu"))
         else:
             single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids)
             plugin = SingleDeviceStrategy(device=single_gpu_ordinal if self.use_gpu else "cpu")
@@ -794,6 +851,8 @@ def select_accelerator(self) -> Accelerator:
             acc_cls = TPUAccelerator
         elif self.use_ipu:
             acc_cls = IPUAccelerator
+        elif self.use_hpu:
+            acc_cls = HPUAccelerator
         else:
             acc_cls = CPUAccelerator
 
@@ -832,6 +891,8 @@ def set_distributed_mode(self, strategy: Optional[str] = None):
         if self.distributed_backend is None:
             if self.has_horovodrun():
                 self._set_horovod_backend()
+            elif self.num_hpus  > 1 and not _use_cpu:
+                self._distrib_type = _StrategyType.DDP
             elif self.num_gpus == 0 and self.num_nodes > 1:
                 self._strategy_type = _StrategyType.DDP
             elif self.num_gpus == 0 and self.num_processes > 1:
@@ -869,6 +930,8 @@ def set_distributed_mode(self, strategy: Optional[str] = None):
                 self._strategy_type = _StrategyType.TPU_SPAWN
         elif self.has_ipu and not _use_cpu:
             self._device_type = _AcceleratorType.IPU
+        elif self.has_hpu and not _use_cpu:
+            self._device_type = _AcceleratorType.HPU
         elif self.distributed_backend and self._strategy_type is None:
             self._strategy_type = _StrategyType(self.distributed_backend)
 
@@ -877,7 +940,7 @@ def set_distributed_mode(self, strategy: Optional[str] = None):
 
         _gpu_strategy_types = (_StrategyType.DP, _StrategyType.DDP, _StrategyType.DDP_SPAWN, _StrategyType.DDP2)
         # DP and DDP2 cannot run without GPU
-        if self.num_gpus == 0 and self._strategy_type in _gpu_strategy_types and not _use_cpu:
+        if self.num_gpus == 0 and self._strategy_type in _gpu_strategy_types and not _use_cpu and not (self.num_hpus > 1):
 
             if (self.num_nodes and self.num_nodes > 1) or (self.num_processes and self.num_processes > 1):
                 if self._strategy_type in (_StrategyType.DP, _StrategyType.DDP2):
@@ -902,6 +965,9 @@ def set_distributed_mode(self, strategy: Optional[str] = None):
         if self._device_type == _AcceleratorType.GPU and self._strategy_type == _StrategyType.DDP2:
             self.num_processes = self.num_nodes
 
+        if self._device_type == _AcceleratorType.HPU and self._strategy_type == _StrategyType.DDP:
+            self.num_processes = self.num_hpus
+
         # Horovod is an extra case...
         if self.distributed_backend == _StrategyType.HOROVOD:
             self._set_horovod_backend()
@@ -965,6 +1031,10 @@ def update_device_type_if_ipu_plugin(self) -> None:
         if isinstance(self._strategy, IPUStrategy) and self._device_type != _AcceleratorType.IPU:
             self._device_type = _AcceleratorType.IPU
 
+    def update_device_type_if_hpu_plugin(self) -> None:
+        if isinstance(self._strategy, HPUStrategy) and self._device_type != _AcceleratorType.HPU:
+            self._device_type = _AcceleratorType.HPU
+
     def update_device_type_if_strategy_passed(self) -> None:
         if isinstance(self._strategy_flag, Strategy) or any(isinstance(plug, Strategy) for plug in self.plugins):
             if self._accelerator_type is not None:
@@ -974,6 +1044,8 @@ def update_device_type_if_strategy_passed(self) -> None:
                     self._device_type = _AcceleratorType.TPU
                 elif self.use_gpu:
                     self._device_type = _AcceleratorType.GPU
+                elif self.use_hpu:
+                    self._device_type = _AcceleratorType.HPU
             else:
                 if self.has_ipu:
                     self._device_type = _AcceleratorType.IPU
@@ -981,6 +1053,8 @@ def update_device_type_if_strategy_passed(self) -> None:
                     self._device_type = _AcceleratorType.TPU
                 elif self.has_gpu:
                     self._device_type = _AcceleratorType.GPU
+                elif self.has_hpu:
+                    self._device_type = _AcceleratorType.HPU
 
     def _set_strategy_type_if_strategy_passed(self):
         # This is required as when `Strategy` instance is passed to either `strategy`
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index f0b56e35e1bf1..5790bc06a6522 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -30,7 +30,7 @@
 from torch.utils.data import DataLoader
 
 import pytorch_lightning as pl
-from pytorch_lightning.accelerators import Accelerator, IPUAccelerator
+from pytorch_lightning.accelerators import Accelerator, IPUAccelerator, HPUAccelerator
 from pytorch_lightning.callbacks import Callback, EarlyStopping, ModelCheckpoint, ProgressBarBase
 from pytorch_lightning.callbacks.prediction_writer import BasePredictionWriter
 from pytorch_lightning.core.datamodule import LightningDataModule
@@ -79,6 +79,7 @@
     _IPU_AVAILABLE,
     _StrategyType,
     _TPU_AVAILABLE,
+    _HPU_AVAILABLE,
     AMPType,
     device_parser,
     GradClipAlgorithmType,
@@ -142,6 +143,7 @@ def __init__(
         devices: Optional[Union[List[int], str, int]] = None,
         gpus: Optional[Union[List[int], str, int]] = None,
         auto_select_gpus: bool = False,
+        hpus: Optional[int] = None,
         tpu_cores: Optional[Union[List[int], str, int]] = None,
         ipus: Optional[int] = None,
         log_gpu_memory: Optional[str] = None,  # TODO: Remove in 1.7
@@ -185,6 +187,7 @@ def __init__(
         plugins: Optional[Union[PLUGIN_INPUT, List[PLUGIN_INPUT]]] = None,
         amp_backend: str = "native",
         amp_level: Optional[str] = None,
+        hmp_params:["level", "verbose", "bf16_ops", "fp32_ops"] = None,
         move_metrics_to_cpu: bool = False,
         multiple_trainloader_mode: str = "max_size_cycle",
         stochastic_weight_avg: bool = False,
@@ -195,7 +198,7 @@ def __init__(
 
         Args:
 
-            accelerator: Supports passing different accelerator types ("cpu", "gpu", "tpu", "ipu", "auto")
+            accelerator: Supports passing different accelerator types ("cpu", "gpu", "tpu", "ipu", "hpu", "auto")
                 as well as custom accelerator instances.
 
                 .. deprecated:: v1.5
@@ -250,7 +253,7 @@ def __init__(
             deterministic: If ``True``, sets whether PyTorch operations must use deterministic algorithms.
                 Default: ``False``.
 
-            devices: Will be mapped to either `gpus`, `tpu_cores`, `num_processes` or `ipus`,
+            devices: Will be mapped to either `gpus`, `tpu_cores`, `hpus`, `num_processes` or `ipus`,
                 based on the accelerator type.
 
             fast_dev_run: Runs n if set to ``n`` (int) else 1 if set to ``True`` batch(es)
@@ -386,6 +389,10 @@ def __init__(
 
             ipus: How many IPUs to train on.
 
+            hpus: How many HPUs to train on.
+
+            hmp_params: list of habana mixed precision parameters
+
             track_grad_norm: -1 no tracking. Otherwise tracks that p-norm. May be set to 'inf' infinity-norm. If using
                 Automatic Mixed Precision (AMP), the gradients will be unscaled before logging them.
 
@@ -443,6 +450,7 @@ def __init__(
             strategy,
             gpus,
             gpu_ids,
+            hpus,
             num_nodes,
             sync_batchnorm,
             benchmark,
@@ -451,6 +459,7 @@ def __init__(
             precision,
             amp_backend,
             amp_level,
+            hmp_params,
             plugins,
         )
         self.logger_connector = LoggerConnector(self, log_gpu_memory)
@@ -1732,6 +1741,9 @@ def _log_device_info(self) -> None:
         num_ipus = self.ipus if self.ipus is not None else 0
         rank_zero_info(f"IPU available: {_IPU_AVAILABLE}, using: {num_ipus} IPUs")
 
+        num_hpus = self.hpus if self.hpus is not None else 0
+        rank_zero_info(f"HPU available: {_HPU_AVAILABLE}, using: {num_hpus} HPUs")
+
         if torch.cuda.is_available() and self._device_type != _AcceleratorType.GPU:
             rank_zero_warn(
                 "GPU available but not used. Set the gpus flag in your trainer `Trainer(gpus=1)` or script `--gpus=1`.",
@@ -1754,6 +1766,16 @@ def _log_device_info(self) -> None:
                 " `Trainer(ipus=8)` or script `--ipus=8`."
             )
 
+        if (
+            _HPU_AVAILABLE and self._device_type != _AcceleratorType.HPU
+            and not isinstance(self.accelerator, HPUAccelerator)
+        ):
+            rank_zero_warn(
+                "HPU available but not used. Set the `hpus` flag in your trainer"
+                " `Trainer(hpus=8)` or script `--hpus=8`."
+            )
+
+
     def _on_exception(self) -> None:
         if not _fault_tolerant_training():
             return
@@ -2000,6 +2022,10 @@ def ipus(self) -> int:
     def num_gpus(self) -> int:
         return self._accelerator_connector.num_gpus
 
+    @property
+    def hpus(self) -> int:
+        return self._accelerator_connector.num_hpus
+
     @property
     def devices(self) -> Optional[Union[List[int], str, int]]:
         return self._accelerator_connector.devices
diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py
index 916532d964952..930467e50108d 100644
--- a/pytorch_lightning/utilities/__init__.py
+++ b/pytorch_lightning/utilities/__init__.py
@@ -36,6 +36,7 @@
     _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE,
     _GROUP_AVAILABLE,
     _HOROVOD_AVAILABLE,
+    _HPU_AVAILABLE,
     _HYDRA_AVAILABLE,
     _HYDRA_EXPERIMENTAL_AVAILABLE,
     _IPU_AVAILABLE,
diff --git a/pytorch_lightning/utilities/argparse.py b/pytorch_lightning/utilities/argparse.py
index 7a33e226bc020..6840b2d874987 100644
--- a/pytorch_lightning/utilities/argparse.py
+++ b/pytorch_lightning/utilities/argparse.py
@@ -247,7 +247,7 @@ def add_argparse_args(
         else:
             use_type = arg_types[0]
 
-        if arg == "gpus" or arg == "tpu_cores":
+        if arg == 'gpus' or arg == 'tpu_cores' or arg == 'hpus':
             use_type = _gpus_allowed_type
 
         # hack for types in (int, float)
diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index 7c8b1162e3cfc..a923323f859a3 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -22,7 +22,7 @@
 from torch.nn.parallel.distributed import DistributedDataParallel
 
 import pytorch_lightning as pl
-from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8, _TORCH_GREATER_EQUAL_1_9, _TPU_AVAILABLE
+from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8, _TORCH_GREATER_EQUAL_1_9, _TPU_AVAILABLE, _HPU_AVAILABLE
 from pytorch_lightning.utilities.rank_zero import rank_zero_debug as new_rank_zero_debug
 from pytorch_lightning.utilities.rank_zero import rank_zero_only  # noqa: F401
 from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation
@@ -146,6 +146,9 @@ def forward(
 
         gathered_tensor = [torch.zeros_like(tensor) for _ in range(torch.distributed.get_world_size())]
 
+        if _HPU_AVAILABLE:
+            # HPU distributed backend doesn't support int64 tensors
+            tensor = tensor.int()
         torch.distributed.all_gather(gathered_tensor, tensor, group=group)
         gathered_tensor = torch.stack(gathered_tensor, dim=0)
 
@@ -332,6 +335,13 @@ def init_dist_connection(
     world_size = world_size if world_size is not None else cluster_environment.world_size()
     os.environ["MASTER_ADDR"] = cluster_environment.main_address
     os.environ["MASTER_PORT"] = str(cluster_environment.main_port)
+
+    #TBD: move this to a hpu based ddp plugin
+    #local rank mapping for device open is needed for hpu devices
+    if torch_distributed_backend == 'hccl' and _HPU_AVAILABLE:
+        import habana_frameworks.torch.core.hccl
+        os.environ["ID"] = str(cluster_environment.local_rank())
+
     log.info(f"Initializing distributed: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
     torch.distributed.init_process_group(torch_distributed_backend, rank=global_rank, world_size=world_size, **kwargs)
 
diff --git a/pytorch_lightning/utilities/enums.py b/pytorch_lightning/utilities/enums.py
index 103fc87ecde1b..1f276ce17d0a3 100644
--- a/pytorch_lightning/utilities/enums.py
+++ b/pytorch_lightning/utilities/enums.py
@@ -281,6 +281,7 @@ class _AcceleratorType(LightningEnum):
     GPU = "GPU"
     IPU = "IPU"
     TPU = "TPU"
+    HPU = 'HPU'
 
 
 class _FaultTolerantMode(LightningEnum):
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index 2d335450d02e6..9c9703dc6e3c7 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -129,6 +129,8 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version:
 else:
     _IPU_AVAILABLE = False
 
+from habana_frameworks.torch.utils.library_loader import is_habana_avaialble
+_HPU_AVAILABLE = is_habana_avaialble()
 
 # experimental feature within PyTorch Lightning.
 def _fault_tolerant_training() -> bool:

From 7fb871b398cf6f42a6620afdd1aceeb06ffceff9 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Tue, 8 Feb 2022 11:43:38 +0200
Subject: [PATCH 002/167] Update strategy for optimizer usage

Signed-off-by: Jerome <janand@habana.ai>
---
 pl_examples/hpu_examples/simple_mnist/mnist.py |  6 +++---
 pytorch_lightning/accelerators/hpu.py          | 18 ------------------
 pytorch_lightning/strategies/hpu.py            | 12 +++++++-----
 .../connectors/accelerator_connector.py        |  2 +-
 pytorch_lightning/trainer/trainer.py           |  2 +-
 5 files changed, 12 insertions(+), 28 deletions(-)

diff --git a/pl_examples/hpu_examples/simple_mnist/mnist.py b/pl_examples/hpu_examples/simple_mnist/mnist.py
index a8e864acdd6af..23ffde1c6ebe8 100644
--- a/pl_examples/hpu_examples/simple_mnist/mnist.py
+++ b/pl_examples/hpu_examples/simple_mnist/mnist.py
@@ -40,11 +40,11 @@ def configure_optimizers(self):
 hmp_params = dict.fromkeys(hmp_keys)
 hmp_params["level"] = "O1"
 hmp_params["verbose"] = False
-hmp_params["bf16_ops"] = "./pytorch-lightning-fork/pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt"
-hmp_params["fp32_ops"] = "./pytorch-lightning-fork/pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt"
+hmp_params["bf16_ops"] = "./pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt"
+hmp_params["fp32_ops"] = "./pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt"
 
 # Initialize a trainer
-trainer = pl.Trainer(hpus=1, max_epochs=1, precision=16, hmp_params=hmp_params)
+trainer = pl.Trainer(devices=1, max_epochs=1, precision=32, hmp_params=hmp_params, default_root_dir='/tmp/', accelerator="hpu")
 
 # Train the model ⚡
 trainer.fit(mnist_model, train_loader)
diff --git a/pytorch_lightning/accelerators/hpu.py b/pytorch_lightning/accelerators/hpu.py
index f91b2afe3273b..5307f99d496fc 100644
--- a/pytorch_lightning/accelerators/hpu.py
+++ b/pytorch_lightning/accelerators/hpu.py
@@ -33,24 +33,6 @@
 class HPUAccelerator(Accelerator):
     """ Accelerator for HPU devices. """
 
-    def setup(self, trainer: "pl.Trainer") -> None:
-        """
-        Raises:
-            ValueError:
-                If the precision or training type plugin are unsupported.
-        """
-        if not isinstance(self.precision_plugin, HPUPrecisionPlugin):
-            # this configuration should have been avoided in the accelerator connector
-            raise ValueError(
-                f"The `HPUAccelerator` can only be used with a `HPUPrecisionPlugin`, found: {self.precision_plugin}."
-            )
-        if not isinstance(self.training_type_plugin, (HPUPlugin, DDPPlugin)):
-            raise ValueError(
-                "The `HPUAccelerator` can only be used with a `HPUPlugin` or `DDPPlugin,"
-                f" found {self.training_type_plugin}."
-            )
-        return super().setup(trainer)
-
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         """HPU device stats aren't supported yet."""
         return {}
diff --git a/pytorch_lightning/strategies/hpu.py b/pytorch_lightning/strategies/hpu.py
index 4e9e7b97865d5..13bee2964fc0f 100644
--- a/pytorch_lightning/strategies/hpu.py
+++ b/pytorch_lightning/strategies/hpu.py
@@ -54,12 +54,14 @@ def is_distributed(self) -> bool:
         return False
 
     def setup(self, trainer: "pl.Trainer") -> None:
-        shared_params = find_shared_parameters(self.model)
         self.model_to_device()
-        if is_overridden("on_post_move_to_device", self.lightning_module):
-            self.model.on_post_move_to_device()
-        else:
-            set_shared_parameters(self.model, shared_params)
+        super().setup(trainer)
+
+    def setup_optimizers(self, trainer: "pl.Trainer") -> None:
+        super().setup_optimizers(trainer)
+
+        if len(self.optimizers) > 1:
+            raise MisconfigurationException("IPUs currently only support one optimizer.")
 
     def model_to_device(self) -> None:
         self.model.to(self.root_device)
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index 88a91b0273ae0..ec654a98ef6cf 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -59,13 +59,13 @@
     DDPStrategy,
     DeepSpeedStrategy,
     HorovodStrategy,
+    HPUStrategy,
     IPUStrategy,
     SingleDeviceStrategy,
     SingleTPUStrategy,
     Strategy,
     StrategyRegistry,
     TPUSpawnStrategy,
-    HPUStrategy,
 )
 from pytorch_lightning.utilities import _AcceleratorType, _StrategyType, AMPType, device_parser
 from pytorch_lightning.utilities.enums import PrecisionType
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 5790bc06a6522..33ef1f237177d 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -330,7 +330,7 @@ def __init__(
             plugins: Plugins allow modification of core behavior like ddp and amp, and enable custom lightning plugins.
 
             precision: Double precision (64), full precision (32), half precision (16) or bfloat16 precision (bf16).
-                Can be used on CPU, GPU, TPUs or IPUs.
+                Can be used on CPU, GPU, TPUs, HPUs or IPUs.
 
             max_epochs: Stop training once this number of epochs is reached. Disabled by default (None).
                 If both max_epochs and max_steps are not specified, defaults to ``max_epochs = 1000``.

From a1a1ca9261d570f6bcc3038068ba075919601112 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Tue, 8 Feb 2022 11:44:23 +0200
Subject: [PATCH 003/167] Add checkpointing support

Signed-off-by: Jerome <janand@habana.ai>
---
 pytorch_lightning/plugins/__init__.py         |  1 +
 pytorch_lightning/plugins/io/__init__.py      |  1 +
 pytorch_lightning/plugins/io/hpu_io_plugin.py | 73 +++++++++++++++++++
 pytorch_lightning/strategies/hpu.py           |  6 +-
 4 files changed, 78 insertions(+), 3 deletions(-)
 create mode 100644 pytorch_lightning/plugins/io/hpu_io_plugin.py

diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py
index 5a75407465042..33b670d46a44c 100644
--- a/pytorch_lightning/plugins/__init__.py
+++ b/pytorch_lightning/plugins/__init__.py
@@ -4,6 +4,7 @@
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
 from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO
 from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO
+from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO
 from pytorch_lightning.plugins.precision.apex_amp import ApexMixedPrecisionPlugin
 from pytorch_lightning.plugins.precision.deepspeed import DeepSpeedPrecisionPlugin
 from pytorch_lightning.plugins.precision.double import DoublePrecisionPlugin
diff --git a/pytorch_lightning/plugins/io/__init__.py b/pytorch_lightning/plugins/io/__init__.py
index 1b14eee6ec4f2..b12f04d86515c 100644
--- a/pytorch_lightning/plugins/io/__init__.py
+++ b/pytorch_lightning/plugins/io/__init__.py
@@ -14,3 +14,4 @@
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO  # noqa: F401
 from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO  # noqa: F401
 from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO  # noqa: F401
+from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO  # noqa: F401
\ No newline at end of file
diff --git a/pytorch_lightning/plugins/io/hpu_io_plugin.py b/pytorch_lightning/plugins/io/hpu_io_plugin.py
new file mode 100644
index 0000000000000..39a1fa376c190
--- /dev/null
+++ b/pytorch_lightning/plugins/io/hpu_io_plugin.py
@@ -0,0 +1,73 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+from typing import Any, Callable, Dict, Optional
+
+import pytorch_lightning as pl
+from pytorch_lightning.utilities.cloud_io import load as pl_load
+from pytorch_lightning.utilities import rank_zero_warn
+from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO
+from pytorch_lightning.utilities import _HPU_AVAILABLE
+from pytorch_lightning.utilities.apply_func import apply_to_collection
+from pytorch_lightning.utilities.cloud_io import get_filesystem
+from pytorch_lightning.utilities.types import _PATH
+from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem
+
+
+class HPUCheckpointIO(TorchCheckpointIO):
+
+    def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None) -> None:
+        fs = get_filesystem(path)
+        fs.makedirs(os.path.dirname(path), exist_ok=True)
+
+        if _HPU_AVAILABLE:
+            from pytorch_lightning.utilities.apply_func import move_data_to_device
+            checkpoint =  move_data_to_device(checkpoint, torch.device("cpu"))
+        # write the checkpoint dictionary on the file
+        atomic_save(checkpoint, path)
+
+    def load_checkpoint(
+        self, path: _PATH, map_location: Optional[Callable] = lambda storage, loc: storage
+    ) -> Dict[str, Any]:
+        """Loads checkpoint using :func:`torch.load`, with additional handling for ``fsspec`` remote loading of
+        files.
+
+        Args:
+            path: Path to checkpoint
+            map_location: a function, :class:`torch.device`, string or a dict specifying how to remap storage
+            locations.
+
+        Returns: The loaded checkpoint.
+
+        Raises:
+            FileNotFoundError: If ``path`` is not found by the ``fsspec`` filesystem
+        """
+
+        # Try to read the checkpoint at `path`. If not exist, do not restore checkpoint.
+        fs = get_filesystem(path)
+        if not fs.exists(path):
+            raise FileNotFoundError(f"Checkpoint at {path} not found. Aborting training.")
+
+        return pl_load(path, map_location=map_location)
+
+    def remove_checkpoint(self, path: _PATH) -> None:
+        """Remove checkpoint file from the filesystem.
+
+        Args:
+            path: Path to checkpoint
+        """
+        fs = get_filesystem(path)
+        if fs.exists(path):
+            fs.rm(path, recursive=True)
diff --git a/pytorch_lightning/strategies/hpu.py b/pytorch_lightning/strategies/hpu.py
index 13bee2964fc0f..2217b8508418b 100644
--- a/pytorch_lightning/strategies/hpu.py
+++ b/pytorch_lightning/strategies/hpu.py
@@ -26,7 +26,7 @@
 from typing import Any, Dict, Optional
 
 import pytorch_lightning as pl
-from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
+from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO
 from pytorch_lightning.strategies.single_device import SingleDeviceStrategy
 from pytorch_lightning.utilities.apply_func import move_data_to_device
 from pytorch_lightning.utilities import _HPU_AVAILABLE, find_shared_parameters, set_shared_parameters
@@ -39,12 +39,12 @@ class HPUStrategy(SingleDeviceStrategy):
     def __init__(
         self,
         device: int,
-        checkpoint_io: Optional[CheckpointIO] = None,
+        checkpoint_io: Optional[HPUCheckpointIO] = None,
         debug: bool = False,
     ):
 
         device = torch.device("hpu")
-        checkpoint_io = checkpoint_io
+        checkpoint_io = checkpoint_io or HPUCheckpointIO()
         super().__init__(device, checkpoint_io=checkpoint_io)
 
         self.debug = debug

From 9a6da437be1245549c9801b39fa3f38e3d520a44 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Tue, 8 Feb 2022 11:45:16 +0200
Subject: [PATCH 004/167] Fix distributed support with hpu

Signed-off-by: Jerome <janand@habana.ai>
---
 .../overrides/torch_distributed.py            | 189 ++++++++++++++++++
 pytorch_lightning/strategies/ddp.py           |  10 +-
 .../connectors/accelerator_connector.py       |   3 +-
 3 files changed, 199 insertions(+), 3 deletions(-)
 create mode 100644 pytorch_lightning/overrides/torch_distributed.py

diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py
new file mode 100644
index 0000000000000..e7f094f341701
--- /dev/null
+++ b/pytorch_lightning/overrides/torch_distributed.py
@@ -0,0 +1,189 @@
+import os
+import io
+import logging
+import os
+import pickle
+
+import torch
+from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8
+from pytorch_lightning.utilities import _HPU_AVAILABLE
+from torch._C._distributed_c10d import (
+    ProcessGroup,
+)
+
+_HCCL_AVAILABLE = True
+
+_pickler = pickle.Pickler
+_unpickler = pickle.Unpickler
+
+
+logger = logging.getLogger(__name__)
+
+if torch.distributed.is_available():
+    from torch.distributed import Backend, broadcast, get_backend, get_rank, GroupMember
+
+# The code underneath is taken from PyTorch `torch/distributed/distributed_c10d.py`
+
+# https://github.com/pytorch/pytorch/blob/1.10.1/torch/distributed/distributed_c10d.py#L256
+def _rank_not_in_group(group: ProcessGroup):
+    """
+    Helper that checks if the current process's rank is not in a given group.
+    """
+    if group is None:
+        return False
+    return group == GroupMember.NON_GROUP_MEMBER
+
+#Taken from https://github.com/pytorch/pytorch/blob/1.10.1/torch/distributed/distributed_c10d.py#L1518
+def _object_to_tensor(obj):
+    f = io.BytesIO()
+    _pickler(f).dump(obj)
+    byte_storage = torch.ByteStorage.from_buffer(f.getvalue())  # type: ignore[attr-defined]
+    # Do not replace `torch.ByteTensor` or `torch.LongTensor` with torch.tensor and specifying dtype.
+    # Otherwise, it will casue 100X slowdown.
+    # See: https://github.com/pytorch/pytorch/issues/65696
+    byte_tensor = torch.ByteTensor(byte_storage)
+    local_size = torch.LongTensor([byte_tensor.numel()])
+    return byte_tensor, local_size
+
+#Taken from https://github.com/pytorch/pytorch/blob/1.10.1/torch/distributed/distributed_c10d.py#L1530
+def _tensor_to_object(tensor, tensor_size):
+    buf = tensor.numpy().tobytes()[:tensor_size]
+    return _unpickler(io.BytesIO(buf)).load()
+
+
+#Taken from https://github.com/pytorch/pytorch/blob/1.10.1/torch/distributed/distributed_c10d.py#L1729
+def _broadcast_object_list(object_list, src=0, group=None, device=None):
+    """
+    Broadcasts picklable objects in ``object_list`` to the whole group. Similar
+    to :func:`broadcast`, but Python objects can be passed in.
+    Note that all objects in ``object_list`` must be picklable in order to be
+    broadcasted.
+
+    Args:
+        object_list (List[Any]): List of input objects to broadcast.
+            Each object must be picklable. Only objects on the ``src`` rank will
+            be broadcast, but each rank must provide lists of equal sizes.
+        src (int): Source rank from which to broadcast ``object_list``.
+        group: (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Default is ``None``.
+        device (``torch.device``, optional): If not None, the objects are
+            serialized and converted to tensors which are moved to the
+            ``device`` before broadcasting. Default is ``None``.
+
+    Returns:
+        ``None``. If rank is part of the group, ``object_list`` will contain the
+        broadcasted objects from ``src`` rank.
+
+    .. note:: For NCCL-based processed groups, internal tensor representations
+        of objects must be moved to the GPU device before communication takes
+        place. In this case, the device used is given by
+        ``torch.cuda.current_device()`` and it is the user's responsiblity to
+        ensure that this is set so that each rank has an individual GPU, via
+        ``torch.cuda.set_device()``.
+
+    .. note:: Note that this API differs slightly from the :func:`all_gather`
+        collective since it does not provide an ``async_op`` handle and thus
+        will be a blocking call.
+
+    .. warning::
+        :func:`broadcast_object_list` uses ``pickle`` module implicitly, which
+        is known to be insecure. It is possible to construct malicious pickle
+        data which will execute arbitrary code during unpickling. Only call this
+        function with data you trust.
+
+    Example::
+        >>> # Note: Process group initialization omitted on each rank.
+        >>> import torch.distributed as dist
+        >>> if dist.get_rank() == 0:
+        >>>     # Assumes world_size of 3.
+        >>>     objects = ["foo", 12, {1: 2}] # any picklable object
+        >>> else:
+        >>>     objects = [None, None, None]
+        >>> # Assumes backend is not NCCL
+        >>> device = torch.device("cpu")
+        >>> dist.broadcast_object_list(objects, src=0, device=device)
+        >>> broadcast_objects
+        ['foo', 12, {1: 2}]
+    """
+    if _rank_not_in_group(group):
+        return
+
+    my_rank = get_rank()
+    # Serialize object_list elements to tensors on src rank.
+    if my_rank == src:
+        tensor_list, size_list = zip(*[_object_to_tensor(obj) for obj in object_list])
+        object_sizes_tensor = torch.cat(size_list)
+    else:
+        object_sizes_tensor = torch.empty(len(object_list), dtype=torch.long)
+
+    # Current device selection.
+    # To preserve backwards compatibility, ``device`` is default to ``None``
+    # in which case we run current logic of device selection, i.e.
+    # ``current_device`` is CUDA if backend is NCCL otherwise CPU device. In the
+    # case it is not ``None`` we move the size and object tensors to be
+    # broadcasted to this device.
+    group_backend = get_backend(group)
+    is_nccl_backend = group_backend == Backend.NCCL
+    dist_backend = os.environ.get("PL_TORCH_DISTRIBUTED_BACKEND")
+    is_hpu_backend = group_backend == torch.distributed.Backend(str(dist_backend))
+    current_device = None
+    if device is not None:
+        if is_nccl_backend and device.type != "cuda":
+            raise ValueError("device type must be cuda for nccl backend")
+        current_device = device
+    else:
+        current_device = torch.device("cpu")
+        if is_nccl_backend:
+            # See note about using torch.cuda.current_device() here in
+            # docstring. We cannot simply use my_rank since rank == device is
+            # not necessarily true.
+            current_device = torch.device("cuda", torch.cuda.current_device())
+    if is_nccl_backend:
+        object_sizes_tensor = object_sizes_tensor.to(current_device)
+
+    elif is_hpu_backend:
+        current_device = torch.device("hpu")
+        #Workaround: HPU doesn't not support long tensors for collectives
+        object_sizes_tensor = object_sizes_tensor.int()
+        object_sizes_tensor = object_sizes_tensor.to(current_device)
+
+    # Broadcast object sizes
+    broadcast(object_sizes_tensor, src=src, group=group)
+
+    # Concatenate and broadcast serialized object tensors
+    if my_rank == src:
+        object_tensor = torch.cat(tensor_list)
+    else:
+        object_tensor = torch.empty(
+            torch.sum(object_sizes_tensor).int().item(),  # type: ignore[arg-type]
+            dtype=torch.uint8,
+        )
+
+    if is_nccl_backend:
+        object_tensor = object_tensor.to(current_device)
+    elif is_hpu_backend:
+        object_tensor = object_tensor.to(current_device)
+
+    broadcast(object_tensor, src=src, group=group)
+    # Deserialize objects using their stored sizes.
+    offset = 0
+    if my_rank != src:
+        for i, obj_size in enumerate(object_sizes_tensor):
+            obj_view = object_tensor[offset : offset + obj_size]
+            obj_view = obj_view.type(torch.uint8)
+            if obj_view.device != torch.device("cpu"):
+                obj_view = obj_view.cpu()
+            offset += obj_size
+            object_list[i] = _tensor_to_object(obj_view, obj_size)
+
+if not torch.distributed.is_available():
+    # avoid failures on early PyTorch versions for Windows where
+    # not all functions used in `broadcast_object_list` are available.
+    def _broadcast_noop(obj, *_, **__):
+        return obj
+
+    broadcast_object_list = _broadcast_noop
+elif _TORCH_GREATER_EQUAL_1_8 and not _HPU_AVAILABLE:
+    from torch.distributed.distributed_c10d import broadcast_object_list
+else:
+    broadcast_object_list = _broadcast_object_list
\ No newline at end of file
diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py
index 082d5eef6d7c1..c904b66cebbe1 100644
--- a/pytorch_lightning/strategies/ddp.py
+++ b/pytorch_lightning/strategies/ddp.py
@@ -31,11 +31,13 @@
 from torch.nn.parallel.distributed import DistributedDataParallel
 
 import pytorch_lightning as pl
+from pytorch_lightning.overrides.torch_distributed import broadcast_object_list
 from pytorch_lightning.core.optimizer import LightningOptimizer
 from pytorch_lightning.overrides import LightningDistributedModule
 from pytorch_lightning.overrides.distributed import prepare_for_backward
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
+from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO
 from pytorch_lightning.plugins.precision import PrecisionPlugin
 from pytorch_lightning.strategies.parallel import ParallelStrategy
 from pytorch_lightning.trainer.states import TrainerFn
@@ -94,7 +96,7 @@ def __init__(
             accelerator=accelerator,
             parallel_devices=parallel_devices,
             cluster_environment=cluster_environment,
-            checkpoint_io=checkpoint_io,
+            checkpoint_io=checkpoint_io or HPUCheckpointIO(),
             precision_plugin=precision_plugin,
         )
         log.detail(f"{self.__class__.__name__}: initializing DDP plugin")
@@ -367,7 +369,11 @@ def broadcast(self, obj: object, src: int = 0) -> object:
         obj = [obj]
         if self.global_rank != src:
             obj = [None]
-        torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD)
+        if self.root_device.type == "hpu":
+            broadcast_object_list(obj, src, group=_group.WORLD)
+        else:
+            torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD)
+
         return obj[0]
 
     def pre_backward(self, closure_loss: torch.Tensor) -> None:
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index ec654a98ef6cf..4e78fa2b20edf 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -892,7 +892,7 @@ def set_distributed_mode(self, strategy: Optional[str] = None):
             if self.has_horovodrun():
                 self._set_horovod_backend()
             elif self.num_hpus  > 1 and not _use_cpu:
-                self._distrib_type = _StrategyType.DDP
+                self.distributed_backend = _StrategyType.DDP
             elif self.num_gpus == 0 and self.num_nodes > 1:
                 self._strategy_type = _StrategyType.DDP
             elif self.num_gpus == 0 and self.num_processes > 1:
@@ -932,6 +932,7 @@ def set_distributed_mode(self, strategy: Optional[str] = None):
             self._device_type = _AcceleratorType.IPU
         elif self.has_hpu and not _use_cpu:
             self._device_type = _AcceleratorType.HPU
+            self._strategy_type = _StrategyType.DDP
         elif self.distributed_backend and self._strategy_type is None:
             self._strategy_type = _StrategyType(self.distributed_backend)
 

From 3e76db948789d41f8c9008f8bb016c8f357bf96d Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Tue, 8 Feb 2022 11:46:32 +0200
Subject: [PATCH 005/167] Enable usage of static_graph with hpu

Signed-off-by: Jerome <janand@habana.ai>
---
 pytorch_lightning/strategies/ddp.py        | 14 ++++++++++++++
 pytorch_lightning/utilities/distributed.py |  4 ++++
 2 files changed, 18 insertions(+)

diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py
index c904b66cebbe1..6fcf8144b7974 100644
--- a/pytorch_lightning/strategies/ddp.py
+++ b/pytorch_lightning/strategies/ddp.py
@@ -284,6 +284,18 @@ def pre_configure_ddp(self):
             )
             self._ddp_kwargs["find_unused_parameters"] = True
 
+        if self.root_device.type == "hpu":
+            self._static_graph = False
+            static_graph  = self._ddp_kwargs.get("static_graph")
+            if static_graph == True:
+                #when _set_static_graph() is called find_unused_parameters does not have any significance.
+                #Resetting the value of find_unused_parameters to False which is the default value to DDP
+                self._ddp_kwargs["find_unused_parameters"] = False
+                self._static_graph = True
+            if static_graph is not None:
+                #DDP does not accept static_graph as a parameter, hence removing it from the list
+                del self._ddp_kwargs["static_graph"]
+
     def _register_ddp_hooks(self) -> None:
         log.detail(f"{self.__class__.__name__}: registering ddp hooks")
         # In 1.8, DDP communication hooks only work with NCCL backend and SPSD (single process single device) mode
@@ -350,6 +362,8 @@ def configure_ddp(self) -> None:
         log.detail(f"{self.__class__.__name__}: configuring DistributedDataParallel")
         self.pre_configure_ddp()
         self.model = self._setup_model(LightningDistributedModule(self.model))
+        if self.root_device.type == "hpu" and self._static_graph == True:
+            self._model._set_static_graph()
         self._register_ddp_hooks()
 
     def determine_ddp_device_ids(self):
diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index a923323f859a3..219adadf60c4d 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -125,6 +125,10 @@ def sync_ddp(
     else:
         op = reduce_op
 
+    # WA for HPU. HPU doesn't support Long types
+    if _HPU_AVAILABLE:
+        result = result.float()
+
     # sync all processes before reduction
     torch.distributed.barrier(group=group)
     torch.distributed.all_reduce(result, op=op, group=group, async_op=False)

From b43d226a6b1312cb51248d6b75dc7d0f0803e21a Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Tue, 8 Feb 2022 11:47:36 +0200
Subject: [PATCH 006/167] Add HPU tests

Signed-off-by: Jerome <janand@habana.ai>
---
 pytorch_lightning/lite/lite.py                |   1 +
 .../connectors/accelerator_connector.py       |   4 +-
 tests/accelerators/ops_bf16_mnist.txt         |   2 +
 tests/accelerators/ops_fp32_mnist.txt         |   1 +
 .../test_accelerator_connector.py             |  13 +-
 tests/accelerators/test_common.py             |   3 +-
 tests/accelerators/test_hpu.py                | 354 ++++++++++++++++++
 tests/conftest.py                             |   1 +
 tests/helpers/runif.py                        |   6 +
 9 files changed, 380 insertions(+), 5 deletions(-)
 create mode 100644 tests/accelerators/ops_bf16_mnist.txt
 create mode 100644 tests/accelerators/ops_fp32_mnist.txt
 create mode 100644 tests/accelerators/test_hpu.py

diff --git a/pytorch_lightning/lite/lite.py b/pytorch_lightning/lite/lite.py
index fb7cd80e61909..268602d6b85b3 100644
--- a/pytorch_lightning/lite/lite.py
+++ b/pytorch_lightning/lite/lite.py
@@ -86,6 +86,7 @@ def __init__(
             devices=devices,
             tpu_cores=tpu_cores,
             ipus=None,
+            hpus=None,
             accelerator=accelerator,
             strategy=strategy,
             gpus=gpus,
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index 4e78fa2b20edf..a9b4a911bbbb8 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -612,8 +612,6 @@ def num_ipus(self) -> int:
     def num_hpus(self) -> int:
         if isinstance(self.hpus, int):
             return self.hpus
-        if isinstance(self._strategy, HPUStrategy):
-            return self._strategy.replication_factor
         return 0
 
     @property
@@ -688,7 +686,7 @@ def select_precision_plugin(self) -> PrecisionPlugin:
                 return TPUBf16PrecisionPlugin()
 
         if self.use_hpu:
-            if self.precision not in (16, 32):
+            if self.precision not in (16, "bf16", 32):
                 raise MisconfigurationException(
                     f"`Trainer(accelerator='hpu', precision={self.precision!r})` is not supported."
                 )
diff --git a/tests/accelerators/ops_bf16_mnist.txt b/tests/accelerators/ops_bf16_mnist.txt
new file mode 100644
index 0000000000000..21dfc7eb22855
--- /dev/null
+++ b/tests/accelerators/ops_bf16_mnist.txt
@@ -0,0 +1,2 @@
+linear
+relu
\ No newline at end of file
diff --git a/tests/accelerators/ops_fp32_mnist.txt b/tests/accelerators/ops_fp32_mnist.txt
new file mode 100644
index 0000000000000..11322c514abd9
--- /dev/null
+++ b/tests/accelerators/ops_fp32_mnist.txt
@@ -0,0 +1 @@
+cross_entropy
\ No newline at end of file
diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
index 3e2ec15216841..6de79de66e89c 100644
--- a/tests/accelerators/test_accelerator_connector.py
+++ b/tests/accelerators/test_accelerator_connector.py
@@ -903,11 +903,22 @@ def test_unsupported_ipu_choice(monkeypatch):
     with pytest.raises(MisconfigurationException, match=r"accelerator='ipu', precision=64\)` is not supported"):
         Trainer(accelerator="ipu", precision=64)
 
+def test_unsupported_hpu_choice(monkeypatch):
+    import pytorch_lightning.plugins.training_type.hpu as hpu
+    import pytorch_lightning.utilities.imports as imports
+    from pytorch_lightning.trainer.connectors.accelerator_connector import AcceleratorConnector
+
+    monkeypatch.setattr(imports, "_HPU_AVAILABLE", True)
+    monkeypatch.setattr(hpu, "_HPU_AVAILABLE", True)
+    monkeypatch.setattr(AcceleratorConnector, "has_hpu", True)
+    with pytest.raises(MisconfigurationException, match=r"accelerator='hpu', precision=64\)` is not supported"):
+        Trainer(accelerator="hpu", precision=64)
 
 @mock.patch("torch.cuda.is_available", return_value=False)
 @mock.patch("pytorch_lightning.utilities.imports._TPU_AVAILABLE", return_value=False)
 @mock.patch("pytorch_lightning.utilities.imports._IPU_AVAILABLE", return_value=False)
-def test_devices_auto_choice_cpu(is_ipu_available_mock, is_tpu_available_mock, is_gpu_available_mock):
+@mock.patch("pytorch_lightning.utilities.imports._HPU_AVAILABLE", return_value=False)
+def test_devices_auto_choice_cpu(is_ipu_available_mock, is_tpu_available_mock, is_gpu_available_mock, is_hpu_available_mock):
     trainer = Trainer(accelerator="auto", devices="auto")
     assert trainer.devices == 1
     assert trainer.num_processes == 1
diff --git a/tests/accelerators/test_common.py b/tests/accelerators/test_common.py
index 18bb04bd0ae17..f19812924ec3b 100644
--- a/tests/accelerators/test_common.py
+++ b/tests/accelerators/test_common.py
@@ -18,7 +18,7 @@
 
 import tests.helpers.utils as tutils
 from pytorch_lightning import Trainer
-from pytorch_lightning.accelerators import CPUAccelerator, GPUAccelerator, IPUAccelerator, TPUAccelerator
+from pytorch_lightning.accelerators import CPUAccelerator, GPUAccelerator, IPUAccelerator, TPUAccelerator, HPUAccelerator
 from pytorch_lightning.utilities.seed import seed_everything
 from tests.accelerators.test_dp import CustomClassificationModelDP
 from tests.helpers.boring_model import BoringModel
@@ -80,3 +80,4 @@ def test_auto_device_count(device_count_mock):
     assert GPUAccelerator.auto_device_count() == 2
     assert TPUAccelerator.auto_device_count() == 8
     assert IPUAccelerator.auto_device_count() == 4
+    assert HPUAccelerator.auto_device_count() == 8
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
new file mode 100644
index 0000000000000..bb10fb611500a
--- /dev/null
+++ b/tests/accelerators/test_hpu.py
@@ -0,0 +1,354 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Optional
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from pytorch_lightning import Callback, seed_everything, Trainer
+from pytorch_lightning.accelerators import CPUAccelerator, HPUAccelerator
+from pytorch_lightning.core.lightning import LightningModule
+from pytorch_lightning.plugins import HPUPlugin, HPUPrecisionPlugin
+from pytorch_lightning.trainer.states import RunningStage, TrainerFn
+from pytorch_lightning.trainer.supporters import CombinedLoader
+from pytorch_lightning.utilities import _HPU_AVAILABLE, DeviceType
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from tests.helpers.boring_model import BoringModel
+from tests.helpers.datamodules import ClassifDataModule
+from tests.helpers.runif import RunIf
+from tests.helpers.simple_models import ClassificationModel
+from pytorch_lightning.callbacks import HPUStatsMonitor
+
+if _HPU_AVAILABLE:
+    import habana_frameworks.torch.core as htcore
+    os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "hccl"
+
+class HPUModel(BoringModel):
+    def training_step(self, batch, batch_idx):
+        output = self(batch)
+        loss = self.loss(batch, output)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        output = self(batch)
+        loss = self.loss(batch, output)
+        return loss
+
+    def test_step(self, batch, batch_idx):
+        output = self(batch)
+        loss = self.loss(batch, output)
+        return loss
+
+    def training_epoch_end(self, outputs) -> None:
+        pass
+
+    def validation_epoch_end(self, outputs) -> None:
+        pass
+
+    def test_epoch_end(self, outputs) -> None:
+        pass
+
+
+class HPUClassificationModel(ClassificationModel):
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        loss = F.cross_entropy(logits, y)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        acc = self.accuracy(logits, y)
+        return acc
+
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        acc = self.accuracy(logits, y)
+        return acc
+
+    def accuracy(self, logits, y):
+        acc = torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y)
+        return acc
+
+    def validation_epoch_end(self, outputs) -> None:
+        self.log("val_acc", torch.stack(outputs).mean())
+
+    def test_epoch_end(self, outputs) -> None:
+        self.log("test_acc", torch.stack(outputs).mean())
+
+
+@pytest.mark.skipif(_HPU_AVAILABLE, reason="test requires non-HPU machine")
+def test_fail_if_no_hpus(tmpdir):
+    with pytest.raises(MisconfigurationException, match="HPU Accelerator requires HPU devices to run"):
+        Trainer(default_root_dir=tmpdir, hpus=1)
+
+    with pytest.raises(MisconfigurationException, match="HPU Accelerator requires HPU devices to run"):
+        Trainer(default_root_dir=tmpdir, hpus=1, accelerator="hpu")
+
+
+@RunIf(hpu=True)
+def test_accelerator_selected(tmpdir):
+    trainer = Trainer(default_root_dir=tmpdir, hpus=1)
+    assert isinstance(trainer.accelerator, HPUAccelerator)
+    trainer = Trainer(default_root_dir=tmpdir, hpus=1, accelerator="hpu")
+    assert isinstance(trainer.accelerator, HPUAccelerator)
+
+
+@RunIf(hpu=True)
+def test_warning_if_hpus_not_used(tmpdir):
+    with pytest.warns(UserWarning, match="HPU available but not used. Set the `hpus` flag in your trainer"):
+        Trainer(default_root_dir=tmpdir)
+
+
+@RunIf(hpu=True)
+def test_no_warning_plugin(tmpdir):
+    with pytest.warns(None) as record:
+        Trainer(default_root_dir=tmpdir, strategy=HPUPlugin(device=torch.device("hpu")))
+    assert len(record) == 0
+
+
+@RunIf(hpu=True)
+@pytest.mark.parametrize("hpus", [1])
+def test_all_stages(tmpdir, hpus):
+    model = HPUModel()
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, hpus=hpus)
+    trainer.fit(model)
+    trainer.validate(model)
+    trainer.test(model)
+    trainer.predict(model)
+
+@RunIf(hpu=True)
+@pytest.mark.parametrize("hpus", [1])
+def test_inference_only(tmpdir, hpus):
+    model = HPUModel()
+
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, hpus=hpus)
+    trainer.validate(model)
+    trainer.test(model)
+    trainer.predict(model)
+
+@RunIf(hpu=True)
+def test_optimization(tmpdir):
+    seed_everything(42)
+
+    dm = ClassifDataModule(length=1024)
+    model = HPUClassificationModel()
+
+    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, hpus=1)
+
+    # fit model
+    trainer.fit(model, dm)
+    assert trainer.state.finished, f"Training failed with {trainer.state}"
+    assert dm.trainer is not None
+
+    # validate
+    result = trainer.validate(datamodule=dm)
+    assert dm.trainer is not None
+    assert result[0]["val_acc"] > 0.7
+
+    # test
+    result = trainer.test(model, datamodule=dm)
+    assert dm.trainer is not None
+    test_result = result[0]["test_acc"]
+    assert test_result > 0.6
+
+    # test saved model
+    model_path = os.path.join(tmpdir, "model.pt")
+    trainer.save_checkpoint(model_path)
+
+    model = HPUClassificationModel.load_from_checkpoint(model_path)
+
+    trainer = Trainer(default_root_dir=tmpdir, hpus=1)
+
+    result = trainer.test(model, datamodule=dm)
+    saved_result = result[0]["test_acc"]
+    assert saved_result == test_result
+
+
+@RunIf(hpu=True)
+def test_mixed_precision(tmpdir):
+    class TestCallback(Callback):
+        def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None:
+            assert trainer.accelerator.model.precision == "bf16"
+            raise SystemExit
+
+    model = HPUModel()
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, hpus=1, precision="bf16", callbacks=TestCallback())
+    assert isinstance(trainer.accelerator.precision_plugin, HPUPrecisionPlugin)
+    assert trainer.accelerator.precision_plugin.precision == "bf16"
+    with pytest.raises(SystemExit):
+        trainer.fit(model)
+
+
+@RunIf(hpu=True)
+def test_pure_half_precision(tmpdir):
+    class TestCallback(Callback):
+        def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
+            assert trainer.accelerator.model.precision == 16
+            for param in trainer.accelerator.model.parameters():
+                assert param.dtype == torch.float16
+            raise SystemExit
+
+    model = HPUModel()
+    model = model.half()
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, hpus=1, precision=16, callbacks=TestCallback())
+
+    assert isinstance(trainer.accelerator.training_type_plugin, HPUPlugin)
+    assert isinstance(trainer.accelerator.precision_plugin, HPUPrecisionPlugin)
+    assert trainer.accelerator.precision_plugin.precision == 16
+
+    with pytest.raises(SystemExit):
+        trainer.fit(model)
+
+@RunIf(hpu=True)
+def test_stages_correct(tmpdir):
+    """Ensure all stages correctly are traced correctly by asserting the output for each stage."""
+
+    class StageModel(HPUModel):
+        def training_step(self, batch, batch_idx):
+            loss = super().training_step(batch, batch_idx)
+            # tracing requires a loss value that depends on the model.
+            # force it to be a value but ensure we use the loss.
+            return (loss - loss) + torch.tensor(1)
+
+        def validation_step(self, batch, batch_idx):
+            loss = super().validation_step(batch, batch_idx)
+            return (loss - loss) + torch.tensor(2)
+
+        def test_step(self, batch, batch_idx):
+            loss = super().validation_step(batch, batch_idx)
+            return (loss - loss) + torch.tensor(3)
+
+        def predict_step(self, batch, batch_idx, dataloader_idx=None):
+            output = super().predict_step(batch, batch_idx)
+            return (output - output) + torch.tensor(4)
+
+    class TestCallback(Callback):
+        def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None:
+            assert outputs["loss"].item() == 1
+
+        def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None:
+            assert outputs.item() == 2
+
+        def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None:
+            assert outputs.item() == 3
+
+        def on_predict_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None:
+            assert torch.all(outputs == 4).item()
+
+    model = StageModel()
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, hpus=1, callbacks=TestCallback())
+    trainer.fit(model)
+    trainer.test(model)
+    trainer.validate(model)
+    trainer.predict(model, model.test_dataloader())
+
+
+@RunIf(hpu=True)
+def test_precision_plugin(tmpdir):
+    """Ensure precision plugin value is set correctly."""
+    hmp_keys = ["level", "verbose", "bf16_ops", "fp32_ops"]
+    hmp_params = dict.fromkeys(hmp_keys)
+    hmp_params["level"] = "O1"
+    hmp_params["verbose"] = False
+    hmp_params["bf16_ops"] = "./ops_bf16_mnist.txt"
+    hmp_params["fp32_ops"] = "./ops_fp32_mnist.txt"
+
+    plugin = HPUPrecisionPlugin(precision="bf16", hmp_params=hmp_params)
+    assert plugin.precision == "bf16"
+
+
+@RunIf(hpu=True)
+def test_accelerator_hpu():
+
+    trainer = Trainer(accelerator="hpu", hpus=1)
+
+    assert trainer._device_type == "hpu"
+    assert isinstance(trainer.accelerator, HPUAccelerator)
+
+    with pytest.raises(
+        MisconfigurationException, match="You passed `accelerator='hpu'`, but you didn't pass `hpus` to `Trainer`"
+    ):
+        trainer = Trainer(accelerator="hpu")
+
+    trainer = Trainer(accelerator="auto", hpus=8)
+
+    assert trainer._device_type == "hpu"
+    assert isinstance(trainer.accelerator, HPUAccelerator)
+
+
+@RunIf(hpu=True)
+def test_accelerator_cpu_with_hpus_flag():
+
+    trainer = Trainer(accelerator="cpu", hpus=1)
+
+    assert trainer._device_type == "cpu"
+    assert isinstance(trainer.accelerator, CPUAccelerator)
+
+
+@RunIf(hpu=True)
+def test_accelerator_hpu_with_devices():
+    """HPU does not support isinstance(trainer.training_type_plugin, HPUPlugin) yet."""
+
+    trainer = Trainer(accelerator="hpu", devices=8)
+
+    assert trainer.hpus == 8
+    assert isinstance(trainer.accelerator, HPUAccelerator)
+
+
+@RunIf(hpu=True)
+def test_accelerator_auto_with_devices_hpu():
+
+    trainer = Trainer(accelerator="auto", devices=8)
+
+    assert trainer._device_type == "hpu"
+    assert trainer.hpus == 8
+
+
+@RunIf(hpu=True)
+def test_accelerator_hpu_with_hpus_priority():
+    """Test for checking `hpus` flag takes priority over `devices`."""
+
+    hpus = 8
+    with pytest.warns(UserWarning, match="The flag `devices=1` will be ignored,"):
+        trainer = Trainer(accelerator="hpu", devices=1, hpus=hpus)
+
+    assert trainer.hpus == hpus
+
+
+@RunIf(hpu=True)
+def test_set_devices_if_none_hpu():
+
+    trainer = Trainer(accelerator="hpu", hpus=8)
+    assert trainer.devices == 8
+
+
+@RunIf(hpu=True)
+def test_device_type(tmpdir):
+    """HPU does not support (trainer.training_type_plugin, HPUPlugin) yet."""
+
+    trainer = Trainer(hpus=8)
+    assert trainer._device_type == DeviceType.HPU
+    assert isinstance(trainer.accelerator, HPUAccelerator)
+
+
+@RunIf(hpu=True)
+def test_devices_auto_choice_hpu():
+    trainer = Trainer(accelerator="auto", devices="auto")
+    assert trainer.devices == 8
+    assert trainer.hpus == 8
diff --git a/tests/conftest.py b/tests/conftest.py
index 8ad7faa3cd769..791ea4bbe29de 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -77,6 +77,7 @@ def restore_env_variables():
         "XRT_HOST_WORLD_SIZE",
         "XRT_SHARD_ORDINAL",
         "XRT_SHARD_LOCAL_ORDINAL",
+        "ID", # set by HPUStrategy,
     }
     leaked_vars.difference_update(allowlist)
     assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}"
diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py
index 3efc0f18873c7..85bd980da737d 100644
--- a/tests/helpers/runif.py
+++ b/tests/helpers/runif.py
@@ -27,6 +27,7 @@
     _FAIRSCALE_AVAILABLE,
     _FAIRSCALE_FULLY_SHARDED_AVAILABLE,
     _HOROVOD_AVAILABLE,
+    _HPU_AVAILABLE,
     _IPU_AVAILABLE,
     _OMEGACONF_AVAILABLE,
     _RICH_AVAILABLE,
@@ -63,6 +64,7 @@ def __new__(
         amp_apex: bool = False,
         tpu: bool = False,
         ipu: bool = False,
+        hpu: bool = False,
         horovod: bool = False,
         horovod_nccl: bool = False,
         skip_windows: bool = False,
@@ -147,6 +149,10 @@ def __new__(
             conditions.append(not _IPU_AVAILABLE)
             reasons.append("IPU")
 
+        if hpu:
+            conditions.append(not _HPU_AVAILABLE)
+            reasons.append("HPU")
+
         if horovod:
             conditions.append(not _HOROVOD_AVAILABLE)
             reasons.append("Horovod")

From 992093dd5a29ab641277598a10d207b1b00718a5 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Tue, 8 Feb 2022 11:48:20 +0200
Subject: [PATCH 007/167] Add basic hpu_stats monitor

Signed-off-by: Jerome <janand@habana.ai>
---
 .../hpu_examples/simple_mnist/mnist.py        |  5 +-
 pytorch_lightning/callbacks/__init__.py       |  2 +
 .../callbacks/hpu_stats_monitor.py            | 81 +++++++++++++++++++
 3 files changed, 87 insertions(+), 1 deletion(-)
 create mode 100644 pytorch_lightning/callbacks/hpu_stats_monitor.py

diff --git a/pl_examples/hpu_examples/simple_mnist/mnist.py b/pl_examples/hpu_examples/simple_mnist/mnist.py
index 23ffde1c6ebe8..d9c4cb63dc557 100644
--- a/pl_examples/hpu_examples/simple_mnist/mnist.py
+++ b/pl_examples/hpu_examples/simple_mnist/mnist.py
@@ -10,6 +10,7 @@
 import sys
 
 import habana_frameworks.torch.core as htcore
+from pytorch_lightning.callbacks import HPUStatsMonitor
 
 class MNISTModel(pl.LightningModule):
 
@@ -43,8 +44,10 @@ def configure_optimizers(self):
 hmp_params["bf16_ops"] = "./pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt"
 hmp_params["fp32_ops"] = "./pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt"
 
+hpu_stats = HPUStatsMonitor(log_save_dir="habana_ptl_log", exp_name="mnist")
+
 # Initialize a trainer
-trainer = pl.Trainer(devices=1, max_epochs=1, precision=32, hmp_params=hmp_params, default_root_dir='/tmp/', accelerator="hpu")
+trainer = pl.Trainer(devices=1, callbacks=[hpu_stats], max_epochs=1, precision=32, hmp_params=hmp_params, default_root_dir='/tmp/', accelerator="hpu")
 
 # Train the model ⚡
 trainer.fit(mnist_model, train_loader)
diff --git a/pytorch_lightning/callbacks/__init__.py b/pytorch_lightning/callbacks/__init__.py
index f47bc115ece51..a614288c0ac00 100644
--- a/pytorch_lightning/callbacks/__init__.py
+++ b/pytorch_lightning/callbacks/__init__.py
@@ -16,6 +16,7 @@
 from pytorch_lightning.callbacks.early_stopping import EarlyStopping
 from pytorch_lightning.callbacks.finetuning import BackboneFinetuning, BaseFinetuning
 from pytorch_lightning.callbacks.gpu_stats_monitor import GPUStatsMonitor
+from pytorch_lightning.callbacks.hpu_stats_monitor import HPUStatsMonitor
 from pytorch_lightning.callbacks.gradient_accumulation_scheduler import GradientAccumulationScheduler
 from pytorch_lightning.callbacks.lambda_function import LambdaCallback
 from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor
@@ -37,6 +38,7 @@
     "DeviceStatsMonitor",
     "EarlyStopping",
     "GPUStatsMonitor",
+    "HPUStatsMonitor",
     "XLAStatsMonitor",
     "GradientAccumulationScheduler",
     "LambdaCallback",
diff --git a/pytorch_lightning/callbacks/hpu_stats_monitor.py b/pytorch_lightning/callbacks/hpu_stats_monitor.py
new file mode 100644
index 0000000000000..b483bf848fa68
--- /dev/null
+++ b/pytorch_lightning/callbacks/hpu_stats_monitor.py
@@ -0,0 +1,81 @@
+# Copyright (C) 2021 Habana Labs, Ltd. an Intel Company
+# All Rights Reserved.
+#
+# Unauthorized copying of this file or any element(s) within it, via any medium
+# is strictly prohibited.
+# This file contains Habana Labs, Ltd. proprietary and confidential information
+# and is subject to the confidentiality and license agreements under which it
+# was provided.
+#
+
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+hpu Stats Monitor
+=================
+
+Monitor and logs hpu stats during training.
+
+"""
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks.base import Callback
+from pytorch_lightning.utilities import rank_zero_only
+
+class HPUStatsMonitor(Callback):
+    """
+    Automatically monitors and logs hpu stats during training stage.
+
+    Args:
+        save_dir: directory to save the logs.
+        exp_name: name of the experiment.
+
+    Example::
+
+        >>> from pytorch_lightning import Trainer
+        >>> from pytorch_lightning.callbacks import HPUStatsMonitor
+        >>> hpu_stats = HPUStatsMonitor()
+        >>> trainer = Trainer(hpus=1, callbacks=[hpu_stats])
+
+    you can also optionally provide save_dir and exp_name in HPUStatsMonitor.
+    No need to provide logger in Trainer.
+
+    """
+    def __init__(
+        self,
+        log_save_dir:str = "habana_ptl_logs",
+        exp_name:str = "default"
+    ):
+        super().__init__()
+        self.log_save_dir = log_save_dir
+        self.exp_name = exp_name
+
+    def on_init_end(self, trainer: "pl.Trainer") -> None:
+        from pytorch_lightning import loggers as pl_logger
+        self.tb_logger = pl_logger.TensorBoardLogger(save_dir=self.log_save_dir, name=self.exp_name)
+        trainer.logger = self.tb_logger
+
+    def on_before_backward(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", loss: torch.Tensor) -> None:
+        pl_module.log("Model_Loss", loss, on_step=True, on_epoch=True, enable_graph=False, logger=True)
+
+    def on_train_epoch_end(
+        self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", unused: Optional = None
+    ) -> None:
+        tensor_board = trainer.logger.experiment
+        dict = vars(pl_module)
+        modules = dict['_modules']
+        for module_name in modules:
+            tensor_board.add_histogram( module_name + ".weight",  modules[module_name].weight, pl_module.current_epoch)
+            tensor_board.add_histogram(module_name + ".bias",  modules[module_name].bias, pl_module.current_epoch)

From 943be49df682871262541ab7d480d7b08ca83f9c Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Tue, 8 Feb 2022 11:49:17 +0200
Subject: [PATCH 008/167] Code cleanup

Signed-off-by: Jerome <janand@habana.ai>
---
 pytorch_lightning/accelerators/hpu.py         | 14 +----------
 pytorch_lightning/core/lightning.py           |  2 --
 .../overrides/torch_distributed.py            |  1 -
 pytorch_lightning/plugins/io/hpu_io_plugin.py |  2 ++
 .../plugins/precision/hpu_precision.py        |  9 ++-----
 pytorch_lightning/strategies/ddp.py           |  7 ------
 pytorch_lightning/strategies/hpu.py           | 25 +++----------------
 pytorch_lightning/utilities/distributed.py    |  3 ---
 8 files changed, 9 insertions(+), 54 deletions(-)

diff --git a/pytorch_lightning/accelerators/hpu.py b/pytorch_lightning/accelerators/hpu.py
index 5307f99d496fc..46a843cd67850 100644
--- a/pytorch_lightning/accelerators/hpu.py
+++ b/pytorch_lightning/accelerators/hpu.py
@@ -11,24 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import logging
-import os
-from typing import Any
 
+from typing import Any
 import torch
-
-import pytorch_lightning as pl
 from pytorch_lightning.accelerators.accelerator import Accelerator
-from pytorch_lightning.plugins import DataParallelPlugin
-from pytorch_lightning.plugins.training_type.single_hpu import HPUPlugin
-from pytorch_lightning.plugins.precision.hpu_precision import HPUPrecisionPlugin
-from pytorch_lightning.plugins.training_type.ddp import DDPPlugin
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from typing import Any, Dict, Union
 
-_log = logging.getLogger(__name__)
-
-
 
 class HPUAccelerator(Accelerator):
     """ Accelerator for HPU devices. """
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index b8592fb974665..f2614b14fa4ce 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -1545,7 +1545,6 @@ def optimizer_step(
         optimizer_idx: int = 0,
         optimizer_closure: Optional[Callable[[], Any]] = None,
         on_tpu: bool = False,
-        on_hpu: bool = None,
         using_native_amp: bool = False,
         using_lbfgs: bool = False,
     ) -> None:
@@ -1564,7 +1563,6 @@ def optimizer_step(
             optimizer_closure: Closure for all optimizers. This closure must be executed as it includes the
                 calls to ``training_step()``, ``optimizer.zero_grad()``, and ``backward()``.
             on_tpu: ``True`` if TPU backward is required
-            on_hpu: ``True`` if HPU backward is required
             using_native_amp: ``True`` if using native amp
             using_lbfgs: True if the matching optimizer is :class:`torch.optim.LBFGS`
 
diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py
index e7f094f341701..834271cfddd08 100644
--- a/pytorch_lightning/overrides/torch_distributed.py
+++ b/pytorch_lightning/overrides/torch_distributed.py
@@ -11,7 +11,6 @@
     ProcessGroup,
 )
 
-_HCCL_AVAILABLE = True
 
 _pickler = pickle.Pickler
 _unpickler = pickle.Unpickler
diff --git a/pytorch_lightning/plugins/io/hpu_io_plugin.py b/pytorch_lightning/plugins/io/hpu_io_plugin.py
index 39a1fa376c190..44012123f2430 100644
--- a/pytorch_lightning/plugins/io/hpu_io_plugin.py
+++ b/pytorch_lightning/plugins/io/hpu_io_plugin.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import os
 import torch
 from typing import Any, Callable, Dict, Optional
@@ -27,6 +28,7 @@
 
 
 class HPUCheckpointIO(TorchCheckpointIO):
+    """CheckpointIO to save checkpoints for HPU training strategies."""
 
     def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None) -> None:
         fs = get_filesystem(path)
diff --git a/pytorch_lightning/plugins/precision/hpu_precision.py b/pytorch_lightning/plugins/precision/hpu_precision.py
index e7130beff9740..cb6075e1ac2ea 100644
--- a/pytorch_lightning/plugins/precision/hpu_precision.py
+++ b/pytorch_lightning/plugins/precision/hpu_precision.py
@@ -21,7 +21,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
+from typing import Any, Optional, Sequence
 from typing import Any, List, Tuple
 
 import torch.nn as nn
@@ -34,7 +34,7 @@
 class HPUPrecisionPlugin(PrecisionPlugin):
     """Plugin that enables bfloats/floats on HPUs"""
 
-    def __init__(self, precision: int, hmp_params :[]) -> None:
+    def __init__(self, precision: int, hmp_params :Optional[Sequence[Any]] = None) -> None:
         super().__init__()
         self.precision = precision
         if hmp_params is not None:
@@ -44,8 +44,3 @@ def __init__(self, precision: int, hmp_params :[]) -> None:
             hmp_verbose = hmp_params["verbose"]
             hmp.convert(opt_level=hmp_opt_level, bf16_file_path=hmp_bf16,
                 fp32_file_path=hmp_fp32, isVerbose=hmp_verbose)
-
-    def connect(
-        self, model: nn.Module, optimizers: List[Optimizer], lr_schedulers: List[Any]
-    ) -> Tuple[nn.Module, List[Optimizer], List[Any]]:
-        return super().connect(model=model, optimizers=optimizers, lr_schedulers=lr_schedulers)
diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py
index 6fcf8144b7974..5ff9842edabaa 100644
--- a/pytorch_lightning/strategies/ddp.py
+++ b/pytorch_lightning/strategies/ddp.py
@@ -516,13 +516,6 @@ def reconciliate_processes(self, trace: str) -> None:
         shutil.rmtree(sync_dir)
         raise DeadlockDetectedException(f"DeadLock detected from rank: {self.global_rank} \n {trace}")
 
-    def on_save(self, checkpoint: Dict[str, Union[Any, torch.Tensor]]) -> Dict[str, Union[Any, torch.Tensor]]:
-        if self.root_device.type == "hpu" and self.cluster_environment.global_rank() == 0:
-            from pytorch_lightning.utilities.apply_func import move_data_to_device
-            return move_data_to_device(checkpoint, torch.device("cpu"))
-        else:
-            return checkpoint
-
     def teardown(self) -> None:
         log.detail(f"{self.__class__.__name__}: tearing down DDP plugin")
         super().teardown()
diff --git a/pytorch_lightning/strategies/hpu.py b/pytorch_lightning/strategies/hpu.py
index 2217b8508418b..7191591e3fc50 100644
--- a/pytorch_lightning/strategies/hpu.py
+++ b/pytorch_lightning/strategies/hpu.py
@@ -1,13 +1,3 @@
-# Copyright (C) 2021 Habana Labs, Ltd. an Intel Company
-# All Rights Reserved.
-#
-# Unauthorized copying of this file or any element(s) within it, via any medium
-# is strictly prohibited.
-# This file contains Habana Labs, Ltd. proprietary and confidential information
-# and is subject to the confidentiality and license agreements under which it
-# was provided.
-#
-
 # Copyright The PyTorch Lightning team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -21,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import torch
 import os
 from typing import Any, Dict, Optional
@@ -29,29 +20,23 @@
 from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO
 from pytorch_lightning.strategies.single_device import SingleDeviceStrategy
 from pytorch_lightning.utilities.apply_func import move_data_to_device
-from pytorch_lightning.utilities import _HPU_AVAILABLE, find_shared_parameters, set_shared_parameters
+from pytorch_lightning.utilities import _HPU_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.types import _PATH
 
 class HPUStrategy(SingleDeviceStrategy):
+    """Strategy for training on HPU devices."""
 
     def __init__(
         self,
         device: int,
         checkpoint_io: Optional[HPUCheckpointIO] = None,
-        debug: bool = False,
     ):
 
         device = torch.device("hpu")
         checkpoint_io = checkpoint_io or HPUCheckpointIO()
         super().__init__(device, checkpoint_io=checkpoint_io)
 
-        self.debug = debug
-
-    @property
-    def is_distributed(self) -> bool:
-        return False
 
     def setup(self, trainer: "pl.Trainer") -> None:
         self.model_to_device()
@@ -61,7 +46,7 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None:
         super().setup_optimizers(trainer)
 
         if len(self.optimizers) > 1:
-            raise MisconfigurationException("IPUs currently only support one optimizer.")
+            raise MisconfigurationException("HPUs currently only support one optimizer.")
 
     def model_to_device(self) -> None:
         self.model.to(self.root_device)
@@ -74,5 +59,3 @@ def pre_dispatch(self) -> None:
         if isinstance(self.device, int):
             self.device = torch.device(self.device)
 
-    def on_save(self, checkpoint: dict) -> dict:
-        return move_data_to_device(checkpoint, torch.device("cpu"))
diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index 219adadf60c4d..8c726f8a1c569 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -150,9 +150,6 @@ def forward(
 
         gathered_tensor = [torch.zeros_like(tensor) for _ in range(torch.distributed.get_world_size())]
 
-        if _HPU_AVAILABLE:
-            # HPU distributed backend doesn't support int64 tensors
-            tensor = tensor.int()
         torch.distributed.all_gather(gathered_tensor, tensor, group=group)
         gathered_tensor = torch.stack(gathered_tensor, dim=0)
 

From 3015972e420e12ebabe978c2090813575c42ae51 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 8 Feb 2022 11:21:52 +0000
Subject: [PATCH 009/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../hpu_examples/simple_mnist/mnist.py        | 23 +++++++++----
 .../simple_mnist/ops_bf16_mnist.txt           |  2 +-
 .../simple_mnist/ops_fp32_mnist.txt           |  2 +-
 pytorch_lightning/accelerators/__init__.py    |  2 +-
 pytorch_lightning/accelerators/hpu.py         |  8 ++---
 pytorch_lightning/callbacks/__init__.py       |  2 +-
 .../callbacks/hpu_stats_monitor.py            | 21 ++++++------
 pytorch_lightning/core/lightning.py           |  4 +--
 .../overrides/torch_distributed.py            | 32 ++++++++-----------
 pytorch_lightning/plugins/__init__.py         |  6 ++--
 pytorch_lightning/plugins/io/__init__.py      |  2 +-
 pytorch_lightning/plugins/io/hpu_io_plugin.py | 14 ++++----
 .../plugins/precision/hpu_precision.py        | 14 ++++----
 .../plugins/training_type/__init__.py         |  2 +-
 pytorch_lightning/strategies/__init__.py      |  2 +-
 pytorch_lightning/strategies/ddp.py           | 10 +++---
 pytorch_lightning/strategies/hpu.py           |  8 ++---
 .../connectors/accelerator_connector.py       | 15 ++++++---
 pytorch_lightning/trainer/trainer.py          | 10 +++---
 pytorch_lightning/utilities/argparse.py       |  2 +-
 pytorch_lightning/utilities/distributed.py    | 14 +++++---
 pytorch_lightning/utilities/enums.py          |  2 +-
 pytorch_lightning/utilities/imports.py        |  1 +
 tests/accelerators/ops_bf16_mnist.txt         |  2 +-
 tests/accelerators/ops_fp32_mnist.txt         |  2 +-
 .../test_accelerator_connector.py             |  6 +++-
 tests/accelerators/test_common.py             |  8 ++++-
 tests/accelerators/test_hpu.py                |  7 +++-
 tests/conftest.py                             |  2 +-
 29 files changed, 128 insertions(+), 97 deletions(-)

diff --git a/pl_examples/hpu_examples/simple_mnist/mnist.py b/pl_examples/hpu_examples/simple_mnist/mnist.py
index d9c4cb63dc557..d9d12fd93bc2d 100644
--- a/pl_examples/hpu_examples/simple_mnist/mnist.py
+++ b/pl_examples/hpu_examples/simple_mnist/mnist.py
@@ -1,21 +1,21 @@
 import os
+import sys
 
+import habana_frameworks.torch.core as htcore
 import torch
 from torch import nn
 from torch.nn import functional as F
 from torch.utils.data import DataLoader, random_split
-from torchvision.datasets import MNIST
 from torchvision import transforms
-import pytorch_lightning as pl
-import sys
+from torchvision.datasets import MNIST
 
-import habana_frameworks.torch.core as htcore
+import pytorch_lightning as pl
 from pytorch_lightning.callbacks import HPUStatsMonitor
 
-class MNISTModel(pl.LightningModule):
 
+class MNISTModel(pl.LightningModule):
     def __init__(self):
-        super(MNISTModel, self).__init__()
+        super().__init__()
         self.l1 = torch.nn.Linear(28 * 28, 10)
 
     def forward(self, x):
@@ -29,6 +29,7 @@ def training_step(self, batch, batch_nb):
     def configure_optimizers(self):
         return torch.optim.Adam(self.parameters(), lr=0.02)
 
+
 # Init our model
 mnist_model = MNISTModel()
 
@@ -47,7 +48,15 @@ def configure_optimizers(self):
 hpu_stats = HPUStatsMonitor(log_save_dir="habana_ptl_log", exp_name="mnist")
 
 # Initialize a trainer
-trainer = pl.Trainer(devices=1, callbacks=[hpu_stats], max_epochs=1, precision=32, hmp_params=hmp_params, default_root_dir='/tmp/', accelerator="hpu")
+trainer = pl.Trainer(
+    devices=1,
+    callbacks=[hpu_stats],
+    max_epochs=1,
+    precision=32,
+    hmp_params=hmp_params,
+    default_root_dir="/tmp/",
+    accelerator="hpu",
+)
 
 # Train the model ⚡
 trainer.fit(mnist_model, train_loader)
diff --git a/pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt b/pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt
index 21dfc7eb22855..53ec99c15b4ce 100644
--- a/pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt
+++ b/pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt
@@ -1,2 +1,2 @@
 linear
-relu
\ No newline at end of file
+relu
diff --git a/pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt b/pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt
index 11322c514abd9..4509b7e58ac29 100644
--- a/pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt
+++ b/pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt
@@ -1 +1 @@
-cross_entropy
\ No newline at end of file
+cross_entropy
diff --git a/pytorch_lightning/accelerators/__init__.py b/pytorch_lightning/accelerators/__init__.py
index e6fb6e3c84a9f..27e580fa5b496 100644
--- a/pytorch_lightning/accelerators/__init__.py
+++ b/pytorch_lightning/accelerators/__init__.py
@@ -13,6 +13,6 @@
 from pytorch_lightning.accelerators.accelerator import Accelerator  # noqa: F401
 from pytorch_lightning.accelerators.cpu import CPUAccelerator  # noqa: F401
 from pytorch_lightning.accelerators.gpu import GPUAccelerator  # noqa: F401
+from pytorch_lightning.accelerators.hpu import HPUAccelerator  # noqa: F401
 from pytorch_lightning.accelerators.ipu import IPUAccelerator  # noqa: F401
 from pytorch_lightning.accelerators.tpu import TPUAccelerator  # noqa: F401
-from pytorch_lightning.accelerators.hpu import HPUAccelerator  # noqa: F401
diff --git a/pytorch_lightning/accelerators/hpu.py b/pytorch_lightning/accelerators/hpu.py
index 46a843cd67850..29ee822c5aeb1 100644
--- a/pytorch_lightning/accelerators/hpu.py
+++ b/pytorch_lightning/accelerators/hpu.py
@@ -12,14 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any
+from typing import Any, Dict, Union
+
 import torch
+
 from pytorch_lightning.accelerators.accelerator import Accelerator
-from typing import Any, Dict, Union
 
 
 class HPUAccelerator(Accelerator):
-    """ Accelerator for HPU devices. """
+    """Accelerator for HPU devices."""
 
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         """HPU device stats aren't supported yet."""
@@ -30,4 +31,3 @@ def auto_device_count() -> int:
         """Get the devices when set to auto."""
         # TBD: make this configurable
         return 8
-        
\ No newline at end of file
diff --git a/pytorch_lightning/callbacks/__init__.py b/pytorch_lightning/callbacks/__init__.py
index a614288c0ac00..6cc4e765b70b6 100644
--- a/pytorch_lightning/callbacks/__init__.py
+++ b/pytorch_lightning/callbacks/__init__.py
@@ -16,8 +16,8 @@
 from pytorch_lightning.callbacks.early_stopping import EarlyStopping
 from pytorch_lightning.callbacks.finetuning import BackboneFinetuning, BaseFinetuning
 from pytorch_lightning.callbacks.gpu_stats_monitor import GPUStatsMonitor
-from pytorch_lightning.callbacks.hpu_stats_monitor import HPUStatsMonitor
 from pytorch_lightning.callbacks.gradient_accumulation_scheduler import GradientAccumulationScheduler
+from pytorch_lightning.callbacks.hpu_stats_monitor import HPUStatsMonitor
 from pytorch_lightning.callbacks.lambda_function import LambdaCallback
 from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor
 from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
diff --git a/pytorch_lightning/callbacks/hpu_stats_monitor.py b/pytorch_lightning/callbacks/hpu_stats_monitor.py
index b483bf848fa68..e1b2ed091ba6a 100644
--- a/pytorch_lightning/callbacks/hpu_stats_monitor.py
+++ b/pytorch_lightning/callbacks/hpu_stats_monitor.py
@@ -29,14 +29,16 @@
 
 """
 from typing import Any, Dict, List, Optional, Tuple
+
 import torch
+
 import pytorch_lightning as pl
 from pytorch_lightning.callbacks.base import Callback
 from pytorch_lightning.utilities import rank_zero_only
 
+
 class HPUStatsMonitor(Callback):
-    """
-    Automatically monitors and logs hpu stats during training stage.
+    """Automatically monitors and logs hpu stats during training stage.
 
     Args:
         save_dir: directory to save the logs.
@@ -51,19 +53,16 @@ class HPUStatsMonitor(Callback):
 
     you can also optionally provide save_dir and exp_name in HPUStatsMonitor.
     No need to provide logger in Trainer.
-
     """
-    def __init__(
-        self,
-        log_save_dir:str = "habana_ptl_logs",
-        exp_name:str = "default"
-    ):
+
+    def __init__(self, log_save_dir: str = "habana_ptl_logs", exp_name: str = "default"):
         super().__init__()
         self.log_save_dir = log_save_dir
         self.exp_name = exp_name
 
     def on_init_end(self, trainer: "pl.Trainer") -> None:
         from pytorch_lightning import loggers as pl_logger
+
         self.tb_logger = pl_logger.TensorBoardLogger(save_dir=self.log_save_dir, name=self.exp_name)
         trainer.logger = self.tb_logger
 
@@ -75,7 +74,7 @@ def on_train_epoch_end(
     ) -> None:
         tensor_board = trainer.logger.experiment
         dict = vars(pl_module)
-        modules = dict['_modules']
+        modules = dict["_modules"]
         for module_name in modules:
-            tensor_board.add_histogram( module_name + ".weight",  modules[module_name].weight, pl_module.current_epoch)
-            tensor_board.add_histogram(module_name + ".bias",  modules[module_name].bias, pl_module.current_epoch)
+            tensor_board.add_histogram(module_name + ".weight", modules[module_name].weight, pl_module.current_epoch)
+            tensor_board.add_histogram(module_name + ".bias", modules[module_name].bias, pl_module.current_epoch)
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index f2614b14fa4ce..c1dc5ba2137e3 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -225,8 +225,8 @@ def on_gpu(self):
 
     @property
     def on_hpu(self):
-        """
-        True if your model is currently running on HPUs.
+        """True if your model is currently running on HPUs.
+
         Useful to set flags around the LightningModule for different CPU vs GPU vs HPU behavior.
         """
         return self.device.type == "hpu"
diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py
index 834271cfddd08..2ea88bcdf9d75 100644
--- a/pytorch_lightning/overrides/torch_distributed.py
+++ b/pytorch_lightning/overrides/torch_distributed.py
@@ -1,16 +1,13 @@
-import os
 import io
 import logging
 import os
 import pickle
 
 import torch
-from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8
-from pytorch_lightning.utilities import _HPU_AVAILABLE
-from torch._C._distributed_c10d import (
-    ProcessGroup,
-)
+from torch._C._distributed_c10d import ProcessGroup
 
+from pytorch_lightning.utilities import _HPU_AVAILABLE
+from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8
 
 _pickler = pickle.Pickler
 _unpickler = pickle.Unpickler
@@ -25,14 +22,13 @@
 
 # https://github.com/pytorch/pytorch/blob/1.10.1/torch/distributed/distributed_c10d.py#L256
 def _rank_not_in_group(group: ProcessGroup):
-    """
-    Helper that checks if the current process's rank is not in a given group.
-    """
+    """Helper that checks if the current process's rank is not in a given group."""
     if group is None:
         return False
     return group == GroupMember.NON_GROUP_MEMBER
 
-#Taken from https://github.com/pytorch/pytorch/blob/1.10.1/torch/distributed/distributed_c10d.py#L1518
+
+# Taken from https://github.com/pytorch/pytorch/blob/1.10.1/torch/distributed/distributed_c10d.py#L1518
 def _object_to_tensor(obj):
     f = io.BytesIO()
     _pickler(f).dump(obj)
@@ -44,18 +40,17 @@ def _object_to_tensor(obj):
     local_size = torch.LongTensor([byte_tensor.numel()])
     return byte_tensor, local_size
 
-#Taken from https://github.com/pytorch/pytorch/blob/1.10.1/torch/distributed/distributed_c10d.py#L1530
+
+# Taken from https://github.com/pytorch/pytorch/blob/1.10.1/torch/distributed/distributed_c10d.py#L1530
 def _tensor_to_object(tensor, tensor_size):
     buf = tensor.numpy().tobytes()[:tensor_size]
     return _unpickler(io.BytesIO(buf)).load()
 
 
-#Taken from https://github.com/pytorch/pytorch/blob/1.10.1/torch/distributed/distributed_c10d.py#L1729
+# Taken from https://github.com/pytorch/pytorch/blob/1.10.1/torch/distributed/distributed_c10d.py#L1729
 def _broadcast_object_list(object_list, src=0, group=None, device=None):
-    """
-    Broadcasts picklable objects in ``object_list`` to the whole group. Similar
-    to :func:`broadcast`, but Python objects can be passed in.
-    Note that all objects in ``object_list`` must be picklable in order to be
+    """Broadcasts picklable objects in ``object_list`` to the whole group. Similar to :func:`broadcast`, but Python
+    objects can be passed in. Note that all objects in ``object_list`` must be picklable in order to be
     broadcasted.
 
     Args:
@@ -142,7 +137,7 @@ def _broadcast_object_list(object_list, src=0, group=None, device=None):
 
     elif is_hpu_backend:
         current_device = torch.device("hpu")
-        #Workaround: HPU doesn't not support long tensors for collectives
+        # Workaround: HPU doesn't not support long tensors for collectives
         object_sizes_tensor = object_sizes_tensor.int()
         object_sizes_tensor = object_sizes_tensor.to(current_device)
 
@@ -175,6 +170,7 @@ def _broadcast_object_list(object_list, src=0, group=None, device=None):
             offset += obj_size
             object_list[i] = _tensor_to_object(obj_view, obj_size)
 
+
 if not torch.distributed.is_available():
     # avoid failures on early PyTorch versions for Windows where
     # not all functions used in `broadcast_object_list` are available.
@@ -185,4 +181,4 @@ def _broadcast_noop(obj, *_, **__):
 elif _TORCH_GREATER_EQUAL_1_8 and not _HPU_AVAILABLE:
     from torch.distributed.distributed_c10d import broadcast_object_list
 else:
-    broadcast_object_list = _broadcast_object_list
\ No newline at end of file
+    broadcast_object_list = _broadcast_object_list
diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py
index 33b670d46a44c..ab8dff865ba56 100644
--- a/pytorch_lightning/plugins/__init__.py
+++ b/pytorch_lightning/plugins/__init__.py
@@ -2,15 +2,15 @@
 
 from pytorch_lightning.plugins.environments import ClusterEnvironment
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
+from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO
 from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO
 from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO
-from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO
 from pytorch_lightning.plugins.precision.apex_amp import ApexMixedPrecisionPlugin
 from pytorch_lightning.plugins.precision.deepspeed import DeepSpeedPrecisionPlugin
 from pytorch_lightning.plugins.precision.double import DoublePrecisionPlugin
 from pytorch_lightning.plugins.precision.fully_sharded_native_amp import FullyShardedNativeMixedPrecisionPlugin
-from pytorch_lightning.plugins.precision.ipu import IPUPrecisionPlugin
 from pytorch_lightning.plugins.precision.hpu_precision import HPUPrecisionPlugin
+from pytorch_lightning.plugins.precision.ipu import IPUPrecisionPlugin
 from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin
 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
 from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin
@@ -24,11 +24,11 @@
 from pytorch_lightning.plugins.training_type.fully_sharded import DDPFullyShardedPlugin
 from pytorch_lightning.plugins.training_type.horovod import HorovodPlugin
 from pytorch_lightning.plugins.training_type.ipu import IPUPlugin
-from pytorch_lightning.plugins.training_type.single_hpu import HPUPlugin
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.plugins.training_type.sharded import DDPShardedPlugin
 from pytorch_lightning.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin
 from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin
+from pytorch_lightning.plugins.training_type.single_hpu import HPUPlugin
 from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin
 from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin
 from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin
diff --git a/pytorch_lightning/plugins/io/__init__.py b/pytorch_lightning/plugins/io/__init__.py
index b12f04d86515c..0671d26a175e1 100644
--- a/pytorch_lightning/plugins/io/__init__.py
+++ b/pytorch_lightning/plugins/io/__init__.py
@@ -12,6 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO  # noqa: F401
+from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO  # noqa: F401
 from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO  # noqa: F401
 from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO  # noqa: F401
-from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO  # noqa: F401
\ No newline at end of file
diff --git a/pytorch_lightning/plugins/io/hpu_io_plugin.py b/pytorch_lightning/plugins/io/hpu_io_plugin.py
index 44012123f2430..225b67bd17359 100644
--- a/pytorch_lightning/plugins/io/hpu_io_plugin.py
+++ b/pytorch_lightning/plugins/io/hpu_io_plugin.py
@@ -13,18 +13,17 @@
 # limitations under the License.
 
 import os
-import torch
 from typing import Any, Callable, Dict, Optional
 
+import torch
+
 import pytorch_lightning as pl
-from pytorch_lightning.utilities.cloud_io import load as pl_load
-from pytorch_lightning.utilities import rank_zero_warn
 from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO
-from pytorch_lightning.utilities import _HPU_AVAILABLE
+from pytorch_lightning.utilities import _HPU_AVAILABLE, rank_zero_warn
 from pytorch_lightning.utilities.apply_func import apply_to_collection
-from pytorch_lightning.utilities.cloud_io import get_filesystem
-from pytorch_lightning.utilities.types import _PATH
 from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem
+from pytorch_lightning.utilities.cloud_io import load as pl_load
+from pytorch_lightning.utilities.types import _PATH
 
 
 class HPUCheckpointIO(TorchCheckpointIO):
@@ -36,7 +35,8 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
 
         if _HPU_AVAILABLE:
             from pytorch_lightning.utilities.apply_func import move_data_to_device
-            checkpoint =  move_data_to_device(checkpoint, torch.device("cpu"))
+
+            checkpoint = move_data_to_device(checkpoint, torch.device("cpu"))
         # write the checkpoint dictionary on the file
         atomic_save(checkpoint, path)
 
diff --git a/pytorch_lightning/plugins/precision/hpu_precision.py b/pytorch_lightning/plugins/precision/hpu_precision.py
index cb6075e1ac2ea..91413aac1352d 100644
--- a/pytorch_lightning/plugins/precision/hpu_precision.py
+++ b/pytorch_lightning/plugins/precision/hpu_precision.py
@@ -21,20 +21,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Optional, Sequence
-from typing import Any, List, Tuple
+from typing import Any, List, Optional, Sequence, Tuple
 
 import torch.nn as nn
+from habana_frameworks.torch.hpex import hmp
 from torch.optim import Optimizer
 
 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
 
-from habana_frameworks.torch.hpex import hmp
 
 class HPUPrecisionPlugin(PrecisionPlugin):
-    """Plugin that enables bfloats/floats on HPUs"""
+    """Plugin that enables bfloats/floats on HPUs."""
 
-    def __init__(self, precision: int, hmp_params :Optional[Sequence[Any]] = None) -> None:
+    def __init__(self, precision: int, hmp_params: Optional[Sequence[Any]] = None) -> None:
         super().__init__()
         self.precision = precision
         if hmp_params is not None:
@@ -42,5 +41,6 @@ def __init__(self, precision: int, hmp_params :Optional[Sequence[Any]] = None) -
             hmp_bf16 = hmp_params["bf16_ops"]
             hmp_fp32 = hmp_params["fp32_ops"]
             hmp_verbose = hmp_params["verbose"]
-            hmp.convert(opt_level=hmp_opt_level, bf16_file_path=hmp_bf16,
-                fp32_file_path=hmp_fp32, isVerbose=hmp_verbose)
+            hmp.convert(
+                opt_level=hmp_opt_level, bf16_file_path=hmp_bf16, fp32_file_path=hmp_fp32, isVerbose=hmp_verbose
+            )
diff --git a/pytorch_lightning/plugins/training_type/__init__.py b/pytorch_lightning/plugins/training_type/__init__.py
index 760312df346c5..0f8059f5f04ad 100644
--- a/pytorch_lightning/plugins/training_type/__init__.py
+++ b/pytorch_lightning/plugins/training_type/__init__.py
@@ -10,7 +10,7 @@
 from pytorch_lightning.plugins.training_type.sharded import DDPShardedPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin  # noqa: F401
-from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.single_hpu import HPUPlugin  # noqa: F401
+from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin  # noqa: F401
diff --git a/pytorch_lightning/strategies/__init__.py b/pytorch_lightning/strategies/__init__.py
index 90f7b8ce585e7..2ed3e841ed849 100644
--- a/pytorch_lightning/strategies/__init__.py
+++ b/pytorch_lightning/strategies/__init__.py
@@ -8,8 +8,8 @@
 from pytorch_lightning.strategies.dp import DataParallelStrategy  # noqa: F401
 from pytorch_lightning.strategies.fully_sharded import DDPFullyShardedStrategy  # noqa: F401
 from pytorch_lightning.strategies.horovod import HorovodStrategy  # noqa: F401
-from pytorch_lightning.strategies.ipu import IPUStrategy  # noqa: F401
 from pytorch_lightning.strategies.hpu import HPUStrategy  # noqa: F401
+from pytorch_lightning.strategies.ipu import IPUStrategy  # noqa: F401
 from pytorch_lightning.strategies.parallel import ParallelStrategy  # noqa: F401
 from pytorch_lightning.strategies.sharded import DDPShardedStrategy  # noqa: F401
 from pytorch_lightning.strategies.sharded_spawn import DDPSpawnShardedStrategy  # noqa: F401
diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py
index 5ff9842edabaa..331d9c9f48cb8 100644
--- a/pytorch_lightning/strategies/ddp.py
+++ b/pytorch_lightning/strategies/ddp.py
@@ -31,10 +31,10 @@
 from torch.nn.parallel.distributed import DistributedDataParallel
 
 import pytorch_lightning as pl
-from pytorch_lightning.overrides.torch_distributed import broadcast_object_list
 from pytorch_lightning.core.optimizer import LightningOptimizer
 from pytorch_lightning.overrides import LightningDistributedModule
 from pytorch_lightning.overrides.distributed import prepare_for_backward
+from pytorch_lightning.overrides.torch_distributed import broadcast_object_list
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
 from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO
@@ -286,14 +286,14 @@ def pre_configure_ddp(self):
 
         if self.root_device.type == "hpu":
             self._static_graph = False
-            static_graph  = self._ddp_kwargs.get("static_graph")
+            static_graph = self._ddp_kwargs.get("static_graph")
             if static_graph == True:
-                #when _set_static_graph() is called find_unused_parameters does not have any significance.
-                #Resetting the value of find_unused_parameters to False which is the default value to DDP
+                # when _set_static_graph() is called find_unused_parameters does not have any significance.
+                # Resetting the value of find_unused_parameters to False which is the default value to DDP
                 self._ddp_kwargs["find_unused_parameters"] = False
                 self._static_graph = True
             if static_graph is not None:
-                #DDP does not accept static_graph as a parameter, hence removing it from the list
+                # DDP does not accept static_graph as a parameter, hence removing it from the list
                 del self._ddp_kwargs["static_graph"]
 
     def _register_ddp_hooks(self) -> None:
diff --git a/pytorch_lightning/strategies/hpu.py b/pytorch_lightning/strategies/hpu.py
index 7191591e3fc50..3fb6265988477 100644
--- a/pytorch_lightning/strategies/hpu.py
+++ b/pytorch_lightning/strategies/hpu.py
@@ -12,18 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import torch
 import os
 from typing import Any, Dict, Optional
 
+import torch
+
 import pytorch_lightning as pl
 from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO
 from pytorch_lightning.strategies.single_device import SingleDeviceStrategy
-from pytorch_lightning.utilities.apply_func import move_data_to_device
 from pytorch_lightning.utilities import _HPU_AVAILABLE
+from pytorch_lightning.utilities.apply_func import move_data_to_device
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.types import _PATH
 
+
 class HPUStrategy(SingleDeviceStrategy):
     """Strategy for training on HPU devices."""
 
@@ -37,7 +39,6 @@ def __init__(
         checkpoint_io = checkpoint_io or HPUCheckpointIO()
         super().__init__(device, checkpoint_io=checkpoint_io)
 
-
     def setup(self, trainer: "pl.Trainer") -> None:
         self.model_to_device()
         super().setup(trainer)
@@ -58,4 +59,3 @@ def on_hpu(self) -> bool:
     def pre_dispatch(self) -> None:
         if isinstance(self.device, int):
             self.device = torch.device(self.device)
-
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index a9b4a911bbbb8..c41783cd04b37 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -22,22 +22,22 @@
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.accelerators.cpu import CPUAccelerator
 from pytorch_lightning.accelerators.gpu import GPUAccelerator
+from pytorch_lightning.accelerators.hpu import HPUAccelerator
 from pytorch_lightning.accelerators.ipu import IPUAccelerator
 from pytorch_lightning.accelerators.tpu import TPUAccelerator
-from pytorch_lightning.accelerators.hpu import HPUAccelerator
 from pytorch_lightning.plugins import (
     ApexMixedPrecisionPlugin,
     CheckpointIO,
     DeepSpeedPrecisionPlugin,
     DoublePrecisionPlugin,
     FullyShardedNativeMixedPrecisionPlugin,
+    HPUPrecisionPlugin,
     IPUPrecisionPlugin,
     NativeMixedPrecisionPlugin,
     PrecisionPlugin,
     ShardedNativeMixedPrecisionPlugin,
     TPUBf16PrecisionPlugin,
     TPUPrecisionPlugin,
-    HPUPrecisionPlugin,
 )
 from pytorch_lightning.plugins.environments import (
     BaguaEnvironment,
@@ -72,10 +72,10 @@
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import (
     _HOROVOD_AVAILABLE,
+    _HPU_AVAILABLE,
     _IPU_AVAILABLE,
     _TORCH_GREATER_EQUAL_1_8,
     _TPU_AVAILABLE,
-    _HPU_AVAILABLE,
 )
 from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info, rank_zero_warn
 
@@ -889,7 +889,7 @@ def set_distributed_mode(self, strategy: Optional[str] = None):
         if self.distributed_backend is None:
             if self.has_horovodrun():
                 self._set_horovod_backend()
-            elif self.num_hpus  > 1 and not _use_cpu:
+            elif self.num_hpus > 1 and not _use_cpu:
                 self.distributed_backend = _StrategyType.DDP
             elif self.num_gpus == 0 and self.num_nodes > 1:
                 self._strategy_type = _StrategyType.DDP
@@ -939,7 +939,12 @@ def set_distributed_mode(self, strategy: Optional[str] = None):
 
         _gpu_strategy_types = (_StrategyType.DP, _StrategyType.DDP, _StrategyType.DDP_SPAWN, _StrategyType.DDP2)
         # DP and DDP2 cannot run without GPU
-        if self.num_gpus == 0 and self._strategy_type in _gpu_strategy_types and not _use_cpu and not (self.num_hpus > 1):
+        if (
+            self.num_gpus == 0
+            and self._strategy_type in _gpu_strategy_types
+            and not _use_cpu
+            and not (self.num_hpus > 1)
+        ):
 
             if (self.num_nodes and self.num_nodes > 1) or (self.num_processes and self.num_processes > 1):
                 if self._strategy_type in (_StrategyType.DP, _StrategyType.DDP2):
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 33ef1f237177d..f763cf23b3531 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -30,7 +30,7 @@
 from torch.utils.data import DataLoader
 
 import pytorch_lightning as pl
-from pytorch_lightning.accelerators import Accelerator, IPUAccelerator, HPUAccelerator
+from pytorch_lightning.accelerators import Accelerator, HPUAccelerator, IPUAccelerator
 from pytorch_lightning.callbacks import Callback, EarlyStopping, ModelCheckpoint, ProgressBarBase
 from pytorch_lightning.callbacks.prediction_writer import BasePredictionWriter
 from pytorch_lightning.core.datamodule import LightningDataModule
@@ -76,10 +76,10 @@
 from pytorch_lightning.tuner.tuning import Tuner
 from pytorch_lightning.utilities import (
     _AcceleratorType,
+    _HPU_AVAILABLE,
     _IPU_AVAILABLE,
     _StrategyType,
     _TPU_AVAILABLE,
-    _HPU_AVAILABLE,
     AMPType,
     device_parser,
     GradClipAlgorithmType,
@@ -187,7 +187,7 @@ def __init__(
         plugins: Optional[Union[PLUGIN_INPUT, List[PLUGIN_INPUT]]] = None,
         amp_backend: str = "native",
         amp_level: Optional[str] = None,
-        hmp_params:["level", "verbose", "bf16_ops", "fp32_ops"] = None,
+        hmp_params: ["level", "verbose", "bf16_ops", "fp32_ops"] = None,
         move_metrics_to_cpu: bool = False,
         multiple_trainloader_mode: str = "max_size_cycle",
         stochastic_weight_avg: bool = False,
@@ -1767,7 +1767,8 @@ def _log_device_info(self) -> None:
             )
 
         if (
-            _HPU_AVAILABLE and self._device_type != _AcceleratorType.HPU
+            _HPU_AVAILABLE
+            and self._device_type != _AcceleratorType.HPU
             and not isinstance(self.accelerator, HPUAccelerator)
         ):
             rank_zero_warn(
@@ -1775,7 +1776,6 @@ def _log_device_info(self) -> None:
                 " `Trainer(hpus=8)` or script `--hpus=8`."
             )
 
-
     def _on_exception(self) -> None:
         if not _fault_tolerant_training():
             return
diff --git a/pytorch_lightning/utilities/argparse.py b/pytorch_lightning/utilities/argparse.py
index 6840b2d874987..0d826f56b70f8 100644
--- a/pytorch_lightning/utilities/argparse.py
+++ b/pytorch_lightning/utilities/argparse.py
@@ -247,7 +247,7 @@ def add_argparse_args(
         else:
             use_type = arg_types[0]
 
-        if arg == 'gpus' or arg == 'tpu_cores' or arg == 'hpus':
+        if arg == "gpus" or arg == "tpu_cores" or arg == "hpus":
             use_type = _gpus_allowed_type
 
         # hack for types in (int, float)
diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index 8c726f8a1c569..f7fb9563db42f 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -22,7 +22,12 @@
 from torch.nn.parallel.distributed import DistributedDataParallel
 
 import pytorch_lightning as pl
-from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8, _TORCH_GREATER_EQUAL_1_9, _TPU_AVAILABLE, _HPU_AVAILABLE
+from pytorch_lightning.utilities.imports import (
+    _HPU_AVAILABLE,
+    _TORCH_GREATER_EQUAL_1_8,
+    _TORCH_GREATER_EQUAL_1_9,
+    _TPU_AVAILABLE,
+)
 from pytorch_lightning.utilities.rank_zero import rank_zero_debug as new_rank_zero_debug
 from pytorch_lightning.utilities.rank_zero import rank_zero_only  # noqa: F401
 from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation
@@ -337,10 +342,11 @@ def init_dist_connection(
     os.environ["MASTER_ADDR"] = cluster_environment.main_address
     os.environ["MASTER_PORT"] = str(cluster_environment.main_port)
 
-    #TBD: move this to a hpu based ddp plugin
-    #local rank mapping for device open is needed for hpu devices
-    if torch_distributed_backend == 'hccl' and _HPU_AVAILABLE:
+    # TBD: move this to a hpu based ddp plugin
+    # local rank mapping for device open is needed for hpu devices
+    if torch_distributed_backend == "hccl" and _HPU_AVAILABLE:
         import habana_frameworks.torch.core.hccl
+
         os.environ["ID"] = str(cluster_environment.local_rank())
 
     log.info(f"Initializing distributed: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
diff --git a/pytorch_lightning/utilities/enums.py b/pytorch_lightning/utilities/enums.py
index 1f276ce17d0a3..f58bdd1b17624 100644
--- a/pytorch_lightning/utilities/enums.py
+++ b/pytorch_lightning/utilities/enums.py
@@ -281,7 +281,7 @@ class _AcceleratorType(LightningEnum):
     GPU = "GPU"
     IPU = "IPU"
     TPU = "TPU"
-    HPU = 'HPU'
+    HPU = "HPU"
 
 
 class _FaultTolerantMode(LightningEnum):
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index 9c9703dc6e3c7..07adfdc87dfbc 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -130,6 +130,7 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version:
     _IPU_AVAILABLE = False
 
 from habana_frameworks.torch.utils.library_loader import is_habana_avaialble
+
 _HPU_AVAILABLE = is_habana_avaialble()
 
 # experimental feature within PyTorch Lightning.
diff --git a/tests/accelerators/ops_bf16_mnist.txt b/tests/accelerators/ops_bf16_mnist.txt
index 21dfc7eb22855..53ec99c15b4ce 100644
--- a/tests/accelerators/ops_bf16_mnist.txt
+++ b/tests/accelerators/ops_bf16_mnist.txt
@@ -1,2 +1,2 @@
 linear
-relu
\ No newline at end of file
+relu
diff --git a/tests/accelerators/ops_fp32_mnist.txt b/tests/accelerators/ops_fp32_mnist.txt
index 11322c514abd9..4509b7e58ac29 100644
--- a/tests/accelerators/ops_fp32_mnist.txt
+++ b/tests/accelerators/ops_fp32_mnist.txt
@@ -1 +1 @@
-cross_entropy
\ No newline at end of file
+cross_entropy
diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
index 6de79de66e89c..22bd217b89713 100644
--- a/tests/accelerators/test_accelerator_connector.py
+++ b/tests/accelerators/test_accelerator_connector.py
@@ -903,6 +903,7 @@ def test_unsupported_ipu_choice(monkeypatch):
     with pytest.raises(MisconfigurationException, match=r"accelerator='ipu', precision=64\)` is not supported"):
         Trainer(accelerator="ipu", precision=64)
 
+
 def test_unsupported_hpu_choice(monkeypatch):
     import pytorch_lightning.plugins.training_type.hpu as hpu
     import pytorch_lightning.utilities.imports as imports
@@ -914,11 +915,14 @@ def test_unsupported_hpu_choice(monkeypatch):
     with pytest.raises(MisconfigurationException, match=r"accelerator='hpu', precision=64\)` is not supported"):
         Trainer(accelerator="hpu", precision=64)
 
+
 @mock.patch("torch.cuda.is_available", return_value=False)
 @mock.patch("pytorch_lightning.utilities.imports._TPU_AVAILABLE", return_value=False)
 @mock.patch("pytorch_lightning.utilities.imports._IPU_AVAILABLE", return_value=False)
 @mock.patch("pytorch_lightning.utilities.imports._HPU_AVAILABLE", return_value=False)
-def test_devices_auto_choice_cpu(is_ipu_available_mock, is_tpu_available_mock, is_gpu_available_mock, is_hpu_available_mock):
+def test_devices_auto_choice_cpu(
+    is_ipu_available_mock, is_tpu_available_mock, is_gpu_available_mock, is_hpu_available_mock
+):
     trainer = Trainer(accelerator="auto", devices="auto")
     assert trainer.devices == 1
     assert trainer.num_processes == 1
diff --git a/tests/accelerators/test_common.py b/tests/accelerators/test_common.py
index f19812924ec3b..a1f2c12d99db6 100644
--- a/tests/accelerators/test_common.py
+++ b/tests/accelerators/test_common.py
@@ -18,7 +18,13 @@
 
 import tests.helpers.utils as tutils
 from pytorch_lightning import Trainer
-from pytorch_lightning.accelerators import CPUAccelerator, GPUAccelerator, IPUAccelerator, TPUAccelerator, HPUAccelerator
+from pytorch_lightning.accelerators import (
+    CPUAccelerator,
+    GPUAccelerator,
+    HPUAccelerator,
+    IPUAccelerator,
+    TPUAccelerator,
+)
 from pytorch_lightning.utilities.seed import seed_everything
 from tests.accelerators.test_dp import CustomClassificationModelDP
 from tests.helpers.boring_model import BoringModel
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index bb10fb611500a..f951816177aa4 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -20,6 +20,7 @@
 
 from pytorch_lightning import Callback, seed_everything, Trainer
 from pytorch_lightning.accelerators import CPUAccelerator, HPUAccelerator
+from pytorch_lightning.callbacks import HPUStatsMonitor
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins import HPUPlugin, HPUPrecisionPlugin
 from pytorch_lightning.trainer.states import RunningStage, TrainerFn
@@ -30,12 +31,13 @@
 from tests.helpers.datamodules import ClassifDataModule
 from tests.helpers.runif import RunIf
 from tests.helpers.simple_models import ClassificationModel
-from pytorch_lightning.callbacks import HPUStatsMonitor
 
 if _HPU_AVAILABLE:
     import habana_frameworks.torch.core as htcore
+
     os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "hccl"
 
+
 class HPUModel(BoringModel):
     def training_step(self, batch, batch_idx):
         output = self(batch)
@@ -132,6 +134,7 @@ def test_all_stages(tmpdir, hpus):
     trainer.test(model)
     trainer.predict(model)
 
+
 @RunIf(hpu=True)
 @pytest.mark.parametrize("hpus", [1])
 def test_inference_only(tmpdir, hpus):
@@ -142,6 +145,7 @@ def test_inference_only(tmpdir, hpus):
     trainer.test(model)
     trainer.predict(model)
 
+
 @RunIf(hpu=True)
 def test_optimization(tmpdir):
     seed_everything(42)
@@ -215,6 +219,7 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
     with pytest.raises(SystemExit):
         trainer.fit(model)
 
+
 @RunIf(hpu=True)
 def test_stages_correct(tmpdir):
     """Ensure all stages correctly are traced correctly by asserting the output for each stage."""
diff --git a/tests/conftest.py b/tests/conftest.py
index 791ea4bbe29de..5ad5693a3e795 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -77,7 +77,7 @@ def restore_env_variables():
         "XRT_HOST_WORLD_SIZE",
         "XRT_SHARD_ORDINAL",
         "XRT_SHARD_LOCAL_ORDINAL",
-        "ID", # set by HPUStrategy,
+        "ID",  # set by HPUStrategy,
     }
     leaked_vars.difference_update(allowlist)
     assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}"

From 257d6447afd2f068592add447152b3b9865b52c9 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Wed, 9 Feb 2022 15:37:54 +0200
Subject: [PATCH 010/167] Update tests

Signed-off-by: Jerome <janand@habana.ai>
---
 tests/accelerators/test_hpu.py | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index f951816177aa4..842ee19e9c368 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -22,10 +22,11 @@
 from pytorch_lightning.accelerators import CPUAccelerator, HPUAccelerator
 from pytorch_lightning.callbacks import HPUStatsMonitor
 from pytorch_lightning.core.lightning import LightningModule
-from pytorch_lightning.plugins import HPUPlugin, HPUPrecisionPlugin
+from pytorch_lightning.plugins import HPUPrecisionPlugin
+from pytorch_lightning.strategies.hpu import HPUStrategy
 from pytorch_lightning.trainer.states import RunningStage, TrainerFn
 from pytorch_lightning.trainer.supporters import CombinedLoader
-from pytorch_lightning.utilities import _HPU_AVAILABLE, DeviceType
+from pytorch_lightning.utilities import _HPU_AVAILABLE, _AcceleratorType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel
 from tests.helpers.datamodules import ClassifDataModule
@@ -120,7 +121,7 @@ def test_warning_if_hpus_not_used(tmpdir):
 @RunIf(hpu=True)
 def test_no_warning_plugin(tmpdir):
     with pytest.warns(None) as record:
-        Trainer(default_root_dir=tmpdir, strategy=HPUPlugin(device=torch.device("hpu")))
+        Trainer(default_root_dir=tmpdir, max_epochs=1, strategy=HPUStrategy(device=torch.device("hpu")))
     assert len(record) == 0
 
 
@@ -188,13 +189,13 @@ def test_optimization(tmpdir):
 def test_mixed_precision(tmpdir):
     class TestCallback(Callback):
         def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None:
-            assert trainer.accelerator.model.precision == "bf16"
+            assert trainer.strategy.model.precision == "bf16"
             raise SystemExit
 
     model = HPUModel()
     trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, hpus=1, precision="bf16", callbacks=TestCallback())
-    assert isinstance(trainer.accelerator.precision_plugin, HPUPrecisionPlugin)
-    assert trainer.accelerator.precision_plugin.precision == "bf16"
+    assert isinstance(trainer.strategy.precision_plugin, HPUPrecisionPlugin)
+    assert trainer.strategy.precision_plugin.precision == "bf16"
     with pytest.raises(SystemExit):
         trainer.fit(model)
 
@@ -203,8 +204,8 @@ def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[st
 def test_pure_half_precision(tmpdir):
     class TestCallback(Callback):
         def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
-            assert trainer.accelerator.model.precision == 16
-            for param in trainer.accelerator.model.parameters():
+            assert trainer.strategy.model.precision == 16
+            for param in trainer.strategy.model.parameters():
                 assert param.dtype == torch.float16
             raise SystemExit
 
@@ -212,9 +213,9 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
     model = model.half()
     trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, hpus=1, precision=16, callbacks=TestCallback())
 
-    assert isinstance(trainer.accelerator.training_type_plugin, HPUPlugin)
-    assert isinstance(trainer.accelerator.precision_plugin, HPUPrecisionPlugin)
-    assert trainer.accelerator.precision_plugin.precision == 16
+    assert isinstance(trainer.strategy, HPUStrategy)
+    assert isinstance(trainer.strategy.precision_plugin, HPUPrecisionPlugin)
+    assert trainer.strategy.precision_plugin.precision == 16
 
     with pytest.raises(SystemExit):
         trainer.fit(model)
@@ -344,11 +345,11 @@ def test_set_devices_if_none_hpu():
 
 
 @RunIf(hpu=True)
-def test_device_type(tmpdir):
-    """HPU does not support (trainer.training_type_plugin, HPUPlugin) yet."""
+def test_device_type_when_training_plugin_hpu_passed(tmpdir):
 
-    trainer = Trainer(hpus=8)
-    assert trainer._device_type == DeviceType.HPU
+    trainer = Trainer(strategy=HPUStrategy(device=torch.device("hpu")), hpus=8)
+    assert isinstance(trainer.strategy, HPUStrategy)
+    assert trainer._device_type == _AcceleratorType.HPU
     assert isinstance(trainer.accelerator, HPUAccelerator)
 
 

From f1867cd9ba9da1be258610c16def0da21df08e07 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 9 Feb 2022 13:40:23 +0000
Subject: [PATCH 011/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/accelerators/test_hpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index 842ee19e9c368..42efb114af485 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -26,7 +26,7 @@
 from pytorch_lightning.strategies.hpu import HPUStrategy
 from pytorch_lightning.trainer.states import RunningStage, TrainerFn
 from pytorch_lightning.trainer.supporters import CombinedLoader
-from pytorch_lightning.utilities import _HPU_AVAILABLE, _AcceleratorType
+from pytorch_lightning.utilities import _AcceleratorType, _HPU_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel
 from tests.helpers.datamodules import ClassifDataModule

From c61d68bcd904438fb77709b50b39a0cc1c06c4ee Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Thu, 10 Feb 2022 13:38:52 +0200
Subject: [PATCH 012/167] Add configurable params for tests

Signed-off-by: Jerome <janand@habana.ai>
---
 tests/accelerators/test_hpu.py | 13 ++-----------
 tests/conftest.py              | 22 ++++++++++++++++++++++
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index 42efb114af485..72ca4b06a5fdb 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import os
 from typing import Optional
+from argparse import ArgumentParser
 
 import pytest
 import torch
@@ -94,7 +95,6 @@ def validation_epoch_end(self, outputs) -> None:
     def test_epoch_end(self, outputs) -> None:
         self.log("test_acc", torch.stack(outputs).mean())
 
-
 @pytest.mark.skipif(_HPU_AVAILABLE, reason="test requires non-HPU machine")
 def test_fail_if_no_hpus(tmpdir):
     with pytest.raises(MisconfigurationException, match="HPU Accelerator requires HPU devices to run"):
@@ -126,7 +126,6 @@ def test_no_warning_plugin(tmpdir):
 
 
 @RunIf(hpu=True)
-@pytest.mark.parametrize("hpus", [1])
 def test_all_stages(tmpdir, hpus):
     model = HPUModel()
     trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, hpus=hpus)
@@ -137,7 +136,6 @@ def test_all_stages(tmpdir, hpus):
 
 
 @RunIf(hpu=True)
-@pytest.mark.parametrize("hpus", [1])
 def test_inference_only(tmpdir, hpus):
     model = HPUModel()
 
@@ -266,14 +264,7 @@ def on_predict_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, da
 
 
 @RunIf(hpu=True)
-def test_precision_plugin(tmpdir):
-    """Ensure precision plugin value is set correctly."""
-    hmp_keys = ["level", "verbose", "bf16_ops", "fp32_ops"]
-    hmp_params = dict.fromkeys(hmp_keys)
-    hmp_params["level"] = "O1"
-    hmp_params["verbose"] = False
-    hmp_params["bf16_ops"] = "./ops_bf16_mnist.txt"
-    hmp_params["fp32_ops"] = "./ops_fp32_mnist.txt"
+def test_precision_plugin(tmpdir, hmp_params):
 
     plugin = HPUPrecisionPlugin(precision="bf16", hmp_params=hmp_params)
     assert plugin.precision == "bf16"
diff --git a/tests/conftest.py b/tests/conftest.py
index 5ad5693a3e795..c9d176c16943b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -188,3 +188,25 @@ def pytest_collection_modifyitems(items):
             # has `@RunIf(slow=True)`
             if marker.name == "skipif" and marker.kwargs.get("slow")
         ]
+
+
+def pytest_addoption(parser):
+    parser.addoption("--hpus", action="store", type=int, default=1, help="Number of hpus 1-8")
+    parser.addoption("--hmp-bf16", action="store", type=str, default='./ops_bf16_mnist.txt', help="bf16 ops list file in hmp O1 mode")
+    parser.addoption("--hmp-fp32", action="store", type=str, default='./ops_fp32_mnist.txt', help="fp32 ops list file in hmp O1 mode")
+
+@pytest.fixture
+def hpus(request):
+    hpus = request.config.getoption("--hpus")
+    return hpus
+
+@pytest.fixture
+def hmp_params(request):
+    """Ensure precision plugin value is set correctly."""
+    hmp_keys = ["level", "verbose", "bf16_ops", "fp32_ops"]
+    hmp_params = dict.fromkeys(hmp_keys)
+    hmp_params["level"] = "O1"
+    hmp_params["verbose"] = False
+    hmp_params["bf16_ops"] = request.config.getoption("--hmp-bf16")
+    hmp_params["fp32_ops"] = request.config.getoption("--hmp-fp32")
+    return hmp_params

From f74a89897f2cf5f121730fc0a6757d31b9184ee4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 10 Feb 2022 11:41:14 +0000
Subject: [PATCH 013/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/accelerators/test_hpu.py |  3 ++-
 tests/conftest.py              | 10 ++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index 72ca4b06a5fdb..33c14f3648d5a 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-from typing import Optional
 from argparse import ArgumentParser
+from typing import Optional
 
 import pytest
 import torch
@@ -95,6 +95,7 @@ def validation_epoch_end(self, outputs) -> None:
     def test_epoch_end(self, outputs) -> None:
         self.log("test_acc", torch.stack(outputs).mean())
 
+
 @pytest.mark.skipif(_HPU_AVAILABLE, reason="test requires non-HPU machine")
 def test_fail_if_no_hpus(tmpdir):
     with pytest.raises(MisconfigurationException, match="HPU Accelerator requires HPU devices to run"):
diff --git a/tests/conftest.py b/tests/conftest.py
index c9d176c16943b..f36737e1acab7 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -192,14 +192,20 @@ def pytest_collection_modifyitems(items):
 
 def pytest_addoption(parser):
     parser.addoption("--hpus", action="store", type=int, default=1, help="Number of hpus 1-8")
-    parser.addoption("--hmp-bf16", action="store", type=str, default='./ops_bf16_mnist.txt', help="bf16 ops list file in hmp O1 mode")
-    parser.addoption("--hmp-fp32", action="store", type=str, default='./ops_fp32_mnist.txt', help="fp32 ops list file in hmp O1 mode")
+    parser.addoption(
+        "--hmp-bf16", action="store", type=str, default="./ops_bf16_mnist.txt", help="bf16 ops list file in hmp O1 mode"
+    )
+    parser.addoption(
+        "--hmp-fp32", action="store", type=str, default="./ops_fp32_mnist.txt", help="fp32 ops list file in hmp O1 mode"
+    )
+
 
 @pytest.fixture
 def hpus(request):
     hpus = request.config.getoption("--hpus")
     return hpus
 
+
 @pytest.fixture
 def hmp_params(request):
     """Ensure precision plugin value is set correctly."""

From 963cd1ec25efee8376518e0bd291a46818011541 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Fri, 11 Feb 2022 10:14:03 +0200
Subject: [PATCH 014/167] Enable inference test

Signed-off-by: Jerome <janand@habana.ai>
---
 tests/accelerators/test_hpu.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index 33c14f3648d5a..f28ccaa5176ed 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -136,14 +136,6 @@ def test_all_stages(tmpdir, hpus):
     trainer.predict(model)
 
 
-@RunIf(hpu=True)
-def test_inference_only(tmpdir, hpus):
-    model = HPUModel()
-
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, hpus=hpus)
-    trainer.validate(model)
-    trainer.test(model)
-    trainer.predict(model)
 
 
 @RunIf(hpu=True)
@@ -284,11 +276,16 @@ def test_accelerator_hpu():
     ):
         trainer = Trainer(accelerator="hpu")
 
+
     trainer = Trainer(accelerator="auto", hpus=8)
 
     assert trainer._device_type == "hpu"
     assert isinstance(trainer.accelerator, HPUAccelerator)
 
+    with pytest.raises(
+        MisconfigurationException, match="You passed `accelerator='hpu'`, but you didn't pass `hpus` to `Trainer`"
+    ):
+        trainer = Trainer(accelerator="hpu")
 
 @RunIf(hpu=True)
 def test_accelerator_cpu_with_hpus_flag():
@@ -350,3 +347,14 @@ def test_devices_auto_choice_hpu():
     trainer = Trainer(accelerator="auto", devices="auto")
     assert trainer.devices == 8
     assert trainer.hpus == 8
+
+
+@RunIf(hpu=True)
+@pytest.mark.parametrize("hpus", [1])
+def test_inference_only(tmpdir, hpus):
+    model = HPUModel()
+
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, hpus=hpus)
+    trainer.validate(model)
+    trainer.test(model)
+    trainer.predict(model)

From 53a5416d430b7824864d08080df52bd805dcca14 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 11 Feb 2022 08:16:43 +0000
Subject: [PATCH 015/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/accelerators/test_hpu.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index f28ccaa5176ed..211af9b3af197 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -136,8 +136,6 @@ def test_all_stages(tmpdir, hpus):
     trainer.predict(model)
 
 
-
-
 @RunIf(hpu=True)
 def test_optimization(tmpdir):
     seed_everything(42)
@@ -276,7 +274,6 @@ def test_accelerator_hpu():
     ):
         trainer = Trainer(accelerator="hpu")
 
-
     trainer = Trainer(accelerator="auto", hpus=8)
 
     assert trainer._device_type == "hpu"
@@ -287,6 +284,7 @@ def test_accelerator_hpu():
     ):
         trainer = Trainer(accelerator="hpu")
 
+
 @RunIf(hpu=True)
 def test_accelerator_cpu_with_hpus_flag():
 

From 2de04e8fd94b5a098fe226da032edaf9ffbced0e Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Tue, 15 Feb 2022 07:46:53 +0200
Subject: [PATCH 016/167]  Resolve issue with hmp params type and load hpu

Signed-off-by: Jerome <janand@habana.ai>
---
 pytorch_lightning/lite/lite.py         | 1 +
 pytorch_lightning/trainer/trainer.py   | 2 +-
 pytorch_lightning/utilities/imports.py | 7 +++++--
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/lite/lite.py b/pytorch_lightning/lite/lite.py
index 268602d6b85b3..c813f365ac05a 100644
--- a/pytorch_lightning/lite/lite.py
+++ b/pytorch_lightning/lite/lite.py
@@ -99,6 +99,7 @@ def __init__(
             precision=precision,
             amp_type="native",
             amp_level=None,
+            hmp_params=None,
             plugins=plugins,
         )
         self._strategy = self._accelerator_connector.strategy
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index f763cf23b3531..7430aa8cb35f1 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -187,7 +187,7 @@ def __init__(
         plugins: Optional[Union[PLUGIN_INPUT, List[PLUGIN_INPUT]]] = None,
         amp_backend: str = "native",
         amp_level: Optional[str] = None,
-        hmp_params: ["level", "verbose", "bf16_ops", "fp32_ops"] = None,
+        hmp_params: Optional[str] = None,
         move_metrics_to_cpu: bool = False,
         multiple_trainloader_mode: str = "max_size_cycle",
         stochastic_weight_avg: bool = False,
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index 07adfdc87dfbc..d2a81aff042e5 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -16,6 +16,7 @@
 import operator
 import platform
 import sys
+import os
 from importlib.util import find_spec
 from typing import Callable
 
@@ -129,9 +130,11 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version:
 else:
     _IPU_AVAILABLE = False
 
-from habana_frameworks.torch.utils.library_loader import is_habana_avaialble
-
+from habana_frameworks.torch.utils.library_loader import is_habana_avaialble, load_habana_module
 _HPU_AVAILABLE = is_habana_avaialble()
+if _HPU_AVAILABLE:
+    load_habana_module()
+    os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "hccl"
 
 # experimental feature within PyTorch Lightning.
 def _fault_tolerant_training() -> bool:

From 0197b9c1c8c1c2ddfad3b137be4be527c6add86c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 15 Feb 2022 05:49:14 +0000
Subject: [PATCH 017/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pytorch_lightning/utilities/imports.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index d2a81aff042e5..bc4da50f7ee08 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -14,9 +14,9 @@
 """General utilities."""
 import importlib
 import operator
+import os
 import platform
 import sys
-import os
 from importlib.util import find_spec
 from typing import Callable
 
@@ -131,6 +131,7 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version:
     _IPU_AVAILABLE = False
 
 from habana_frameworks.torch.utils.library_loader import is_habana_avaialble, load_habana_module
+
 _HPU_AVAILABLE = is_habana_avaialble()
 if _HPU_AVAILABLE:
     load_habana_module()

From b4126383c1d51ed43cbc1c8efba288124e8e019d Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Thu, 17 Feb 2022 13:48:42 +0200
Subject: [PATCH 018/167] Move hmp_params to HPUPrecision plugin

Signed-off-by: Jerome <janand@habana.ai>
---
 .../hpu_examples/simple_mnist/mnist.py        | 13 ++++---
 pytorch_lightning/lite/lite.py                |  1 -
 pytorch_lightning/strategies/hpu.py           |  8 +++-
 .../connectors/accelerator_connector.py       |  4 +-
 pytorch_lightning/trainer/trainer.py          |  4 --
 tests/accelerators/test_hpu.py                | 39 +++++++++++++++----
 6 files changed, 47 insertions(+), 22 deletions(-)

diff --git a/pl_examples/hpu_examples/simple_mnist/mnist.py b/pl_examples/hpu_examples/simple_mnist/mnist.py
index d9d12fd93bc2d..f1eaccc328ffb 100644
--- a/pl_examples/hpu_examples/simple_mnist/mnist.py
+++ b/pl_examples/hpu_examples/simple_mnist/mnist.py
@@ -11,7 +11,8 @@
 
 import pytorch_lightning as pl
 from pytorch_lightning.callbacks import HPUStatsMonitor
-
+from pytorch_lightning.plugins import HPUPrecisionPlugin
+from pytorch_lightning.strategies.hpu import HPUStrategy
 
 class MNISTModel(pl.LightningModule):
     def __init__(self):
@@ -47,14 +48,16 @@ def configure_optimizers(self):
 
 hpu_stats = HPUStatsMonitor(log_save_dir="habana_ptl_log", exp_name="mnist")
 
+parallel_devices = 1
+hpustrat=HPUStrategy(device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=None))
+
 # Initialize a trainer
 trainer = pl.Trainer(
-    devices=1,
+    strategy= None if (parallel_devices==8) else hpustrat,
+    devices=parallel_devices,
     callbacks=[hpu_stats],
     max_epochs=1,
-    precision=32,
-    hmp_params=hmp_params,
-    default_root_dir="/tmp/",
+    default_root_dir=os.getcwd(),
     accelerator="hpu",
 )
 
diff --git a/pytorch_lightning/lite/lite.py b/pytorch_lightning/lite/lite.py
index c813f365ac05a..268602d6b85b3 100644
--- a/pytorch_lightning/lite/lite.py
+++ b/pytorch_lightning/lite/lite.py
@@ -99,7 +99,6 @@ def __init__(
             precision=precision,
             amp_type="native",
             amp_level=None,
-            hmp_params=None,
             plugins=plugins,
         )
         self._strategy = self._accelerator_connector.strategy
diff --git a/pytorch_lightning/strategies/hpu.py b/pytorch_lightning/strategies/hpu.py
index 3fb6265988477..48c209db291f5 100644
--- a/pytorch_lightning/strategies/hpu.py
+++ b/pytorch_lightning/strategies/hpu.py
@@ -19,6 +19,8 @@
 
 import pytorch_lightning as pl
 from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO
+from pytorch_lightning.plugins.precision import PrecisionPlugin
+from pytorch_lightning.plugins.precision.hpu_precision import HPUPrecisionPlugin
 from pytorch_lightning.strategies.single_device import SingleDeviceStrategy
 from pytorch_lightning.utilities import _HPU_AVAILABLE
 from pytorch_lightning.utilities.apply_func import move_data_to_device
@@ -33,11 +35,13 @@ def __init__(
         self,
         device: int,
         checkpoint_io: Optional[HPUCheckpointIO] = None,
+        precision_plugin: Optional[PrecisionPlugin] = None,
+        hmp_params: Optional[str] = None,
     ):
 
-        device = torch.device("hpu")
+        device = device
         checkpoint_io = checkpoint_io or HPUCheckpointIO()
-        super().__init__(device, checkpoint_io=checkpoint_io)
+        super().__init__(device, checkpoint_io=checkpoint_io, precision_plugin=precision_plugin)
 
     def setup(self, trainer: "pl.Trainer") -> None:
         self.model_to_device()
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index c41783cd04b37..12618b27f9734 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -105,7 +105,6 @@ def __init__(
         precision,
         amp_type,
         amp_level,
-        hmp_params,
         plugins,
     ):
         # initialization
@@ -138,7 +137,6 @@ def __init__(
         self.precision = precision
         self.amp_type = amp_type.lower() if isinstance(amp_type, str) else None
         self.amp_level = amp_level
-        self.hmp_params = hmp_params
 
         self._precision_plugin: Optional[PrecisionPlugin] = None
         self._strategy: Optional[Strategy] = None
@@ -690,7 +688,7 @@ def select_precision_plugin(self) -> PrecisionPlugin:
                 raise MisconfigurationException(
                     f"`Trainer(accelerator='hpu', precision={self.precision!r})` is not supported."
                 )
-            return HPUPrecisionPlugin(self.precision, self.hmp_params)
+            return HPUPrecisionPlugin(self.precision)
 
         if self._strategy_type == _StrategyType.DEEPSPEED or isinstance(self._strategy, DeepSpeedStrategy):
             return DeepSpeedPrecisionPlugin(self.precision, self.amp_type, self.amp_level)
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 7430aa8cb35f1..8f22fb47cb195 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -187,7 +187,6 @@ def __init__(
         plugins: Optional[Union[PLUGIN_INPUT, List[PLUGIN_INPUT]]] = None,
         amp_backend: str = "native",
         amp_level: Optional[str] = None,
-        hmp_params: Optional[str] = None,
         move_metrics_to_cpu: bool = False,
         multiple_trainloader_mode: str = "max_size_cycle",
         stochastic_weight_avg: bool = False,
@@ -391,8 +390,6 @@ def __init__(
 
             hpus: How many HPUs to train on.
 
-            hmp_params: list of habana mixed precision parameters
-
             track_grad_norm: -1 no tracking. Otherwise tracks that p-norm. May be set to 'inf' infinity-norm. If using
                 Automatic Mixed Precision (AMP), the gradients will be unscaled before logging them.
 
@@ -459,7 +456,6 @@ def __init__(
             precision,
             amp_backend,
             amp_level,
-            hmp_params,
             plugins,
         )
         self.logger_connector = LoggerConnector(self, log_gpu_memory)
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index 211af9b3af197..1cb51780e2239 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -25,6 +25,7 @@
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins import HPUPrecisionPlugin
 from pytorch_lightning.strategies.hpu import HPUStrategy
+from pytorch_lightning.strategies.ddp import DDPStrategy
 from pytorch_lightning.trainer.states import RunningStage, TrainerFn
 from pytorch_lightning.trainer.supporters import CombinedLoader
 from pytorch_lightning.utilities import _AcceleratorType, _HPU_AVAILABLE
@@ -175,14 +176,16 @@ def test_optimization(tmpdir):
 
 
 @RunIf(hpu=True)
-def test_mixed_precision(tmpdir):
+def test_mixed_precision(tmpdir, hmp_params):
     class TestCallback(Callback):
         def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None:
             assert trainer.strategy.model.precision == "bf16"
             raise SystemExit
 
     model = HPUModel()
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, hpus=1, precision="bf16", callbacks=TestCallback())
+    trainer = Trainer(strategy=HPUStrategy(device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision="bf16", hmp_params=hmp_params)),
+                      default_root_dir=tmpdir, fast_dev_run=True, accelerator="hpu", devices=1, callbacks=TestCallback())
+    assert isinstance(trainer.strategy, HPUStrategy)
     assert isinstance(trainer.strategy.precision_plugin, HPUPrecisionPlugin)
     assert trainer.strategy.precision_plugin.precision == "bf16"
     with pytest.raises(SystemExit):
@@ -190,7 +193,7 @@ def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[st
 
 
 @RunIf(hpu=True)
-def test_pure_half_precision(tmpdir):
+def test_pure_half_precision(tmpdir, hmp_params):
     class TestCallback(Callback):
         def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
             assert trainer.strategy.model.precision == 16
@@ -200,7 +203,8 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
 
     model = HPUModel()
     model = model.half()
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, hpus=1, precision=16, callbacks=TestCallback())
+    trainer = Trainer(strategy=HPUStrategy(device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=None)),
+                      default_root_dir=tmpdir, fast_dev_run=True, accelerator="hpu", devices=1, callbacks=TestCallback())
 
     assert isinstance(trainer.strategy, HPUStrategy)
     assert isinstance(trainer.strategy.precision_plugin, HPUPrecisionPlugin)
@@ -295,12 +299,21 @@ def test_accelerator_cpu_with_hpus_flag():
 
 
 @RunIf(hpu=True)
-def test_accelerator_hpu_with_devices():
-    """HPU does not support isinstance(trainer.training_type_plugin, HPUPlugin) yet."""
+def test_accelerator_hpu_with_single_device():
+
+    trainer = Trainer(accelerator="hpu", devices=1)
+
+    assert trainer.hpus == 1
+    assert isinstance(trainer.strategy, HPUStrategy)
+    assert isinstance(trainer.accelerator, HPUAccelerator)
+
+
+def test_accelerator_hpu_with_multiple_devices():
 
     trainer = Trainer(accelerator="hpu", devices=8)
 
     assert trainer.hpus == 8
+    assert isinstance(trainer.strategy, DDPStrategy)
     assert isinstance(trainer.accelerator, HPUAccelerator)
 
 
@@ -331,10 +344,22 @@ def test_set_devices_if_none_hpu():
     assert trainer.devices == 8
 
 
+@RunIf(hpu=True)
+def test_strategy_choice_hpu_plugin(tmpdir):
+    trainer = Trainer(strategy=HPUStrategy(device=torch.device("hpu")), accelerator="hpu", devices=1)
+    assert isinstance(trainer.strategy, HPUStrategy)
+
+
+@RunIf(hpu=True)
+def test_strategy_choice_hpu_ddp_plugin(tmpdir):
+    trainer = Trainer(strategy=HPUStrategy(device=torch.device("hpu")), accelerator="hpu", devices=8)
+    assert isinstance(trainer.strategy, HPUStrategy)
+
+
 @RunIf(hpu=True)
 def test_device_type_when_training_plugin_hpu_passed(tmpdir):
 
-    trainer = Trainer(strategy=HPUStrategy(device=torch.device("hpu")), hpus=8)
+    trainer = Trainer(strategy=HPUStrategy(device=torch.device("hpu")), accelerator="hpu", devices=1)
     assert isinstance(trainer.strategy, HPUStrategy)
     assert trainer._device_type == _AcceleratorType.HPU
     assert isinstance(trainer.accelerator, HPUAccelerator)

From e549434436ea83272ddcf1f0ad6b2e5997f32991 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 17 Feb 2022 11:54:15 +0000
Subject: [PATCH 019/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../hpu_examples/simple_mnist/mnist.py        |  5 ++--
 tests/accelerators/test_hpu.py                | 26 +++++++++++++++----
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/pl_examples/hpu_examples/simple_mnist/mnist.py b/pl_examples/hpu_examples/simple_mnist/mnist.py
index f1eaccc328ffb..81228b372cbf5 100644
--- a/pl_examples/hpu_examples/simple_mnist/mnist.py
+++ b/pl_examples/hpu_examples/simple_mnist/mnist.py
@@ -14,6 +14,7 @@
 from pytorch_lightning.plugins import HPUPrecisionPlugin
 from pytorch_lightning.strategies.hpu import HPUStrategy
 
+
 class MNISTModel(pl.LightningModule):
     def __init__(self):
         super().__init__()
@@ -49,11 +50,11 @@ def configure_optimizers(self):
 hpu_stats = HPUStatsMonitor(log_save_dir="habana_ptl_log", exp_name="mnist")
 
 parallel_devices = 1
-hpustrat=HPUStrategy(device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=None))
+hpustrat = HPUStrategy(device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=None))
 
 # Initialize a trainer
 trainer = pl.Trainer(
-    strategy= None if (parallel_devices==8) else hpustrat,
+    strategy=None if (parallel_devices == 8) else hpustrat,
     devices=parallel_devices,
     callbacks=[hpu_stats],
     max_epochs=1,
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index 1cb51780e2239..a711cc542771e 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -24,8 +24,8 @@
 from pytorch_lightning.callbacks import HPUStatsMonitor
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins import HPUPrecisionPlugin
-from pytorch_lightning.strategies.hpu import HPUStrategy
 from pytorch_lightning.strategies.ddp import DDPStrategy
+from pytorch_lightning.strategies.hpu import HPUStrategy
 from pytorch_lightning.trainer.states import RunningStage, TrainerFn
 from pytorch_lightning.trainer.supporters import CombinedLoader
 from pytorch_lightning.utilities import _AcceleratorType, _HPU_AVAILABLE
@@ -183,8 +183,16 @@ def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[st
             raise SystemExit
 
     model = HPUModel()
-    trainer = Trainer(strategy=HPUStrategy(device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision="bf16", hmp_params=hmp_params)),
-                      default_root_dir=tmpdir, fast_dev_run=True, accelerator="hpu", devices=1, callbacks=TestCallback())
+    trainer = Trainer(
+        strategy=HPUStrategy(
+            device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision="bf16", hmp_params=hmp_params)
+        ),
+        default_root_dir=tmpdir,
+        fast_dev_run=True,
+        accelerator="hpu",
+        devices=1,
+        callbacks=TestCallback(),
+    )
     assert isinstance(trainer.strategy, HPUStrategy)
     assert isinstance(trainer.strategy.precision_plugin, HPUPrecisionPlugin)
     assert trainer.strategy.precision_plugin.precision == "bf16"
@@ -203,8 +211,16 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
 
     model = HPUModel()
     model = model.half()
-    trainer = Trainer(strategy=HPUStrategy(device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=None)),
-                      default_root_dir=tmpdir, fast_dev_run=True, accelerator="hpu", devices=1, callbacks=TestCallback())
+    trainer = Trainer(
+        strategy=HPUStrategy(
+            device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=None)
+        ),
+        default_root_dir=tmpdir,
+        fast_dev_run=True,
+        accelerator="hpu",
+        devices=1,
+        callbacks=TestCallback(),
+    )
 
     assert isinstance(trainer.strategy, HPUStrategy)
     assert isinstance(trainer.strategy.precision_plugin, HPUPrecisionPlugin)

From 1cc0a37834695dc29e9b09c3bfd0f3a6475f5b8d Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Fri, 18 Feb 2022 11:47:41 +0200
Subject: [PATCH 020/167] Update habana distributed with ddp subclass

Signed-off-by: Jerome <janand@habana.ai>
---
 .../hpu_examples/simple_mnist/mnist.py        |  6 +-
 .../overrides/torch_distributed.py            |  1 +
 pytorch_lightning/strategies/__init__.py      |  1 +
 pytorch_lightning/strategies/ddp.py           |  9 +--
 pytorch_lightning/strategies/hpu.py           |  2 +-
 pytorch_lightning/strategies/hpu_parallel.py  | 73 +++++++++++++++++++
 .../connectors/accelerator_connector.py       | 13 +++-
 pytorch_lightning/utilities/distributed.py    |  7 --
 pytorch_lightning/utilities/enums.py          |  2 +
 pytorch_lightning/utilities/imports.py        |  3 -
 tests/accelerators/test_hpu.py                | 16 ++--
 tests/conftest.py                             |  2 +-
 12 files changed, 101 insertions(+), 34 deletions(-)
 create mode 100644 pytorch_lightning/strategies/hpu_parallel.py

diff --git a/pl_examples/hpu_examples/simple_mnist/mnist.py b/pl_examples/hpu_examples/simple_mnist/mnist.py
index 81228b372cbf5..938ea3acabbba 100644
--- a/pl_examples/hpu_examples/simple_mnist/mnist.py
+++ b/pl_examples/hpu_examples/simple_mnist/mnist.py
@@ -13,6 +13,7 @@
 from pytorch_lightning.callbacks import HPUStatsMonitor
 from pytorch_lightning.plugins import HPUPrecisionPlugin
 from pytorch_lightning.strategies.hpu import HPUStrategy
+from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy
 
 
 class MNISTModel(pl.LightningModule):
@@ -50,11 +51,12 @@ def configure_optimizers(self):
 hpu_stats = HPUStatsMonitor(log_save_dir="habana_ptl_log", exp_name="mnist")
 
 parallel_devices = 1
-hpustrat = HPUStrategy(device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=None))
+hpustrat_1 = HPUStrategy(device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params))
+hpustrat_8 = HPUParallelStrategy(parallel_devices=[torch.device("hpu")]*parallel_devices, precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params))
 
 # Initialize a trainer
 trainer = pl.Trainer(
-    strategy=None if (parallel_devices == 8) else hpustrat,
+    strategy=hpustrat_8 if (parallel_devices == 8) else hpustrat_1,
     devices=parallel_devices,
     callbacks=[hpu_stats],
     max_epochs=1,
diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py
index 2ea88bcdf9d75..c915235ec0e73 100644
--- a/pytorch_lightning/overrides/torch_distributed.py
+++ b/pytorch_lightning/overrides/torch_distributed.py
@@ -19,6 +19,7 @@
     from torch.distributed import Backend, broadcast, get_backend, get_rank, GroupMember
 
 # The code underneath is taken from PyTorch `torch/distributed/distributed_c10d.py`
+# the distributed backend and tensor type updates for habana backend is done here before broadcast
 
 # https://github.com/pytorch/pytorch/blob/1.10.1/torch/distributed/distributed_c10d.py#L256
 def _rank_not_in_group(group: ProcessGroup):
diff --git a/pytorch_lightning/strategies/__init__.py b/pytorch_lightning/strategies/__init__.py
index 2ed3e841ed849..873e13766b242 100644
--- a/pytorch_lightning/strategies/__init__.py
+++ b/pytorch_lightning/strategies/__init__.py
@@ -9,6 +9,7 @@
 from pytorch_lightning.strategies.fully_sharded import DDPFullyShardedStrategy  # noqa: F401
 from pytorch_lightning.strategies.horovod import HorovodStrategy  # noqa: F401
 from pytorch_lightning.strategies.hpu import HPUStrategy  # noqa: F401
+from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy  # noqa: F401
 from pytorch_lightning.strategies.ipu import IPUStrategy  # noqa: F401
 from pytorch_lightning.strategies.parallel import ParallelStrategy  # noqa: F401
 from pytorch_lightning.strategies.sharded import DDPShardedStrategy  # noqa: F401
diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py
index 331d9c9f48cb8..5752d8129c85a 100644
--- a/pytorch_lightning/strategies/ddp.py
+++ b/pytorch_lightning/strategies/ddp.py
@@ -34,10 +34,8 @@
 from pytorch_lightning.core.optimizer import LightningOptimizer
 from pytorch_lightning.overrides import LightningDistributedModule
 from pytorch_lightning.overrides.distributed import prepare_for_backward
-from pytorch_lightning.overrides.torch_distributed import broadcast_object_list
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
-from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO
 from pytorch_lightning.plugins.precision import PrecisionPlugin
 from pytorch_lightning.strategies.parallel import ParallelStrategy
 from pytorch_lightning.trainer.states import TrainerFn
@@ -96,7 +94,7 @@ def __init__(
             accelerator=accelerator,
             parallel_devices=parallel_devices,
             cluster_environment=cluster_environment,
-            checkpoint_io=checkpoint_io or HPUCheckpointIO(),
+            checkpoint_io=checkpoint_io,
             precision_plugin=precision_plugin,
         )
         log.detail(f"{self.__class__.__name__}: initializing DDP plugin")
@@ -383,10 +381,7 @@ def broadcast(self, obj: object, src: int = 0) -> object:
         obj = [obj]
         if self.global_rank != src:
             obj = [None]
-        if self.root_device.type == "hpu":
-            broadcast_object_list(obj, src, group=_group.WORLD)
-        else:
-            torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD)
+        torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD)
 
         return obj[0]
 
diff --git a/pytorch_lightning/strategies/hpu.py b/pytorch_lightning/strategies/hpu.py
index 48c209db291f5..ab35a1ab7fdee 100644
--- a/pytorch_lightning/strategies/hpu.py
+++ b/pytorch_lightning/strategies/hpu.py
@@ -51,7 +51,7 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None:
         super().setup_optimizers(trainer)
 
         if len(self.optimizers) > 1:
-            raise MisconfigurationException("HPUs currently only support one optimizer.")
+            raise MisconfigurationException("HPUs currently support only one optimizer.")
 
     def model_to_device(self) -> None:
         self.model.to(self.root_device)
diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
new file mode 100644
index 0000000000000..9497bd8588aae
--- /dev/null
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -0,0 +1,73 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import List, Optional
+
+import __main__
+
+import torch
+import torch.distributed
+
+import pytorch_lightning as pl
+from pytorch_lightning.overrides.torch_distributed import broadcast_object_list
+from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
+from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO
+from pytorch_lightning.plugins.precision import PrecisionPlugin
+from pytorch_lightning.strategies.ddp import DDPStrategy
+from pytorch_lightning.utilities.distributed import group as _group
+from pytorch_lightning.utilities.enums import _StrategyType
+
+
+class HPUParallelStrategy(DDPStrategy):
+    """Plugin for multi-process single-device training on one or multiple nodes.
+
+    The main process in each node spawns N-1 child processes via :func:`subprocess.Popen`, where N is the number of
+    devices (e.g. GPU) per node. It is very similar to how :mod:`torch.distributed.launch` launches processes.
+    """
+
+    distributed_backend = _StrategyType.HPU_PARALLEL
+
+    def __init__(
+        self,
+        accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None,
+        parallel_devices: Optional[List[torch.device]] = None,
+        checkpoint_io: Optional[CheckpointIO] = None,
+        precision_plugin: Optional[PrecisionPlugin] = None,
+    ) -> None:
+        super().__init__(
+            accelerator=accelerator,
+            parallel_devices=parallel_devices,
+            checkpoint_io=checkpoint_io or HPUCheckpointIO(),
+            precision_plugin=precision_plugin,
+        )
+
+    def setup_environment(self) -> None:
+
+        import habana_frameworks.torch.core.hccl
+
+        os.environ["ID"] = str(self.local_rank)
+        os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "hccl"
+        
+        super().setup_environment()
+
+    def broadcast(self, obj: object, src: int = 0) -> object:
+        obj = [obj]
+        if self.global_rank != src:
+            obj = [None]
+        if self.root_device.type == "hpu":
+            broadcast_object_list(obj, src, group=_group.WORLD)
+        else:
+            torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD)
+
+        return obj[0]
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index 12618b27f9734..246c0a309ed97 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -60,6 +60,7 @@
     DeepSpeedStrategy,
     HorovodStrategy,
     HPUStrategy,
+    HPUParallelStrategy,
     IPUStrategy,
     SingleDeviceStrategy,
     SingleTPUStrategy,
@@ -804,7 +805,11 @@ def select_strategy(self) -> Strategy:
         elif self.use_ipu:
             plugin = IPUStrategy(parallel_devices=self.parallel_devices)
         elif self.use_hpu:
-            plugin = HPUStrategy(device=torch.device("hpu"))
+            if len(self.parallel_devices) > 1 :
+                plugin = HPUParallelStrategy(
+                    parallel_devices=self.parallel_devices)
+            else:
+                plugin = HPUStrategy(device=torch.device("hpu"))
         else:
             single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids)
             plugin = SingleDeviceStrategy(device=single_gpu_ordinal if self.use_gpu else "cpu")
@@ -888,7 +893,7 @@ def set_distributed_mode(self, strategy: Optional[str] = None):
             if self.has_horovodrun():
                 self._set_horovod_backend()
             elif self.num_hpus > 1 and not _use_cpu:
-                self.distributed_backend = _StrategyType.DDP
+                self.distributed_backend = _StrategyType.HPU_PARALLEL
             elif self.num_gpus == 0 and self.num_nodes > 1:
                 self._strategy_type = _StrategyType.DDP
             elif self.num_gpus == 0 and self.num_processes > 1:
@@ -928,7 +933,7 @@ def set_distributed_mode(self, strategy: Optional[str] = None):
             self._device_type = _AcceleratorType.IPU
         elif self.has_hpu and not _use_cpu:
             self._device_type = _AcceleratorType.HPU
-            self._strategy_type = _StrategyType.DDP
+            self._strategy_type = _StrategyType.HPU_PARALLEL
         elif self.distributed_backend and self._strategy_type is None:
             self._strategy_type = _StrategyType(self.distributed_backend)
 
@@ -967,7 +972,7 @@ def set_distributed_mode(self, strategy: Optional[str] = None):
         if self._device_type == _AcceleratorType.GPU and self._strategy_type == _StrategyType.DDP2:
             self.num_processes = self.num_nodes
 
-        if self._device_type == _AcceleratorType.HPU and self._strategy_type == _StrategyType.DDP:
+        if self._device_type == _AcceleratorType.HPU and self._strategy_type == _StrategyType.HPU_PARALLEL:
             self.num_processes = self.num_hpus
 
         # Horovod is an extra case...
diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index f7fb9563db42f..7bead0a834ed8 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -342,13 +342,6 @@ def init_dist_connection(
     os.environ["MASTER_ADDR"] = cluster_environment.main_address
     os.environ["MASTER_PORT"] = str(cluster_environment.main_port)
 
-    # TBD: move this to a hpu based ddp plugin
-    # local rank mapping for device open is needed for hpu devices
-    if torch_distributed_backend == "hccl" and _HPU_AVAILABLE:
-        import habana_frameworks.torch.core.hccl
-
-        os.environ["ID"] = str(cluster_environment.local_rank())
-
     log.info(f"Initializing distributed: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
     torch.distributed.init_process_group(torch_distributed_backend, rank=global_rank, world_size=world_size, **kwargs)
 
diff --git a/pytorch_lightning/utilities/enums.py b/pytorch_lightning/utilities/enums.py
index f58bdd1b17624..27e134e198a55 100644
--- a/pytorch_lightning/utilities/enums.py
+++ b/pytorch_lightning/utilities/enums.py
@@ -123,6 +123,7 @@ class DistributedType(LightningEnum, metaclass=_OnAccessEnumMeta):
     DDP_SHARDED = "ddp_sharded"
     DDP_SHARDED_SPAWN = "ddp_sharded_spawn"
     DDP_FULLY_SHARDED = "ddp_fully_sharded"
+    HPU_PARALLEL = "hpu_parallel"
 
     @staticmethod
     def interactive_compatible_types() -> list[DistributedType]:
@@ -248,6 +249,7 @@ class _StrategyType(LightningEnum):
     DDP_SHARDED_SPAWN = "ddp_sharded_spawn"
     DDP_FULLY_SHARDED = "ddp_fully_sharded"
     BAGUA = "bagua"
+    HPU_PARALLEL = "hpu_parallel"
 
     @staticmethod
     def interactive_compatible_types() -> list[_StrategyType]:
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index bc4da50f7ee08..47727e00cc7bc 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -133,9 +133,6 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version:
 from habana_frameworks.torch.utils.library_loader import is_habana_avaialble, load_habana_module
 
 _HPU_AVAILABLE = is_habana_avaialble()
-if _HPU_AVAILABLE:
-    load_habana_module()
-    os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "hccl"
 
 # experimental feature within PyTorch Lightning.
 def _fault_tolerant_training() -> bool:
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index a711cc542771e..29141cfcc01c3 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -21,13 +21,10 @@
 
 from pytorch_lightning import Callback, seed_everything, Trainer
 from pytorch_lightning.accelerators import CPUAccelerator, HPUAccelerator
-from pytorch_lightning.callbacks import HPUStatsMonitor
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins import HPUPrecisionPlugin
-from pytorch_lightning.strategies.ddp import DDPStrategy
+from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy
 from pytorch_lightning.strategies.hpu import HPUStrategy
-from pytorch_lightning.trainer.states import RunningStage, TrainerFn
-from pytorch_lightning.trainer.supporters import CombinedLoader
 from pytorch_lightning.utilities import _AcceleratorType, _HPU_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel
@@ -130,7 +127,7 @@ def test_no_warning_plugin(tmpdir):
 @RunIf(hpu=True)
 def test_all_stages(tmpdir, hpus):
     model = HPUModel()
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, hpus=hpus)
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, accelerator="hpu", devices="auto")
     trainer.fit(model)
     trainer.validate(model)
     trainer.test(model)
@@ -329,7 +326,7 @@ def test_accelerator_hpu_with_multiple_devices():
     trainer = Trainer(accelerator="hpu", devices=8)
 
     assert trainer.hpus == 8
-    assert isinstance(trainer.strategy, DDPStrategy)
+    assert isinstance(trainer.strategy, HPUParallelStrategy)
     assert isinstance(trainer.accelerator, HPUAccelerator)
 
 
@@ -340,6 +337,7 @@ def test_accelerator_auto_with_devices_hpu():
 
     assert trainer._device_type == "hpu"
     assert trainer.hpus == 8
+    assert isinstance(trainer.strategy, HPUParallelStrategy)
 
 
 @RunIf(hpu=True)
@@ -367,9 +365,9 @@ def test_strategy_choice_hpu_plugin(tmpdir):
 
 
 @RunIf(hpu=True)
-def test_strategy_choice_hpu_ddp_plugin(tmpdir):
-    trainer = Trainer(strategy=HPUStrategy(device=torch.device("hpu")), accelerator="hpu", devices=8)
-    assert isinstance(trainer.strategy, HPUStrategy)
+def test_strategy_choice_hpu_parallel_plugin(tmpdir):
+    trainer = Trainer(strategy=HPUParallelStrategy(parallel_devices=[torch.device("hpu")]*8), accelerator="hpu", devices=8)
+    assert isinstance(trainer.strategy, HPUParallelStrategy)
 
 
 @RunIf(hpu=True)
diff --git a/tests/conftest.py b/tests/conftest.py
index f36737e1acab7..48fb8e64cdd67 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -77,7 +77,7 @@ def restore_env_variables():
         "XRT_HOST_WORLD_SIZE",
         "XRT_SHARD_ORDINAL",
         "XRT_SHARD_LOCAL_ORDINAL",
-        "ID",  # set by HPUStrategy,
+        "ID",  # used by HPU for acquiring the right gaudi device based on rank,
     }
     leaked_vars.difference_update(allowlist)
     assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}"

From aeda6818f4e77432e074a354547ab171078b1d96 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Fri, 18 Feb 2022 15:59:36 +0200
Subject: [PATCH 021/167] Add hpu backend, datatype checks

Signed-off-by: Jerome <janand@habana.ai>
---
 pytorch_lightning/overrides/torch_distributed.py |  6 +++++-
 pytorch_lightning/strategies/ddp.py              |  1 -
 pytorch_lightning/utilities/distributed.py       | 12 +++++++++---
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py
index c915235ec0e73..36168863c603c 100644
--- a/pytorch_lightning/overrides/torch_distributed.py
+++ b/pytorch_lightning/overrides/torch_distributed.py
@@ -139,7 +139,11 @@ def _broadcast_object_list(object_list, src=0, group=None, device=None):
     elif is_hpu_backend:
         current_device = torch.device("hpu")
         # Workaround: HPU doesn't not support long tensors for collectives
-        object_sizes_tensor = object_sizes_tensor.int()
+        if (object_sizes_tensor.type() == "torch.LongTensor") or \
+           (object_sizes_tensor.type() == "torch.hpu.LongTensor"):
+            object_sizes_tensor = object_sizes_tensor.int()
+        else:
+            print("unhandled hpu object_sizes_tensor type :: ", object_sizes_tensor.type())
         object_sizes_tensor = object_sizes_tensor.to(current_device)
 
     # Broadcast object sizes
diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py
index 5752d8129c85a..0c4d1e81ad504 100644
--- a/pytorch_lightning/strategies/ddp.py
+++ b/pytorch_lightning/strategies/ddp.py
@@ -382,7 +382,6 @@ def broadcast(self, obj: object, src: int = 0) -> object:
         if self.global_rank != src:
             obj = [None]
         torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD)
-
         return obj[0]
 
     def pre_backward(self, closure_loss: torch.Tensor) -> None:
diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index 7bead0a834ed8..aaf33429a038e 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -20,6 +20,7 @@
 import torch
 from torch.nn import Module
 from torch.nn.parallel.distributed import DistributedDataParallel
+from torch.distributed import Backend, get_backend
 
 import pytorch_lightning as pl
 from pytorch_lightning.utilities.imports import (
@@ -130,9 +131,15 @@ def sync_ddp(
     else:
         op = reduce_op
 
-    # WA for HPU. HPU doesn't support Long types
+    # WA for HPU. HPU doesn't support Long types, forcefully set it to float
     if _HPU_AVAILABLE:
-        result = result.float()
+        group_backend = get_backend(group)
+        dist_backend = os.environ.get("PL_TORCH_DISTRIBUTED_BACKEND")
+        is_hpu_backend = group_backend == torch.distributed.Backend(str(dist_backend))
+        if is_hpu_backend:
+            if (result.type() == "torch.LongTensor") or \
+               (result.type() == "torch.hpu.LongTensor"):
+                result = result.float()
 
     # sync all processes before reduction
     torch.distributed.barrier(group=group)
@@ -341,7 +348,6 @@ def init_dist_connection(
     world_size = world_size if world_size is not None else cluster_environment.world_size()
     os.environ["MASTER_ADDR"] = cluster_environment.main_address
     os.environ["MASTER_PORT"] = str(cluster_environment.main_port)
-
     log.info(f"Initializing distributed: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
     torch.distributed.init_process_group(torch_distributed_backend, rank=global_rank, world_size=world_size, **kwargs)
 

From 123112db87c2fc2706edff61106dddb403fc3470 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 23 Feb 2022 13:52:57 +0000
Subject: [PATCH 022/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../hpu_examples/simple_mnist/mnist.py        |  9 +++++++--
 pytorch_lightning/accelerators/hpu.py         |  3 ++-
 .../overrides/torch_distributed.py            |  3 +--
 pytorch_lightning/strategies/hpu.py           |  6 +++---
 pytorch_lightning/strategies/hpu_parallel.py  |  3 +--
 .../connectors/accelerator_connector.py       |  2 +-
 pytorch_lightning/trainer/trainer.py          |  7 +++----
 pytorch_lightning/utilities/distributed.py    |  5 ++---
 tests/accelerators/test_common.py             |  8 +++++++-
 tests/accelerators/test_hpu.py                | 19 ++++++++++++-------
 10 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/pl_examples/hpu_examples/simple_mnist/mnist.py b/pl_examples/hpu_examples/simple_mnist/mnist.py
index 938ea3acabbba..a6bbfc64f86e4 100644
--- a/pl_examples/hpu_examples/simple_mnist/mnist.py
+++ b/pl_examples/hpu_examples/simple_mnist/mnist.py
@@ -51,8 +51,13 @@ def configure_optimizers(self):
 hpu_stats = HPUStatsMonitor(log_save_dir="habana_ptl_log", exp_name="mnist")
 
 parallel_devices = 1
-hpustrat_1 = HPUStrategy(device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params))
-hpustrat_8 = HPUParallelStrategy(parallel_devices=[torch.device("hpu")]*parallel_devices, precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params))
+hpustrat_1 = HPUStrategy(
+    device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params)
+)
+hpustrat_8 = HPUParallelStrategy(
+    parallel_devices=[torch.device("hpu")] * parallel_devices,
+    precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params),
+)
 
 # Initialize a trainer
 trainer = pl.Trainer(
diff --git a/pytorch_lightning/accelerators/hpu.py b/pytorch_lightning/accelerators/hpu.py
index bb8b6837e33f6..8f83423225fa1 100644
--- a/pytorch_lightning/accelerators/hpu.py
+++ b/pytorch_lightning/accelerators/hpu.py
@@ -19,6 +19,7 @@
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.utilities import _HPU_AVAILABLE
 
+
 class HPUAccelerator(Accelerator):
     """Accelerator for HPU devices."""
 
@@ -34,4 +35,4 @@ def auto_device_count() -> int:
 
     @staticmethod
     def is_available() -> bool:
-        return _HPU_AVAILABLE
\ No newline at end of file
+        return _HPU_AVAILABLE
diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py
index 36168863c603c..865ed00213f32 100644
--- a/pytorch_lightning/overrides/torch_distributed.py
+++ b/pytorch_lightning/overrides/torch_distributed.py
@@ -139,8 +139,7 @@ def _broadcast_object_list(object_list, src=0, group=None, device=None):
     elif is_hpu_backend:
         current_device = torch.device("hpu")
         # Workaround: HPU doesn't not support long tensors for collectives
-        if (object_sizes_tensor.type() == "torch.LongTensor") or \
-           (object_sizes_tensor.type() == "torch.hpu.LongTensor"):
+        if (object_sizes_tensor.type() == "torch.LongTensor") or (object_sizes_tensor.type() == "torch.hpu.LongTensor"):
             object_sizes_tensor = object_sizes_tensor.int()
         else:
             print("unhandled hpu object_sizes_tensor type :: ", object_sizes_tensor.type())
diff --git a/pytorch_lightning/strategies/hpu.py b/pytorch_lightning/strategies/hpu.py
index 6cde547c6b186..651f56ea9a135 100644
--- a/pytorch_lightning/strategies/hpu.py
+++ b/pytorch_lightning/strategies/hpu.py
@@ -30,7 +30,7 @@
 
 class HPUStrategy(SingleDeviceStrategy):
     """Strategy for training on HPU devices."""
-    
+
     strategy_name = "hpu_single"
 
     def __init__(
@@ -45,8 +45,8 @@ def __init__(
         device = device
         checkpoint_io = checkpoint_io or HPUCheckpointIO()
         super().__init__(
-            accelerator=accelerator,
-            device=device, checkpoint_io=checkpoint_io, precision_plugin=precision_plugin)
+            accelerator=accelerator, device=device, checkpoint_io=checkpoint_io, precision_plugin=precision_plugin
+        )
 
     @property
     def is_distributed(self) -> bool:
diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index 412a205d07570..9cc3d6a18c3b1 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -15,7 +15,6 @@
 from typing import Any, Dict, List, Optional, Union
 
 import __main__
-
 import torch
 import torch.distributed
 
@@ -59,7 +58,7 @@ def setup_environment(self) -> None:
 
         os.environ["ID"] = str(self.local_rank)
         os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "hccl"
-        
+
         super().setup_environment()
 
     def broadcast(self, obj: object, src: int = 0) -> object:
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index b96ae757f819a..2a347fdcd216c 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -58,8 +58,8 @@
     DDPStrategy,
     DeepSpeedStrategy,
     HorovodStrategy,
-    HPUStrategy,
     HPUParallelStrategy,
+    HPUStrategy,
     IPUStrategy,
     ParallelStrategy,
     SingleDeviceStrategy,
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index afe4aa00ac933..9627f2e87d067 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -30,7 +30,7 @@
 from torch.utils.data import DataLoader
 
 import pytorch_lightning as pl
-from pytorch_lightning.accelerators import Accelerator, GPUAccelerator, IPUAccelerator, TPUAccelerator, HPUAccelerator
+from pytorch_lightning.accelerators import Accelerator, GPUAccelerator, HPUAccelerator, IPUAccelerator, TPUAccelerator
 from pytorch_lightning.callbacks import Callback, EarlyStopping, ModelCheckpoint, ProgressBarBase
 from pytorch_lightning.callbacks.prediction_writer import BasePredictionWriter
 from pytorch_lightning.core.datamodule import LightningDataModule
@@ -75,9 +75,9 @@
 from pytorch_lightning.tuner.lr_finder import _LRFinder
 from pytorch_lightning.tuner.tuning import Tuner
 from pytorch_lightning.utilities import (
+    _HPU_AVAILABLE,
     _IPU_AVAILABLE,
     _TPU_AVAILABLE,
-    _HPU_AVAILABLE,
     AMPType,
     device_parser,
     GradClipAlgorithmType,
@@ -1808,8 +1808,7 @@ def _log_device_info(self) -> None:
 
         if _HPU_AVAILABLE and not isinstance(self.accelerator, HPUAccelerator):
             rank_zero_warn(
-                "HPU available but not used. Set the `devices` flag in your trainer"
-                " `Trainer(devices=8)`."
+                "HPU available but not used. Set the `devices` flag in your trainer" " `Trainer(devices=8)`."
             )
 
     """
diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index 57873fe3a54dc..385effbeee2a1 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -18,9 +18,9 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
+from torch.distributed import Backend, get_backend
 from torch.nn import Module
 from torch.nn.parallel.distributed import DistributedDataParallel
-from torch.distributed import Backend, get_backend
 
 import pytorch_lightning as pl
 from pytorch_lightning.utilities.imports import (
@@ -137,8 +137,7 @@ def sync_ddp(
         dist_backend = os.environ.get("PL_TORCH_DISTRIBUTED_BACKEND")
         is_hpu_backend = group_backend == torch.distributed.Backend(str(dist_backend))
         if is_hpu_backend:
-            if (result.type() == "torch.LongTensor") or \
-               (result.type() == "torch.hpu.LongTensor"):
+            if (result.type() == "torch.LongTensor") or (result.type() == "torch.hpu.LongTensor"):
                 result = result.float()
 
     # sync all processes before reduction
diff --git a/tests/accelerators/test_common.py b/tests/accelerators/test_common.py
index e46563b65baab..302e1fe3a617b 100644
--- a/tests/accelerators/test_common.py
+++ b/tests/accelerators/test_common.py
@@ -13,7 +13,13 @@
 # limitations under the License.
 from unittest import mock
 
-from pytorch_lightning.accelerators import CPUAccelerator, GPUAccelerator, IPUAccelerator, TPUAccelerator, HPUAccelerator
+from pytorch_lightning.accelerators import (
+    CPUAccelerator,
+    GPUAccelerator,
+    HPUAccelerator,
+    IPUAccelerator,
+    TPUAccelerator,
+)
 
 
 @mock.patch("torch.cuda.device_count", return_value=2)
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index f4beca2da4fcb..b554933a7a3aa 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -23,8 +23,8 @@
 from pytorch_lightning.accelerators import CPUAccelerator, HPUAccelerator
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins import HPUPrecisionPlugin
-from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy
 from pytorch_lightning.strategies.hpu import HPUStrategy
+from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy
 from pytorch_lightning.utilities import _HPU_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel
@@ -97,6 +97,7 @@ def test_epoch_end(self, outputs) -> None:
 def test_availability():
     assert HPUAccelerator.is_available()
 
+
 @pytest.mark.skipif(_HPU_AVAILABLE, reason="test requires non-HPU machine")
 def test_fail_if_no_hpus(tmpdir):
     with pytest.raises(MisconfigurationException, match="HPU Accelerator requires HPU devices to run"):
@@ -177,8 +178,8 @@ def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[st
 
     model = HPUModel()
     trainer = Trainer(
-        strategy=HPUStrategy(device=torch.device("hpu"),
-            precision_plugin=HPUPrecisionPlugin(precision="bf16", hmp_params=hmp_params)
+        strategy=HPUStrategy(
+            device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision="bf16", hmp_params=hmp_params)
         ),
         default_root_dir=tmpdir,
         fast_dev_run=True,
@@ -205,8 +206,8 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
     model = HPUModel()
     model = model.half()
     trainer = Trainer(
-        strategy=HPUStrategy(device=torch.device("hpu"),
-            precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=None)
+        strategy=HPUStrategy(
+            device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=None)
         ),
         default_root_dir=tmpdir,
         fast_dev_run=True,
@@ -260,7 +261,9 @@ def on_predict_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, da
             assert torch.all(outputs == 4).item()
 
     model = StageModel()
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, accelerator="hpu", devices=1, callbacks=TestCallback())
+    trainer = Trainer(
+        default_root_dir=tmpdir, fast_dev_run=True, accelerator="hpu", devices=1, callbacks=TestCallback()
+    )
     trainer.fit(model)
     trainer.test(model)
     trainer.validate(model)
@@ -327,7 +330,9 @@ def test_strategy_choice_hpu_plugin(tmpdir):
 
 @RunIf(hpu=True)
 def test_strategy_choice_hpu_parallel_plugin(tmpdir):
-    trainer = Trainer(strategy=HPUParallelStrategy(parallel_devices=[torch.device("hpu")]*8), accelerator="hpu", devices=8)
+    trainer = Trainer(
+        strategy=HPUParallelStrategy(parallel_devices=[torch.device("hpu")] * 8), accelerator="hpu", devices=8
+    )
     assert isinstance(trainer.strategy, HPUParallelStrategy)
 
 

From ede68ebafc9f1bbfb33d1a2ec7f012ed38beb65a Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Wed, 23 Feb 2022 16:03:20 +0200
Subject: [PATCH 023/167] Remove unused param for 'on_train_batch_end' in hpu
 test

Signed-off-by: Jerome <janand@habana.ai>
---
 tests/accelerators/test_hpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index b554933a7a3aa..1bc0c8d4de0e8 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -248,7 +248,7 @@ def predict_step(self, batch, batch_idx, dataloader_idx=None):
             return (output - output) + torch.tensor(4)
 
     class TestCallback(Callback):
-        def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None:
+        def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx) -> None:
             assert outputs["loss"].item() == 1
 
         def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None:

From 3a029c1412e57285614da5610f6d861fa27ad16f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 3 Mar 2022 03:43:30 +0000
Subject: [PATCH 024/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pytorch_lightning/utilities/distributed.py | 2 +-
 tests/accelerators/test_common.py          | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index 61ca914c576f2..21f5d2c8a4775 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -18,9 +18,9 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
-from torch.nn.parallel.distributed import DistributedDataParallel
 from torch.distributed import Backend, get_backend
 from torch.nn import Module
+from torch.nn.parallel.distributed import DistributedDataParallel
 
 import pytorch_lightning as pl
 from pytorch_lightning.utilities.imports import (
diff --git a/tests/accelerators/test_common.py b/tests/accelerators/test_common.py
index 37e419254657b..12aa1ea866015 100644
--- a/tests/accelerators/test_common.py
+++ b/tests/accelerators/test_common.py
@@ -14,7 +14,14 @@
 from unittest import mock
 
 from pytorch_lightning import Trainer
-from pytorch_lightning.accelerators import Accelerator, CPUAccelerator, GPUAccelerator, IPUAccelerator, TPUAccelerator, HPUAccelerator
+from pytorch_lightning.accelerators import (
+    Accelerator,
+    CPUAccelerator,
+    GPUAccelerator,
+    HPUAccelerator,
+    IPUAccelerator,
+    TPUAccelerator,
+)
 from pytorch_lightning.strategies import DDPStrategy
 
 

From 0a959f03dfedda78676067d67fced1511ee54310 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Thu, 3 Mar 2022 07:35:32 +0200
Subject: [PATCH 025/167] Addres review comments

- remove unused 'hpus' param from trainer
- make params configurable in example test
- update basic test with strategy

Signed-off-by: Jerome <janand@habana.ai>
---
 .../hpu_examples/simple_mnist/mnist.py        | 155 ++++++++++++------
 pytorch_lightning/plugins/__init__.py         |   2 -
 .../plugins/training_type/__init__.py         |   1 -
 .../plugins/training_type/single_hpu.py       |  24 ---
 pytorch_lightning/trainer/trainer.py          |   6 +-
 tests/accelerators/test_hpu.py                |  11 +-
 6 files changed, 118 insertions(+), 81 deletions(-)
 delete mode 100644 pytorch_lightning/plugins/training_type/single_hpu.py

diff --git a/pl_examples/hpu_examples/simple_mnist/mnist.py b/pl_examples/hpu_examples/simple_mnist/mnist.py
index a6bbfc64f86e4..61bdb99f30b66 100644
--- a/pl_examples/hpu_examples/simple_mnist/mnist.py
+++ b/pl_examples/hpu_examples/simple_mnist/mnist.py
@@ -1,73 +1,132 @@
-import os
-import sys
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
-import habana_frameworks.torch.core as htcore
+import os
 import torch
-from torch import nn
 from torch.nn import functional as F
-from torch.utils.data import DataLoader, random_split
-from torchvision import transforms
-from torchvision.datasets import MNIST
 
 import pytorch_lightning as pl
+
+from pl_examples.basic_examples.mnist_datamodule import MNISTDataModule
+
+import habana_frameworks.torch.core as htcore
 from pytorch_lightning.callbacks import HPUStatsMonitor
 from pytorch_lightning.plugins import HPUPrecisionPlugin
 from pytorch_lightning.strategies.hpu import HPUStrategy
 from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy
+import argparse
+
+def parse_args():
+    import argparse
+    parser = argparse.ArgumentParser(description='PyTorch Classification Training')
 
+    parser.add_argument('-b', '--batch-size', default=32, type=int)
+    parser.add_argument('--epochs', default=1, type=int, metavar='N',
+                        help='number of total epochs to run')
+    parser.add_argument('--hpus', default=1, type=int, metavar='N',
+                        help='number of habana accelerator for training (default: 1)')
+    parser.add_argument('--hmp', dest='is_hmp', action='store_true', help='enable habana mixed precision mode')
+    parser.add_argument('--hmp-bf16', default='', help='path to bf16 ops list in hmp O1 mode')
+    parser.add_argument('--hmp-fp32', default='', help='path to fp32 ops list in hmp O1 mode')
+    parser.add_argument('--hmp-opt-level', default='O1', help='choose optimization level for hmp')
+    parser.add_argument('--hmp-verbose', action='store_true', help='enable verbose mode for hmp')
+
+    args = parser.parse_args()
+
+    return args
+
+class LitClassifier(pl.LightningModule):
 
-class MNISTModel(pl.LightningModule):
     def __init__(self):
-        super().__init__()
+        super(LitClassifier, self).__init__()
+
         self.l1 = torch.nn.Linear(28 * 28, 10)
 
     def forward(self, x):
         return torch.relu(self.l1(x.view(x.size(0), -1)))
 
-    def training_step(self, batch, batch_nb):
+    def training_step(self, batch, batch_idx):
         x, y = batch
         loss = F.cross_entropy(self(x), y)
         return loss
 
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        probs = self(x)
+        acc = self.accuracy(probs, y)
+        return acc
+
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        acc = self.accuracy(logits, y)
+        return acc
+
+    def accuracy(self, logits, y):
+        acc = torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y)
+        return acc
+
+    def validation_epoch_end(self, outputs) -> None:
+        self.log("val_acc", torch.stack(outputs).mean(), prog_bar=True)
+
+    def test_epoch_end(self, outputs) -> None:
+        self.log("test_acc", torch.stack(outputs).mean())
+
     def configure_optimizers(self):
         return torch.optim.Adam(self.parameters(), lr=0.02)
 
+if __name__ == "__main__":
+
+    args = parse_args()
+
+    # Init our model
+    model = LitClassifier()
+
+    # Init DataLoader from MNIST Dataset
+    dm = MNISTDataModule(batch_size=args.batch_size)
+
+    # TBD: import these keys from hmp
+    hmp_keys = ["level", "verbose", "bf16_ops", "fp32_ops"]
+    hmp_params = dict.fromkeys(hmp_keys)
+    hmp_params["level"] = args.hmp_opt_level
+    hmp_params["verbose"] = args.hmp_verbose
+    hmp_params["bf16_ops"] = args.hmp_bf16 #"./pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt"
+    hmp_params["fp32_ops"] = args.hmp_fp32 #"./pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt"
+
+    hpu_stats = HPUStatsMonitor(log_save_dir="habana_ptl_log", exp_name="mnist")
+
+    parallel_devices = args.hpus
+    hpustrat_1 = HPUStrategy(
+        device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params)
+    )
+    hpustrat_8 = HPUParallelStrategy(
+        parallel_devices=[torch.device("hpu")] * parallel_devices,
+        precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params),
+    )
+
+    # Initialize a trainer
+    trainer = pl.Trainer(
+        strategy=hpustrat_8 if (parallel_devices == 8) else hpustrat_1,
+        devices=parallel_devices,
+        callbacks=[hpu_stats],
+        max_epochs=args.epochs,
+        default_root_dir=os.getcwd(),
+        accelerator="hpu",
+    )
+
 
-# Init our model
-mnist_model = MNISTModel()
-
-# Init DataLoader from MNIST Dataset
-train_ds = MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())
-train_loader = DataLoader(train_ds, batch_size=32)
-
-# TBD: import these keys from hmp
-hmp_keys = ["level", "verbose", "bf16_ops", "fp32_ops"]
-hmp_params = dict.fromkeys(hmp_keys)
-hmp_params["level"] = "O1"
-hmp_params["verbose"] = False
-hmp_params["bf16_ops"] = "./pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt"
-hmp_params["fp32_ops"] = "./pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt"
-
-hpu_stats = HPUStatsMonitor(log_save_dir="habana_ptl_log", exp_name="mnist")
-
-parallel_devices = 1
-hpustrat_1 = HPUStrategy(
-    device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params)
-)
-hpustrat_8 = HPUParallelStrategy(
-    parallel_devices=[torch.device("hpu")] * parallel_devices,
-    precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params),
-)
-
-# Initialize a trainer
-trainer = pl.Trainer(
-    strategy=hpustrat_8 if (parallel_devices == 8) else hpustrat_1,
-    devices=parallel_devices,
-    callbacks=[hpu_stats],
-    max_epochs=1,
-    default_root_dir=os.getcwd(),
-    accelerator="hpu",
-)
-
-# Train the model ⚡
-trainer.fit(mnist_model, train_loader)
+    # Train the model ⚡
+    trainer.fit(model, datamodule=dm)
+    trainer.test(model, datamodule=dm)
+    trainer.validate(model, datamodule=dm)
diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py
index 6a24e785f9530..1d384412f252a 100644
--- a/pytorch_lightning/plugins/__init__.py
+++ b/pytorch_lightning/plugins/__init__.py
@@ -29,7 +29,6 @@
 from pytorch_lightning.plugins.training_type.sharded import DDPShardedPlugin
 from pytorch_lightning.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin
 from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin
-from pytorch_lightning.plugins.training_type.single_hpu import HPUPlugin
 from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin
 from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin
 from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin
@@ -54,7 +53,6 @@
     "HorovodPlugin",
     "IPUPlugin",
     "IPUPrecisionPlugin",
-    "HPUPlugin",
     "HPUPrecisionPlugin",
     "NativeMixedPrecisionPlugin",
     "PrecisionPlugin",
diff --git a/pytorch_lightning/plugins/training_type/__init__.py b/pytorch_lightning/plugins/training_type/__init__.py
index 0f8059f5f04ad..f7bee339ef95a 100644
--- a/pytorch_lightning/plugins/training_type/__init__.py
+++ b/pytorch_lightning/plugins/training_type/__init__.py
@@ -10,7 +10,6 @@
 from pytorch_lightning.plugins.training_type.sharded import DDPShardedPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin  # noqa: F401
-from pytorch_lightning.plugins.training_type.single_hpu import HPUPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin  # noqa: F401
diff --git a/pytorch_lightning/plugins/training_type/single_hpu.py b/pytorch_lightning/plugins/training_type/single_hpu.py
deleted file mode 100644
index 1dae809141a26..0000000000000
--- a/pytorch_lightning/plugins/training_type/single_hpu.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from pytorch_lightning.strategies import HPUStrategy
-from pytorch_lightning.utilities import rank_zero_deprecation
-
-
-class HPUPlugin(HPUStrategy):
-    def __init__(self, *args, **kwargs) -> None:  # type: ignore[no-untyped-def]
-        rank_zero_deprecation(
-            "The `pl.plugins.training_type.hpu.HPUPlugin` is deprecated in v1.6 and will be removed in."
-            " v1.8. Use `pl.strategies.hpu.HPUStrategy` instead."
-        )
-        super().__init__(*args, **kwargs)
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index e1d28fd43752f..f6f3d962b7604 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -142,7 +142,6 @@ def __init__(
         devices: Optional[Union[List[int], str, int]] = None,
         gpus: Optional[Union[List[int], str, int]] = None,
         auto_select_gpus: bool = False,
-        hpus: Optional[int] = None,
         tpu_cores: Optional[Union[List[int], str, int]] = None,
         ipus: Optional[int] = None,
         log_gpu_memory: Optional[str] = None,  # TODO: Remove in 1.7
@@ -423,9 +422,6 @@ def __init__(
             ipus: How many IPUs to train on.
                 Default: ``None``.
 
-            hpus: How many HPUs to train on.
-                Default: ``None``.
-
             track_grad_norm: -1 no tracking. Otherwise tracks that p-norm. May be set to 'inf' infinity-norm. If using
                 Automatic Mixed Precision (AMP), the gradients will be unscaled before logging them.
                 Default: ``-1``.
@@ -1794,7 +1790,7 @@ def _log_device_info(self) -> None:
         num_ipus = self.ipus if self.ipus is not None else 0
         rank_zero_info(f"IPU available: {_IPU_AVAILABLE}, using: {num_ipus} IPUs")
 
-        num_hpus = self.hpus if self.hpus is not None else 0
+        num_hpus = self.devices if self.devices is not None else 0
         rank_zero_info(f"HPU available: {_HPU_AVAILABLE}, using: {num_hpus} HPUs")
 
         if torch.cuda.is_available() and not isinstance(self.accelerator, GPUAccelerator):
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index 1bc0c8d4de0e8..189dfd77e5dee 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -124,7 +124,16 @@ def test_no_warning_plugin(tmpdir):
 @RunIf(hpu=True)
 def test_all_stages(tmpdir):
     model = HPUModel()
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, accelerator="hpu", devices=1)
+    parallel_devices = 1
+    hpustrat_1 = HPUStrategy(
+        device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=None)
+    )
+    hpustrat_8 = HPUParallelStrategy(
+        parallel_devices=[torch.device("hpu")] * parallel_devices,
+        precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=None),
+    )
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, accelerator="hpu", devices=parallel_devices,
+                      strategy=hpustrat_8 if (parallel_devices == 8) else hpustrat_1)
     trainer.fit(model)
     trainer.validate(model)
     trainer.test(model)

From 14342990eea4346427bc47bb33f113322fd614fd Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 3 Mar 2022 05:38:43 +0000
Subject: [PATCH 026/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../hpu_examples/simple_mnist/mnist.py        | 43 ++++++++++---------
 tests/accelerators/test_hpu.py                |  9 +++-
 2 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/pl_examples/hpu_examples/simple_mnist/mnist.py b/pl_examples/hpu_examples/simple_mnist/mnist.py
index 61bdb99f30b66..7759a2a18a336 100644
--- a/pl_examples/hpu_examples/simple_mnist/mnist.py
+++ b/pl_examples/hpu_examples/simple_mnist/mnist.py
@@ -12,44 +12,45 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import argparse
 import os
+
+import habana_frameworks.torch.core as htcore
 import torch
 from torch.nn import functional as F
 
 import pytorch_lightning as pl
-
 from pl_examples.basic_examples.mnist_datamodule import MNISTDataModule
-
-import habana_frameworks.torch.core as htcore
 from pytorch_lightning.callbacks import HPUStatsMonitor
 from pytorch_lightning.plugins import HPUPrecisionPlugin
 from pytorch_lightning.strategies.hpu import HPUStrategy
 from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy
-import argparse
+
 
 def parse_args():
     import argparse
-    parser = argparse.ArgumentParser(description='PyTorch Classification Training')
-
-    parser.add_argument('-b', '--batch-size', default=32, type=int)
-    parser.add_argument('--epochs', default=1, type=int, metavar='N',
-                        help='number of total epochs to run')
-    parser.add_argument('--hpus', default=1, type=int, metavar='N',
-                        help='number of habana accelerator for training (default: 1)')
-    parser.add_argument('--hmp', dest='is_hmp', action='store_true', help='enable habana mixed precision mode')
-    parser.add_argument('--hmp-bf16', default='', help='path to bf16 ops list in hmp O1 mode')
-    parser.add_argument('--hmp-fp32', default='', help='path to fp32 ops list in hmp O1 mode')
-    parser.add_argument('--hmp-opt-level', default='O1', help='choose optimization level for hmp')
-    parser.add_argument('--hmp-verbose', action='store_true', help='enable verbose mode for hmp')
+
+    parser = argparse.ArgumentParser(description="PyTorch Classification Training")
+
+    parser.add_argument("-b", "--batch-size", default=32, type=int)
+    parser.add_argument("--epochs", default=1, type=int, metavar="N", help="number of total epochs to run")
+    parser.add_argument(
+        "--hpus", default=1, type=int, metavar="N", help="number of habana accelerator for training (default: 1)"
+    )
+    parser.add_argument("--hmp", dest="is_hmp", action="store_true", help="enable habana mixed precision mode")
+    parser.add_argument("--hmp-bf16", default="", help="path to bf16 ops list in hmp O1 mode")
+    parser.add_argument("--hmp-fp32", default="", help="path to fp32 ops list in hmp O1 mode")
+    parser.add_argument("--hmp-opt-level", default="O1", help="choose optimization level for hmp")
+    parser.add_argument("--hmp-verbose", action="store_true", help="enable verbose mode for hmp")
 
     args = parser.parse_args()
 
     return args
 
-class LitClassifier(pl.LightningModule):
 
+class LitClassifier(pl.LightningModule):
     def __init__(self):
-        super(LitClassifier, self).__init__()
+        super().__init__()
 
         self.l1 = torch.nn.Linear(28 * 28, 10)
 
@@ -86,6 +87,7 @@ def test_epoch_end(self, outputs) -> None:
     def configure_optimizers(self):
         return torch.optim.Adam(self.parameters(), lr=0.02)
 
+
 if __name__ == "__main__":
 
     args = parse_args()
@@ -101,8 +103,8 @@ def configure_optimizers(self):
     hmp_params = dict.fromkeys(hmp_keys)
     hmp_params["level"] = args.hmp_opt_level
     hmp_params["verbose"] = args.hmp_verbose
-    hmp_params["bf16_ops"] = args.hmp_bf16 #"./pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt"
-    hmp_params["fp32_ops"] = args.hmp_fp32 #"./pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt"
+    hmp_params["bf16_ops"] = args.hmp_bf16  # "./pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt"
+    hmp_params["fp32_ops"] = args.hmp_fp32  # "./pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt"
 
     hpu_stats = HPUStatsMonitor(log_save_dir="habana_ptl_log", exp_name="mnist")
 
@@ -125,7 +127,6 @@ def configure_optimizers(self):
         accelerator="hpu",
     )
 
-
     # Train the model ⚡
     trainer.fit(model, datamodule=dm)
     trainer.test(model, datamodule=dm)
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index 189dfd77e5dee..06e1cd6c2d8e5 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -132,8 +132,13 @@ def test_all_stages(tmpdir):
         parallel_devices=[torch.device("hpu")] * parallel_devices,
         precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=None),
     )
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, accelerator="hpu", devices=parallel_devices,
-                      strategy=hpustrat_8 if (parallel_devices == 8) else hpustrat_1)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        fast_dev_run=True,
+        accelerator="hpu",
+        devices=parallel_devices,
+        strategy=hpustrat_8 if (parallel_devices == 8) else hpustrat_1,
+    )
     trainer.fit(model)
     trainer.validate(model)
     trainer.test(model)

From 400ea7774af3a318c8059fe9dae10f36a3683fa1 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Fri, 4 Mar 2022 07:12:12 +0200
Subject: [PATCH 027/167] Address review comments

- update logs
- Update tests with hpu check

Signed-off-by: Jerome <janand@habana.ai>
---
 pytorch_lightning/accelerators/hpu.py      | 2 +-
 pytorch_lightning/trainer/trainer.py       | 3 ++-
 pytorch_lightning/utilities/distributed.py | 3 +++
 tests/accelerators/test_hpu.py             | 6 +++---
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/accelerators/hpu.py b/pytorch_lightning/accelerators/hpu.py
index c1cc43c1e4290..38382aeab7fc5 100644
--- a/pytorch_lightning/accelerators/hpu.py
+++ b/pytorch_lightning/accelerators/hpu.py
@@ -45,7 +45,7 @@ def get_parallel_devices(devices: int) -> List[int]:
     @staticmethod
     def auto_device_count() -> int:
         """Get the devices when set to auto."""
-        # TBD: make this configurable
+        # TODO: Update this when api is exposed by the Habana team
         return 8
 
     @staticmethod
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index f6f3d962b7604..b262b79ff0162 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -1814,7 +1814,8 @@ def _log_device_info(self) -> None:
 
         if _HPU_AVAILABLE and not isinstance(self.accelerator, HPUAccelerator):
             rank_zero_warn(
-                "HPU available but not used. Set the `devices` flag in your trainer" " `Trainer(devices=8)`."
+                "HPU available but not used. Set `accelerator` and `devices` using"
+                f" `Trainer(accelerator='hpu', devices={HPUAccelerator.auto_device_count()})`."
             )
 
     """
diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index 21f5d2c8a4775..902bd6969c113 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -22,6 +22,8 @@
 from torch.nn import Module
 from torch.nn.parallel.distributed import DistributedDataParallel
 
+from pytorch_lightning.utilities import rank_zero_warn
+
 import pytorch_lightning as pl
 from pytorch_lightning.utilities.imports import (
     _HPU_AVAILABLE,
@@ -138,6 +140,7 @@ def sync_ddp(
         is_hpu_backend = group_backend == torch.distributed.Backend(str(dist_backend))
         if is_hpu_backend:
             if (result.type() == "torch.LongTensor") or (result.type() == "torch.hpu.LongTensor"):
+                rank_zero_warn("Long tensor unsupported, downcasting to float")
                 result = result.float()
 
     # sync all processes before reduction
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index 06e1cd6c2d8e5..063cd6f8ef649 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -94,6 +94,7 @@ def test_epoch_end(self, outputs) -> None:
         self.log("test_acc", torch.stack(outputs).mean())
 
 
+@RunIf(hpu=True)
 def test_availability():
     assert HPUAccelerator.is_available()
 
@@ -109,7 +110,6 @@ def test_fail_if_no_hpus(tmpdir):
 
 @RunIf(hpu=True)
 def test_accelerator_selected(tmpdir):
-    assert HPUAccelerator.is_available()
     trainer = Trainer(default_root_dir=tmpdir, accelerator="hpu", devices=1)
     assert isinstance(trainer.accelerator, HPUAccelerator)
 
@@ -122,9 +122,9 @@ def test_no_warning_plugin(tmpdir):
 
 
 @RunIf(hpu=True)
-def test_all_stages(tmpdir):
+def test_all_stages(tmpdir, hpus):
     model = HPUModel()
-    parallel_devices = 1
+    parallel_devices = hpus
     hpustrat_1 = HPUStrategy(
         device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=None)
     )

From 4146bab88837dc398e834c2dcd130e76d863894e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 4 Mar 2022 05:15:13 +0000
Subject: [PATCH 028/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pytorch_lightning/utilities/distributed.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index 902bd6969c113..382caa7420633 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -22,9 +22,8 @@
 from torch.nn import Module
 from torch.nn.parallel.distributed import DistributedDataParallel
 
-from pytorch_lightning.utilities import rank_zero_warn
-
 import pytorch_lightning as pl
+from pytorch_lightning.utilities import rank_zero_warn
 from pytorch_lightning.utilities.imports import (
     _HPU_AVAILABLE,
     _TORCH_GREATER_EQUAL_1_8,

From f5cb696f585fa7e2aa096a117f8cd3570beaf7ae Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Fri, 4 Mar 2022 08:26:30 +0200
Subject: [PATCH 029/167] remove deprecated logging

Signed-off-by: Jerome <janand@habana.ai>
---
 pytorch_lightning/utilities/distributed.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index 382caa7420633..ec9a9955752ca 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -23,7 +23,6 @@
 from torch.nn.parallel.distributed import DistributedDataParallel
 
 import pytorch_lightning as pl
-from pytorch_lightning.utilities import rank_zero_warn
 from pytorch_lightning.utilities.imports import (
     _HPU_AVAILABLE,
     _TORCH_GREATER_EQUAL_1_8,
@@ -139,7 +138,7 @@ def sync_ddp(
         is_hpu_backend = group_backend == torch.distributed.Backend(str(dist_backend))
         if is_hpu_backend:
             if (result.type() == "torch.LongTensor") or (result.type() == "torch.hpu.LongTensor"):
-                rank_zero_warn("Long tensor unsupported, downcasting to float")
+                new_rank_zero_info("Long tensor unsupported, casting to float")
                 result = result.float()
 
     # sync all processes before reduction

From 448ed771b9c1b2f5e966bf5b50ce6f5ce76adaff Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Wed, 9 Mar 2022 12:28:38 +0400
Subject: [PATCH 030/167] Fix imports for failing CI

---
 .../plugins/precision/hpu_precision.py              | 13 ++++++++-----
 pytorch_lightning/utilities/imports.py              | 10 +++++++---
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/pytorch_lightning/plugins/precision/hpu_precision.py b/pytorch_lightning/plugins/precision/hpu_precision.py
index 91413aac1352d..ec814f9582437 100644
--- a/pytorch_lightning/plugins/precision/hpu_precision.py
+++ b/pytorch_lightning/plugins/precision/hpu_precision.py
@@ -21,22 +21,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, List, Optional, Sequence, Tuple
-
-import torch.nn as nn
-from habana_frameworks.torch.hpex import hmp
-from torch.optim import Optimizer
+from typing import Any, Optional, Sequence
 
 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.imports import _HPU_AVAILABLE
 
 
 class HPUPrecisionPlugin(PrecisionPlugin):
     """Plugin that enables bfloats/floats on HPUs."""
 
     def __init__(self, precision: int, hmp_params: Optional[Sequence[Any]] = None) -> None:
+        if not _HPU_AVAILABLE:
+            raise MisconfigurationException("HPU precision plugin requires HPU support")
         super().__init__()
         self.precision = precision
         if hmp_params is not None:
+
+            from habana_frameworks.torch.hpex import hmp
+
             hmp_opt_level = hmp_params["level"]
             hmp_bf16 = hmp_params["bf16_ops"]
             hmp_fp32 = hmp_params["fp32_ops"]
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index b8601c8958f82..f9fded925dff6 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -14,7 +14,6 @@
 """General utilities."""
 import importlib
 import operator
-import os
 import platform
 import sys
 from importlib.util import find_spec
@@ -112,6 +111,7 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version:
 _NEPTUNE_GREATER_EQUAL_0_9 = _NEPTUNE_AVAILABLE and _compare_version("neptune", operator.ge, "0.9.0")
 _OMEGACONF_AVAILABLE = _package_available("omegaconf")
 _POPTORCH_AVAILABLE = _package_available("poptorch")
+_HABANA_FRAMEWORK_AVAILABLE = _package_available("habana_frameworks")
 _RICH_AVAILABLE = _package_available("rich") and _compare_version("rich", operator.ge, "10.2.2")
 _TORCH_QUANTIZE_AVAILABLE = bool([eg for eg in torch.backends.quantized.supported_engines if eg != "none"])
 _TORCHTEXT_AVAILABLE = _package_available("torchtext")
@@ -134,9 +134,13 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version:
 else:
     _IPU_AVAILABLE = False
 
-from habana_frameworks.torch.utils.library_loader import is_habana_avaialble, load_habana_module
+if _HABANA_FRAMEWORK_AVAILABLE:
+    from habana_frameworks.torch.utils.library_loader import is_habana_available
+
+    _HPU_AVAILABLE = is_habana_available()
+else:
+    _HPU_AVAILABLE = False
 
-_HPU_AVAILABLE = is_habana_avaialble()
 
 # experimental feature within PyTorch Lightning.
 def _fault_tolerant_training() -> bool:

From 10b190fb44dad68614f93ebf60fb420945d68c22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 7 Mar 2022 09:04:21 +0100
Subject: [PATCH 031/167] fix str to_device section in converting.rst (#12243)

---
 docs/source/starter/converting.rst | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/source/starter/converting.rst b/docs/source/starter/converting.rst
index b15a62138fb06..d57688b8afd44 100644
--- a/docs/source/starter/converting.rst
+++ b/docs/source/starter/converting.rst
@@ -185,24 +185,24 @@ Your :doc:`LightningModule <../common/lightning_module>` can automatically run o
 
 If you have any explicit calls to ``.cuda()`` or ``.to(device)``, you can remove them since Lightning makes sure that the data coming from :class:`~torch.utils.data.DataLoader`
 and all the :class:`~torch.nn.Module` instances initialized inside ``LightningModule.__init__`` are moved to the respective devices automatically.
+If you still need to access the current device, you can use ``self.device`` anywhere in your ``LightningModule`` except in the ``__init__`` and ``setup`` methods.
 
 .. testcode::
 
     class LitModel(LightningModule):
-        def __init__(self):
-            super().__init__()
-            self.register_buffer("running_mean", torch.zeros(num_features))
+        def training_step(self, batch, batch_idx):
+            z = torch.randn(4, 5, device=self.device)
+            ...
 
-If you still need to access the current device, you can use ``self.device`` anywhere in ``LightningModule`` except ``__init__`` and ``setup`` methods.
-You are initializing a :class:`~torch.Tensor` within ``LightningModule.__init__`` method and want it to be moved to the device automatically you must
+Hint: If you are initializing a :class:`~torch.Tensor` within the ``LightningModule.__init__`` method and want it to be moved to the device automatically you should call
 :meth:`~torch.nn.Module.register_buffer` to register it as a parameter.
 
 .. testcode::
 
     class LitModel(LightningModule):
-        def training_step(self, batch, batch_idx):
-            z = torch.randn(4, 5, device=self.device)
-            ...
+        def __init__(self):
+            super().__init__()
+            self.register_buffer("running_mean", torch.zeros(num_features))
 
 --------
 

From c17c62b30f4311434cc29d5e101862fcb9485af5 Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Mon, 7 Mar 2022 14:15:07 +0530
Subject: [PATCH 032/167] Disable tuner with distributed strategies (#12179)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
---
 CHANGELOG.md                                  |  4 +++
 pytorch_lightning/tuner/tuning.py             |  8 ++++++
 .../properties}/test_auto_gpu_select.py       |  0
 tests/tuner/test_tuning.py                    | 27 +++++++++++++++++++
 4 files changed, 39 insertions(+)
 rename tests/{tuner => trainer/properties}/test_auto_gpu_select.py (100%)
 create mode 100644 tests/tuner/test_tuning.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7bfe08edb1f16..524540530b247 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -333,8 +333,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Removed `is_global_zero` check in `training_epoch_loop` before `logger.save`. If you have a custom logger that implements `save` the Trainer will now call `save` on all ranks by default. To change this behavior add `@rank_zero_only` to your `save` implementation ([#12134](https://github.com/PyTorchLightning/pytorch-lightning/pull/12134))
 
 
+- Disabled tuner with distributed strategies ([#12179](https://github.com/PyTorchLightning/pytorch-lightning/pull/12179))
+
+
 - Marked `trainer.logger_connector` as protected ([#12195](https://github.com/PyTorchLightning/pytorch-lightning/pull/12195))
 
+
 ### Deprecated
 
 - Deprecated `training_type_plugin` property in favor of `strategy` in `Trainer` and updated the references ([#11141](https://github.com/PyTorchLightning/pytorch-lightning/pull/11141))
diff --git a/pytorch_lightning/tuner/tuning.py b/pytorch_lightning/tuner/tuning.py
index f64183a92bc1c..b1a38bd27688c 100644
--- a/pytorch_lightning/tuner/tuning.py
+++ b/pytorch_lightning/tuner/tuning.py
@@ -17,6 +17,7 @@
 from pytorch_lightning.trainer.states import TrainerStatus
 from pytorch_lightning.tuner.batch_size_scaling import scale_batch_size
 from pytorch_lightning.tuner.lr_finder import _LRFinder, lr_find
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 
 
@@ -43,6 +44,13 @@ def _tune(
 
         self.trainer.strategy.connect(model)
 
+        is_tuning = self.trainer.auto_scale_batch_size or self.trainer.auto_lr_find
+        if self.trainer._accelerator_connector.is_distributed and is_tuning:
+            raise MisconfigurationException(
+                "`trainer.tune()` is currently not supported with"
+                f" `Trainer(strategy={self.trainer.strategy.strategy_name!r})`."
+            )
+
         # Run auto batch size scaling
         if self.trainer.auto_scale_batch_size:
             if isinstance(self.trainer.auto_scale_batch_size, str):
diff --git a/tests/tuner/test_auto_gpu_select.py b/tests/trainer/properties/test_auto_gpu_select.py
similarity index 100%
rename from tests/tuner/test_auto_gpu_select.py
rename to tests/trainer/properties/test_auto_gpu_select.py
diff --git a/tests/tuner/test_tuning.py b/tests/tuner/test_tuning.py
new file mode 100644
index 0000000000000..aadacb6440fc2
--- /dev/null
+++ b/tests/tuner/test_tuning.py
@@ -0,0 +1,27 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+
+from pytorch_lightning import Trainer
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from tests.helpers.boring_model import BoringModel
+
+
+def test_tuner_with_distributed_strategies():
+    """Test that an error is raised when tuner is used with multi-device strategy."""
+    trainer = Trainer(auto_scale_batch_size=True, devices=2, strategy="ddp", accelerator="cpu")
+    model = BoringModel()
+
+    with pytest.raises(MisconfigurationException, match=r"not supported with `Trainer\(strategy='ddp'\)`"):
+        trainer.tune(model)

From 28bc4f0df0f967eed7063ab5156e0dc5e6e628af Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Mon, 7 Mar 2022 12:46:22 +0400
Subject: [PATCH 033/167] Add callout items to the Docs landing page (#12196)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: Carlos Mocholi <carlossmocholi@gmail.com>
---
 docs/source/index.rst | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index e7bc8262c9e0c..b3e080fbcc2fd 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -3,9 +3,34 @@
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
-PyTorch Lightning
-=================
+Welcome to PyTorch Lightning
+============================
 
+.. raw:: html
+
+    <div class="tutorials-callout-container">
+        <div class="row">
+
+.. Add callout items below this line
+
+.. customcalloutitem::
+   :description: Learn how to leverage the PyTorch Lightning features for your Machine Learning projects with ease in this quickstart guide.
+   :header: Introduction
+   :button_link:  starter/introduction.html
+   :button_text: Get started with PyTorch Lightning
+
+.. customcalloutitem::
+   :description: Guide to restructure your PyTorch code to PyTorch Lightning and help you focus more on research rather than the tricky engineering aspects.
+   :header: PyTorch to PyTorch Lightning
+   :button_link: starter/converting.html
+   :button_text: Organize PyTorch to PyTorch Lightning
+
+.. raw:: html
+
+        </div>
+    </div>
+
+.. End of callout item section
 
 .. tutoriallist::
 

From 97e1d2897b417ffc8520debf36f70ccb6fba26ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Mon, 7 Mar 2022 20:21:37 +0100
Subject: [PATCH 034/167] Integrate global step with progress tracking (#11805)

---
 CHANGELOG.md                                  |  9 ++++
 docs/source/common/lightning_module.rst       | 13 ++---
 docs/source/common/trainer.rst                | 25 ++++++----
 .../callbacks/device_stats_monitor.py         |  4 +-
 .../callbacks/gpu_stats_monitor.py            |  4 +-
 pytorch_lightning/callbacks/lr_monitor.py     |  4 +-
 .../callbacks/model_checkpoint.py             | 24 +--------
 .../loops/dataloader/evaluation_loop.py       |  4 +-
 .../loops/epoch/training_epoch_loop.py        | 26 +++++-----
 pytorch_lightning/loops/fit_loop.py           | 27 +++-------
 .../loops/optimization/optimizer_loop.py      |  9 +++-
 .../connectors/checkpoint_connector.py        | 16 +++---
 .../logger_connector/logger_connector.py      | 12 ++---
 pytorch_lightning/trainer/trainer.py          |  6 ++-
 pytorch_lightning/tuner/batch_size_scaling.py |  2 -
 pytorch_lightning/tuner/lr_finder.py          |  2 -
 tests/callbacks/test_lr_monitor.py            |  7 +--
 tests/callbacks/test_rich_progress_bar.py     | 14 +++---
 tests/callbacks/test_tqdm_progress_bar.py     | 14 +++---
 .../test_checkpoint_callback_frequency.py     |  4 +-
 tests/checkpointing/test_model_checkpoint.py  | 49 +++++--------------
 .../checkpointing/test_trainer_checkpoint.py  | 16 ------
 tests/loggers/test_comet.py                   |  2 +-
 tests/loggers/test_mlflow.py                  |  4 +-
 tests/loggers/test_wandb.py                   |  4 +-
 tests/loops/test_loops.py                     |  7 ++-
 tests/loops/test_training_loop.py             |  2 +-
 tests/models/test_amp.py                      |  4 +-
 tests/models/test_restore.py                  | 13 +++--
 tests/plugins/test_checkpoint_io_plugin.py    |  4 +-
 tests/trainer/optimization/test_optimizers.py |  2 +-
 tests/trainer/test_trainer.py                 |  3 +-
 32 files changed, 144 insertions(+), 192 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 524540530b247..fd8d25ccafe6f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -303,6 +303,15 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - The `trainer.current_epoch` value is now increased by 1 during and after `on_train_end` ([#8578](https://github.com/PyTorchLightning/pytorch-lightning/pull/8578))
 
 
+- The `trainer.global_step` value now accounts for multiple optimizers and TBPTT splits ([#11805](https://github.com/PyTorchLightning/pytorch-lightning/pull/11805))
+
+
+- The `trainer.global_step` value is now increased right after the `optimizer.step()` call which will impact users who access it during an intra-training validation hook ([#11805](https://github.com/PyTorchLightning/pytorch-lightning/pull/11805))
+
+
+- The filename of checkpoints created with `ModelCheckpoint(filename='{step}')` is different compared to previous versions. A checkpoint saved after 1 step will be named `step=1.ckpt` instead of `step=0.ckpt` ([#11805](https://github.com/PyTorchLightning/pytorch-lightning/pull/11805))
+
+
 - Inherit from `ABC` for `Accelerator`: Users need to implement `auto_device_count` ([#11521](https://github.com/PyTorchLightning/pytorch-lightning/pull/11521))
 
 
diff --git a/docs/source/common/lightning_module.rst b/docs/source/common/lightning_module.rst
index dc10f235ceb39..2156c1567ac33 100644
--- a/docs/source/common/lightning_module.rst
+++ b/docs/source/common/lightning_module.rst
@@ -916,7 +916,7 @@ These are properties available in a LightningModule.
 current_epoch
 ~~~~~~~~~~~~~
 
-The current epoch
+The number of epochs run.
 
 .. code-block:: python
 
@@ -946,12 +946,13 @@ usually do not need to use this property, but it is useful to know how to access
     def training_step(self, batch, batch_idx):
         if self.global_rank == 0:
             # do something only once across all the nodes
-            self.log("global_step", self.trainer.global_step)
+            ...
 
 global_step
 ~~~~~~~~~~~
 
-The current step (does not reset each epoch)
+The number of optimizer steps taken (does not reset each epoch).
+This includes multiple optimizers and TBPTT steps (if enabled).
 
 .. code-block:: python
 
@@ -1003,16 +1004,16 @@ The list of loggers currently being used by the Trainer.
 local_rank
 ~~~~~~~~~~~
 
-The ``global_rank`` is the index of the current process across all the devices for the current node.
+The ``local_rank`` is the index of the current process across all the devices for the current node.
 You usually do not need to use this property, but it is useful to know how to access it if needed.
 For example, if using 10 machines (or nodes), the GPU at index 0 on each machine has local_rank = 0.
 
 .. code-block:: python
 
     def training_step(self, batch, batch_idx):
-        if self.global_rank == 0:
+        if self.local_rank == 0:
             # do something only once across each node
-            self.log("global_step", self.trainer.global_step)
+            ...
 
 precision
 ~~~~~~~~~
diff --git a/docs/source/common/trainer.rst b/docs/source/common/trainer.rst
index b7c65929cb7b7..1f39355afd242 100644
--- a/docs/source/common/trainer.rst
+++ b/docs/source/common/trainer.rst
@@ -934,7 +934,7 @@ max_steps
 
 |
 
-Stop training after this number of steps
+Stop training after this number of :ref:`global steps <common/trainer:global_step>`.
 Training will stop if max_steps or max_epochs have reached (earliest).
 
 .. testcode::
@@ -959,7 +959,7 @@ min_steps
 
 |
 
-Force training for at least these number of steps.
+Force training for at least this number of :ref:`global steps <common/trainer:global_step>`.
 Trainer will train model for at least min_steps or min_epochs (latest).
 
 .. testcode::
@@ -1732,16 +1732,23 @@ The metrics available to callbacks. These are automatically set when you log via
 current_epoch
 *************
 
-The current epoch
+The number of epochs run.
 
 .. code-block:: python
 
-    def training_step(self, batch, batch_idx):
-        current_epoch = self.trainer.current_epoch
-        if current_epoch > 100:
-            # do something
-            pass
+    if trainer.current_epoch >= 10:
+        ...
+
+global_step
+***********
+
+The number of optimizer steps taken (does not reset each epoch).
+This includes multiple optimizers and TBPTT steps (if enabled).
 
+.. code-block:: python
+
+    if trainer.global_step >= 100:
+        ...
 
 logger
 *******
@@ -1822,4 +1829,4 @@ The metrics sent to the progress bar.
 estimated_stepping_batches
 **************************
 
-Check out :paramref:`~pytorch_lightning.trainer.trainer.Trainer.estimated_stepping_batches`.
+Check out :meth:`~pytorch_lightning.trainer.trainer.Trainer.estimated_stepping_batches`.
diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py
index 93d440d016086..0929358cf0f74 100644
--- a/pytorch_lightning/callbacks/device_stats_monitor.py
+++ b/pytorch_lightning/callbacks/device_stats_monitor.py
@@ -66,7 +66,7 @@ def on_train_batch_start(
         for logger in trainer.loggers:
             separator = logger.group_separator
             prefixed_device_stats = _prefix_metric_keys(device_stats, "on_train_batch_start", separator)
-            logger.log_metrics(prefixed_device_stats, step=trainer.global_step)
+            logger.log_metrics(prefixed_device_stats, step=trainer.fit_loop.epoch_loop._batches_that_stepped)
 
     def on_train_batch_end(
         self,
@@ -88,7 +88,7 @@ def on_train_batch_end(
         for logger in trainer.loggers:
             separator = logger.group_separator
             prefixed_device_stats = _prefix_metric_keys(device_stats, "on_train_batch_end", separator)
-            logger.log_metrics(prefixed_device_stats, step=trainer.global_step)
+            logger.log_metrics(prefixed_device_stats, step=trainer.fit_loop.epoch_loop._batches_that_stepped)
 
 
 def _prefix_metric_keys(metrics_dict: Dict[str, float], prefix: str, separator: str) -> Dict[str, float]:
diff --git a/pytorch_lightning/callbacks/gpu_stats_monitor.py b/pytorch_lightning/callbacks/gpu_stats_monitor.py
index 2e9e817bf9be4..8fb92006708f7 100644
--- a/pytorch_lightning/callbacks/gpu_stats_monitor.py
+++ b/pytorch_lightning/callbacks/gpu_stats_monitor.py
@@ -162,7 +162,7 @@ def on_train_batch_start(
             logs["batch_time/inter_step (ms)"] = (time.time() - self._snap_inter_step_time) * 1000
 
         for logger in trainer.loggers:
-            logger.log_metrics(logs, step=trainer.global_step)
+            logger.log_metrics(logs, step=trainer.fit_loop.epoch_loop._batches_that_stepped)
 
     @rank_zero_only
     def on_train_batch_end(
@@ -187,7 +187,7 @@ def on_train_batch_end(
             logs["batch_time/intra_step (ms)"] = (time.time() - self._snap_intra_step_time) * 1000
 
         for logger in trainer.loggers:
-            logger.log_metrics(logs, step=trainer.global_step)
+            logger.log_metrics(logs, step=trainer.fit_loop.epoch_loop._batches_that_stepped)
 
     @staticmethod
     def _get_gpu_ids(device_ids: List[int]) -> List[str]:
diff --git a/pytorch_lightning/callbacks/lr_monitor.py b/pytorch_lightning/callbacks/lr_monitor.py
index 4f226f7fdec51..b149858575118 100644
--- a/pytorch_lightning/callbacks/lr_monitor.py
+++ b/pytorch_lightning/callbacks/lr_monitor.py
@@ -158,7 +158,7 @@ def on_train_batch_start(self, trainer: "pl.Trainer", *args: Any, **kwargs: Any)
 
             if latest_stat:
                 for logger in trainer.loggers:
-                    logger.log_metrics(latest_stat, step=trainer.global_step)
+                    logger.log_metrics(latest_stat, step=trainer.fit_loop.epoch_loop._batches_that_stepped)
 
     def on_train_epoch_start(self, trainer: "pl.Trainer", *args: Any, **kwargs: Any) -> None:
         if self.logging_interval != "step":
@@ -167,7 +167,7 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", *args: Any, **kwargs: Any)
 
             if latest_stat:
                 for logger in trainer.loggers:
-                    logger.log_metrics(latest_stat, step=trainer.global_step)
+                    logger.log_metrics(latest_stat, step=trainer.fit_loop.epoch_loop._batches_that_stepped)
 
     def _extract_stats(self, trainer: "pl.Trainer", interval: str) -> Dict[str, float]:
         latest_stat = {}
diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 704d0a7a52253..d9b5f13e6fa8a 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -227,7 +227,7 @@ def __init__(
         self.save_weights_only = save_weights_only
         self.auto_insert_metric_name = auto_insert_metric_name
         self._save_on_train_epoch_end = save_on_train_epoch_end
-        self._last_global_step_saved = -1
+        self._last_global_step_saved = 0  # no need to save when no steps were taken
         self._last_time_checked: Optional[float] = None
         self.current_score = None
         self.best_k_models = {}
@@ -278,8 +278,7 @@ def on_train_batch_end(
         """Save checkpoint on train batch end if we meet the criteria for `every_n_train_steps`"""
         if self._should_skip_saving_checkpoint(trainer):
             return
-        step = trainer.global_step
-        skip_batch = self._every_n_train_steps < 1 or ((step + 1) % self._every_n_train_steps != 0)
+        skip_batch = self._every_n_train_steps < 1 or (trainer.global_step % self._every_n_train_steps != 0)
 
         train_time_interval = self._train_time_interval
         skip_time = True
@@ -300,8 +299,6 @@ def on_train_batch_end(
 
     def on_train_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
         """Save a checkpoint at the end of the training epoch."""
-        # as we advance one step at end of training, we use `global_step - 1` to avoid saving duplicates
-        trainer.fit_loop.global_step -= 1
         if (
             not self._should_skip_saving_checkpoint(trainer)
             and self._save_on_train_epoch_end
@@ -309,7 +306,6 @@ def on_train_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModu
             and (trainer.current_epoch + 1) % self._every_n_epochs == 0
         ):
             self.save_checkpoint(trainer)
-        trainer.fit_loop.global_step += 1
 
     def on_validation_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
         """Save a checkpoint at the end of the validation stage."""
@@ -322,22 +318,6 @@ def on_validation_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModul
             return
         self.save_checkpoint(trainer)
 
-    def on_train_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
-        """Save a checkpoint when training stops.
-
-        This will only save a checkpoint if `save_last` is also enabled as the monitor metrics logged during
-        training/validation steps or end of epochs are not guaranteed to be available at this stage.
-        """
-        if self._should_skip_saving_checkpoint(trainer) or not self.save_last:
-            return
-        if self.verbose:
-            rank_zero_info("Saving latest checkpoint...")
-        # as we advance one step at end of training, we use `global_step - 1` to avoid saving duplicates
-        monitor_candidates = self._monitor_candidates(trainer, trainer.current_epoch, trainer.global_step - 1)
-        trainer.fit_loop.global_step -= 1
-        self._save_last_checkpoint(trainer, monitor_candidates)
-        trainer.fit_loop.global_step += 1
-
     def on_save_checkpoint(
         self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", checkpoint: Dict[str, Any]
     ) -> Dict[str, Any]:
diff --git a/pytorch_lightning/loops/dataloader/evaluation_loop.py b/pytorch_lightning/loops/dataloader/evaluation_loop.py
index 833a43e4cf019..859ded3a98e72 100644
--- a/pytorch_lightning/loops/dataloader/evaluation_loop.py
+++ b/pytorch_lightning/loops/dataloader/evaluation_loop.py
@@ -66,8 +66,6 @@ def num_dataloaders(self) -> int:
         # case where user does:
         # return dl1, dl2
         dataloaders = self.dataloaders
-        if dataloaders is None:
-            return 0
         length = len(dataloaders)
         if length > 0 and isinstance(dataloaders[0], (list, tuple)):
             length = len(dataloaders[0])
@@ -78,7 +76,7 @@ def dataloaders(self) -> Sequence[DataLoader]:
         """Returns the validation or test dataloaders."""
         dataloaders = self.trainer.test_dataloaders if self.trainer.testing else self.trainer.val_dataloaders
         if dataloaders is None:
-            raise RuntimeError("Dataloaders should be available.")
+            return []
         return dataloaders
 
     @property
diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
index e28a864b46f16..67400ce0472de 100644
--- a/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -60,7 +60,6 @@ def __init__(self, min_steps: Optional[int] = None, max_steps: int = -1) -> None
         self.min_steps = min_steps
         self.max_steps = max_steps
 
-        self.global_step: int = 0
         self.batch_progress = BatchProgress()
         self.scheduler_progress = SchedulerProgress()
 
@@ -72,6 +71,7 @@ def __init__(self, min_steps: Optional[int] = None, max_steps: int = -1) -> None
         self._warning_cache = WarningCache()
         # caches the loaded dataloader state until dataloader objects are available
         self._dataloader_state_dict: Dict[str, Any] = {}
+        self._batches_that_stepped: int = 0
 
     @property
     def total_batch_idx(self) -> int:
@@ -87,6 +87,13 @@ def batch_idx(self) -> int:
         # but before the next `ready` increase
         return self.batch_progress.current.ready - 1
 
+    @property
+    def global_step(self) -> int:
+        lightning_module = self.trainer.lightning_module
+        if lightning_module is None or lightning_module.automatic_optimization:
+            return self.batch_loop.optimizer_loop.optim_progress.optimizer_steps
+        return self.batch_loop.manual_loop.optim_step_progress.total.completed
+
     @property
     def _is_training_done(self) -> bool:
         max_steps_reached = _is_max_limit_reached(self.global_step, self.max_steps)
@@ -247,17 +254,14 @@ def on_advance_end(self) -> None:
             self._run_validation()
             self.trainer.training = True
 
-        # -----------------------------------------
-        # SAVE LOGGERS (ie: Tensorboard, etc...)
-        # -----------------------------------------
-        self._save_loggers_on_train_batch_end()
-
         # update plateau LR scheduler after metrics are logged
         self.update_lr_schedulers("step", update_plateau_schedulers=True)
 
         if not self._should_accumulate():
-            # progress global step according to grads progress
-            self.global_step += 1
+            # this is increased once per batch disregarding multiple optimizers or tbptt on purpose for loggers
+            self._batches_that_stepped += 1
+        # this will save based on the `batches_that_stepped` value
+        self._save_loggers_on_train_batch_end()
 
         # if training finished, defer exit to the parent. this assumes there will be enough time in between
         # which might not be the case depending on what's in the `*_epoch_end` hooks
@@ -503,9 +507,9 @@ def _should_check_val_fx(self, batch_idx: int, is_last_batch: bool) -> bool:
 
     def _save_loggers_on_train_batch_end(self) -> None:
         """Flushes loggers to disk."""
-        # when loggers should save to disk
-        should_flush_logs = self.trainer._logger_connector.should_flush_logs
-        if should_flush_logs:
+        # this assumes that `batches_that_stepped` was increased before
+        should_flush = self._batches_that_stepped % self.trainer.flush_logs_every_n_steps == 0
+        if should_flush or self.trainer.should_stop:
             for logger in self.trainer.loggers:
                 logger.save()
 
diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py
index 361e104fa878b..7087bcbad0442 100644
--- a/pytorch_lightning/loops/fit_loop.py
+++ b/pytorch_lightning/loops/fit_loop.py
@@ -68,19 +68,6 @@ def __init__(
         self._outputs: _EPOCH_OUTPUTS_TYPE = []
         self._data_fetcher: Optional[AbstractDataFetcher] = None
 
-    @property
-    def global_step(self) -> int:
-        """Returns the global step."""
-        lightning_module = self.trainer.lightning_module
-        if lightning_module is None or lightning_module.automatic_optimization:
-            return self.epoch_loop.global_step
-        return self.epoch_loop.batch_loop.manual_loop.optim_step_progress.total.completed
-
-    @global_step.setter
-    def global_step(self, value: int) -> None:
-        """Sets the global step (forwards to epoch_loop)"""
-        self.epoch_loop.global_step = value
-
     @property
     def total_batch_idx(self) -> int:
         """Returns the current batch index (across epochs)"""
@@ -177,7 +164,7 @@ def _results(self) -> _ResultCollection:
     def done(self) -> bool:
         """Evaluates when to leave the loop."""
         # TODO(@awaelchli): Move track steps inside training loop and move part of these condition inside training loop
-        stop_steps = _is_max_limit_reached(self.global_step, self.max_steps)
+        stop_steps = _is_max_limit_reached(self.epoch_loop.global_step, self.max_steps)
         # `processed` is increased before `on_train_epoch_end`, the hook where checkpoints are typically saved.
         # we use it here because the checkpoint data won't have `completed` increased yet
         stop_epochs = _is_max_limit_reached(self.epoch_progress.current.processed, self.max_epochs)
@@ -186,7 +173,7 @@ def done(self) -> bool:
         if self.trainer.should_stop:
             # early stopping
             met_min_epochs = self.epoch_progress.current.processed >= self.min_epochs if self.min_epochs else True
-            met_min_steps = self.global_step >= self.min_steps if self.min_steps else True
+            met_min_steps = self.epoch_loop.global_step >= self.min_steps if self.min_steps else True
             if met_min_epochs and met_min_steps:
                 should_stop = True
             else:
@@ -319,14 +306,12 @@ def on_advance_end(self) -> None:
 
         self.epoch_progress.increment_completed()
 
-        # the global step is manually decreased here due to backwards compatibility with existing loggers
-        # as they expect that the same step is used when logging epoch end metrics even when the batch loop has
-        # finished. this means the attribute does not exactly track the number of optimizer steps applied.
-        # TODO(@carmocca): deprecate and rename so users don't get confused
-        self.global_step -= 1
+        # we manually decrease here because loggers expect that the same step is used when logging epoch-end metrics
+        # even when the batch loop has finished
+        self.epoch_loop._batches_that_stepped -= 1
         # log epoch metrics
         self.trainer._logger_connector.update_train_epoch_metrics()
-        self.global_step += 1
+        self.epoch_loop._batches_that_stepped += 1
 
         # if fault tolerant is enabled and process has been notified, exit.
         self.trainer._exit_gracefully_on_signal()
diff --git a/pytorch_lightning/loops/optimization/optimizer_loop.py b/pytorch_lightning/loops/optimization/optimizer_loop.py
index f8d692d688035..bab025466789a 100644
--- a/pytorch_lightning/loops/optimization/optimizer_loop.py
+++ b/pytorch_lightning/loops/optimization/optimizer_loop.py
@@ -359,7 +359,11 @@ def _optimizer_step(
         else:
             optimizer = self.trainer.strategy._lightning_optimizers[opt_idx]
 
-        self.optim_progress.optimizer.step.increment_ready()
+        # if `strategy.handles_gradient_accumulation`, this method will be called to route into the strategy, but we
+        # need to check again if `should_accumulate` before increasing the counters
+        should_accumulate = self.trainer.fit_loop._should_accumulate()
+        if not should_accumulate:
+            self.optim_progress.optimizer.step.increment_ready()
 
         # model hook
         self.trainer._call_lightning_module_hook(
@@ -374,7 +378,8 @@ def _optimizer_step(
             using_lbfgs=is_lbfgs,
         )
 
-        self.optim_progress.optimizer.step.increment_completed()
+        if not should_accumulate:
+            self.optim_progress.optimizer.step.increment_completed()
 
     def _on_before_zero_grad(self, optimizer: torch.optim.Optimizer) -> None:
         """Calls the ``on_before_zero_grad`` hook.
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index 9a15db2e8e561..be2c2d3dfa8eb 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -232,16 +232,20 @@ def restore_loops(self) -> None:
         if not self._loaded_checkpoint:
             return
 
-        self.trainer.fit_loop.global_step = self._loaded_checkpoint["global_step"]
-        # set the `current_epoch` value for old checkpoints without the progress tracking state.
+        fit_loop = self.trainer.fit_loop
+        # set the `global_step` value for checkpoints before v1.6 without the progress tracking state.
         # it will be overwritten by the loop's state if it was also saved
-        self.trainer.fit_loop.epoch_progress.current.completed = self._loaded_checkpoint["epoch"]
+        optimizer_loop = fit_loop.epoch_loop.batch_loop.optimizer_loop
+        optimizer_loop.optim_progress.optimizer.step.total.completed = self._loaded_checkpoint["global_step"]
+        # set the `current_epoch` value for checkpoints before v1.6 without the progress tracking state.
+        # it will be overwritten by the loop's state if it was also saved
+        fit_loop.epoch_progress.current.completed = self._loaded_checkpoint["epoch"]
 
         assert self.trainer.state.fn is not None
         state_dict = self._loaded_checkpoint.get("loops")
         if state_dict is not None:
             if self.trainer.state.fn in (TrainerFn.FITTING, TrainerFn.TUNING):
-                self.trainer.fit_loop.load_state_dict(state_dict["fit_loop"])
+                fit_loop.load_state_dict(state_dict["fit_loop"])
             elif self.trainer.state.fn == TrainerFn.VALIDATING:
                 self.trainer.validate_loop.load_state_dict(state_dict["validate_loop"])
             elif self.trainer.state.fn == TrainerFn.TESTING:
@@ -330,9 +334,9 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
         model = self.trainer.lightning_module
 
         checkpoint = {
-            # the epoch is saved for compatibility but it's not relevant for restoration
+            # the epoch and global step are saved for compatibility but they are not relevant for restoration
             "epoch": self.trainer.current_epoch,
-            "global_step": self.trainer.global_step + model.automatic_optimization,
+            "global_step": self.trainer.global_step,
             "pytorch-lightning_version": pl.__version__,
             "state_dict": self._get_lightning_module_state_dict(),
             "loops": self._get_loops_state_dict(),
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
index 428713ff3347e..0e3a69bfc9d98 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -77,15 +77,11 @@ def on_trainer_init(
                 )
                 break
 
-    @property
-    def should_flush_logs(self) -> bool:
-        should_flush = (self.trainer.global_step + 1) % self.trainer.flush_logs_every_n_steps == 0
-        return should_flush or self.trainer.should_stop
-
     @property
     def should_update_logs(self) -> bool:
-        should_log_every_n_steps = (self.trainer.global_step + 1) % self.trainer.log_every_n_steps == 0
-        return should_log_every_n_steps or self.trainer.should_stop
+        # `+ 1` because it can be checked before a step is executed, for example, in `on_train_batch_start`
+        should_log = (self.trainer.fit_loop.epoch_loop._batches_that_stepped + 1) % self.trainer.log_every_n_steps == 0
+        return should_log or self.trainer.should_stop
 
     def configure_logger(self, logger: Union[bool, LightningLoggerBase, Iterable[LightningLoggerBase]]) -> None:
         if isinstance(logger, bool):
@@ -123,7 +119,7 @@ def log_metrics(self, metrics: _OUT_DICT, step: Optional[int] = None) -> None:
         if step is None:
             # added metrics for convenience
             scalar_metrics.setdefault("epoch", self.trainer.current_epoch)
-            step = self.trainer.global_step
+            step = self.trainer.fit_loop.epoch_loop._batches_that_stepped
 
         # log actual metrics
         for logger in self.trainer.loggers:
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index b3b81fe4554fd..5dbd9d3cadc3a 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -2486,7 +2486,11 @@ def sanity_checking(self, val: bool) -> None:
 
     @property
     def global_step(self) -> int:
-        return self.fit_loop.global_step
+        """The number of optimizer steps taken (does not reset each epoch).
+
+        This includes multiple optimizers and TBPTT steps (if enabled).
+        """
+        return self.fit_loop.epoch_loop.global_step
 
     @property
     def current_epoch(self) -> int:
diff --git a/pytorch_lightning/tuner/batch_size_scaling.py b/pytorch_lightning/tuner/batch_size_scaling.py
index 3d5916e3f8bd9..6f4ac72bd7e8b 100644
--- a/pytorch_lightning/tuner/batch_size_scaling.py
+++ b/pytorch_lightning/tuner/batch_size_scaling.py
@@ -60,9 +60,7 @@ def scale_batch_size(
 
     # Save initial model, that is loaded after batch size is found
     ckpt_path = os.path.join(trainer.default_root_dir, f".scale_batch_size_{uuid.uuid4()}.ckpt")
-    trainer.fit_loop.global_step -= 1
     trainer.save_checkpoint(ckpt_path)
-    trainer.fit_loop.global_step += 1
     params = __scale_batch_dump_params(trainer)
 
     # Set to values that are required by the algorithm
diff --git a/pytorch_lightning/tuner/lr_finder.py b/pytorch_lightning/tuner/lr_finder.py
index d929bbe2f87c7..36b09c130056c 100644
--- a/pytorch_lightning/tuner/lr_finder.py
+++ b/pytorch_lightning/tuner/lr_finder.py
@@ -204,9 +204,7 @@ def lr_find(
 
     # Save initial model, that is loaded after learning rate is found
     ckpt_path = os.path.join(trainer.default_root_dir, f".lr_find_{uuid.uuid4()}.ckpt")
-    trainer.fit_loop.global_step -= 1
     trainer.save_checkpoint(ckpt_path)
-    trainer.fit_loop.global_step += 1
     params = __lr_finder_dump_params(trainer)
 
     # Set to values that are required by the algorithm
diff --git a/tests/callbacks/test_lr_monitor.py b/tests/callbacks/test_lr_monitor.py
index 82a4a5b99894a..391e74bb10221 100644
--- a/tests/callbacks/test_lr_monitor.py
+++ b/tests/callbacks/test_lr_monitor.py
@@ -217,7 +217,6 @@ def configure_optimizers(self):
             optimizer2 = optim.Adam(self.parameters(), lr=1e-2)
             lr_scheduler1 = optim.lr_scheduler.StepLR(optimizer1, 1, gamma=0.1)
             lr_scheduler2 = optim.lr_scheduler.StepLR(optimizer2, 1, gamma=0.1)
-
             return [optimizer1, optimizer2], [lr_scheduler1, lr_scheduler2]
 
     model = CustomBoringModel()
@@ -241,7 +240,8 @@ def configure_optimizers(self):
     assert list(lr_monitor.lrs) == ["lr-Adam", "lr-Adam-1"], "Names of learning rates not set correctly"
 
     if logging_interval == "step":
-        expected_number_logged = trainer.global_step // log_every_n_steps
+        # divide by 2 because we have 2 optimizers
+        expected_number_logged = trainer.global_step // 2 // log_every_n_steps
     if logging_interval == "epoch":
         expected_number_logged = trainer.max_epochs
 
@@ -284,7 +284,8 @@ def configure_optimizers(self):
     assert list(lr_monitor.lrs) == ["lr-Adam", "lr-Adam-1"], "Names of learning rates not set correctly"
 
     if logging_interval == "step":
-        expected_number_logged = trainer.global_step // log_every_n_steps
+        # divide by 2 because we have 2 optimizers
+        expected_number_logged = trainer.global_step // 2 // log_every_n_steps
     if logging_interval == "epoch":
         expected_number_logged = trainer.max_epochs
 
diff --git a/tests/callbacks/test_rich_progress_bar.py b/tests/callbacks/test_rich_progress_bar.py
index cfe32dc495f8d..29ef3aa98f89b 100644
--- a/tests/callbacks/test_rich_progress_bar.py
+++ b/tests/callbacks/test_rich_progress_bar.py
@@ -368,15 +368,15 @@ def test_step(self, batch, batch_idx):
     trainer.fit(model)
     assert pbar.calls["fit"] == [
         ("sanity_check", 0, 0, {"b": 0}),
-        ("train", 0, 0, {}),
         ("train", 0, 1, {}),
-        ("validate", 0, 1, {"b": 1}),  # validation end
+        ("train", 0, 2, {}),
+        ("validate", 0, 2, {"b": 2}),  # validation end
         # epoch end over, `on_epoch=True` metrics are computed
-        ("train", 0, 2, {"a": 1, "b": 1}),  # training epoch end
-        ("train", 1, 2, {"a": 1, "b": 1}),
-        ("train", 1, 3, {"a": 1, "b": 1}),
-        ("validate", 1, 3, {"a": 1, "b": 3}),  # validation end
-        ("train", 1, 4, {"a": 3, "b": 3}),  # training epoch end
+        ("train", 0, 2, {"a": 1, "b": 2}),  # training epoch end
+        ("train", 1, 3, {"a": 1, "b": 2}),
+        ("train", 1, 4, {"a": 1, "b": 2}),
+        ("validate", 1, 4, {"a": 1, "b": 4}),  # validation end
+        ("train", 1, 4, {"a": 3, "b": 4}),  # training epoch end
     ]
 
     trainer.validate(model, verbose=False)
diff --git a/tests/callbacks/test_tqdm_progress_bar.py b/tests/callbacks/test_tqdm_progress_bar.py
index 7897a1be798bb..3cfe54c992247 100644
--- a/tests/callbacks/test_tqdm_progress_bar.py
+++ b/tests/callbacks/test_tqdm_progress_bar.py
@@ -608,15 +608,15 @@ def test_step(self, batch, batch_idx):
     trainer.fit(model)
     assert pbar.calls["fit"] == [
         ("sanity_check", 0, 0, {"b": 0}),
-        ("train", 0, 0, {}),
         ("train", 0, 1, {}),
-        ("validate", 0, 1, {"b": 1}),  # validation end
+        ("train", 0, 2, {}),
+        ("validate", 0, 2, {"b": 2}),  # validation end
         # epoch end over, `on_epoch=True` metrics are computed
-        ("train", 0, 2, {"a": 1, "b": 1}),  # training epoch end
-        ("train", 1, 2, {"a": 1, "b": 1}),
-        ("train", 1, 3, {"a": 1, "b": 1}),
-        ("validate", 1, 3, {"a": 1, "b": 3}),  # validation end
-        ("train", 1, 4, {"a": 3, "b": 3}),  # training epoch end
+        ("train", 0, 2, {"a": 1, "b": 2}),  # training epoch end
+        ("train", 1, 3, {"a": 1, "b": 2}),
+        ("train", 1, 4, {"a": 1, "b": 2}),
+        ("validate", 1, 4, {"a": 1, "b": 4}),  # validation end
+        ("train", 1, 4, {"a": 3, "b": 4}),  # training epoch end
     ]
 
     trainer.validate(model, verbose=False)
diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py
index 90665a6db476e..eeec11c6ecd14 100644
--- a/tests/checkpointing/test_checkpoint_callback_frequency.py
+++ b/tests/checkpointing/test_checkpoint_callback_frequency.py
@@ -81,8 +81,8 @@ def training_step(self, batch, batch_idx):
     trainer.fit(model)
 
     if save_last:
-        # last epochs are saved every step (so double the save calls) and once `on_train_end`
-        expected = expected * 2 + 1
+        # last epochs are saved every step (so double the save calls)
+        expected = expected * 2
     assert save_mock.call_count == expected
 
 
diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py
index b9e63d28f4234..3dadf0b733a74 100644
--- a/tests/checkpointing/test_model_checkpoint.py
+++ b/tests/checkpointing/test_model_checkpoint.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import logging
 import math
 import os
 import pickle
@@ -469,7 +468,7 @@ def test_model_checkpoint_file_extension(tmpdir):
     trainer = Trainer(default_root_dir=tmpdir, callbacks=[model_checkpoint], max_steps=1, logger=False)
     trainer.fit(model)
 
-    expected = ["epoch=0-step=0.tpkc", "last.tpkc"]
+    expected = ["epoch=0-step=1.tpkc", "last.tpkc"]
     assert set(expected) == set(os.listdir(tmpdir))
 
 
@@ -490,12 +489,12 @@ def test_model_checkpoint_save_last(tmpdir):
     )
     trainer.fit(model)
     last_filename = model_checkpoint._format_checkpoint_name(
-        ModelCheckpoint.CHECKPOINT_NAME_LAST, {"epoch": trainer.current_epoch}
+        ModelCheckpoint.CHECKPOINT_NAME_LAST, {"epoch": trainer.current_epoch - 1}
     )
     last_filename = last_filename + ".ckpt"
     assert str(tmpdir / last_filename) == model_checkpoint.last_model_path
     assert set(os.listdir(tmpdir)) == set(
-        [f"epoch={i}-step={j}.ckpt" for i, j in zip(range(epochs), [9, 19, 29])] + [last_filename]
+        [f"epoch={i}-step={j}.ckpt" for i, j in zip(range(epochs), [10, 20, 30])] + [last_filename]
     )
 
     ModelCheckpoint.CHECKPOINT_NAME_LAST = "last"
@@ -583,14 +582,14 @@ def test_model_checkpoint_save_last_none_monitor(tmpdir, caplog):
 
     # these should not be set if monitor is None
     assert checkpoint_callback.monitor is None
-    assert checkpoint_callback.best_model_path == tmpdir / "epoch=1-step=19.ckpt"
+    assert checkpoint_callback.best_model_path == tmpdir / "epoch=1-step=20.ckpt"
     assert checkpoint_callback.last_model_path == tmpdir / "last.ckpt"
     assert checkpoint_callback.best_model_score is None
     assert checkpoint_callback.best_k_models == {}
     assert checkpoint_callback.kth_best_model_path == ""
 
     # check that the correct ckpts were created
-    expected = [f"epoch={i}-step={j}.ckpt" for i, j in zip(range(epochs), [9, 19])]
+    expected = [f"epoch={i}-step={j}.ckpt" for i, j in zip(range(epochs), [10, 20])]
     expected.append("last.ckpt")
     assert set(os.listdir(tmpdir)) == set(expected)
 
@@ -642,7 +641,7 @@ def test_ckpt_every_n_train_steps(tmpdir):
 
     trainer.fit(model)
     expected = [
-        f"step={i}.ckpt" for i in range(every_n_train_steps - 1, max_epochs * epoch_length, every_n_train_steps)
+        f"step={i}.ckpt" for i in range(every_n_train_steps, max_epochs * epoch_length + 1, every_n_train_steps)
     ]
     assert set(os.listdir(tmpdir)) == set(expected)
 
@@ -766,34 +765,14 @@ def test_default_checkpoint_behavior(tmpdir):
     save_weights_only = trainer.checkpoint_callback.save_weights_only
     save_mock.assert_has_calls(
         [
-            call(save_dir / "epoch=0-step=4.ckpt", save_weights_only),
-            call(save_dir / "epoch=1-step=9.ckpt", save_weights_only),
-            call(save_dir / "epoch=2-step=14.ckpt", save_weights_only),
+            call(save_dir / "epoch=0-step=5.ckpt", save_weights_only),
+            call(save_dir / "epoch=1-step=10.ckpt", save_weights_only),
+            call(save_dir / "epoch=2-step=15.ckpt", save_weights_only),
         ]
     )
     ckpts = os.listdir(save_dir)
     assert len(ckpts) == 1
-    assert ckpts[0] == "epoch=2-step=14.ckpt"
-
-
-@pytest.mark.parametrize("max_epochs", [1, 2])
-@pytest.mark.parametrize("should_validate", [True, False])
-@pytest.mark.parametrize("save_last", [True, False])
-@pytest.mark.parametrize("verbose", [True, False])
-def test_model_checkpoint_save_last_warning(
-    tmpdir, caplog, max_epochs: int, should_validate: bool, save_last: bool, verbose: bool
-):
-    """Tests 'Saving latest checkpoint...' log."""
-    model = LogInTwoMethods()
-    if not should_validate:
-        model.validation_step = None
-    ckpt = ModelCheckpoint(monitor="early_stop_on", dirpath=tmpdir, save_top_k=0, save_last=save_last, verbose=verbose)
-    trainer = Trainer(
-        default_root_dir=tmpdir, callbacks=[ckpt], max_epochs=max_epochs, limit_train_batches=1, limit_val_batches=1
-    )
-    with caplog.at_level(logging.INFO):
-        trainer.fit(model)
-    assert caplog.messages.count("Saving latest checkpoint...") == (verbose and save_last)
+    assert ckpts[0] == "epoch=2-step=15.ckpt"
 
 
 def test_model_checkpoint_save_last_checkpoint_contents(tmpdir):
@@ -821,9 +800,7 @@ def test_model_checkpoint_save_last_checkpoint_contents(tmpdir):
     ckpt_last_epoch = torch.load(path_last_epoch)
     ckpt_last = torch.load(path_last)
 
-    # `-1` because this checkpoint is saved `on_train_epoch_end` which is considered part of the epoch so the
-    # `current_epoch` count has not been increased yet
-    assert ckpt_last_epoch["epoch"] == ckpt_last["epoch"] - 1
+    assert ckpt_last_epoch["epoch"] == ckpt_last["epoch"]
     assert ckpt_last_epoch["global_step"] == ckpt_last["global_step"]
 
     ckpt_id = (
@@ -1041,7 +1018,7 @@ def test_val_check_interval_checkpoint_files(tmpdir):
     )
     trainer.fit(model)
     files = {p.basename for p in tmpdir.listdir()}
-    assert files == {f"epoch=0-step={s}.ckpt" for s in [1, 3, 5, 7, 9]}
+    assert files == {f"epoch=0-step={s}.ckpt" for s in [2, 4, 6, 8, 10]}
 
 
 def test_current_score(tmpdir):
@@ -1303,4 +1280,4 @@ def test_last_global_step_saved():
     trainer = MagicMock()
     trainer.callback_metrics = {"foo": 123}
     model_checkpoint.save_checkpoint(trainer)
-    assert model_checkpoint._last_global_step_saved == -1
+    assert model_checkpoint._last_global_step_saved == 0
diff --git a/tests/checkpointing/test_trainer_checkpoint.py b/tests/checkpointing/test_trainer_checkpoint.py
index 24268e3cfca84..5d129179c7c5d 100644
--- a/tests/checkpointing/test_trainer_checkpoint.py
+++ b/tests/checkpointing/test_trainer_checkpoint.py
@@ -71,19 +71,3 @@ def validation_step(self, batch, batch_idx):
             assert best_model_path.endswith(f"epoch=0{idx}.ckpt")
         else:
             assert f"epoch={idx + 1}" in best_model_path
-
-
-def test_accumulated_gradient_batches_with_ckpt_path(tmpdir):
-    """This test validates that accumulated gradient is properly recomputed and reset on the trainer."""
-
-    ckpt = ModelCheckpoint(dirpath=tmpdir, save_last=True)
-    model = BoringModel()
-    trainer_kwargs = dict(
-        max_epochs=1, accumulate_grad_batches={0: 2}, callbacks=ckpt, limit_train_batches=1, limit_val_batches=0
-    )
-    trainer = Trainer(**trainer_kwargs)
-    trainer.fit(model)
-
-    trainer_kwargs["max_epochs"] = 2
-    trainer = Trainer(**trainer_kwargs)
-    trainer.fit(model, ckpt_path=ckpt.last_model_path)
diff --git a/tests/loggers/test_comet.py b/tests/loggers/test_comet.py
index 37758e904256a..e09b954a61a6a 100644
--- a/tests/loggers/test_comet.py
+++ b/tests/loggers/test_comet.py
@@ -156,7 +156,7 @@ def test_comet_logger_dirs_creation(comet, comet_experiment, tmpdir, monkeypatch
     trainer.fit(model)
 
     assert trainer.checkpoint_callback.dirpath == (tmpdir / "test" / "1" / "checkpoints")
-    assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {"epoch=0-step=2.ckpt"}
+    assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {"epoch=0-step=3.ckpt"}
     assert trainer.log_dir == logger.save_dir
 
 
diff --git a/tests/loggers/test_mlflow.py b/tests/loggers/test_mlflow.py
index 46c85f13e29e4..5ce5ceb75a0b1 100644
--- a/tests/loggers/test_mlflow.py
+++ b/tests/loggers/test_mlflow.py
@@ -136,7 +136,7 @@ def test_mlflow_log_dir(client, mlflow, tmpdir):
     assert trainer.log_dir == logger.save_dir
     trainer.fit(model)
     assert trainer.checkpoint_callback.dirpath == (tmpdir / "exp-id" / "run-id" / "checkpoints")
-    assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {"epoch=0-step=0.ckpt"}
+    assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {"epoch=0-step=1.ckpt"}
     assert trainer.log_dir == logger.save_dir
 
 
@@ -177,7 +177,7 @@ def training_epoch_end(self, *args, **kwargs):
     assert "epoch" in os.listdir(tmpdir / exp_id / run_id / "metrics")
     assert set(os.listdir(tmpdir / exp_id / run_id / "params")) == model.hparams.keys()
     assert trainer.checkpoint_callback.dirpath == (tmpdir / exp_id / run_id / "checkpoints")
-    assert os.listdir(trainer.checkpoint_callback.dirpath) == [f"epoch=0-step={limit_batches - 1}.ckpt"]
+    assert os.listdir(trainer.checkpoint_callback.dirpath) == [f"epoch=0-step={limit_batches}.ckpt"]
 
 
 @mock.patch("pytorch_lightning.loggers.mlflow.mlflow")
diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py
index 280303a3f7318..adb91aab6da32 100644
--- a/tests/loggers/test_wandb.py
+++ b/tests/loggers/test_wandb.py
@@ -156,7 +156,7 @@ def test_wandb_logger_dirs_creation(wandb, monkeypatch, tmpdir):
     trainer.fit(model)
 
     assert trainer.checkpoint_callback.dirpath == str(tmpdir / "project" / version / "checkpoints")
-    assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {"epoch=0-step=2.ckpt"}
+    assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {"epoch=0-step=3.ckpt"}
     assert trainer.log_dir == logger.save_dir
 
 
@@ -212,7 +212,7 @@ def test_wandb_log_model(wandb, monkeypatch, tmpdir):
         type="model",
         metadata={
             "score": None,
-            "original_filename": "epoch=1-step=5-v3.ckpt",
+            "original_filename": "epoch=1-step=6-v3.ckpt",
             "ModelCheckpoint": {
                 "monitor": None,
                 "mode": "min",
diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index cfc347293484c..9f3c63da4d1e8 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -743,7 +743,7 @@ def test_fit_loop_reset(tmpdir):
     trainer.fit(model)
 
     # reset state loaded from a checkpoint from mid-epoch
-    mid_epoch_ckpt = torch.load(str(tmpdir / "epoch=0-step=1.ckpt"))
+    mid_epoch_ckpt = torch.load(str(tmpdir / "epoch=0-step=2.ckpt"))
     fit_loop = trainer.fit_loop
     epoch_loop = fit_loop.epoch_loop
     optimizer_loop = epoch_loop.batch_loop.optimizer_loop
@@ -776,7 +776,7 @@ def test_fit_loop_reset(tmpdir):
     assert optimizer_loop.optim_progress.optimizer_position == 1
 
     # reset state loaded from a checkpoint from the end of an epoch
-    end_of_epoch_ckpt = torch.load(str(tmpdir / "epoch=0-step=3.ckpt"))
+    end_of_epoch_ckpt = torch.load(str(tmpdir / "epoch=0-step=4.ckpt"))
     fit_loop = trainer.fit_loop
     epoch_loop = fit_loop.epoch_loop
     fit_loop.restarting = False
@@ -943,8 +943,7 @@ def val_dataloader(self):
     )
     trainer.fit(model, ckpt_path=ckpt_path)
 
-    # TODO: -1 because there's a bug where global step is off by one on reload
-    assert trainer.global_step - 1 == expected_global_step
+    assert trainer.global_step == expected_global_step
 
     state_dict_after_restart = trainer.fit_loop.state_dict()
 
diff --git a/tests/loops/test_training_loop.py b/tests/loops/test_training_loop.py
index bcec1bb8bc13f..3de02d5f8bb1c 100644
--- a/tests/loops/test_training_loop.py
+++ b/tests/loops/test_training_loop.py
@@ -133,7 +133,7 @@ def validation_step(self, *args):
     # even though we stopped mid epoch, the fit loop finished normally and the current epoch was increased
     assert trainer.current_epoch == 1
     assert trainer.global_step == 5
-    assert model.validation_called_at == (0, 4)
+    assert model.validation_called_at == (0, 5)
 
 
 def test_warning_valid_train_step_end(tmpdir):
diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py
index 17135b98c16f5..917bb4d224194 100644
--- a/tests/models/test_amp.py
+++ b/tests/models/test_amp.py
@@ -199,7 +199,9 @@ def configure_optimizers(self):
     assert str(trainer.amp_backend) == "AMPType.APEX"
     trainer.fit(model)
     assert trainer.state.finished, f"Training failed with {trainer.state}"
-    assert bwd_mock.call_count == 10
+    # `max_steps` is fulfilled in the third batch first optimizer, but we don't check the loop
+    # `done` condition until all optimizers have run, so the number of backwards is higher than `max_steps`
+    assert bwd_mock.call_count == 6
 
     assert isinstance(trainer.lr_scheduler_configs[0].scheduler.optimizer, optim.Adam)
     assert isinstance(trainer.lr_scheduler_configs[1].scheduler.optimizer, optim.SGD)
diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py
index c04e36bbc09bd..e5259c4047ad2 100644
--- a/tests/models/test_restore.py
+++ b/tests/models/test_restore.py
@@ -199,7 +199,9 @@ def on_train_start(self):
             if self.trainer.state.fn == TrainerFn.TUNING:
                 self._test_on_val_test_predict_tune_start()
             else:
-                assert self.trainer.current_epoch == state_dict["epoch"]
+                # `-1` because this checkpoint is saved `on_train_epoch_end` which is considered part of the epoch so
+                # the `current_epoch` count has not been increased yet
+                assert self.trainer.current_epoch - 1 == state_dict["epoch"]
                 assert self.trainer.global_step == state_dict["global_step"]
                 assert self._check_model_state_dict()
                 assert self._check_optimizers()
@@ -241,8 +243,7 @@ def test_correct_step_and_epoch(tmpdir):
 
     ckpt = torch.load(ckpt_path)
     assert ckpt["epoch"] == first_max_epochs
-    # TODO(@carmocca): should not need `+1`
-    assert ckpt["global_step"] == first_max_epochs * train_batches + 1
+    assert ckpt["global_step"] == first_max_epochs * train_batches
 
     max_epochs = first_max_epochs + 2
     trainer = Trainer(
@@ -255,13 +256,11 @@ def test_correct_step_and_epoch(tmpdir):
     class TestModel(BoringModel):
         def on_train_start(self) -> None:
             assert self.trainer.current_epoch == first_max_epochs
-            # TODO(@carmocca): should not need `+1`
-            assert self.trainer.global_step == first_max_epochs * train_batches + 1
+            assert self.trainer.global_step == first_max_epochs * train_batches
 
     trainer.fit(TestModel(), ckpt_path=ckpt_path)
     assert trainer.current_epoch == max_epochs
-    # TODO(@carmocca): should not need `+1`
-    assert trainer.global_step == max_epochs * train_batches + 1
+    assert trainer.global_step == max_epochs * train_batches
 
 
 def test_fit_twice(tmpdir):
diff --git a/tests/plugins/test_checkpoint_io_plugin.py b/tests/plugins/test_checkpoint_io_plugin.py
index 7a1352804ba3d..56aadad353b2e 100644
--- a/tests/plugins/test_checkpoint_io_plugin.py
+++ b/tests/plugins/test_checkpoint_io_plugin.py
@@ -52,7 +52,7 @@ def test_checkpoint_plugin_called(tmpdir):
     )
     trainer.fit(model)
 
-    assert checkpoint_plugin.save_checkpoint.call_count == 5
+    assert checkpoint_plugin.save_checkpoint.call_count == 4
     assert checkpoint_plugin.remove_checkpoint.call_count == 1
 
     trainer.test(model, ckpt_path=ck.last_model_path)
@@ -71,7 +71,7 @@ def test_checkpoint_plugin_called(tmpdir):
     )
     trainer.fit(model)
 
-    assert checkpoint_plugin.save_checkpoint.call_count == 5
+    assert checkpoint_plugin.save_checkpoint.call_count == 4
     assert checkpoint_plugin.remove_checkpoint.call_count == 1
 
     trainer.test(model, ckpt_path=ck.last_model_path)
diff --git a/tests/trainer/optimization/test_optimizers.py b/tests/trainer/optimization/test_optimizers.py
index 99071ce3d8f8a..38c0a83cabb65 100644
--- a/tests/trainer/optimization/test_optimizers.py
+++ b/tests/trainer/optimization/test_optimizers.py
@@ -628,7 +628,7 @@ def configure_optimizers(self):
         def on_save_checkpoint(self, checkpoint):
             lr_scheduler_config = checkpoint["lr_schedulers"][0]
             # 2 batches ran. since the lr_scheduler_config interval is `step`, the step count should be 2
-            assert self.trainer.global_step + 1 == batches  # the global step hasn't been increased yet
+            assert self.trainer.global_step == batches
             compare_to = max_epochs if epoch_interval else batches
             assert lr_scheduler_config["_step_count"] - 1 == compare_to  # step count starts at 1
             assert lr_scheduler_config["_last_lr"] == [lr * gamma ** compare_to]
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index d5f0aeea1e0e4..6f4d7300220e5 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -332,7 +332,8 @@ def mock_save_function(filepath, *args):
 
     # emulate callback's calls during the training
     for i, loss in enumerate(losses, 1):
-        trainer.fit_loop.global_step = i
+        # sets `trainer.global_step`
+        trainer.fit_loop.epoch_loop.batch_loop.optimizer_loop.optim_progress.optimizer.step.total.completed = i
         trainer.callback_metrics.update({"checkpoint_on": torch.tensor(loss)})
         checkpoint_callback.on_validation_end(trainer, trainer.lightning_module)
         trainer.fit_loop.epoch_progress.current.completed = i  # sets `trainer.current_epoch`

From 5aecf65911e1cfe52e2416b632f2c70084d6de5c Mon Sep 17 00:00:00 2001
From: jjenniferdai <89552168+jjenniferdai@users.noreply.github.com>
Date: Mon, 7 Mar 2022 18:21:46 -0800
Subject: [PATCH 035/167] Deprecate
 `LightningDataModule.on_save/load_checkpoint` (#11893)

---
 CHANGELOG.md                                  |  3 +++
 docs/source/extensions/datamodules.rst        |  8 +++---
 .../trainer/configuration_validator.py        | 15 +++++++++++
 tests/core/test_datamodules.py                |  6 ++++-
 tests/deprecated_api/test_remove_1-8.py       | 27 +++++++++++++++++++
 5 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fd8d25ccafe6f..bbc65633bd6f2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -482,6 +482,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Deprecated `PrecisionPlugin.on_{save,load}_checkpoint` in favor of `PrecisionPlugin.{state_dict,load_state_dict}` ([#11978](https://github.com/PyTorchLightning/pytorch-lightning/pull/11978))
 
 
+- Deprecated `LightningDataModule.on_save/load_checkpoint` in favor of `state_dict/load_state_dict` ([#11893](https://github.com/PyTorchLightning/pytorch-lightning/pull/11893))
+
+
 ### Removed
 
 - Removed deprecated parameter `method` in `pytorch_lightning.utilities.model_helpers.is_overridden` ([#10507](https://github.com/PyTorchLightning/pytorch-lightning/pull/10507))
diff --git a/docs/source/extensions/datamodules.rst b/docs/source/extensions/datamodules.rst
index 93eb9ce319f86..1bbcbcb83a43c 100644
--- a/docs/source/extensions/datamodules.rst
+++ b/docs/source/extensions/datamodules.rst
@@ -323,16 +323,16 @@ on_after_batch_transfer
 .. automethod:: pytorch_lightning.core.datamodule.LightningDataModule.on_after_batch_transfer
     :noindex:
 
-on_load_checkpoint
+load_state_dict
 ~~~~~~~~~~~~~~~~~~
 
-.. automethod:: pytorch_lightning.core.datamodule.LightningDataModule.on_load_checkpoint
+.. automethod:: pytorch_lightning.core.datamodule.LightningDataModule.load_state_dict
     :noindex:
 
-on_save_checkpoint
+state_dict
 ~~~~~~~~~~~~~~~~~~
 
-.. automethod:: pytorch_lightning.core.datamodule.LightningDataModule.on_save_checkpoint
+.. automethod:: pytorch_lightning.core.datamodule.LightningDataModule.state_dict
     :noindex:
 
 on_train_dataloader
diff --git a/pytorch_lightning/trainer/configuration_validator.py b/pytorch_lightning/trainer/configuration_validator.py
index 739d49c6c2844..07ffba0860c10 100644
--- a/pytorch_lightning/trainer/configuration_validator.py
+++ b/pytorch_lightning/trainer/configuration_validator.py
@@ -62,6 +62,8 @@ def verify_loop_configurations(trainer: "pl.Trainer") -> None:
     _check_precision_plugin_checkpoint_hooks(trainer)
     # TODO: Delete on_pretrain_routine_start/end hooks in v1.8
     _check_on_pretrain_routine(model)
+    # TODO: Delete CheckpointHooks off LightningDataModule in v1.8
+    _check_datamodule_checkpoint_hooks(trainer)
 
 
 def __verify_train_val_loop_configuration(trainer: "pl.Trainer", model: "pl.LightningModule") -> None:
@@ -395,3 +397,16 @@ def _check_precision_plugin_checkpoint_hooks(trainer: "pl.Trainer") -> None:
             "`PrecisionPlugin.on_load_checkpoint` was deprecated in"
             " v1.6 and will be removed in v1.8. Use `load_state_dict` instead."
         )
+
+
+def _check_datamodule_checkpoint_hooks(trainer: "pl.Trainer") -> None:
+    if is_overridden(method_name="on_save_checkpoint", instance=trainer.datamodule):
+        rank_zero_deprecation(
+            "`LightningDataModule.on_save_checkpoint` was deprecated in"
+            " v1.6 and will be removed in v1.8. Use `state_dict` instead."
+        )
+    if is_overridden(method_name="on_load_checkpoint", instance=trainer.datamodule):
+        rank_zero_deprecation(
+            "`LightningDataModule.on_load_checkpoint` was deprecated in"
+            " v1.6 and will be removed in v1.8. Use `load_state_dict` instead."
+        )
diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py
index b2a4f58762e28..f015d96fef179 100644
--- a/tests/core/test_datamodules.py
+++ b/tests/core/test_datamodules.py
@@ -222,7 +222,11 @@ def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
     )
 
     # fit model
-    trainer.fit(model, datamodule=dm)
+    with pytest.deprecated_call(
+        match="`LightningDataModule.on_save_checkpoint` was deprecated in"
+        " v1.6 and will be removed in v1.8. Use `state_dict` instead."
+    ):
+        trainer.fit(model, datamodule=dm)
     assert trainer.state.finished, f"Training failed with {trainer.state}"
     checkpoint_path = list(trainer.checkpoint_callback.best_k_models.keys())[0]
     checkpoint = torch.load(checkpoint_path)
diff --git a/tests/deprecated_api/test_remove_1-8.py b/tests/deprecated_api/test_remove_1-8.py
index bf24f81b8edc4..0fa174e172309 100644
--- a/tests/deprecated_api/test_remove_1-8.py
+++ b/tests/deprecated_api/test_remove_1-8.py
@@ -36,6 +36,7 @@
 from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin
 from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin
 from pytorch_lightning.profiler import AbstractProfiler, AdvancedProfiler, SimpleProfiler
+from pytorch_lightning.trainer.configuration_validator import _check_datamodule_checkpoint_hooks
 from pytorch_lightning.trainer.states import RunningStage
 from pytorch_lightning.utilities.apply_func import move_data_to_device
 from pytorch_lightning.utilities.enums import DeviceType, DistributedType
@@ -740,3 +741,29 @@ def on_load_checkpoint(self, checkpoint):
 
 def test_v1_8_0_abstract_profiler():
     assert "`AbstractProfiler` was deprecated in v1.6" in AbstractProfiler.__doc__
+
+
+def test_v1_8_0_datamodule_checkpointhooks():
+    class CustomBoringDataModuleSave(BoringDataModule):
+        def on_save_checkpoint(self, checkpoint):
+            print("override on_save_checkpoint")
+
+    class CustomBoringDataModuleLoad(BoringDataModule):
+        def on_load_checkpoint(self, checkpoint):
+            print("override on_load_checkpoint")
+
+    trainer = Mock()
+
+    trainer.datamodule = CustomBoringDataModuleSave()
+    with pytest.deprecated_call(
+        match="`LightningDataModule.on_save_checkpoint` was deprecated in"
+        " v1.6 and will be removed in v1.8. Use `state_dict` instead."
+    ):
+        _check_datamodule_checkpoint_hooks(trainer)
+
+    trainer.datamodule = CustomBoringDataModuleLoad()
+    with pytest.deprecated_call(
+        match="`LightningDataModule.on_load_checkpoint` was deprecated in"
+        " v1.6 and will be removed in v1.8. Use `load_state_dict` instead."
+    ):
+        _check_datamodule_checkpoint_hooks(trainer)

From 09495997e63d1d5df42834a93618720c73e2d2f1 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Tue, 8 Mar 2022 16:20:43 +0100
Subject: [PATCH 036/167] add Azure HPU agent (#12258)

---
 .azure-pipelines/hpu-tests.yml | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 .azure-pipelines/hpu-tests.yml

diff --git a/.azure-pipelines/hpu-tests.yml b/.azure-pipelines/hpu-tests.yml
new file mode 100644
index 0000000000000..aaec3caabd0f6
--- /dev/null
+++ b/.azure-pipelines/hpu-tests.yml
@@ -0,0 +1,32 @@
+# Pipeline to run the HPU tests in DL1 Instance
+
+trigger:
+  tags:
+    include:
+      - '*'
+  branches:
+    include:
+      - "master"
+      - "release/*"
+      - "refs/tags/*"
+pr:
+  - "master"
+  - "release/*"
+
+jobs:
+  - job: hpu
+
+    # how long to run the job before automatically cancelling
+    timeoutInMinutes: "5"
+    # how much time to give 'run always even if cancelled tasks' before stopping them
+    cancelTimeoutInMinutes: "2"
+
+    pool: intel-hpus
+
+    workspace:
+      clean: all
+
+    steps:
+    - bash: |
+        hwinfo --short
+      displayName: 'Instance HW info'

From 4bd5034f6630dc5e5f02650b9dcaf390c5d7d6bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Tue, 8 Mar 2022 18:26:10 +0100
Subject: [PATCH 037/167] Add `LightningCLI(auto_registry)` (#12108)

---
 CHANGELOG.md                         |  3 ++
 docs/source/common/lightning_cli.rst | 23 ++++++++
 pytorch_lightning/utilities/cli.py   | 51 ++++++++++++------
 pytorch_lightning/utilities/meta.py  |  2 +-
 tests/utilities/test_cli.py          | 80 +++++++++++++++++++---------
 5 files changed, 115 insertions(+), 44 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bbc65633bd6f2..daf117eff1db6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -57,6 +57,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `LightningCLI.configure_optimizers` to override the `configure_optimizers` return value ([#10860](https://github.com/PyTorchLightning/pytorch-lightning/pull/10860))
 
 
+- Added `LightningCLI(auto_registry)` flag to register all subclasses of the registerable components automatically ([#12108](https://github.com/PyTorchLightning/pytorch-lightning/pull/12108))
+
+
 - Added a warning that shows when `max_epochs` in the `Trainer` is not set ([#10700](https://github.com/PyTorchLightning/pytorch-lightning/pull/10700))
 
 
diff --git a/docs/source/common/lightning_cli.rst b/docs/source/common/lightning_cli.rst
index ec77785d8abba..7267606b05c2c 100644
--- a/docs/source/common/lightning_cli.rst
+++ b/docs/source/common/lightning_cli.rst
@@ -345,6 +345,29 @@ This can be useful to implement custom logic without having to subclass the CLI,
 and argument parsing capabilities.
 
 
+Subclass registration
+^^^^^^^^^^^^^^^^^^^^^
+
+To use shorthand notation, the options need to be registered beforehand. This can be easily done with:
+
+.. code-block::
+
+    LightningCLI(auto_registry=True)  # False by default
+
+which will register all subclasses of :class:`torch.optim.Optimizer`, :class:`torch.optim.lr_scheduler._LRScheduler`,
+:class:`~pytorch_lightning.core.lightning.LightningModule`,
+:class:`~pytorch_lightning.core.datamodule.LightningDataModule`, :class:`~pytorch_lightning.callbacks.Callback`, and
+:class:`~pytorch_lightning.loggers.LightningLoggerBase` across all imported modules. This includes those in your own
+code.
+
+Alternatively, if this is left unset, only the subclasses defined in PyTorch's :class:`torch.optim.Optimizer`,
+:class:`torch.optim.lr_scheduler._LRScheduler` and Lightning's :class:`~pytorch_lightning.callbacks.Callback` and
+:class:`~pytorch_lightning.loggers.LightningLoggerBase` subclassess will be registered.
+
+In subsequent sections, we will go over adding specific classes to specific registries as well as how to use
+shorthand notation.
+
+
 Trainer Callbacks and arguments with class type
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/pytorch_lightning/utilities/cli.py b/pytorch_lightning/utilities/cli.py
index 976f1d58b83b5..2e8d629a88cb8 100644
--- a/pytorch_lightning/utilities/cli.py
+++ b/pytorch_lightning/utilities/cli.py
@@ -30,6 +30,7 @@
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _JSONARGPARSE_AVAILABLE
+from pytorch_lightning.utilities.meta import get_all_subclasses
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.rank_zero import _warn, rank_zero_warn
 from pytorch_lightning.utilities.types import LRSchedulerType, LRSchedulerTypeTuple, LRSchedulerTypeUnion
@@ -58,9 +59,8 @@ def __call__(self, cls: Type, key: Optional[str] = None, override: bool = False)
         elif not isinstance(key, str):
             raise TypeError(f"`key` must be a str, found {key}")
 
-        if key in self and not override:
-            raise MisconfigurationException(f"'{key}' is already present in the registry. HINT: Use `override=True`.")
-        self[key] = cls
+        if key not in self or override:
+            self[key] = cls
         return cls
 
     def register_classes(self, module: ModuleType, base_cls: Type, override: bool = False) -> None:
@@ -91,10 +91,11 @@ def __str__(self) -> str:
 
 
 OPTIMIZER_REGISTRY = _Registry()
-OPTIMIZER_REGISTRY.register_classes(torch.optim, Optimizer)
-
 LR_SCHEDULER_REGISTRY = _Registry()
-LR_SCHEDULER_REGISTRY.register_classes(torch.optim.lr_scheduler, torch.optim.lr_scheduler._LRScheduler)
+CALLBACK_REGISTRY = _Registry()
+MODEL_REGISTRY = _Registry()
+DATAMODULE_REGISTRY = _Registry()
+LOGGER_REGISTRY = _Registry()
 
 
 class ReduceLROnPlateau(torch.optim.lr_scheduler.ReduceLROnPlateau):
@@ -103,17 +104,29 @@ def __init__(self, optimizer: Optimizer, monitor: str, *args: Any, **kwargs: Any
         self.monitor = monitor
 
 
-LR_SCHEDULER_REGISTRY(cls=ReduceLROnPlateau)
-
-CALLBACK_REGISTRY = _Registry()
-CALLBACK_REGISTRY.register_classes(pl.callbacks, pl.callbacks.Callback)
-
-MODEL_REGISTRY = _Registry()
-
-DATAMODULE_REGISTRY = _Registry()
-
-LOGGER_REGISTRY = _Registry()
-LOGGER_REGISTRY.register_classes(pl.loggers, pl.loggers.LightningLoggerBase)
+def _populate_registries(subclasses: bool) -> None:
+    if subclasses:
+        # this will register any subclasses from all loaded modules including userland
+        for cls in get_all_subclasses(torch.optim.Optimizer):
+            OPTIMIZER_REGISTRY(cls)
+        for cls in get_all_subclasses(torch.optim.lr_scheduler._LRScheduler):
+            LR_SCHEDULER_REGISTRY(cls)
+        for cls in get_all_subclasses(pl.Callback):
+            CALLBACK_REGISTRY(cls)
+        for cls in get_all_subclasses(pl.LightningModule):
+            MODEL_REGISTRY(cls)
+        for cls in get_all_subclasses(pl.LightningDataModule):
+            DATAMODULE_REGISTRY(cls)
+        for cls in get_all_subclasses(pl.loggers.LightningLoggerBase):
+            LOGGER_REGISTRY(cls)
+    else:
+        # manually register torch's subclasses and our subclasses
+        OPTIMIZER_REGISTRY.register_classes(torch.optim, Optimizer)
+        LR_SCHEDULER_REGISTRY.register_classes(torch.optim.lr_scheduler, torch.optim.lr_scheduler._LRScheduler)
+        CALLBACK_REGISTRY.register_classes(pl.callbacks, pl.Callback)
+        LOGGER_REGISTRY.register_classes(pl.loggers, pl.loggers.LightningLoggerBase)
+    # `ReduceLROnPlateau` does not subclass `_LRScheduler`
+    LR_SCHEDULER_REGISTRY(cls=ReduceLROnPlateau)
 
 
 class LightningArgumentParser(ArgumentParser):
@@ -465,6 +478,7 @@ def __init__(
         subclass_mode_model: bool = False,
         subclass_mode_data: bool = False,
         run: bool = True,
+        auto_registry: bool = False,
     ) -> None:
         """Receives as input pytorch-lightning classes (or callables which return pytorch-lightning classes), which
         are called / instantiated using a parsed configuration file and / or command line args.
@@ -508,6 +522,7 @@ def __init__(
                 of the given class.
             run: Whether subcommands should be added to run a :class:`~pytorch_lightning.trainer.trainer.Trainer`
                 method. If set to ``False``, the trainer and model classes will be instantiated only.
+            auto_registry: Whether to automatically fill up the registries with all defined subclasses.
         """
         self.save_config_callback = save_config_callback
         self.save_config_filename = save_config_filename
@@ -527,6 +542,8 @@ def __init__(
         self._datamodule_class = datamodule_class or LightningDataModule
         self.subclass_mode_data = (datamodule_class is None) or subclass_mode_data
 
+        _populate_registries(auto_registry)
+
         main_kwargs, subparser_kwargs = self._setup_parser_kwargs(
             parser_kwargs or {},  # type: ignore  # github.com/python/mypy/issues/6463
             {"description": description, "env_prefix": env_prefix, "default_env": env_parse},
diff --git a/pytorch_lightning/utilities/meta.py b/pytorch_lightning/utilities/meta.py
index 0b9b21193b553..d14f111e8759a 100644
--- a/pytorch_lightning/utilities/meta.py
+++ b/pytorch_lightning/utilities/meta.py
@@ -147,7 +147,7 @@ def init_meta(*_, **__):
 
 
 # https://stackoverflow.com/a/63851681/9201239
-def get_all_subclasses(cls: Type[nn.Module]) -> Set[nn.Module]:
+def get_all_subclasses(cls: Type) -> Set[Type]:
     subclass_list = []
 
     def recurse(cl):
diff --git a/tests/utilities/test_cli.py b/tests/utilities/test_cli.py
index 6d705671893f4..5da16737fc2d7 100644
--- a/tests/utilities/test_cli.py
+++ b/tests/utilities/test_cli.py
@@ -38,6 +38,7 @@
 from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.utilities import _TPU_AVAILABLE
 from pytorch_lightning.utilities.cli import (
+    _populate_registries,
     CALLBACK_REGISTRY,
     DATAMODULE_REGISTRY,
     instantiate_class,
@@ -880,27 +881,38 @@ def test_lightning_cli_run():
     assert isinstance(cli.model, LightningModule)
 
 
-@OPTIMIZER_REGISTRY
-class CustomAdam(torch.optim.Adam):
-    pass
+@pytest.fixture(autouse=True)
+def clear_registries():
+    # since the registries are global, it's good to clear them after each test to avoid unwanted interactions
+    yield
+    OPTIMIZER_REGISTRY.clear()
+    LR_SCHEDULER_REGISTRY.clear()
+    CALLBACK_REGISTRY.clear()
+    MODEL_REGISTRY.clear()
+    DATAMODULE_REGISTRY.clear()
+    LOGGER_REGISTRY.clear()
 
 
-@LR_SCHEDULER_REGISTRY
-class CustomCosineAnnealingLR(torch.optim.lr_scheduler.CosineAnnealingLR):
-    pass
-
+def test_registries():
+    # the registries are global so this is only necessary when this test is run standalone
+    _populate_registries(False)
 
-@CALLBACK_REGISTRY
-class CustomCallback(Callback):
-    pass
+    @OPTIMIZER_REGISTRY
+    class CustomAdam(torch.optim.Adam):
+        pass
 
+    @LR_SCHEDULER_REGISTRY
+    class CustomCosineAnnealingLR(torch.optim.lr_scheduler.CosineAnnealingLR):
+        pass
 
-@LOGGER_REGISTRY
-class CustomLogger(LightningLoggerBase):
-    pass
+    @CALLBACK_REGISTRY
+    class CustomCallback(Callback):
+        pass
 
+    @LOGGER_REGISTRY
+    class CustomLogger(LightningLoggerBase):
+        pass
 
-def test_registries():
     assert "SGD" in OPTIMIZER_REGISTRY.names
     assert "RMSprop" in OPTIMIZER_REGISTRY.names
     assert "CustomAdam" in OPTIMIZER_REGISTRY.names
@@ -913,9 +925,13 @@ def test_registries():
     assert "EarlyStopping" in CALLBACK_REGISTRY.names
     assert "CustomCallback" in CALLBACK_REGISTRY.names
 
-    with pytest.raises(MisconfigurationException, match="is already present in the registry"):
-        OPTIMIZER_REGISTRY.register_classes(torch.optim, torch.optim.Optimizer)
-    OPTIMIZER_REGISTRY.register_classes(torch.optim, torch.optim.Optimizer, override=True)
+    class Foo:
+        ...
+
+    OPTIMIZER_REGISTRY(Foo, key="SGD")  # not overridden by default
+    assert OPTIMIZER_REGISTRY["SGD"] is torch.optim.SGD
+    OPTIMIZER_REGISTRY(Foo, key="SGD", override=True)
+    assert OPTIMIZER_REGISTRY["SGD"] is Foo
 
     # test `_Registry.__call__` returns the class
     assert isinstance(CustomCallback(), CustomCallback)
@@ -924,7 +940,13 @@ def test_registries():
     assert "CustomLogger" in LOGGER_REGISTRY
 
 
-@MODEL_REGISTRY
+def test_registries_register_automatically():
+    assert "SaveConfigCallback" not in CALLBACK_REGISTRY
+    with mock.patch("sys.argv", ["any.py"]):
+        LightningCLI(BoringModel, run=False, auto_registry=True)
+    assert "SaveConfigCallback" in CALLBACK_REGISTRY
+
+
 class TestModel(BoringModel):
     def __init__(self, foo, bar=5):
         super().__init__()
@@ -932,10 +954,10 @@ def __init__(self, foo, bar=5):
         self.bar = bar
 
 
-MODEL_REGISTRY(cls=BoringModel)
-
-
 def test_lightning_cli_model_choices():
+    MODEL_REGISTRY(cls=TestModel)
+    MODEL_REGISTRY(cls=BoringModel)
+
     with mock.patch("sys.argv", ["any.py", "fit", "--model=BoringModel"]), mock.patch(
         "pytorch_lightning.Trainer._fit_impl"
     ) as run:
@@ -950,7 +972,6 @@ def test_lightning_cli_model_choices():
         assert cli.model.bar == 5
 
 
-@DATAMODULE_REGISTRY
 class MyDataModule(BoringDataModule):
     def __init__(self, foo, bar=5):
         super().__init__()
@@ -958,10 +979,11 @@ def __init__(self, foo, bar=5):
         self.bar = bar
 
 
-DATAMODULE_REGISTRY(cls=BoringDataModule)
-
-
 def test_lightning_cli_datamodule_choices():
+    MODEL_REGISTRY(cls=BoringModel)
+    DATAMODULE_REGISTRY(cls=MyDataModule)
+    DATAMODULE_REGISTRY(cls=BoringDataModule)
+
     # with set model
     with mock.patch("sys.argv", ["any.py", "fit", "--data=BoringDataModule"]), mock.patch(
         "pytorch_lightning.Trainer._fit_impl"
@@ -998,7 +1020,7 @@ def test_lightning_cli_datamodule_choices():
         assert not hasattr(cli.parser.groups["data"], "group_class")
 
     with mock.patch("sys.argv", ["any.py"]), mock.patch.dict(DATAMODULE_REGISTRY, clear=True):
-        cli = LightningCLI(BoringModel, run=False)
+        cli = LightningCLI(BoringModel, run=False, auto_registry=False)
         # no registered classes so not added automatically
         assert "data" not in cli.parser.groups
     assert len(DATAMODULE_REGISTRY)  # check state was not modified
@@ -1011,6 +1033,8 @@ def test_lightning_cli_datamodule_choices():
 
 @pytest.mark.parametrize("use_class_path_callbacks", [False, True])
 def test_registries_resolution(use_class_path_callbacks):
+    MODEL_REGISTRY(cls=BoringModel)
+
     """This test validates registries are used when simplified command line are being used."""
     cli_args = [
         "--optimizer",
@@ -1067,6 +1091,7 @@ def test_argv_transformation_single_callback():
         }
     ]
     expected = base + ["--trainer.callbacks", str(callbacks)]
+    _populate_registries(False)
     argv = LightningArgumentParser._convert_argv_issue_85(CALLBACK_REGISTRY.classes, "trainer.callbacks", input)
     assert argv == expected
 
@@ -1090,6 +1115,7 @@ def test_argv_transformation_multiple_callbacks():
         },
     ]
     expected = base + ["--trainer.callbacks", str(callbacks)]
+    _populate_registries(False)
     argv = LightningArgumentParser._convert_argv_issue_85(CALLBACK_REGISTRY.classes, "trainer.callbacks", input)
     assert argv == expected
 
@@ -1117,6 +1143,7 @@ def test_argv_transformation_multiple_callbacks_with_config():
     ]
     expected = base + ["--trainer.callbacks", str(callbacks)]
     nested_key = "trainer.callbacks"
+    _populate_registries(False)
     argv = LightningArgumentParser._convert_argv_issue_85(CALLBACK_REGISTRY.classes, nested_key, input)
     assert argv == expected
 
@@ -1153,6 +1180,7 @@ def test_argv_transformation_multiple_callbacks_with_config():
 def test_argv_transformations_with_optimizers_and_lr_schedulers(args, expected, nested_key, registry):
     base = ["any.py", "--trainer.max_epochs=1"]
     argv = base + args
+    _populate_registries(False)
     new_argv = LightningArgumentParser._convert_argv_issue_84(registry.classes, nested_key, argv)
     assert new_argv == base + [f"--{nested_key}", str(expected)]
 

From bd76456a6ccc41c9e043624dbc5004a18d9a8866 Mon Sep 17 00:00:00 2001
From: Kushashwa Ravi Shrimali <kushashwaravishrimali@gmail.com>
Date: Tue, 8 Mar 2022 23:32:32 +0530
Subject: [PATCH 038/167] Drop PyTorch 1.7 testing from the CI (#12191)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: Aki Nitta <nitta@akihironitta.com>
---
 .github/workflows/ci_dockers.yml     |  6 ++----
 .github/workflows/ci_test-conda.yml  |  2 +-
 .github/workflows/ci_test-full.yml   |  3 ---
 .github/workflows/events-nightly.yml |  4 ++--
 .github/workflows/release-docker.yml |  2 +-
 CHANGELOG.md                         |  3 +++
 README.md                            | 16 ++++++++--------
 environment.yml                      |  6 +++---
 requirements.txt                     |  2 +-
 requirements/examples.txt            |  2 +-
 requirements/extra.txt               |  2 +-
 11 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml
index 13f3148054264..e517d5d7c15ee 100644
--- a/.github/workflows/ci_dockers.yml
+++ b/.github/workflows/ci_dockers.yml
@@ -97,7 +97,7 @@ jobs:
       matrix:
         # the config used in '.github/workflows/ci_test-conda.yml'
         python_version: ["3.8"]
-        pytorch_version: ["1.7", "1.8", "1.9", "1.10"]
+        pytorch_version: ["1.8", "1.9", "1.10"]
     steps:
       - name: Checkout
         uses: actions/checkout@v2
@@ -124,9 +124,7 @@ jobs:
       matrix:
         # the config used in 'dockers/ipu-ci-runner/Dockerfile'
         python_version: ["3.9"]  # latest
-        # TODO: upgrade - PopTorch 2.2 uses torch 1.9, see:
-        # https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/installation.html#version-compatibility
-        pytorch_version: ["1.7"]
+        pytorch_version: ["1.9"]
     steps:
       - name: Checkout
         uses: actions/checkout@v2
diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml
index 69ac71f10121d..96932cf8cbb6c 100644
--- a/.github/workflows/ci_test-conda.yml
+++ b/.github/workflows/ci_test-conda.yml
@@ -19,7 +19,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: ["3.8"]  # previous to last Python version as that one is already used in test-full
-        pytorch-version: ["1.7", "1.8", "1.9", "1.10"]  # nightly: add when there's a release candidate
+        pytorch-version: ["1.8", "1.9", "1.10"]  # nightly: add when there's a release candidate
 
     timeout-minutes: 30
     steps:
diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml
index 87d3665d62983..115f1fd2ad5c8 100644
--- a/.github/workflows/ci_test-full.yml
+++ b/.github/workflows/ci_test-full.yml
@@ -24,9 +24,6 @@ jobs:
         python-version: ["3.7", "3.9"]  # minimum, maximum
         requires: ["oldest", "latest"]
         release: ["stable"]
-        exclude:
-          # Skip if torch<1.8 and py3.9 on Linux: https://github.com/pytorch/pytorch/issues/50014
-          - {os: ubuntu-20.04, python-version: "3.9", requires: "oldest"}
         #include:
           # nightly: add when there's a release candidate
           #- {os: ubuntu-20.04, python-version: "3.10", requires: "latest", release: "pre"}
diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml
index ee60736454a97..7cb795bfccf67 100644
--- a/.github/workflows/events-nightly.yml
+++ b/.github/workflows/events-nightly.yml
@@ -125,7 +125,7 @@ jobs:
       matrix:
         # the config used in '.github/workflows/ci_test-conda.yml'
         python_version: ["3.8"]
-        pytorch_version: ["1.7", "1.8", "1.9", "1.10"]
+        pytorch_version: ["1.8", "1.9", "1.10"]
 
     steps:
       - name: Checkout
@@ -165,7 +165,7 @@ jobs:
       matrix:
         # the config used in 'dockers/ipu-ci-runner/Dockerfile'
         include:
-          - {python_version: "3.9", pytorch_version: "1.7"}
+          - {python_version: "3.9", pytorch_version: "1.9"}
 
     steps:
       - name: Checkout
diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml
index 39e25980862e5..6c9d22f455545 100644
--- a/.github/workflows/release-docker.yml
+++ b/.github/workflows/release-docker.yml
@@ -16,7 +16,7 @@ jobs:
       fail-fast: false
       matrix:
         python_version: ["3.7", "3.8", "3.9"]
-        pytorch_version: ["1.7", "1.8", "1.9", "1.10"]
+        pytorch_version: ["1.8", "1.9", "1.10"]
     steps:
       - name: Checkout
         uses: actions/checkout@v2
diff --git a/CHANGELOG.md b/CHANGELOG.md
index daf117eff1db6..06bc97f644378 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -149,6 +149,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Changed
 
+- Drop PyTorch 1.7 support ([#12191](https://github.com/PyTorchLightning/pytorch-lightning/pull/12191))
+
+
 - Make `benchmark` flag optional and set its value based on the deterministic flag ([#11944](https://github.com/PyTorchLightning/pytorch-lightning/pull/11944))
 
 
diff --git a/README.md b/README.md
index e0504ced8ba42..d79463d0cfb02 100644
--- a/README.md
+++ b/README.md
@@ -78,14 +78,14 @@ Lightning is rigorously tested across multiple GPUs, TPUs CPUs and against major
 
 <center>
 
-|   System / PyTorch ver.    |                                                                                                          1.7 (min. req.)                                                                                                           |                                                                                                                         1.8 (LTS)                                                                                                                         |                                                                                                                1.9                                                                                                                 |                                                                                                           1.10 (latest)                                                                                                            |
-| :------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-|  Linux py3.7 \[GPUs\*\*\]  |                                                                                                                 -                                                                                                                  | [![Build Status](<https://dev.azure.com/PytorchLightning/pytorch-lightning/_apis/build/status/PL.pytorch-lightning%20(GPUs)?branchName=master>)](https://dev.azure.com/PytorchLightning/pytorch-lightning/_build/latest?definitionId=6&branchName=master) |                                                                                                                 -                                                                                                                  |                                                                                                                 -                                                                                                                  |
-| Linux py3.7 \[TPUs\*\*\*\] |                                                                                                                 -                                                                                                                  |                                        [![CircleCI](https://circleci.com/gh/PyTorchLightning/pytorch-lightning/tree/master.svg?style=svg)](https://circleci.com/gh/PyTorchLightning/pytorch-lightning/tree/master)                                        |                                                                                                                 -                                                                                                                  |                                                                                                                 -                                                                                                                  |
-|  Linux py3.8 (with Conda   | [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml) |            [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml)             | [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml) |
-|      Linux py3.{7,9}       |  [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml)  |                                                                                                                             -                                                                                                                             |                                                                                                                 -                                                                                                                  |  [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml)  |
-|       OSX py3.{7,9}        |  [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml)  |                                                                                                                             -                                                                                                                             |                                                                                                                 -                                                                                                                  |  [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml)  |
-|     Windows py3.{7,9}      |  [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml)  |                                                                                                                             -                                                                                                                             |                                                                                                                 -                                                                                                                  |  [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml)  |
+|   System / PyTorch ver.    |                                                                                                                   1.8 (LTS, min. req.)                                                                                                                    |                                                                                                                1.9                                                                                                                 |                                                                                                           1.10 (latest)                                                                                                            |
+| :------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  Linux py3.7 \[GPUs\*\*\]  | [![Build Status](<https://dev.azure.com/PytorchLightning/pytorch-lightning/_apis/build/status/PL.pytorch-lightning%20(GPUs)?branchName=master>)](https://dev.azure.com/PytorchLightning/pytorch-lightning/_build/latest?definitionId=6&branchName=master) |                                                                                                                 -                                                                                                                  |                                                                                                                 -                                                                                                                  |
+| Linux py3.7 \[TPUs\*\*\*\] |                                        [![CircleCI](https://circleci.com/gh/PyTorchLightning/pytorch-lightning/tree/master.svg?style=svg)](https://circleci.com/gh/PyTorchLightning/pytorch-lightning/tree/master)                                        |                                                                                                                 -                                                                                                                  |                                                                                                                 -                                                                                                                  |
+|  Linux py3.8 (with Conda   |            [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml)             | [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml) |
+|      Linux py3.{7,9}       |                                                                                                                             -                                                                                                                             |                                                                                                                 -                                                                                                                  |  [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml)  |
+|       OSX py3.{7,9}        |                                                                                                                             -                                                                                                                             |                                                                                                                 -                                                                                                                  |  [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml)  |
+|     Windows py3.{7,9}      |                                                                                                                             -                                                                                                                             |                                                                                                                 -                                                                                                                  |  [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml)  |
 
 - _\*\* tests run on two NVIDIA P100_
 - _\*\*\* tests run on Google GKE TPUv2/3. TPU py3.7 means we support Colab and Kaggle env._
diff --git a/environment.yml b/environment.yml
index cc5273f46570d..28feb0b083012 100644
--- a/environment.yml
+++ b/environment.yml
@@ -29,7 +29,7 @@ dependencies:
     - python>=3.7
     - pip>20.1
     - numpy>=1.17.2
-    - pytorch>=1.7.*
+    - pytorch>=1.8.*
     - future>=0.17.1
     - PyYAML>=5.1
     - tqdm>=4.41.0
@@ -41,10 +41,10 @@ dependencies:
     - scikit-learn>=0.20.0
     - matplotlib>=3.1.1
     - omegaconf>=2.0.5
-    - torchtext>=0.8.*
+    - torchtext>=0.9.*
 
     # Examples
-    - torchvision>=0.8.*
+    - torchvision>=0.9.*
 
     - pip:
         - test-tube>=0.7.5
diff --git a/requirements.txt b/requirements.txt
index 5cad90d9b9278..6aa080fc7e8fb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # the default package dependencies
 
 numpy>=1.17.2
-torch>=1.7.*
+torch>=1.8.*
 tqdm>=4.41.0
 PyYAML>=5.4
 fsspec[http]>=2021.05.0, !=2021.06.0
diff --git a/requirements/examples.txt b/requirements/examples.txt
index 0e00654a8e223..501d413de90ce 100644
--- a/requirements/examples.txt
+++ b/requirements/examples.txt
@@ -1,3 +1,3 @@
-torchvision>=0.8.*
+torchvision>=0.9.*
 gym[classic_control]>=0.17.0
 ipython[all]
diff --git a/requirements/extra.txt b/requirements/extra.txt
index e70e3f6254216..bafa62b9bf57e 100644
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@@ -2,7 +2,7 @@
 
 matplotlib>3.1
 horovod>=0.21.2,<0.24  # no need to install with [pytorch] as pytorch is already installed
-torchtext>=0.8.*
+torchtext>=0.9.*
 omegaconf>=2.0.5
 hydra-core>=1.0.5
 jsonargparse[signatures]>=4.3.0

From 80b8d018af60121663261c02e56a316db75093d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Tue, 8 Mar 2022 19:10:18 +0100
Subject: [PATCH 039/167] Have the outputs match the loops format (#12182)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
---
 CHANGELOG.md                                  |   8 +-
 pytorch_lightning/core/lightning.py           |   7 +-
 .../loops/epoch/training_epoch_loop.py        |  74 ++--
 pytorch_lightning/loops/fit_loop.py           |   2 +-
 pytorch_lightning/loops/utilities.py          |   9 +-
 tests/deprecated_api/test_remove_1-8.py       |  50 +++
 tests/loops/epoch/test_training_epoch_loop.py | 317 +++++++++++-------
 .../loops/optimization/test_optimizer_loop.py |   5 +-
 tests/loops/test_utilities.py                 |  26 +-
 .../optimization/test_multiple_optimizers.py  |   5 +-
 10 files changed, 350 insertions(+), 153 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 06bc97f644378..6de78b47817d9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -30,7 +30,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
     * Broadcast the `_terminate_gracefully` to all processes and add support for DDP ([#10638](https://github.com/PyTorchLightning/pytorch-lightning/pull/10638))
 
 
-- Added support for re-instantiation of custom (subclasses of) `DataLoaders` returned in the `*_dataloader()` methods, i.e., automatic replacement of samplers now works with custom types of `DataLoader` ([#10680](https://github.com/PyTorchLightning/pytorch-lightning/pull/10639))
+- Added support for re-instantiation of custom (subclasses of) `DataLoaders` returned in the `*_dataloader()` methods, i.e., automatic replacement of samplers now works with custom types of `DataLoader` ([#10680](https://github.com/PyTorchLightning/pytorch-lightning/pull/10680))
 
 
 - Added a function to validate if fault tolerant training is supported. ([#10465](https://github.com/PyTorchLightning/pytorch-lightning/pull/10465))
@@ -410,6 +410,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Deprecated `TrainerOptimizersMixin` and moved functionality to `core/optimizer.py`([#11155](https://github.com/PyTorchLightning/pytorch-lightning/pull/11155))
 
 
+- Deprecated the `on_train_batch_end(outputs)` format when multiple optimizers are used and TBPTT is enabled ([#12182](https://github.com/PyTorchLightning/pytorch-lightning/pull/12182))
+
+
+- Deprecated the `training_epoch_end(outputs)` format when multiple optimizers are used and TBPTT is enabled ([#12182](https://github.com/PyTorchLightning/pytorch-lightning/pull/12182))
+
+
 - Deprecated `TrainerCallbackHookMixin` ([#11148](https://github.com/PyTorchLightning/pytorch-lightning/pull/11148))
 
 
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index ba8e23d18dcf7..ca3e6d653c799 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -713,10 +713,9 @@ def training_epoch_end(self, outputs: EPOCH_OUTPUT) -> None:
             training_epoch_end(train_outs)
 
         Args:
-            outputs: List of outputs you defined in :meth:`training_step`.
-                If there are multiple optimizers, it is a list containing a list of outputs for each optimizer.
-                If using ``truncated_bptt_steps > 1``, each element is a list of outputs corresponding to the outputs
-                of each processed split batch.
+            outputs: List of outputs you defined in :meth:`training_step`. If there are multiple optimizers or when
+                using ``truncated_bptt_steps > 0``, the lists have the dimensions
+                (n_batches, tbptt_steps, n_optimizers). Dimensions of length 1 are squeezed.
 
         Return:
             None
diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
index 67400ce0472de..91e01b4c0ab6b 100644
--- a/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -18,10 +18,11 @@
 import numpy as np
 import torch
 
+import pytorch_lightning as pl
 from pytorch_lightning import loops  # import as loops to avoid circular imports
 from pytorch_lightning.loops.batch import TrainingBatchLoop
 from pytorch_lightning.loops.batch.training_batch_loop import _OUTPUTS_TYPE as _BATCH_OUTPUTS_TYPE
-from pytorch_lightning.loops.utilities import _get_active_optimizers, _is_max_limit_reached
+from pytorch_lightning.loops.utilities import _get_active_optimizers, _is_max_limit_reached, _v1_8_output_format
 from pytorch_lightning.trainer.connectors.logger_connector.result import _ResultCollection
 from pytorch_lightning.trainer.progress import BatchProgress, SchedulerProgress
 from pytorch_lightning.trainer.supporters import CombinedLoader
@@ -216,7 +217,7 @@ def advance(self, data_fetcher: AbstractDataFetcher) -> None:  # type: ignore[ov
 
         batch_end_outputs = self._prepare_outputs_training_batch_end(
             batch_output,
-            automatic=self.trainer.lightning_module.trainer.lightning_module.automatic_optimization,
+            lightning_module=self.trainer.lightning_module,
             num_optimizers=len(self.trainer.optimizers),
         )
 
@@ -337,26 +338,38 @@ def _should_accumulate(self) -> bool:
     @staticmethod
     def _prepare_outputs_training_batch_end(
         batch_output: _BATCH_OUTPUTS_TYPE,
-        automatic: bool,
+        lightning_module: "pl.LightningModule",
         num_optimizers: int,
     ) -> Union[List[List[Dict[str, Any]]], List[Dict[str, Any]]]:
-        """Processes the outputs from the batch loop into the format passed to the ``training_batch_end`` hook.
-
-        ``(tbptt_steps, n_opt) -> (n_opt, tbptt_steps)``. The optimizer dimension might have been squeezed.
-        """
+        """Processes the outputs from the batch loop into the format passed to the ``on_train_batch_end`` hook."""
         if not batch_output:
             return []
 
         # convert optimizer dicts to list
-        if automatic:
+        if lightning_module.automatic_optimization:
             batch_output = apply_to_collection(
                 batch_output, dtype=dict, function=_convert_optim_dict, num_optimizers=num_optimizers
             )
-        array = np.array(batch_output, dtype=object)
-        if array.ndim == 1:
-            array = np.expand_dims(array, 1)
 
-        array = array.transpose((1, 0))
+        array = np.array(batch_output, dtype=object)
+        # TODO: remove in v1.8
+        if (
+            num_optimizers > 1
+            and lightning_module.truncated_bptt_steps > 0
+            and not _v1_8_output_format(lightning_module.on_train_batch_end)
+        ):
+            rank_zero_deprecation(
+                "You are training with multiple optimizers AND truncated backpropagation through time enabled."
+                " The current format of the `on_train_batch_end(outputs, ...)` is a 2d list with sizes"
+                " (n_optimizers, tbptt_steps), however, this has been deprecated and will change in version v1.8 to"
+                " (tbptt_steps, n_optimizers). You can update your code by adding the following parameter to your"
+                " hook signature: `on_train_batch_end(outputs, ..., new_format=True)`."
+            )
+            # (tbptt_steps, n_opt) -> (n_opt, tbptt_steps)
+            if array.ndim == 1:
+                array = np.expand_dims(array, 1)
+            array = array.transpose((1, 0))
+        # squeeze all single-element dimensions
         array = array.squeeze()
         array = array.tolist()
         array = _recursive_unpad(array)
@@ -365,35 +378,42 @@ def _prepare_outputs_training_batch_end(
     @staticmethod
     def _prepare_outputs_training_epoch_end(
         batch_outputs: _OUTPUTS_TYPE,
-        automatic: bool,
+        lightning_module: "pl.LightningModule",
         num_optimizers: int,
     ) -> Union[List[List[List[Dict[str, Any]]]], List[List[Dict[str, Any]]], List[Dict[str, Any]]]:
-        """Processes the outputs from the batch loop into the format passed to the ``training_epoch_end`` hook.
-
-        ``(n_batches, tbptt_steps, n_opt) -> (n_opt, n_batches, tbptt_steps)``.
-        All single-element dimensions might have been squeezed.
-
-        This processing is necessary because the format of the inputs to the ``training_epoch_end`` hook does not
-        match the loop structure and because empty dimensions are squeezed. This could break with loop customization.
-        """
+        """Processes the outputs from the batch loop into the format passed to the ``training_epoch_end`` hook."""
         # `batch_outputs` (plural) is the same as `epoch_end_output` (singular)
         if not batch_outputs:
             return []
 
         # convert optimizer dicts to list
-        if automatic:
+        if lightning_module.automatic_optimization:
             batch_outputs = apply_to_collection(
                 batch_outputs, dtype=dict, function=_convert_optim_dict, num_optimizers=num_optimizers
             )
 
         array = _recursive_pad(batch_outputs)
-        if array.ndim == 2:
-            array = np.expand_dims(array, 2)
-        array = array.transpose((2, 0, 1))
+        # TODO: remove in v1.8
+        if (
+            num_optimizers > 1
+            and lightning_module.truncated_bptt_steps > 0
+            and not _v1_8_output_format(lightning_module.on_train_epoch_end)
+        ):
+            rank_zero_deprecation(
+                "You are training with multiple optimizers AND truncated backpropagation through time enabled."
+                " The current format of the `training_epoch_end(outputs)` is a 3d list with sizes"
+                " (n_optimizers, n_batches, tbptt_steps), however, this has been deprecated and will change in version"
+                " v1.8 to (n_batches, tbptt_steps, n_optimizers). You can update your code by adding the following"
+                " parameter to your hook signature: `training_epoch_end(outputs, new_format=True)`."
+            )
+            # (n_batches, tbptt_steps, n_opt) -> (n_opt, n_batches, tbptt_steps)
+            if array.ndim == 2:
+                array = np.expand_dims(array, 2)
+            array = array.transpose((2, 0, 1))
+        # squeeze all single-element dimensions
         array = array.squeeze()
         array = array.tolist()
         array = _recursive_unpad(array)
-
         # in case we squeezed from 1-element array to a 0-dim array
         array = array if isinstance(array, list) else [array]
         # remove residual empty lists
@@ -519,7 +539,7 @@ def _reload_dataloader_state_dict(self, data_fetcher: AbstractDataFetcher) -> No
             self._dataloader_state_dict = None
 
 
-def _convert_optim_dict(outs: Dict[int, Dict[str, Any]], num_optimizers: int) -> List[Dict[str, Any]]:
+def _convert_optim_dict(outs: Dict[int, Dict[str, Any]], num_optimizers: int) -> List[Optional[Dict[str, Any]]]:
     """Converts an optimizer dict to a list in which the key of the dict determines the position of the element.
 
     Example::
diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py
index 7087bcbad0442..5111969ca79db 100644
--- a/pytorch_lightning/loops/fit_loop.py
+++ b/pytorch_lightning/loops/fit_loop.py
@@ -276,7 +276,7 @@ def on_advance_end(self) -> None:
         if is_overridden("training_epoch_end", model) and self._outputs:
             epoch_end_outputs = self.epoch_loop._prepare_outputs_training_epoch_end(
                 self._outputs,
-                automatic=model.automatic_optimization,
+                lightning_module=model,
                 num_optimizers=len(self.trainer.optimizers),
             )
             # run lightning module hook training_epoch_end
diff --git a/pytorch_lightning/loops/utilities.py b/pytorch_lightning/loops/utilities.py
index 13ae87fad50d1..d84c195d758f9 100644
--- a/pytorch_lightning/loops/utilities.py
+++ b/pytorch_lightning/loops/utilities.py
@@ -11,11 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import inspect
 from collections import OrderedDict
 from contextlib import contextmanager
 from datetime import timedelta
 from functools import lru_cache
-from typing import Any, Dict, Generator, List, Optional, Sequence, Tuple, Union
+from typing import Any, Callable, Dict, Generator, List, Optional, Sequence, Tuple, Union
 
 import numpy as np
 import torch
@@ -221,3 +222,9 @@ def _reset_progress(loop: Loop) -> None:
             v.reset()
         elif isinstance(v, Loop):
             _reset_progress(v)
+
+
+# TODO: remove in v1.8
+def _v1_8_output_format(fx: Callable) -> bool:
+    parameters = inspect.signature(fx).parameters
+    return "new_format" in parameters and parameters["new_format"].default is True
diff --git a/tests/deprecated_api/test_remove_1-8.py b/tests/deprecated_api/test_remove_1-8.py
index 0fa174e172309..4bfc8b6d361f8 100644
--- a/tests/deprecated_api/test_remove_1-8.py
+++ b/tests/deprecated_api/test_remove_1-8.py
@@ -42,6 +42,7 @@
 from pytorch_lightning.utilities.enums import DeviceType, DistributedType
 from pytorch_lightning.utilities.imports import _TORCHTEXT_LEGACY
 from pytorch_lightning.utilities.rank_zero import rank_zero_only, rank_zero_warn
+from tests.deprecated_api import no_deprecated_call
 from tests.helpers.boring_model import BoringDataModule, BoringModel
 from tests.helpers.runif import RunIf
 from tests.helpers.torchtext_utils import get_dummy_torchtext_data_iterator
@@ -652,6 +653,55 @@ def test_v1_8_0_weights_save_path(tmpdir):
         _ = trainer.weights_save_path
 
 
+def test_deprecated_epoch_outputs_format(tmpdir):
+    class DeprecationModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.truncated_bptt_steps = 1
+
+        def training_step(self, batch, batch_idx, optimizer_idx, hiddens):
+            output = super().training_step(batch, batch_idx)
+            output["hiddens"] = hiddens
+            return output
+
+        def tbptt_split_batch(self, batch, split_size):
+            return [batch, batch]
+
+        def training_epoch_end(self, outputs):
+            ...
+
+        def on_train_batch_end(self, outputs, batch, batch_idx) -> None:
+            ...
+
+        def configure_optimizers(self):
+            return [torch.optim.Adam(self.parameters()), torch.optim.Adam(self.parameters())]
+
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
+    model = DeprecationModel()
+    batch_match = r"on_train_batch_end.*will change in version v1.8 to \(tbptt_steps, n_optimizers\)"
+    with pytest.deprecated_call(match=batch_match):
+        trainer.fit(model)
+
+    class DeprecationModel2(DeprecationModel):
+        def on_train_batch_end(self, *args, new_format=True):
+            ...
+
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
+    model = DeprecationModel()
+    epoch_match = r"training_epoch_end.*will change in version v1.8 to \(n_batches, tbptt_steps, n_optimizers\)"
+    with pytest.deprecated_call(match=epoch_match):
+        trainer.fit(model)
+
+    class NoDeprecationModel(DeprecationModel2):
+        def training_epoch_end(self, outputs, new_format=True):
+            ...
+
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
+    model = NoDeprecationModel()
+    with no_deprecated_call(match="will change in version v1.8.*new_format=True"):
+        trainer.fit(model)
+
+
 @pytest.mark.flaky(reruns=3)
 @pytest.mark.parametrize(["action", "expected"], [("a", [3, 1]), ("b", [2]), ("c", [1])])
 def test_simple_profiler_iterable_durations(tmpdir, action: str, expected: list):
diff --git a/tests/loops/epoch/test_training_epoch_loop.py b/tests/loops/epoch/test_training_epoch_loop.py
index 6159809bce092..ed3a853644ace 100644
--- a/tests/loops/epoch/test_training_epoch_loop.py
+++ b/tests/loops/epoch/test_training_epoch_loop.py
@@ -11,12 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from unittest.mock import patch
+from unittest import mock
+from unittest.mock import Mock, patch
 
 import pytest
 
 from pytorch_lightning.loops import TrainingEpochLoop
 from pytorch_lightning.trainer.trainer import Trainer
+from tests.deprecated_api import no_deprecated_call
 from tests.helpers.boring_model import BoringModel
 
 _out00 = {"loss": 0.0}
@@ -29,122 +31,213 @@
 _out13 = {"loss": 1.3}
 
 
-@pytest.mark.parametrize(
-    "num_optimizers,batch_outputs,expected",
-    [
-        (1, [], []),
-        (1, [[]], []),
-        # 1 batch
-        (1, [[{0: _out00}]], [_out00]),
-        # 2 batches
-        (1, [[{0: _out00}], [{0: _out01}]], [_out00, _out01]),
-        # 1 batch, 2 optimizers
-        (2, [[{0: _out00, 1: _out01}]], [_out00, _out01]),
-        # 2 batches, 2 optimizers
-        (
-            2,
-            [[{0: _out00, 1: _out01}], [{0: _out10, 1: _out11}]],
-            [[_out00, _out10], [_out01, _out11]],
-        ),
-        # 4 batches, 2 optimizers, different frequency
-        (
-            2,
-            [[{0: _out00}], [{1: _out10}], [{1: _out11}], [{0: _out01}]],
-            [[_out00, _out01], [_out10, _out11]],
-        ),
-        # 1 batch, tbptt with 2 splits (uneven)
-        (1, [[{0: _out00}, {0: _out01}], [{0: _out03}]], [[_out00, _out01], [_out03]]),
-        # 3 batches, tbptt with 2 splits, 2 optimizers alternating
-        (
-            2,
-            [[{0: _out00}, {0: _out01}], [{1: _out10}, {1: _out11}], [{0: _out02}, {0: _out03}]],
-            [[[_out00, _out01], [], [_out02, _out03]], [[], [_out10, _out11], []]],
-        ),
-    ],
-)
-def test_prepare_outputs_training_epoch_end_automatic(num_optimizers, batch_outputs, expected):
-    """Test that the loop converts the nested lists of outputs to the format that the `training_epoch_end` hook
-    currently expects in the case of automatic optimization."""
-    prepared = TrainingEpochLoop._prepare_outputs_training_epoch_end(
-        batch_outputs,
-        automatic=True,
-        num_optimizers=num_optimizers,
+class TestPrepareOutputs:
+    def prepare_outputs(self, fn, tbptt_splits, new_format, batch_outputs, num_optimizers, automatic_optimization):
+        lightning_module = Mock()
+        lightning_module.automatic_optimization = automatic_optimization
+        lightning_module.truncated_bptt_steps = tbptt_splits
+        match = "will change in version v1.8.*new_format=True"
+        will_warn = tbptt_splits and num_optimizers > 1 and not new_format
+        ctx_manager = pytest.deprecated_call if will_warn else no_deprecated_call
+        with ctx_manager(match=match):
+            with mock.patch(
+                "pytorch_lightning.loops.epoch.training_epoch_loop._v1_8_output_format", return_value=new_format
+            ):
+                return fn(
+                    batch_outputs,
+                    lightning_module=lightning_module,
+                    num_optimizers=num_optimizers,  # does not matter for manual optimization
+                )
+
+    def prepare_outputs_training_epoch_end(
+        self, tbptt_splits, new_format, batch_outputs, num_optimizers, automatic_optimization=True
+    ):
+        return self.prepare_outputs(
+            TrainingEpochLoop._prepare_outputs_training_epoch_end,
+            tbptt_splits,
+            new_format,
+            batch_outputs,
+            num_optimizers,
+            automatic_optimization=automatic_optimization,
+        )
+
+    def prepare_outputs_training_batch_end(
+        self, tbptt_splits, new_format, batch_outputs, num_optimizers, automatic_optimization=True
+    ):
+        return self.prepare_outputs(
+            TrainingEpochLoop._prepare_outputs_training_batch_end,
+            tbptt_splits,
+            new_format,
+            batch_outputs,
+            num_optimizers,
+            automatic_optimization=automatic_optimization,
+        )
+
+    @pytest.mark.parametrize(
+        "num_optimizers,tbptt_splits,batch_outputs,expected",
+        [
+            (1, 0, [], []),
+            (1, 0, [[]], []),
+            # 1 batch
+            (1, 0, [[{0: _out00}]], [_out00]),
+            # 2 batches
+            (1, 0, [[{0: _out00}], [{0: _out01}]], [_out00, _out01]),
+            # 1 batch, 2 optimizers
+            (2, 0, [[{0: _out00, 1: _out01}]], [_out00, _out01]),
+            # 2 batches, 2 optimizers
+            (2, 0, [[{0: _out00, 1: _out01}], [{0: _out10, 1: _out11}]], [[_out00, _out01], [_out10, _out11]]),
+            # 4 batches, 2 optimizers, different frequency
+            (
+                2,
+                0,
+                [[{0: _out00}], [{1: _out10}], [{1: _out11}], [{0: _out01}]],
+                [[_out00], [_out10], [_out11], [_out01]],
+            ),
+            # 1 batch, tbptt with 2 splits (uneven)
+            (1, 2, [[{0: _out00}, {0: _out01}], [{0: _out03}]], [[_out00, _out01], [_out03]]),
+        ],
+    )
+    @pytest.mark.parametrize("new_format", (False, True))
+    def test_prepare_outputs_training_epoch_end_automatic(
+        self, num_optimizers, tbptt_splits, batch_outputs, expected, new_format
+    ):
+        """Test that the loop converts the nested lists of outputs to the format that the `training_epoch_end` hook
+        currently expects in the case of automatic optimization."""
+        assert (
+            self.prepare_outputs_training_epoch_end(tbptt_splits, new_format, batch_outputs, num_optimizers) == expected
+        )
+
+    @pytest.mark.parametrize(
+        "num_optimizers,tbptt_splits,batch_outputs,expected",
+        [
+            # 3 batches, tbptt with 2 splits, 2 optimizers alternating
+            (
+                2,
+                2,
+                [[{0: _out00}, {0: _out01}], [{1: _out10}, {1: _out11}], [{0: _out02}, {0: _out03}]],
+                [[[_out00, _out01], [], [_out02, _out03]], [[], [_out10, _out11], []]],
+            )
+        ],
+    )
+    def test_prepare_outputs_training_epoch_end_automatic_old_format(
+        self, num_optimizers, tbptt_splits, batch_outputs, expected
+    ):
+        assert self.prepare_outputs_training_epoch_end(tbptt_splits, False, batch_outputs, num_optimizers) == expected
+
+    @pytest.mark.parametrize(
+        "num_optimizers,tbptt_splits,batch_outputs,expected",
+        [
+            # 3 batches, tbptt with 2 splits, 2 optimizers alternating
+            (
+                2,
+                2,
+                [[{0: _out00}, {0: _out01}], [{1: _out10}, {1: _out11}], [{0: _out02}, {0: _out03}]],
+                [[[_out00], [_out01]], [[_out10], [_out11]], [[_out02], [_out03]]],
+            )
+        ],
+    )
+    def test_prepare_outputs_training_epoch_end_automatic_new_format(
+        self, num_optimizers, tbptt_splits, batch_outputs, expected
+    ):
+        """Test that the loop converts the nested lists of outputs to the format that the `training_epoch_end` hook
+        currently expects in the case of automatic optimization."""
+        assert self.prepare_outputs_training_epoch_end(tbptt_splits, True, batch_outputs, num_optimizers) == expected
+
+    @pytest.mark.parametrize(
+        "batch_outputs,expected",
+        [
+            ([], []),
+            ([[]], []),
+            # 1 batch
+            ([[_out00]], [_out00]),
+            # 2 batches
+            ([[_out00], [_out01]], [_out00, _out01]),
+            # skipped outputs
+            ([[_out00], [], [], [_out03]], [_out00, _out03]),
+            # tbptt with 2 splits, uneven, skipped output
+            ([[_out00, _out01], [_out02, _out03], [], [_out10]], [[_out00, _out01], [_out02, _out03], [_out10]]),
+        ],
+    )
+    @pytest.mark.parametrize("new_format", (False, True))
+    def test_prepare_outputs_training_epoch_end_manual(self, batch_outputs, expected, new_format):
+        """Test that the loop converts the nested lists of outputs to the format that the `training_epoch_end` hook
+        currently expects in the case of manual optimization."""
+        assert (
+            self.prepare_outputs_training_epoch_end(0, new_format, batch_outputs, -1, automatic_optimization=False)
+            == expected
+        )
+
+    @pytest.mark.parametrize(
+        "num_optimizers,tbptt_splits,batch_end_outputs,expected",
+        [
+            (1, 0, [], []),
+            (1, 0, [[]], []),
+            # 1 optimizer
+            (1, 0, [{0: _out00}], _out00),
+            # 2 optimizers
+            (2, 0, [{0: _out00, 1: _out01}], [_out00, _out01]),
+            # tbptt with 2 splits
+            (1, 2, [{0: _out00}, {0: _out01}], [_out00, _out01]),
+        ],
     )
-    assert prepared == expected
-
-
-@pytest.mark.parametrize(
-    "batch_outputs,expected",
-    [
-        ([], []),
-        ([[]], []),
-        # 1 batch
-        ([[_out00]], [_out00]),
-        # 2 batches
-        ([[_out00], [_out01]], [_out00, _out01]),
-        # skipped outputs
-        ([[_out00], [], [], [_out03]], [_out00, _out03]),
-        # tbptt with 2 splits, uneven, skipped output
-        ([[_out00, _out01], [_out02, _out03], [], [_out10]], [[_out00, _out01], [_out02, _out03], [_out10]]),
-    ],
-)
-def test_prepare_outputs_training_epoch_end_manual(batch_outputs, expected):
-    """Test that the loop converts the nested lists of outputs to the format that the `training_epoch_end` hook
-    currently expects in the case of manual optimization."""
-    prepared = TrainingEpochLoop._prepare_outputs_training_epoch_end(
-        batch_outputs,
-        automatic=False,
-        num_optimizers=-1,  # does not matter for manual optimization
+    @pytest.mark.parametrize("new_format", (False, True))
+    def test_prepare_outputs_training_batch_end_automatic(
+        self, num_optimizers, tbptt_splits, batch_end_outputs, expected, new_format
+    ):
+        """Test that the loop converts the nested lists of outputs to the format that the `on_train_batch_end` hook
+        currently expects in the case of automatic optimization."""
+
+        assert (
+            self.prepare_outputs_training_batch_end(tbptt_splits, new_format, batch_end_outputs, num_optimizers)
+            == expected
+        )
+
+    @pytest.mark.parametrize(
+        "num_optimizers,tbptt_splits,batch_end_outputs,expected",
+        # 2 optimizers, tbptt with 2 splits
+        [(2, 2, [{0: _out00, 1: _out01}, {0: _out10, 1: _out11}], [[_out00, _out10], [_out01, _out11]])],
     )
-    assert prepared == expected
-
-
-@pytest.mark.parametrize(
-    "num_optimizers,batch_end_outputs,expected",
-    [
-        (1, [], []),
-        (1, [[]], []),
-        # 1 optimizer
-        (1, [{0: _out00}], _out00),
-        # 2 optimizers
-        (2, [{0: _out00, 1: _out01}], [_out00, _out01]),
-        # tbptt with 2 splits
-        (1, [{0: _out00}, {0: _out01}], [_out00, _out01]),
+    def test_prepare_outputs_training_batch_end_automatic_old_format(
+        self, num_optimizers, tbptt_splits, batch_end_outputs, expected
+    ):
+        """Test that the loop converts the nested lists of outputs to the format that the `on_train_batch_end` hook
+        currently expects in the case of automatic optimization."""
+        assert (
+            self.prepare_outputs_training_batch_end(tbptt_splits, False, batch_end_outputs, num_optimizers) == expected
+        )
+
+    @pytest.mark.parametrize(
+        "num_optimizers,tbptt_splits,batch_end_outputs,expected",
         # 2 optimizers, tbptt with 2 splits
-        (2, [{0: _out00, 1: _out01}, {0: _out10, 1: _out11}], [[_out00, _out10], [_out01, _out11]]),
-    ],
-)
-def test_prepare_outputs_training_batch_end_automatic(num_optimizers, batch_end_outputs, expected):
-    """Test that the loop converts the nested lists of outputs to the format that the `on_train_batch_end` hook
-    currently expects in the case of automatic optimization."""
-    prepared = TrainingEpochLoop._prepare_outputs_training_batch_end(
-        batch_end_outputs,
-        automatic=True,
-        num_optimizers=num_optimizers,
+        [(2, 2, [{0: _out00, 1: _out01}, {0: _out10, 1: _out11}], [[_out00, _out01], [_out10, _out11]])],
     )
-    assert prepared == expected
-
-
-@pytest.mark.parametrize(
-    "batch_end_outputs,expected",
-    [
-        ([], []),
-        ([[]], []),
-        # skipped outputs
-        ([_out00, None, _out02], [_out00, _out02]),
-        # tbptt with 3 splits, skipped output
-        ([_out00, _out01, None, _out03], [_out00, _out01, _out03]),
-    ],
-)
-def test_prepare_outputs_training_batch_end_manual(batch_end_outputs, expected):
-    """Test that the loop converts the nested lists of outputs to the format that the `on_train_batch_end` hook
-    currently expects in the case of manual optimization."""
-    prepared = TrainingEpochLoop._prepare_outputs_training_batch_end(
-        batch_end_outputs,
-        automatic=False,
-        num_optimizers=-1,  # does not matter for manual optimization
+    def test_prepare_outputs_training_batch_end_automatic_new_format(
+        self, num_optimizers, tbptt_splits, batch_end_outputs, expected
+    ):
+        """Test that the loop converts the nested lists of outputs to the format that the `on_train_batch_end` hook
+        currently expects in the case of automatic optimization."""
+        assert (
+            self.prepare_outputs_training_batch_end(tbptt_splits, True, batch_end_outputs, num_optimizers) == expected
+        )
+
+    @pytest.mark.parametrize(
+        "batch_end_outputs,expected",
+        [
+            ([], []),
+            ([[]], []),
+            # skipped outputs
+            ([_out00, None, _out02], [_out00, _out02]),
+            # tbptt with 3 splits, skipped output
+            ([_out00, _out01, None, _out03], [_out00, _out01, _out03]),
+        ],
     )
-    assert prepared == expected
+    def test_prepare_outputs_training_batch_end_manual(self, batch_end_outputs, expected):
+        """Test that the loop converts the nested lists of outputs to the format that the `on_train_batch_end` hook
+        currently expects in the case of manual optimization."""
+        assert (
+            self.prepare_outputs_training_batch_end(0, False, batch_end_outputs, -1, automatic_optimization=False)
+            == expected
+        )
 
 
 def test_no_val_on_train_epoch_loop_restart(tmpdir):
diff --git a/tests/loops/optimization/test_optimizer_loop.py b/tests/loops/optimization/test_optimizer_loop.py
index 15fba4d0c194a..d04cc7343c9eb 100644
--- a/tests/loops/optimization/test_optimizer_loop.py
+++ b/tests/loops/optimization/test_optimizer_loop.py
@@ -109,16 +109,13 @@ class CurrentModel(BoringModel):
         def training_step(self, batch, batch_idx, optimizer_idx):
             return super().training_step(batch, batch_idx)
 
-        def training_epoch_end(self, outputs):
-            assert len(outputs[0]) == sum(idx == 0 for idx, _ in expected)
-            assert len(outputs[1]) == sum(idx == 1 for idx, _ in expected)
-
         def configure_optimizers(self):
             opt0 = SGD(self.parameters(), lr=0.1)
             opt1 = Adam(self.parameters(), lr=0.1)
             return {"optimizer": opt0, "frequency": frequencies[0]}, {"optimizer": opt1, "frequency": frequencies[1]}
 
     model = CurrentModel()
+    model.training_epoch_end = None
     model.optimizer_step = Mock(wraps=model.optimizer_step)
     trainer = Trainer(
         default_root_dir=tmpdir,
diff --git a/tests/loops/test_utilities.py b/tests/loops/test_utilities.py
index f632ce6e215f2..c5d2e98d008b0 100644
--- a/tests/loops/test_utilities.py
+++ b/tests/loops/test_utilities.py
@@ -14,7 +14,7 @@
 import pytest
 import torch
 
-from pytorch_lightning.loops.utilities import _extract_hiddens
+from pytorch_lightning.loops.utilities import _extract_hiddens, _v1_8_output_format
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
@@ -37,3 +37,27 @@ def test_extract_hiddens():
     # tbptt enabled, no hiddens return
     with pytest.raises(MisconfigurationException, match="enabled `truncated_bptt_steps` but did not `return"):
         _extract_hiddens(None, 1)
+
+
+def test_v1_8_output_format():
+    # old format
+    def training_epoch_end(outputs):
+        ...
+
+    assert not _v1_8_output_format(training_epoch_end)
+
+    def training_epoch_end(outputs, new_format=1):
+        ...
+
+    assert not _v1_8_output_format(training_epoch_end)
+
+    def training_epoch_end(outputs, new_format=False):
+        ...
+
+    assert not _v1_8_output_format(training_epoch_end)
+
+    # new format
+    def training_epoch_end(outputs, new_format=True):
+        ...
+
+    assert _v1_8_output_format(training_epoch_end)
diff --git a/tests/trainer/optimization/test_multiple_optimizers.py b/tests/trainer/optimization/test_multiple_optimizers.py
index 30ee7c635b0bf..41d380b4aa8a7 100644
--- a/tests/trainer/optimization/test_multiple_optimizers.py
+++ b/tests/trainer/optimization/test_multiple_optimizers.py
@@ -163,8 +163,9 @@ def training_step(self, batch, batch_idx, optimizer_idx):
             return loss
 
         def training_epoch_end(self, outputs) -> None:
-            # outputs should be an array with an entry per optimizer
-            assert len(outputs) == 2
+            # outputs should be an array of batches with an entry per optimizer
+            assert len(outputs) == limit_train_batches
+            assert all(len(o) == 2 for o in outputs)
 
         def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, **_):
             # update first optimizer every step

From c168db541de7bfb617cac377d811b375cd8cd207 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Wed, 9 Mar 2022 14:12:19 +0200
Subject: [PATCH 040/167] Address review comments

Update doc string and remove unneeded calls, params

Signed-off-by: Jerome <janand@habana.ai>
---
 pytorch_lightning/core/lightning.py    | 8 --------
 pytorch_lightning/lite/lite.py         | 1 -
 pytorch_lightning/plugins/__init__.py  | 1 +
 pytorch_lightning/strategies/hpu.py    | 4 ----
 pytorch_lightning/utilities/imports.py | 4 ++--
 tests/accelerators/test_hpu.py         | 2 +-
 tests/helpers/runif.py                 | 1 +
 7 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index ca3e6d653c799..699e138fe407a 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -223,14 +223,6 @@ def on_gpu(self):
         """
         return self.device.type == "cuda"
 
-    @property
-    def on_hpu(self):
-        """True if your model is currently running on HPUs.
-
-        Useful to set flags around the LightningModule for different CPU vs GPU vs HPU behavior.
-        """
-        return self.device.type == "hpu"
-
     @property
     def automatic_optimization(self) -> bool:
         """If set to ``False`` you are responsible for calling ``.backward()``, ``.step()``, ``.zero_grad()``."""
diff --git a/pytorch_lightning/lite/lite.py b/pytorch_lightning/lite/lite.py
index 8994924e6ce66..117125f29d8db 100644
--- a/pytorch_lightning/lite/lite.py
+++ b/pytorch_lightning/lite/lite.py
@@ -86,7 +86,6 @@ def __init__(
             devices=devices,
             tpu_cores=tpu_cores,
             ipus=None,
-            hpus=None,
             accelerator=accelerator,
             strategy=strategy,
             gpus=gpus,
diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py
index 1d384412f252a..0c786776af981 100644
--- a/pytorch_lightning/plugins/__init__.py
+++ b/pytorch_lightning/plugins/__init__.py
@@ -41,6 +41,7 @@
     "CheckpointIO",
     "TorchCheckpointIO",
     "XLACheckpointIO",
+    "HPUCheckpointIO",
     "ApexMixedPrecisionPlugin",
     "DataParallelPlugin",
     "DDP2Plugin",
diff --git a/pytorch_lightning/strategies/hpu.py b/pytorch_lightning/strategies/hpu.py
index 651f56ea9a135..2614f02da5896 100644
--- a/pytorch_lightning/strategies/hpu.py
+++ b/pytorch_lightning/strategies/hpu.py
@@ -65,10 +65,6 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None:
     def model_to_device(self) -> None:
         self.model.to(self.root_device)
 
-    @property
-    def on_hpu(self) -> bool:
-        return True
-
     def pre_dispatch(self) -> None:
         if isinstance(self.device, int):
             self.device = torch.device(self.device)
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index f9fded925dff6..9e6778e46ba00 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -135,9 +135,9 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version:
     _IPU_AVAILABLE = False
 
 if _HABANA_FRAMEWORK_AVAILABLE:
-    from habana_frameworks.torch.utils.library_loader import is_habana_available
+    from habana_frameworks.torch.utils.library_loader import is_habana_avaialble
 
-    _HPU_AVAILABLE = is_habana_available()
+    _HPU_AVAILABLE = is_habana_avaialble()
 else:
     _HPU_AVAILABLE = False
 
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index 063cd6f8ef649..d34c16e3de4bd 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -221,7 +221,7 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
     model = model.half()
     trainer = Trainer(
         strategy=HPUStrategy(
-            device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=None)
+            device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params)
         ),
         default_root_dir=tmpdir,
         fast_dev_run=True,
diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py
index 247a5b018ae26..bd5536fa6f544 100644
--- a/tests/helpers/runif.py
+++ b/tests/helpers/runif.py
@@ -96,6 +96,7 @@ def __new__(
             amp_apex: Require that NVIDIA/apex is installed.
             tpu: Require that TPU is available.
             ipu: Require that IPU is available.
+            hpu: Require that HPU is available.
             horovod: Require that Horovod is installed.
             horovod_nccl: Require that Horovod is installed with NCCL support.
             skip_windows: Skip for Windows platform.

From 831a672efff9da828ac6e684e5d1745706157f5c Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Wed, 9 Mar 2022 15:29:46 +0200
Subject: [PATCH 041/167] Review comment :Make use of Boring model

Update test to make use of boring model without subclassing

Signed-off-by: Jerome <janand@habana.ai>
---
 tests/accelerators/test_hpu.py | 88 ++++++++--------------------------
 1 file changed, 19 insertions(+), 69 deletions(-)

diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index d34c16e3de4bd..9b48907a42fff 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -38,62 +38,6 @@
     os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "hccl"
 
 
-class HPUModel(BoringModel):
-    def training_step(self, batch, batch_idx):
-        output = self(batch)
-        loss = self.loss(batch, output)
-        return loss
-
-    def validation_step(self, batch, batch_idx):
-        output = self(batch)
-        loss = self.loss(batch, output)
-        return loss
-
-    def test_step(self, batch, batch_idx):
-        output = self(batch)
-        loss = self.loss(batch, output)
-        return loss
-
-    def training_epoch_end(self, outputs) -> None:
-        pass
-
-    def validation_epoch_end(self, outputs) -> None:
-        pass
-
-    def test_epoch_end(self, outputs) -> None:
-        pass
-
-
-class HPUClassificationModel(ClassificationModel):
-    def training_step(self, batch, batch_idx):
-        x, y = batch
-        logits = self(x)
-        loss = F.cross_entropy(logits, y)
-        return loss
-
-    def validation_step(self, batch, batch_idx):
-        x, y = batch
-        logits = self(x)
-        acc = self.accuracy(logits, y)
-        return acc
-
-    def test_step(self, batch, batch_idx):
-        x, y = batch
-        logits = self(x)
-        acc = self.accuracy(logits, y)
-        return acc
-
-    def accuracy(self, logits, y):
-        acc = torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y)
-        return acc
-
-    def validation_epoch_end(self, outputs) -> None:
-        self.log("val_acc", torch.stack(outputs).mean())
-
-    def test_epoch_end(self, outputs) -> None:
-        self.log("test_acc", torch.stack(outputs).mean())
-
-
 @RunIf(hpu=True)
 def test_availability():
     assert HPUAccelerator.is_available()
@@ -123,7 +67,7 @@ def test_no_warning_plugin(tmpdir):
 
 @RunIf(hpu=True)
 def test_all_stages(tmpdir, hpus):
-    model = HPUModel()
+    model = BoringModel()
     parallel_devices = hpus
     hpustrat_1 = HPUStrategy(
         device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=None)
@@ -150,7 +94,7 @@ def test_optimization(tmpdir):
     seed_everything(42)
 
     dm = ClassifDataModule(length=1024)
-    model = HPUClassificationModel()
+    model = ClassificationModel()
 
     trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="hpu", devices=1)
 
@@ -174,7 +118,7 @@ def test_optimization(tmpdir):
     model_path = os.path.join(tmpdir, "model.pt")
     trainer.save_checkpoint(model_path)
 
-    model = HPUClassificationModel.load_from_checkpoint(model_path)
+    model = ClassificationModel.load_from_checkpoint(model_path)
 
     trainer = Trainer(default_root_dir=tmpdir, accelerator="hpu", devices=1)
 
@@ -190,7 +134,7 @@ def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[st
             assert trainer.strategy.model.precision == "bf16"
             raise SystemExit
 
-    model = HPUModel()
+    model = BoringModel()
     trainer = Trainer(
         strategy=HPUStrategy(
             device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision="bf16", hmp_params=hmp_params)
@@ -217,7 +161,7 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
                 assert param.dtype == torch.float16
             raise SystemExit
 
-    model = HPUModel()
+    model = BoringModel()
     model = model.half()
     trainer = Trainer(
         strategy=HPUStrategy(
@@ -242,20 +186,26 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
 def test_stages_correct(tmpdir):
     """Ensure all stages correctly are traced correctly by asserting the output for each stage."""
 
-    class StageModel(HPUModel):
+    class StageModel(BoringModel):
         def training_step(self, batch, batch_idx):
             loss = super().training_step(batch, batch_idx)
+            loss = loss.get("loss")
             # tracing requires a loss value that depends on the model.
             # force it to be a value but ensure we use the loss.
-            return (loss - loss) + torch.tensor(1)
+            loss = (loss - loss) + torch.tensor(1)
+            return {"loss": loss}
 
         def validation_step(self, batch, batch_idx):
             loss = super().validation_step(batch, batch_idx)
-            return (loss - loss) + torch.tensor(2)
+            x = loss.get("x")
+            x = (x - x) + torch.tensor(2)
+            return {"x": x}
 
         def test_step(self, batch, batch_idx):
-            loss = super().validation_step(batch, batch_idx)
-            return (loss - loss) + torch.tensor(3)
+            loss = super().test_step(batch, batch_idx)
+            y = loss.get("y")
+            y = (y - y) + torch.tensor(3)
+            return {"y": y}
 
         def predict_step(self, batch, batch_idx, dataloader_idx=None):
             output = super().predict_step(batch, batch_idx)
@@ -266,10 +216,10 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx) -> N
             assert outputs["loss"].item() == 1
 
         def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None:
-            assert outputs.item() == 2
+            assert outputs["x"].item() == 2
 
         def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None:
-            assert outputs.item() == 3
+            assert outputs["y"].item() == 3
 
         def on_predict_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None:
             assert torch.all(outputs == 4).item()
@@ -367,7 +317,7 @@ def test_devices_auto_choice_hpu():
 @RunIf(hpu=True)
 @pytest.mark.parametrize("hpus", [1])
 def test_inference_only(tmpdir, hpus):
-    model = HPUModel()
+    model = BoringModel()
 
     trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, accelerator="hpu", devices=hpus)
     trainer.validate(model)

From 328329e1796e5e2d5d7e2024b0fd8ebee9179c81 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Wed, 9 Mar 2022 18:58:01 +0200
Subject: [PATCH 042/167] Update stats example trainer params

Signed-off-by: Jerome <janand@habana.ai>
---
 pytorch_lightning/callbacks/hpu_stats_monitor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/callbacks/hpu_stats_monitor.py b/pytorch_lightning/callbacks/hpu_stats_monitor.py
index e1b2ed091ba6a..c0bac416a5032 100644
--- a/pytorch_lightning/callbacks/hpu_stats_monitor.py
+++ b/pytorch_lightning/callbacks/hpu_stats_monitor.py
@@ -49,7 +49,7 @@ class HPUStatsMonitor(Callback):
         >>> from pytorch_lightning import Trainer
         >>> from pytorch_lightning.callbacks import HPUStatsMonitor
         >>> hpu_stats = HPUStatsMonitor()
-        >>> trainer = Trainer(hpus=1, callbacks=[hpu_stats])
+        >>> trainer = Trainer(accelerator="hpu", callbacks=[hpu_stats])
 
     you can also optionally provide save_dir and exp_name in HPUStatsMonitor.
     No need to provide logger in Trainer.

From c8e331ef23e9d121e04585831247fa10eacd1315 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Wed, 9 Mar 2022 20:02:54 +0200
Subject: [PATCH 043/167] Correct flake8 errors

Signed-off-by: Jerome <janand@habana.ai>
---
 pl_examples/hpu_examples/simple_mnist/mnist.py   | 3 +--
 pytorch_lightning/callbacks/hpu_stats_monitor.py | 3 +--
 pytorch_lightning/overrides/torch_distributed.py | 1 +
 pytorch_lightning/plugins/io/hpu_io_plugin.py    | 4 +---
 pytorch_lightning/strategies/ddp.py              | 4 ++--
 pytorch_lightning/strategies/hpu.py              | 7 +------
 pytorch_lightning/strategies/hpu_parallel.py     | 5 ++---
 pytorch_lightning/trainer/trainer.py             | 2 +-
 pytorch_lightning/utilities/distributed.py       | 3 +--
 tests/accelerators/test_hpu.py                   | 6 ++----
 10 files changed, 13 insertions(+), 25 deletions(-)

diff --git a/pl_examples/hpu_examples/simple_mnist/mnist.py b/pl_examples/hpu_examples/simple_mnist/mnist.py
index 7759a2a18a336..c411e01e8f74d 100644
--- a/pl_examples/hpu_examples/simple_mnist/mnist.py
+++ b/pl_examples/hpu_examples/simple_mnist/mnist.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import argparse
 import os
 
-import habana_frameworks.torch.core as htcore
+import habana_frameworks.torch.core as htcore  # noqa: F401
 import torch
 from torch.nn import functional as F
 
diff --git a/pytorch_lightning/callbacks/hpu_stats_monitor.py b/pytorch_lightning/callbacks/hpu_stats_monitor.py
index c0bac416a5032..4e32ef7671bbc 100644
--- a/pytorch_lightning/callbacks/hpu_stats_monitor.py
+++ b/pytorch_lightning/callbacks/hpu_stats_monitor.py
@@ -28,13 +28,12 @@
 Monitor and logs hpu stats during training.
 
 """
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Optional
 
 import torch
 
 import pytorch_lightning as pl
 from pytorch_lightning.callbacks.base import Callback
-from pytorch_lightning.utilities import rank_zero_only
 
 
 class HPUStatsMonitor(Callback):
diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py
index 865ed00213f32..06b45908b565f 100644
--- a/pytorch_lightning/overrides/torch_distributed.py
+++ b/pytorch_lightning/overrides/torch_distributed.py
@@ -21,6 +21,7 @@
 # The code underneath is taken from PyTorch `torch/distributed/distributed_c10d.py`
 # the distributed backend and tensor type updates for habana backend is done here before broadcast
 
+
 # https://github.com/pytorch/pytorch/blob/1.10.1/torch/distributed/distributed_c10d.py#L256
 def _rank_not_in_group(group: ProcessGroup):
     """Helper that checks if the current process's rank is not in a given group."""
diff --git a/pytorch_lightning/plugins/io/hpu_io_plugin.py b/pytorch_lightning/plugins/io/hpu_io_plugin.py
index 225b67bd17359..214f54ed71af4 100644
--- a/pytorch_lightning/plugins/io/hpu_io_plugin.py
+++ b/pytorch_lightning/plugins/io/hpu_io_plugin.py
@@ -17,12 +17,10 @@
 
 import torch
 
-import pytorch_lightning as pl
 from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO
-from pytorch_lightning.utilities import _HPU_AVAILABLE, rank_zero_warn
-from pytorch_lightning.utilities.apply_func import apply_to_collection
 from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem
 from pytorch_lightning.utilities.cloud_io import load as pl_load
+from pytorch_lightning.utilities.imports import _HPU_AVAILABLE
 from pytorch_lightning.utilities.types import _PATH
 
 
diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py
index 0b78d3eaddc04..fad0603e8ab47 100644
--- a/pytorch_lightning/strategies/ddp.py
+++ b/pytorch_lightning/strategies/ddp.py
@@ -199,7 +199,7 @@ def pre_configure_ddp(self):
         if self.root_device.type == "hpu":
             self._static_graph = False
             static_graph = self._ddp_kwargs.get("static_graph")
-            if static_graph == True:
+            if static_graph:
                 # when _set_static_graph() is called find_unused_parameters does not have any significance.
                 # Resetting the value of find_unused_parameters to False which is the default value to DDP
                 self._ddp_kwargs["find_unused_parameters"] = False
@@ -274,7 +274,7 @@ def configure_ddp(self) -> None:
         log.detail(f"{self.__class__.__name__}: configuring DistributedDataParallel")
         self.pre_configure_ddp()
         self.model = self._setup_model(LightningDistributedModule(self.model))
-        if self.root_device.type == "hpu" and self._static_graph == True:
+        if self.root_device.type == "hpu" and self._static_graph:
             self._model._set_static_graph()
         self._register_ddp_hooks()
 
diff --git a/pytorch_lightning/strategies/hpu.py b/pytorch_lightning/strategies/hpu.py
index 2614f02da5896..5505d84934274 100644
--- a/pytorch_lightning/strategies/hpu.py
+++ b/pytorch_lightning/strategies/hpu.py
@@ -12,20 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-from typing import Any, Dict, Optional
+from typing import Dict, Optional
 
 import torch
 
 import pytorch_lightning as pl
 from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO
 from pytorch_lightning.plugins.precision import PrecisionPlugin
-from pytorch_lightning.plugins.precision.hpu_precision import HPUPrecisionPlugin
 from pytorch_lightning.strategies.single_device import SingleDeviceStrategy
-from pytorch_lightning.utilities import _HPU_AVAILABLE
-from pytorch_lightning.utilities.apply_func import move_data_to_device
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.types import _PATH
 
 
 class HPUStrategy(SingleDeviceStrategy):
diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index 9cc3d6a18c3b1..8847d720cd956 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-from typing import Any, Dict, List, Optional, Union
+from typing import Dict, List, Optional
 
-import __main__
 import torch
 import torch.distributed
 
@@ -54,7 +53,7 @@ def __init__(
 
     def setup_environment(self) -> None:
 
-        import habana_frameworks.torch.core.hccl
+        import habana_frameworks.torch.core.hccl  # noqa: F401
 
         os.environ["ID"] = str(self.local_rank)
         os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "hccl"
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 5dbd9d3cadc3a..7f5e4baa97302 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -2045,7 +2045,7 @@ def num_gpus(self) -> int:
         return self._accelerator_connector.num_gpus
 
     @property
-    def hpus(self) -> int:
+    def num_hpus(self) -> int:
         return self._accelerator_connector.num_hpus
 
     @property
diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index ec9a9955752ca..d345194aadb3a 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -18,8 +18,7 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
-from torch.distributed import Backend, get_backend
-from torch.nn import Module
+from torch.distributed import get_backend
 from torch.nn.parallel.distributed import DistributedDataParallel
 
 import pytorch_lightning as pl
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index 9b48907a42fff..8686670b8607f 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -12,15 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-from argparse import ArgumentParser
 from typing import Optional
 
 import pytest
 import torch
-import torch.nn.functional as F
 
 from pytorch_lightning import Callback, seed_everything, Trainer
-from pytorch_lightning.accelerators import CPUAccelerator, HPUAccelerator
+from pytorch_lightning.accelerators import HPUAccelerator
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins import HPUPrecisionPlugin
 from pytorch_lightning.strategies.hpu import HPUStrategy
@@ -33,7 +31,7 @@
 from tests.helpers.simple_models import ClassificationModel
 
 if _HPU_AVAILABLE:
-    import habana_frameworks.torch.core as htcore
+    import habana_frameworks.torch.core as htcore  # noqa: F401
 
     os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "hccl"
 

From 9a71bdcf91d5759bb0716aad97b6a8a8dc7c0959 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Wed, 9 Mar 2022 20:31:39 +0200
Subject: [PATCH 044/167] Remove docstring examples

Signed-off-by: Jerome <janand@habana.ai>
---
 pytorch_lightning/callbacks/hpu_stats_monitor.py | 10 ----------
 pytorch_lightning/overrides/torch_distributed.py | 14 --------------
 2 files changed, 24 deletions(-)

diff --git a/pytorch_lightning/callbacks/hpu_stats_monitor.py b/pytorch_lightning/callbacks/hpu_stats_monitor.py
index 4e32ef7671bbc..7ed9abeb2ced2 100644
--- a/pytorch_lightning/callbacks/hpu_stats_monitor.py
+++ b/pytorch_lightning/callbacks/hpu_stats_monitor.py
@@ -42,16 +42,6 @@ class HPUStatsMonitor(Callback):
     Args:
         save_dir: directory to save the logs.
         exp_name: name of the experiment.
-
-    Example::
-
-        >>> from pytorch_lightning import Trainer
-        >>> from pytorch_lightning.callbacks import HPUStatsMonitor
-        >>> hpu_stats = HPUStatsMonitor()
-        >>> trainer = Trainer(accelerator="hpu", callbacks=[hpu_stats])
-
-    you can also optionally provide save_dir and exp_name in HPUStatsMonitor.
-    No need to provide logger in Trainer.
     """
 
     def __init__(self, log_save_dir: str = "habana_ptl_logs", exp_name: str = "default"):
diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py
index 06b45908b565f..ca5b156dcbed9 100644
--- a/pytorch_lightning/overrides/torch_distributed.py
+++ b/pytorch_lightning/overrides/torch_distributed.py
@@ -86,20 +86,6 @@ def _broadcast_object_list(object_list, src=0, group=None, device=None):
         is known to be insecure. It is possible to construct malicious pickle
         data which will execute arbitrary code during unpickling. Only call this
         function with data you trust.
-
-    Example::
-        >>> # Note: Process group initialization omitted on each rank.
-        >>> import torch.distributed as dist
-        >>> if dist.get_rank() == 0:
-        >>>     # Assumes world_size of 3.
-        >>>     objects = ["foo", 12, {1: 2}] # any picklable object
-        >>> else:
-        >>>     objects = [None, None, None]
-        >>> # Assumes backend is not NCCL
-        >>> device = torch.device("cpu")
-        >>> dist.broadcast_object_list(objects, src=0, device=device)
-        >>> broadcast_objects
-        ['foo', 12, {1: 2}]
     """
     if _rank_not_in_group(group):
         return

From 8efed0bf519ac64371bf2909bb41668260cfc403 Mon Sep 17 00:00:00 2001
From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com>
Date: Thu, 3 Mar 2022 11:28:44 +0100
Subject: [PATCH 045/167] Update hpu-tests.yml

---
 .azure-pipelines/hpu-tests.yml | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/.azure-pipelines/hpu-tests.yml b/.azure-pipelines/hpu-tests.yml
index aaec3caabd0f6..4e3be2c318f70 100644
--- a/.azure-pipelines/hpu-tests.yml
+++ b/.azure-pipelines/hpu-tests.yml
@@ -30,3 +30,30 @@ jobs:
     - bash: |
         hwinfo --short
       displayName: 'Instance HW info'
+
+    - bash: |
+        python setup.py bdist_wheel
+        pip install -U ./dist/pytorch_lightning*
+        pip install . --requirement requirements/hpu.txt
+      displayName: 'Install dependencies'
+      continueOnError: true
+
+    - bash: |
+         python ".azure-pipelines/run_hpu_tests.py"
+      displayName: 'HPU Tests in parallel'
+      continueOnError: true
+
+    - bash: |
+        python -m coverage report
+        python -m coverage xml
+        python -m coverage html
+        python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=pytest,hpu --name="HPU Test Coverage"
+      displayName: 'Statistics'
+      continueOnError: true
+
+    - task: PublishTestResults@2
+      inputs:
+        testResultsFiles: 'hpu*_test-results.xml'
+        testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
+      condition: succeededOrFailed()
+      displayName: 'Publish test results'

From 90409a281b1bac49f46adb5e41286aaed8f259b1 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Mon, 7 Mar 2022 17:05:19 +0100
Subject: [PATCH 046/167] prune

---
 .azure-pipelines/hpu-tests.yml | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/.azure-pipelines/hpu-tests.yml b/.azure-pipelines/hpu-tests.yml
index 4e3be2c318f70..86f74c7a738ff 100644
--- a/.azure-pipelines/hpu-tests.yml
+++ b/.azure-pipelines/hpu-tests.yml
@@ -32,8 +32,6 @@ jobs:
       displayName: 'Instance HW info'
 
     - bash: |
-        python setup.py bdist_wheel
-        pip install -U ./dist/pytorch_lightning*
         pip install . --requirement requirements/hpu.txt
       displayName: 'Install dependencies'
       continueOnError: true
@@ -43,14 +41,6 @@ jobs:
       displayName: 'HPU Tests in parallel'
       continueOnError: true
 
-    - bash: |
-        python -m coverage report
-        python -m coverage xml
-        python -m coverage html
-        python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=pytest,hpu --name="HPU Test Coverage"
-      displayName: 'Statistics'
-      continueOnError: true
-
     - task: PublishTestResults@2
       inputs:
         testResultsFiles: 'hpu*_test-results.xml'

From 5bbc6dc81a8e8b891f1199f2085af28f99894df2 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Tue, 8 Mar 2022 16:34:11 +0100
Subject: [PATCH 047/167] Update hpu-tests.yml

---
 .azure-pipelines/hpu-tests.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.azure-pipelines/hpu-tests.yml b/.azure-pipelines/hpu-tests.yml
index 86f74c7a738ff..f53a34d2eb4ba 100644
--- a/.azure-pipelines/hpu-tests.yml
+++ b/.azure-pipelines/hpu-tests.yml
@@ -34,12 +34,10 @@ jobs:
     - bash: |
         pip install . --requirement requirements/hpu.txt
       displayName: 'Install dependencies'
-      continueOnError: true
 
     - bash: |
          python ".azure-pipelines/run_hpu_tests.py"
       displayName: 'HPU Tests in parallel'
-      continueOnError: true
 
     - task: PublishTestResults@2
       inputs:

From 85f535b0a720070b9bff794c67a276271cd63a45 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Wed, 9 Mar 2022 13:43:13 +0100
Subject: [PATCH 048/167] Apply suggestions from code review

---
 .azure-pipelines/hpu-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azure-pipelines/hpu-tests.yml b/.azure-pipelines/hpu-tests.yml
index f53a34d2eb4ba..e663703de58da 100644
--- a/.azure-pipelines/hpu-tests.yml
+++ b/.azure-pipelines/hpu-tests.yml
@@ -32,7 +32,7 @@ jobs:
       displayName: 'Instance HW info'
 
     - bash: |
-        pip install . --requirement requirements/hpu.txt
+        pip install . --requirement requirements/test.txt
       displayName: 'Install dependencies'
 
     - bash: |

From 75227d91611222ef0dccc04116dd14ed679db347 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Wed, 9 Mar 2022 13:54:12 +0100
Subject: [PATCH 049/167] hwinfo

---
 .azure-pipelines/hpu-tests.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.azure-pipelines/hpu-tests.yml b/.azure-pipelines/hpu-tests.yml
index e663703de58da..53e933471cf3f 100644
--- a/.azure-pipelines/hpu-tests.yml
+++ b/.azure-pipelines/hpu-tests.yml
@@ -28,6 +28,7 @@ jobs:
 
     steps:
     - bash: |
+        apt-get install hwinfo
         hwinfo --short
       displayName: 'Instance HW info'
 

From 711bbf3a83f38b0563892aac3da29c4b5ee104dd Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Thu, 10 Mar 2022 07:36:56 +0200
Subject: [PATCH 050/167] Override mypy warnings

Signed-off-by: Jerome <janand@habana.ai>
---
 pytorch_lightning/callbacks/hpu_stats_monitor.py          | 7 ++-----
 pytorch_lightning/overrides/torch_distributed.py          | 6 ++++--
 pytorch_lightning/plugins/precision/hpu_precision.py      | 8 ++++----
 pytorch_lightning/strategies/hpu.py                       | 6 +++---
 pytorch_lightning/strategies/hpu_parallel.py              | 2 +-
 .../trainer/connectors/accelerator_connector.py           | 6 +++---
 requirements.txt                                          | 3 +++
 7 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/pytorch_lightning/callbacks/hpu_stats_monitor.py b/pytorch_lightning/callbacks/hpu_stats_monitor.py
index 7ed9abeb2ced2..50edf9b31a492 100644
--- a/pytorch_lightning/callbacks/hpu_stats_monitor.py
+++ b/pytorch_lightning/callbacks/hpu_stats_monitor.py
@@ -28,7 +28,6 @@
 Monitor and logs hpu stats during training.
 
 """
-from typing import Optional
 
 import torch
 
@@ -58,10 +57,8 @@ def on_init_end(self, trainer: "pl.Trainer") -> None:
     def on_before_backward(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", loss: torch.Tensor) -> None:
         pl_module.log("Model_Loss", loss, on_step=True, on_epoch=True, enable_graph=False, logger=True)
 
-    def on_train_epoch_end(
-        self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", unused: Optional = None
-    ) -> None:
-        tensor_board = trainer.logger.experiment
+    def on_train_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        tensor_board = trainer.logger.experiment  # type: ignore
         dict = vars(pl_module)
         modules = dict["_modules"]
         for module_name in modules:
diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py
index ca5b156dcbed9..9571d962bc3e6 100644
--- a/pytorch_lightning/overrides/torch_distributed.py
+++ b/pytorch_lightning/overrides/torch_distributed.py
@@ -1,3 +1,5 @@
+# type: ignore
+
 import io
 import logging
 import os
@@ -34,7 +36,7 @@ def _rank_not_in_group(group: ProcessGroup):
 def _object_to_tensor(obj):
     f = io.BytesIO()
     _pickler(f).dump(obj)
-    byte_storage = torch.ByteStorage.from_buffer(f.getvalue())  # type: ignore[attr-defined]
+    byte_storage = torch.ByteStorage.from_buffer(f.getvalue())
     # Do not replace `torch.ByteTensor` or `torch.LongTensor` with torch.tensor and specifying dtype.
     # Otherwise, it will casue 100X slowdown.
     # See: https://github.com/pytorch/pytorch/issues/65696
@@ -140,7 +142,7 @@ def _broadcast_object_list(object_list, src=0, group=None, device=None):
         object_tensor = torch.cat(tensor_list)
     else:
         object_tensor = torch.empty(
-            torch.sum(object_sizes_tensor).int().item(),  # type: ignore[arg-type]
+            torch.sum(object_sizes_tensor).int().item(),
             dtype=torch.uint8,
         )
 
diff --git a/pytorch_lightning/plugins/precision/hpu_precision.py b/pytorch_lightning/plugins/precision/hpu_precision.py
index ec814f9582437..bf32c78f9ebe8 100644
--- a/pytorch_lightning/plugins/precision/hpu_precision.py
+++ b/pytorch_lightning/plugins/precision/hpu_precision.py
@@ -40,10 +40,10 @@ def __init__(self, precision: int, hmp_params: Optional[Sequence[Any]] = None) -
 
             from habana_frameworks.torch.hpex import hmp
 
-            hmp_opt_level = hmp_params["level"]
-            hmp_bf16 = hmp_params["bf16_ops"]
-            hmp_fp32 = hmp_params["fp32_ops"]
-            hmp_verbose = hmp_params["verbose"]
+            hmp_opt_level = hmp_params["level"]  # type: ignore
+            hmp_bf16 = hmp_params["bf16_ops"]  # type: ignore
+            hmp_fp32 = hmp_params["fp32_ops"]  # type: ignore
+            hmp_verbose = hmp_params["verbose"]  # type: ignore
             hmp.convert(
                 opt_level=hmp_opt_level, bf16_file_path=hmp_bf16, fp32_file_path=hmp_fp32, isVerbose=hmp_verbose
             )
diff --git a/pytorch_lightning/strategies/hpu.py b/pytorch_lightning/strategies/hpu.py
index 5505d84934274..3c6f30159eacc 100644
--- a/pytorch_lightning/strategies/hpu.py
+++ b/pytorch_lightning/strategies/hpu.py
@@ -58,11 +58,11 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None:
             raise MisconfigurationException("HPUs currently support only one optimizer.")
 
     def model_to_device(self) -> None:
-        self.model.to(self.root_device)
+        self.model.to(self.root_device)  # type: ignore
 
     def pre_dispatch(self) -> None:
-        if isinstance(self.device, int):
-            self.device = torch.device(self.device)
+        if isinstance(self.device, int):  # type: ignore
+            self.device = torch.device(self.device)  # type: ignore
 
     @classmethod
     def register_strategies(cls, strategy_registry: Dict) -> None:
diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index 8847d720cd956..73f66e25d478c 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -60,7 +60,7 @@ def setup_environment(self) -> None:
 
         super().setup_environment()
 
-    def broadcast(self, obj: object, src: int = 0) -> object:
+    def broadcast(self, obj: object, src: int = 0) -> object:  # type: ignore
         obj = [obj]
         if self.global_rank != src:
             obj = [None]
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index 6324cc5cd0b5b..07c36f6410224 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -551,9 +551,9 @@ def _choose_strategy(self) -> Union[Strategy, str]:
             return IPUStrategy.strategy_name
         if self._accelerator_flag == "hpu":
             if self._parallel_devices and len(self._parallel_devices) > 1:
-                return HPUParallelStrategy(parallel_devices=self.parallel_devices)
+                return HPUParallelStrategy(parallel_devices=self.parallel_devices)  # type: ignore
             else:
-                return HPUStrategy(device=torch.device("hpu"))
+                return HPUStrategy(device=torch.device("hpu"))  # type: ignore
         if self._accelerator_flag == "tpu":
             if self._parallel_devices and len(self._parallel_devices) > 1:
                 return TPUSpawnStrategy.strategy_name
@@ -652,7 +652,7 @@ def _check_and_init_precision(self) -> PrecisionPlugin:
         if isinstance(self.accelerator, IPUAccelerator):
             return IPUPrecisionPlugin(self._precision_flag)  # type: ignore
         if isinstance(self.accelerator, HPUAccelerator):
-            return HPUPrecisionPlugin(self._precision_flag)
+            return HPUPrecisionPlugin(self._precision_flag)  # type: ignore
         if isinstance(self.accelerator, TPUAccelerator):
             if self._precision_flag == 32:
                 return TPUPrecisionPlugin()
diff --git a/requirements.txt b/requirements.txt
index 6aa080fc7e8fb..d1a543132f088 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,3 +10,6 @@ torchmetrics>=0.4.1
 pyDeprecate>=0.3.1, <0.4.0
 packaging>=17.0
 typing-extensions>=4.0.0
+pytest>=7.0.1
+pytest-forked>=1.4.0
+sklearn

From bc174f6a6f891ea07f22890674896a82cc13ef7c Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Thu, 10 Mar 2022 09:21:31 +0200
Subject: [PATCH 051/167] Update test and requirements file

Signed-off-by: Jerome <janand@habana.ai>
---
 requirements.txt                                 | 3 ---
 requirements/test.txt                            | 2 ++
 tests/accelerators/test_accelerator_connector.py | 3 ++-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index d1a543132f088..6aa080fc7e8fb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,3 @@ torchmetrics>=0.4.1
 pyDeprecate>=0.3.1, <0.4.0
 packaging>=17.0
 typing-extensions>=4.0.0
-pytest>=7.0.1
-pytest-forked>=1.4.0
-sklearn
diff --git a/requirements/test.txt b/requirements/test.txt
index c7d2860ebee61..25e0f0f55a1c7 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -6,6 +6,8 @@ twine==3.2
 mypy>=0.920
 flake8>=3.9.2
 pre-commit>=1.0
+pytest-forked
+sklearn
 
 # needed in tests
 cloudpickle>=1.3
diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
index 6fdc2ea8c9d02..05641380ba869 100644
--- a/tests/accelerators/test_accelerator_connector.py
+++ b/tests/accelerators/test_accelerator_connector.py
@@ -935,8 +935,9 @@ def test_unsupported_ipu_choice(mock_ipu_acc_avail, monkeypatch):
         Trainer(accelerator="ipu", precision=64)
 
 
+@mock.patch("pytorch_lightning.accelerators.hpu.HPUAccelerator.is_available", return_value=True)
 def test_unsupported_hpu_choice(monkeypatch):
-    import pytorch_lightning.plugins.training_type.hpu as hpu
+    import pytorch_lightning.strategies.hpu as hpu
     import pytorch_lightning.utilities.imports as imports
     from pytorch_lightning.trainer.connectors.accelerator_connector import AcceleratorConnector
 

From b28c0ce1d4d075e75a236f7f7f22e7b755fb3a2d Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Thu, 10 Mar 2022 12:04:16 +0200
Subject: [PATCH 052/167] Remove hpu stats monitor and deprecated API's

Signed-off-by: Jerome <janand@habana.ai>
---
 .../hpu_examples/simple_mnist/mnist.py        |  4 --
 pytorch_lightning/callbacks/__init__.py       |  2 -
 .../callbacks/hpu_stats_monitor.py            | 66 -------------------
 pytorch_lightning/strategies/hpu.py           |  4 --
 .../connectors/accelerator_connector.py       |  6 --
 5 files changed, 82 deletions(-)
 delete mode 100644 pytorch_lightning/callbacks/hpu_stats_monitor.py

diff --git a/pl_examples/hpu_examples/simple_mnist/mnist.py b/pl_examples/hpu_examples/simple_mnist/mnist.py
index c411e01e8f74d..76390d5eabeb7 100644
--- a/pl_examples/hpu_examples/simple_mnist/mnist.py
+++ b/pl_examples/hpu_examples/simple_mnist/mnist.py
@@ -20,7 +20,6 @@
 
 import pytorch_lightning as pl
 from pl_examples.basic_examples.mnist_datamodule import MNISTDataModule
-from pytorch_lightning.callbacks import HPUStatsMonitor
 from pytorch_lightning.plugins import HPUPrecisionPlugin
 from pytorch_lightning.strategies.hpu import HPUStrategy
 from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy
@@ -105,8 +104,6 @@ def configure_optimizers(self):
     hmp_params["bf16_ops"] = args.hmp_bf16  # "./pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt"
     hmp_params["fp32_ops"] = args.hmp_fp32  # "./pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt"
 
-    hpu_stats = HPUStatsMonitor(log_save_dir="habana_ptl_log", exp_name="mnist")
-
     parallel_devices = args.hpus
     hpustrat_1 = HPUStrategy(
         device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params)
@@ -120,7 +117,6 @@ def configure_optimizers(self):
     trainer = pl.Trainer(
         strategy=hpustrat_8 if (parallel_devices == 8) else hpustrat_1,
         devices=parallel_devices,
-        callbacks=[hpu_stats],
         max_epochs=args.epochs,
         default_root_dir=os.getcwd(),
         accelerator="hpu",
diff --git a/pytorch_lightning/callbacks/__init__.py b/pytorch_lightning/callbacks/__init__.py
index 6cc4e765b70b6..f47bc115ece51 100644
--- a/pytorch_lightning/callbacks/__init__.py
+++ b/pytorch_lightning/callbacks/__init__.py
@@ -17,7 +17,6 @@
 from pytorch_lightning.callbacks.finetuning import BackboneFinetuning, BaseFinetuning
 from pytorch_lightning.callbacks.gpu_stats_monitor import GPUStatsMonitor
 from pytorch_lightning.callbacks.gradient_accumulation_scheduler import GradientAccumulationScheduler
-from pytorch_lightning.callbacks.hpu_stats_monitor import HPUStatsMonitor
 from pytorch_lightning.callbacks.lambda_function import LambdaCallback
 from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor
 from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
@@ -38,7 +37,6 @@
     "DeviceStatsMonitor",
     "EarlyStopping",
     "GPUStatsMonitor",
-    "HPUStatsMonitor",
     "XLAStatsMonitor",
     "GradientAccumulationScheduler",
     "LambdaCallback",
diff --git a/pytorch_lightning/callbacks/hpu_stats_monitor.py b/pytorch_lightning/callbacks/hpu_stats_monitor.py
deleted file mode 100644
index 50edf9b31a492..0000000000000
--- a/pytorch_lightning/callbacks/hpu_stats_monitor.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (C) 2021 Habana Labs, Ltd. an Intel Company
-# All Rights Reserved.
-#
-# Unauthorized copying of this file or any element(s) within it, via any medium
-# is strictly prohibited.
-# This file contains Habana Labs, Ltd. proprietary and confidential information
-# and is subject to the confidentiality and license agreements under which it
-# was provided.
-#
-
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-hpu Stats Monitor
-=================
-
-Monitor and logs hpu stats during training.
-
-"""
-
-import torch
-
-import pytorch_lightning as pl
-from pytorch_lightning.callbacks.base import Callback
-
-
-class HPUStatsMonitor(Callback):
-    """Automatically monitors and logs hpu stats during training stage.
-
-    Args:
-        save_dir: directory to save the logs.
-        exp_name: name of the experiment.
-    """
-
-    def __init__(self, log_save_dir: str = "habana_ptl_logs", exp_name: str = "default"):
-        super().__init__()
-        self.log_save_dir = log_save_dir
-        self.exp_name = exp_name
-
-    def on_init_end(self, trainer: "pl.Trainer") -> None:
-        from pytorch_lightning import loggers as pl_logger
-
-        self.tb_logger = pl_logger.TensorBoardLogger(save_dir=self.log_save_dir, name=self.exp_name)
-        trainer.logger = self.tb_logger
-
-    def on_before_backward(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", loss: torch.Tensor) -> None:
-        pl_module.log("Model_Loss", loss, on_step=True, on_epoch=True, enable_graph=False, logger=True)
-
-    def on_train_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
-        tensor_board = trainer.logger.experiment  # type: ignore
-        dict = vars(pl_module)
-        modules = dict["_modules"]
-        for module_name in modules:
-            tensor_board.add_histogram(module_name + ".weight", modules[module_name].weight, pl_module.current_epoch)
-            tensor_board.add_histogram(module_name + ".bias", modules[module_name].bias, pl_module.current_epoch)
diff --git a/pytorch_lightning/strategies/hpu.py b/pytorch_lightning/strategies/hpu.py
index 3c6f30159eacc..c4bc5875131dd 100644
--- a/pytorch_lightning/strategies/hpu.py
+++ b/pytorch_lightning/strategies/hpu.py
@@ -60,10 +60,6 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None:
     def model_to_device(self) -> None:
         self.model.to(self.root_device)  # type: ignore
 
-    def pre_dispatch(self) -> None:
-        if isinstance(self.device, int):  # type: ignore
-            self.device = torch.device(self.device)  # type: ignore
-
     @classmethod
     def register_strategies(cls, strategy_registry: Dict) -> None:
         strategy_registry.register(
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index 07c36f6410224..6c192332e14a5 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -842,12 +842,6 @@ def num_ipus(self) -> int:
             return self.devices
         return 0
 
-    @property
-    def num_hpus(self) -> int:
-        if isinstance(self.accelerator, HPUAccelerator):
-            return self.devices
-        return 0
-
     @property
     def num_gpus(self) -> int:
         if isinstance(self.accelerator, GPUAccelerator):

From 3c08bf5ad47cd53fca474cdd536f00798995193e Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Thu, 10 Mar 2022 12:46:13 +0200
Subject: [PATCH 053/167] Update non-hpu tests

Signed-off-by: Jerome <janand@habana.ai>
---
 pytorch_lightning/strategies/hpu.py | 2 --
 tests/accelerators/test_hpu.py      | 3 +++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/strategies/hpu.py b/pytorch_lightning/strategies/hpu.py
index c4bc5875131dd..34c2b7df99764 100644
--- a/pytorch_lightning/strategies/hpu.py
+++ b/pytorch_lightning/strategies/hpu.py
@@ -14,8 +14,6 @@
 
 from typing import Dict, Optional
 
-import torch
-
 import pytorch_lightning as pl
 from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO
 from pytorch_lightning.plugins.precision import PrecisionPlugin
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index 8686670b8607f..e15e97fb58f88 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import os
 from typing import Optional
+from unittest import mock
 
 import pytest
 import torch
@@ -42,6 +43,7 @@ def test_availability():
 
 
 @pytest.mark.skipif(_HPU_AVAILABLE, reason="test requires non-HPU machine")
+@mock.patch("pytorch_lightning.accelerators.hpu.HPUAccelerator.is_available", return_value=True)
 def test_fail_if_no_hpus(tmpdir):
     with pytest.raises(MisconfigurationException, match="HPU Accelerator requires HPU devices to run"):
         Trainer(default_root_dir=tmpdir, accelerator="hpu", devices=1)
@@ -261,6 +263,7 @@ def test_accelerator_hpu_with_single_device():
     assert isinstance(trainer.accelerator, HPUAccelerator)
 
 
+@RunIf(hpu=True)
 def test_accelerator_hpu_with_multiple_devices():
 
     trainer = Trainer(accelerator="hpu", devices=8)

From f857721d6b10ac397a1d75f6678aa51cbc628485 Mon Sep 17 00:00:00 2001
From: Jirka <jirka.borovec@seznam.cz>
Date: Thu, 10 Mar 2022 12:02:36 +0100
Subject: [PATCH 054/167] Add hpu-tests.yml and run_hpu_tests.py to support HPU
 Testing

This commit 0cc5ef9c
---
 .azure-pipelines/run_hpu_tests.py | 137 ++++++++++++++++++++++++++++++
 1 file changed, 137 insertions(+)
 create mode 100644 .azure-pipelines/run_hpu_tests.py

diff --git a/.azure-pipelines/run_hpu_tests.py b/.azure-pipelines/run_hpu_tests.py
new file mode 100644
index 0000000000000..55f2796a30d1b
--- /dev/null
+++ b/.azure-pipelines/run_hpu_tests.py
@@ -0,0 +1,137 @@
+"""This file is called from the hpu-tests.yml pipeline.
+The following script run the hpu tests in parallel.
+Tests run are:
+1. test_inference_only is run on four cards
+2. test_all_stages on two cards
+3. complete hpu tests using one card
+4. complete hpu tests using eight cards.
+"""
+import itertools
+import subprocess
+import sys
+
+HPU_TESTS_DICTIONARY = {
+    "hpu1_test": "python -m coverage run --source pytorch_lightning -m pytest -sv tests/accelerators/test_hpu.py \
+            --hmp-bf16 'tests/accelerators/ops_bf16_mnist.txt' \
+            --hmp-fp32 'tests/accelerators/ops_fp32_mnist.txt' \
+            --forked \
+            --junitxml=hpu1_test-results.xml",
+    "hpu2_test": "python -m coverage run --source pytorch_lightning -m pytest -sv tests/accelerators/test_hpu.py \
+            -k test_all_stages \
+            --hpus 2 \
+            --verbose \
+            --capture=no \
+            --forked \
+            --junitxml=hpu2_test-results.xml",
+    "hpu4_test": "python -m coverage run --source pytorch_lightning -m pytest -sv tests/accelerators/test_hpu.py \
+            -k test_inference_only \
+            --hpus 4 \
+            --capture=no \
+            --verbose \
+            --forked \
+            --junitxml=hpu4_test-results.xml",
+    "hpu8_test": "python -m coverage run --source pytorch_lightning -m pytest -sv tests/accelerators/test_hpu.py \
+            --hmp-bf16 'tests/accelerators/ops_bf16_mnist.txt' \
+            --hmp-fp32 'tests/accelerators/ops_fp32_mnist.txt' \
+            --forked \
+            --hpus 8 \
+            --junitxml=hpu8_test-results.xml",
+}
+
+HPU1_TEST = HPU_TESTS_DICTIONARY["hpu1_test"]
+HPU2_TEST = HPU_TESTS_DICTIONARY["hpu2_test"]
+HPU4_TEST = HPU_TESTS_DICTIONARY["hpu4_test"]
+HPU8_TEST = HPU_TESTS_DICTIONARY["hpu8_test"]
+
+PARALLEL_HPU_TESTS_EXECUTION = [[HPU4_TEST, HPU1_TEST], [HPU2_TEST, HPU1_TEST], [HPU8_TEST]]
+TIMEOUT = 60
+TIMEOUT_EXIT_CODE = -9
+
+
+def run_hpu_tests_parallel(timeout=TIMEOUT):
+    """This function is called to run the HPU tests in parallel.
+    We run the tests in sub process to utilize all the eight cards available in the DL1 instance
+    Considering the max time taken to run the HPU tests as 60 seconds, we kill the process if the time taken exceeds.
+    Return of this function will be the list of exit status of the HPU tests that were run in the subprocess.
+    Here, the exit_status 0 means the test run is successful. exit_status 1 means the test run is failed.
+    Args:
+        timeout: The threshold time to run the HPU tests in parallel.
+        Exception is logged if the threshold timeout gets expired.
+        TIMEOUT_EXIT_CODE will be returned as -9 in case of timeout, 0 in case of success and 4 in case of a failure.
+    """
+    exit_status = []
+    with open("stdout_log.txt", "w") as stdout_log, open("error_log.txt", "w") as error_log:
+        for hpu_tests in PARALLEL_HPU_TESTS_EXECUTION:
+            process_list = [
+                subprocess.Popen(
+                    each_hpu_test, shell=True, stdout=stdout_log, stderr=error_log, universal_newlines=True
+                )
+                for each_hpu_test in hpu_tests
+            ]
+            for process in process_list:
+                try:
+                    exit_status.append(process.wait(timeout=TIMEOUT))
+                except subprocess.TimeoutExpired as e:
+                    print(e)
+                    print("Killing the process....")
+                    process.kill()
+                    exit_status.append(TIMEOUT_EXIT_CODE)
+    return exit_status
+
+
+def zip_cmd_exitcode(exit_status):
+    """This function is called to zip the tests that were executed with the exit status of the test.
+    Return of this function will be list of hpu tests called and their exit status.
+    Args:
+        exit_status: The returned exit_status after executing run_hpu_tests_parallel().
+    """
+    status_list = []
+    hpu_tests_called = []
+    for hpu_tests in PARALLEL_HPU_TESTS_EXECUTION:
+        hpu_tests_called.append(hpu_tests)
+    status_list = list(zip(list(itertools.chain(*hpu_tests_called)), exit_status))
+    return status_list
+
+
+def print_logs(filename):
+    """This function is called to read the file and print the logs.
+    Args:
+        filename: Provide the log filename that need to be print on the console.
+    """
+    with open(filename) as f:
+        print(f.read())
+
+
+def print_subprocess_logs_and_return_status(exit_status):
+    """This function is called to print the logs of subprocess stdout and stderror and return the status of test
+    execution.
+    Args:
+        exit_status: The returned exit_status after executing run_hpu_tests_parallel().
+    Return of this function will be the return to main().
+    Based on the exit status of the HPU tests, we return success or failure to the main method.
+    """
+    if all(v == 0 for v in exit_status):
+        print("All HPU tests passed")
+        file_name = "stdout_log.txt"
+        print_logs(file_name)
+        return 0
+    else:
+        print("HPU tests are failing")
+        print("Printing stdout_log.txt...")
+        file_name = "stdout_log.txt"
+        print_logs(file_name)
+        print("Printing error_log.txt...")
+        file_name = "error_log.txt"
+        print_logs(file_name)
+        return 1
+
+
+def main():
+    exit_status = run_hpu_tests_parallel(timeout=TIMEOUT)
+    status_list = zip_cmd_exitcode(exit_status)
+    print("HPU Tests executed and their exit status:", status_list)
+    return print_subprocess_logs_and_return_status(exit_status)
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 7cb34bcc5cce7add2c8a38a7ddc0a7d13c80ba1f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 10 Mar 2022 11:38:53 +0000
Subject: [PATCH 055/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .azure-pipelines/run_hpu_tests.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.azure-pipelines/run_hpu_tests.py b/.azure-pipelines/run_hpu_tests.py
index 55f2796a30d1b..cd8531103529e 100644
--- a/.azure-pipelines/run_hpu_tests.py
+++ b/.azure-pipelines/run_hpu_tests.py
@@ -1,4 +1,5 @@
 """This file is called from the hpu-tests.yml pipeline.
+
 The following script run the hpu tests in parallel.
 Tests run are:
 1. test_inference_only is run on four cards
@@ -50,6 +51,7 @@
 
 def run_hpu_tests_parallel(timeout=TIMEOUT):
     """This function is called to run the HPU tests in parallel.
+
     We run the tests in sub process to utilize all the eight cards available in the DL1 instance
     Considering the max time taken to run the HPU tests as 60 seconds, we kill the process if the time taken exceeds.
     Return of this function will be the list of exit status of the HPU tests that were run in the subprocess.
@@ -81,6 +83,7 @@ def run_hpu_tests_parallel(timeout=TIMEOUT):
 
 def zip_cmd_exitcode(exit_status):
     """This function is called to zip the tests that were executed with the exit status of the test.
+
     Return of this function will be list of hpu tests called and their exit status.
     Args:
         exit_status: The returned exit_status after executing run_hpu_tests_parallel().
@@ -95,6 +98,7 @@ def zip_cmd_exitcode(exit_status):
 
 def print_logs(filename):
     """This function is called to read the file and print the logs.
+
     Args:
         filename: Provide the log filename that need to be print on the console.
     """
@@ -105,6 +109,7 @@ def print_logs(filename):
 def print_subprocess_logs_and_return_status(exit_status):
     """This function is called to print the logs of subprocess stdout and stderror and return the status of test
     execution.
+
     Args:
         exit_status: The returned exit_status after executing run_hpu_tests_parallel().
     Return of this function will be the return to main().

From f6baf69356560bba3710b2b9a3fadea1d9c417f5 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Thu, 10 Mar 2022 14:34:57 +0200
Subject: [PATCH 056/167] Add exception for non-hpu tests

Signed-off-by: Jerome <janand@habana.ai>
---
 pytorch_lightning/plugins/precision/hpu_precision.py | 3 ++-
 pytorch_lightning/strategies/hpu.py                  | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/plugins/precision/hpu_precision.py b/pytorch_lightning/plugins/precision/hpu_precision.py
index bf32c78f9ebe8..2c6446ca1da80 100644
--- a/pytorch_lightning/plugins/precision/hpu_precision.py
+++ b/pytorch_lightning/plugins/precision/hpu_precision.py
@@ -33,7 +33,8 @@ class HPUPrecisionPlugin(PrecisionPlugin):
 
     def __init__(self, precision: int, hmp_params: Optional[Sequence[Any]] = None) -> None:
         if not _HPU_AVAILABLE:
-            raise MisconfigurationException("HPU precision plugin requires HPU support")
+            raise MisconfigurationException("HPU Accelerator requires HPU devices to run ."
+                                            "HPU precision plugin requires HPU support ")
         super().__init__()
         self.precision = precision
         if hmp_params is not None:
diff --git a/pytorch_lightning/strategies/hpu.py b/pytorch_lightning/strategies/hpu.py
index 34c2b7df99764..a091437d8e91e 100644
--- a/pytorch_lightning/strategies/hpu.py
+++ b/pytorch_lightning/strategies/hpu.py
@@ -19,6 +19,7 @@
 from pytorch_lightning.plugins.precision import PrecisionPlugin
 from pytorch_lightning.strategies.single_device import SingleDeviceStrategy
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities import _HPU_AVAILABLE
 
 
 class HPUStrategy(SingleDeviceStrategy):
@@ -40,6 +41,10 @@ def __init__(
         super().__init__(
             accelerator=accelerator, device=device, checkpoint_io=checkpoint_io, precision_plugin=precision_plugin
         )
+        if not _HPU_AVAILABLE:
+            raise MisconfigurationException(
+                "HPU Accelerator requires HPU devices to run"
+            )
 
     @property
     def is_distributed(self) -> bool:

From 21fc9a43e26ad96384130f71f7f58c3d4b7ea4d2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 10 Mar 2022 12:48:50 +0000
Subject: [PATCH 057/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pytorch_lightning/plugins/precision/hpu_precision.py | 5 +++--
 pytorch_lightning/strategies/hpu.py                  | 6 ++----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/plugins/precision/hpu_precision.py b/pytorch_lightning/plugins/precision/hpu_precision.py
index 2c6446ca1da80..490eea19d579d 100644
--- a/pytorch_lightning/plugins/precision/hpu_precision.py
+++ b/pytorch_lightning/plugins/precision/hpu_precision.py
@@ -33,8 +33,9 @@ class HPUPrecisionPlugin(PrecisionPlugin):
 
     def __init__(self, precision: int, hmp_params: Optional[Sequence[Any]] = None) -> None:
         if not _HPU_AVAILABLE:
-            raise MisconfigurationException("HPU Accelerator requires HPU devices to run ."
-                                            "HPU precision plugin requires HPU support ")
+            raise MisconfigurationException(
+                "HPU Accelerator requires HPU devices to run ." "HPU precision plugin requires HPU support "
+            )
         super().__init__()
         self.precision = precision
         if hmp_params is not None:
diff --git a/pytorch_lightning/strategies/hpu.py b/pytorch_lightning/strategies/hpu.py
index a091437d8e91e..6903ec159ff25 100644
--- a/pytorch_lightning/strategies/hpu.py
+++ b/pytorch_lightning/strategies/hpu.py
@@ -18,8 +18,8 @@
 from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO
 from pytorch_lightning.plugins.precision import PrecisionPlugin
 from pytorch_lightning.strategies.single_device import SingleDeviceStrategy
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities import _HPU_AVAILABLE
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
 class HPUStrategy(SingleDeviceStrategy):
@@ -42,9 +42,7 @@ def __init__(
             accelerator=accelerator, device=device, checkpoint_io=checkpoint_io, precision_plugin=precision_plugin
         )
         if not _HPU_AVAILABLE:
-            raise MisconfigurationException(
-                "HPU Accelerator requires HPU devices to run"
-            )
+            raise MisconfigurationException("HPU Accelerator requires HPU devices to run")
 
     @property
     def is_distributed(self) -> bool:

From 3665ffc06b6a5be08c2da1e8ba8412a22ced326e Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Thu, 10 Mar 2022 15:41:44 +0200
Subject: [PATCH 058/167] Throw exception when accelerator is not present

Signed-off-by: Jerome <janand@habana.ai>
---
 pytorch_lightning/strategies/hpu.py                       | 8 +++++---
 .../trainer/connectors/accelerator_connector.py           | 3 +++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/strategies/hpu.py b/pytorch_lightning/strategies/hpu.py
index 6903ec159ff25..b177bd8c8bc58 100644
--- a/pytorch_lightning/strategies/hpu.py
+++ b/pytorch_lightning/strategies/hpu.py
@@ -20,6 +20,7 @@
 from pytorch_lightning.strategies.single_device import SingleDeviceStrategy
 from pytorch_lightning.utilities import _HPU_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.types import _DEVICE
 
 
 class HPUStrategy(SingleDeviceStrategy):
@@ -29,20 +30,21 @@ class HPUStrategy(SingleDeviceStrategy):
 
     def __init__(
         self,
-        device: int,
+        device: _DEVICE = "hpu",
         accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None,
         checkpoint_io: Optional[HPUCheckpointIO] = None,
         precision_plugin: Optional[PrecisionPlugin] = None,
         hmp_params: Optional[str] = None,
     ):
 
+        if not _HPU_AVAILABLE:
+            raise MisconfigurationException("HPU Accelerator requires HPU devices to run")
+
         device = device
         checkpoint_io = checkpoint_io or HPUCheckpointIO()
         super().__init__(
             accelerator=accelerator, device=device, checkpoint_io=checkpoint_io, precision_plugin=precision_plugin
         )
-        if not _HPU_AVAILABLE:
-            raise MisconfigurationException("HPU Accelerator requires HPU devices to run")
 
     @property
     def is_distributed(self) -> bool:
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index 6c192332e14a5..60216cbd86f7d 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -550,6 +550,9 @@ def _choose_strategy(self) -> Union[Strategy, str]:
         if self._accelerator_flag == "ipu":
             return IPUStrategy.strategy_name
         if self._accelerator_flag == "hpu":
+            if not _HPU_AVAILABLE:
+                raise MisconfigurationException("HPU Accelerator requires HPU devices to run")
+
             if self._parallel_devices and len(self._parallel_devices) > 1:
                 return HPUParallelStrategy(parallel_devices=self.parallel_devices)  # type: ignore
             else:

From e0b4611248a4432585dacd6e6dc255c1921f7cc8 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Thu, 10 Mar 2022 18:00:35 +0200
Subject: [PATCH 059/167] Resolve mypy and  error message

Signed-off-by: Jerome <janand@habana.ai>
---
 .../trainer/connectors/accelerator_connector.py             | 2 +-
 tests/accelerators/test_accelerator_connector.py            | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index 60216cbd86f7d..b72caffddbe21 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -556,7 +556,7 @@ def _choose_strategy(self) -> Union[Strategy, str]:
             if self._parallel_devices and len(self._parallel_devices) > 1:
                 return HPUParallelStrategy(parallel_devices=self.parallel_devices)  # type: ignore
             else:
-                return HPUStrategy(device=torch.device("hpu"))  # type: ignore
+                return HPUStrategy(device=torch.device("hpu"))
         if self._accelerator_flag == "tpu":
             if self._parallel_devices and len(self._parallel_devices) > 1:
                 return TPUSpawnStrategy.strategy_name
diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
index 05641380ba869..fb94dd93e501d 100644
--- a/tests/accelerators/test_accelerator_connector.py
+++ b/tests/accelerators/test_accelerator_connector.py
@@ -943,8 +943,10 @@ def test_unsupported_hpu_choice(monkeypatch):
 
     monkeypatch.setattr(imports, "_HPU_AVAILABLE", True)
     monkeypatch.setattr(hpu, "_HPU_AVAILABLE", True)
-    monkeypatch.setattr(AcceleratorConnector, "has_hpu", True)
-    with pytest.raises(MisconfigurationException, match=r"accelerator='hpu', precision=64\)` is not supported"):
+    monkeypatch.setattr(AcceleratorConnector, "_HPU_AVAILABLE", True)
+    with pytest.raises(
+        MisconfigurationException, match=r"accelerator='hpu', precision=64\)` is not supported|HPU Accelerator requires HPU devices to run"
+        ):
         Trainer(accelerator="hpu", precision=64)
 
 

From 545ab6a847f8ffab4454b7889eda49ca284d1bb8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 10 Mar 2022 16:03:40 +0000
Subject: [PATCH 060/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/accelerators/test_accelerator_connector.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
index fb94dd93e501d..9a664743f7747 100644
--- a/tests/accelerators/test_accelerator_connector.py
+++ b/tests/accelerators/test_accelerator_connector.py
@@ -945,8 +945,9 @@ def test_unsupported_hpu_choice(monkeypatch):
     monkeypatch.setattr(hpu, "_HPU_AVAILABLE", True)
     monkeypatch.setattr(AcceleratorConnector, "_HPU_AVAILABLE", True)
     with pytest.raises(
-        MisconfigurationException, match=r"accelerator='hpu', precision=64\)` is not supported|HPU Accelerator requires HPU devices to run"
-        ):
+        MisconfigurationException,
+        match=r"accelerator='hpu', precision=64\)` is not supported|HPU Accelerator requires HPU devices to run",
+    ):
         Trainer(accelerator="hpu", precision=64)
 
 

From 96ed1cde1a3298022b6fc61a4cf83713fa822f28 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Thu, 10 Mar 2022 18:37:58 +0200
Subject: [PATCH 061/167] Disable hpu pl examples on CPU

Signed-off-by: Jerome <janand@habana.ai>
---
 .../hpu_examples/simple_mnist/mnist.py        | 83 ++++++++++---------
 pytorch_lightning/strategies/hpu.py           |  5 ++
 pytorch_lightning/strategies/hpu_parallel.py  | 10 ++-
 3 files changed, 58 insertions(+), 40 deletions(-)

diff --git a/pl_examples/hpu_examples/simple_mnist/mnist.py b/pl_examples/hpu_examples/simple_mnist/mnist.py
index 76390d5eabeb7..b03e2b7d955fc 100644
--- a/pl_examples/hpu_examples/simple_mnist/mnist.py
+++ b/pl_examples/hpu_examples/simple_mnist/mnist.py
@@ -14,7 +14,6 @@
 
 import os
 
-import habana_frameworks.torch.core as htcore  # noqa: F401
 import torch
 from torch.nn import functional as F
 
@@ -23,6 +22,7 @@
 from pytorch_lightning.plugins import HPUPrecisionPlugin
 from pytorch_lightning.strategies.hpu import HPUStrategy
 from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy
+from pytorch_lightning.utilities.imports import _HPU_AVAILABLE
 
 
 def parse_args():
@@ -88,41 +88,46 @@ def configure_optimizers(self):
 
 if __name__ == "__main__":
 
-    args = parse_args()
-
-    # Init our model
-    model = LitClassifier()
-
-    # Init DataLoader from MNIST Dataset
-    dm = MNISTDataModule(batch_size=args.batch_size)
-
-    # TBD: import these keys from hmp
-    hmp_keys = ["level", "verbose", "bf16_ops", "fp32_ops"]
-    hmp_params = dict.fromkeys(hmp_keys)
-    hmp_params["level"] = args.hmp_opt_level
-    hmp_params["verbose"] = args.hmp_verbose
-    hmp_params["bf16_ops"] = args.hmp_bf16  # "./pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt"
-    hmp_params["fp32_ops"] = args.hmp_fp32  # "./pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt"
-
-    parallel_devices = args.hpus
-    hpustrat_1 = HPUStrategy(
-        device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params)
-    )
-    hpustrat_8 = HPUParallelStrategy(
-        parallel_devices=[torch.device("hpu")] * parallel_devices,
-        precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params),
-    )
-
-    # Initialize a trainer
-    trainer = pl.Trainer(
-        strategy=hpustrat_8 if (parallel_devices == 8) else hpustrat_1,
-        devices=parallel_devices,
-        max_epochs=args.epochs,
-        default_root_dir=os.getcwd(),
-        accelerator="hpu",
-    )
-
-    # Train the model ⚡
-    trainer.fit(model, datamodule=dm)
-    trainer.test(model, datamodule=dm)
-    trainer.validate(model, datamodule=dm)
+    if _HPU_AVAILABLE:
+
+        args = parse_args()
+
+        # Init our model
+        model = LitClassifier()
+
+        # Init DataLoader from MNIST Dataset
+        dm = MNISTDataModule(batch_size=args.batch_size)
+
+        # TBD: import these keys from hmp
+        hmp_keys = ["level", "verbose", "bf16_ops", "fp32_ops"]
+        hmp_params = dict.fromkeys(hmp_keys)
+        hmp_params["level"] = args.hmp_opt_level
+        hmp_params["verbose"] = args.hmp_verbose
+        hmp_params["bf16_ops"] = args.hmp_bf16  # "./pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt"
+        hmp_params["fp32_ops"] = args.hmp_fp32  # "./pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt"
+
+        parallel_devices = args.hpus
+        hpustrat_1 = HPUStrategy(
+            device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params)
+        )
+        hpustrat_8 = HPUParallelStrategy(
+            parallel_devices=[torch.device("hpu")] * parallel_devices,
+            precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params),
+        )
+
+        # Initialize a trainer
+        trainer = pl.Trainer(
+            strategy=hpustrat_8 if (parallel_devices == 8) else hpustrat_1,
+            devices=parallel_devices,
+            max_epochs=args.epochs,
+            default_root_dir=os.getcwd(),
+            accelerator="hpu",
+        )
+
+        # Train the model ⚡
+        trainer.fit(model, datamodule=dm)
+        trainer.test(model, datamodule=dm)
+        trainer.validate(model, datamodule=dm)
+
+    else:
+        print("This example is supported only on HPU !")
diff --git a/pytorch_lightning/strategies/hpu.py b/pytorch_lightning/strategies/hpu.py
index b177bd8c8bc58..9ecb29e6e8700 100644
--- a/pytorch_lightning/strategies/hpu.py
+++ b/pytorch_lightning/strategies/hpu.py
@@ -40,6 +40,11 @@ def __init__(
         if not _HPU_AVAILABLE:
             raise MisconfigurationException("HPU Accelerator requires HPU devices to run")
 
+        from habana_frameworks.torch.utils.library_loader import load_habana_module
+        load_habana_module()
+        import habana_frameworks.torch.core # noqa: F401
+        import habana_frameworks.torch.core.hccl # noqa: F401
+
         device = device
         checkpoint_io = checkpoint_io or HPUCheckpointIO()
         super().__init__(
diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index 73f66e25d478c..25d4f5b2ae680 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -25,6 +25,8 @@
 from pytorch_lightning.strategies.ddp import DDPStrategy
 from pytorch_lightning.utilities.distributed import group as _group
 from pytorch_lightning.utilities.enums import _StrategyType
+from pytorch_lightning.utilities import _HPU_AVAILABLE
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
 class HPUParallelStrategy(DDPStrategy):
@@ -53,7 +55,13 @@ def __init__(
 
     def setup_environment(self) -> None:
 
-        import habana_frameworks.torch.core.hccl  # noqa: F401
+        if not _HPU_AVAILABLE:
+            raise MisconfigurationException("HPU Accelerator requires HPU devices to run")
+
+        from habana_frameworks.torch.utils.library_loader import load_habana_module
+        load_habana_module()
+        import habana_frameworks.torch.core # noqa: F401
+        import habana_frameworks.torch.core.hccl # noqa: F401
 
         os.environ["ID"] = str(self.local_rank)
         os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "hccl"

From c44b017e7aeb85095d3f15de65f1354db8d1e5bf Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 10 Mar 2022 16:44:50 +0000
Subject: [PATCH 062/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pytorch_lightning/strategies/hpu.py          | 5 +++--
 pytorch_lightning/strategies/hpu_parallel.py | 7 ++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/strategies/hpu.py b/pytorch_lightning/strategies/hpu.py
index 9ecb29e6e8700..6668fa29c78de 100644
--- a/pytorch_lightning/strategies/hpu.py
+++ b/pytorch_lightning/strategies/hpu.py
@@ -41,9 +41,10 @@ def __init__(
             raise MisconfigurationException("HPU Accelerator requires HPU devices to run")
 
         from habana_frameworks.torch.utils.library_loader import load_habana_module
+
         load_habana_module()
-        import habana_frameworks.torch.core # noqa: F401
-        import habana_frameworks.torch.core.hccl # noqa: F401
+        import habana_frameworks.torch.core
+        import habana_frameworks.torch.core.hccl  # noqa: F401
 
         device = device
         checkpoint_io = checkpoint_io or HPUCheckpointIO()
diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index 25d4f5b2ae680..77949b2ddc3ae 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -23,9 +23,9 @@
 from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO
 from pytorch_lightning.plugins.precision import PrecisionPlugin
 from pytorch_lightning.strategies.ddp import DDPStrategy
+from pytorch_lightning.utilities import _HPU_AVAILABLE
 from pytorch_lightning.utilities.distributed import group as _group
 from pytorch_lightning.utilities.enums import _StrategyType
-from pytorch_lightning.utilities import _HPU_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
@@ -59,9 +59,10 @@ def setup_environment(self) -> None:
             raise MisconfigurationException("HPU Accelerator requires HPU devices to run")
 
         from habana_frameworks.torch.utils.library_loader import load_habana_module
+
         load_habana_module()
-        import habana_frameworks.torch.core # noqa: F401
-        import habana_frameworks.torch.core.hccl # noqa: F401
+        import habana_frameworks.torch.core
+        import habana_frameworks.torch.core.hccl  # noqa: F401
 
         os.environ["ID"] = str(self.local_rank)
         os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "hccl"

From 410875c653290806619ef34effcfbdb48f992d3f Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Mon, 14 Mar 2022 14:47:12 +0200
Subject: [PATCH 063/167] Address review comments

- Change strategy naming

Signed-off-by: Jerome <janand@habana.ai>
---
 .../hpu_examples/simple_mnist/mnist.py        |  4 ++--
 pytorch_lightning/strategies/__init__.py      |  2 +-
 pytorch_lightning/strategies/ddp.py           |  2 +-
 pytorch_lightning/strategies/hpu.py           |  2 +-
 .../connectors/accelerator_connector.py       | 10 ++++----
 pytorch_lightning/utilities/argparse.py       |  2 +-
 tests/accelerators/test_hpu.py                | 24 +++++++++----------
 7 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/pl_examples/hpu_examples/simple_mnist/mnist.py b/pl_examples/hpu_examples/simple_mnist/mnist.py
index b03e2b7d955fc..0c309bf28f912 100644
--- a/pl_examples/hpu_examples/simple_mnist/mnist.py
+++ b/pl_examples/hpu_examples/simple_mnist/mnist.py
@@ -20,7 +20,7 @@
 import pytorch_lightning as pl
 from pl_examples.basic_examples.mnist_datamodule import MNISTDataModule
 from pytorch_lightning.plugins import HPUPrecisionPlugin
-from pytorch_lightning.strategies.hpu import HPUStrategy
+from pytorch_lightning.strategies.hpu import SingleHPUStrategy
 from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy
 from pytorch_lightning.utilities.imports import _HPU_AVAILABLE
 
@@ -107,7 +107,7 @@ def configure_optimizers(self):
         hmp_params["fp32_ops"] = args.hmp_fp32  # "./pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt"
 
         parallel_devices = args.hpus
-        hpustrat_1 = HPUStrategy(
+        hpustrat_1 = SingleHPUStrategy(
             device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params)
         )
         hpustrat_8 = HPUParallelStrategy(
diff --git a/pytorch_lightning/strategies/__init__.py b/pytorch_lightning/strategies/__init__.py
index 0d39cdf4d4695..1cc928e489e13 100644
--- a/pytorch_lightning/strategies/__init__.py
+++ b/pytorch_lightning/strategies/__init__.py
@@ -21,7 +21,7 @@
 from pytorch_lightning.strategies.dp import DataParallelStrategy  # noqa: F401
 from pytorch_lightning.strategies.fully_sharded import DDPFullyShardedStrategy  # noqa: F401
 from pytorch_lightning.strategies.horovod import HorovodStrategy  # noqa: F401
-from pytorch_lightning.strategies.hpu import HPUStrategy  # noqa: F401
+from pytorch_lightning.strategies.hpu import SingleHPUStrategy  # noqa: F401
 from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy  # noqa: F401
 from pytorch_lightning.strategies.ipu import IPUStrategy  # noqa: F401
 from pytorch_lightning.strategies.parallel import ParallelStrategy  # noqa: F401
diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py
index fad0603e8ab47..3636bf56cf293 100644
--- a/pytorch_lightning/strategies/ddp.py
+++ b/pytorch_lightning/strategies/ddp.py
@@ -279,7 +279,7 @@ def configure_ddp(self) -> None:
         self._register_ddp_hooks()
 
     def determine_ddp_device_ids(self):
-        if self.root_device.type == "cpu" or self.root_device.type == "hpu":
+        if self.root_device.type in ("cpu", "hpu"):
             return None
         return [self.root_device.index]
 
diff --git a/pytorch_lightning/strategies/hpu.py b/pytorch_lightning/strategies/hpu.py
index 6668fa29c78de..7e43ff94760f2 100644
--- a/pytorch_lightning/strategies/hpu.py
+++ b/pytorch_lightning/strategies/hpu.py
@@ -23,7 +23,7 @@
 from pytorch_lightning.utilities.types import _DEVICE
 
 
-class HPUStrategy(SingleDeviceStrategy):
+class SingleHPUStrategy(SingleDeviceStrategy):
     """Strategy for training on HPU devices."""
 
     strategy_name = "hpu_single"
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index b72caffddbe21..8cc2976e510df 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -59,7 +59,7 @@
     DeepSpeedStrategy,
     HorovodStrategy,
     HPUParallelStrategy,
-    HPUStrategy,
+    SingleHPUStrategy,
     IPUStrategy,
     ParallelStrategy,
     SingleDeviceStrategy,
@@ -467,7 +467,7 @@ def _set_accelerator_if_ipu_strategy_is_passed(self) -> None:
             self._accelerator_flag = "ipu"
 
     def _set_accelerator_if_hpu_strategy_is_passed(self) -> None:
-        if isinstance(self._strategy_flag, HPUStrategy):
+        if isinstance(self._strategy_flag, SingleHPUStrategy):
             self._accelerator_flag = "hpu"
 
     def _choose_accelerator(self) -> str:
@@ -556,7 +556,7 @@ def _choose_strategy(self) -> Union[Strategy, str]:
             if self._parallel_devices and len(self._parallel_devices) > 1:
                 return HPUParallelStrategy(parallel_devices=self.parallel_devices)  # type: ignore
             else:
-                return HPUStrategy(device=torch.device("hpu"))
+                return SingleHPUStrategy(device=torch.device("hpu"))
         if self._accelerator_flag == "tpu":
             if self._parallel_devices and len(self._parallel_devices) > 1:
                 return TPUSpawnStrategy.strategy_name
@@ -791,10 +791,10 @@ def _lazy_init_strategy(self) -> None:
             )
 
         if isinstance(self.accelerator, HPUAccelerator) and not isinstance(
-            self.strategy, (HPUStrategy, HPUParallelStrategy)
+            self.strategy, (SingleHPUStrategy, HPUParallelStrategy)
         ):
             raise ValueError(
-                "The `TPUAccelerator` can only be used with a `HPUStrategy` or `HPUParallelStrategy`,"
+                "The `HPUAccelerator` can only be used with a `SingleHPUStrategy` or `HPUParallelStrategy`,"
                 f" found {self.strategy}."
             )
 
diff --git a/pytorch_lightning/utilities/argparse.py b/pytorch_lightning/utilities/argparse.py
index 147e7c3458273..57cb6b5315e88 100644
--- a/pytorch_lightning/utilities/argparse.py
+++ b/pytorch_lightning/utilities/argparse.py
@@ -250,7 +250,7 @@ def add_argparse_args(
         else:
             use_type = arg_types[0]
 
-        if arg == "gpus" or arg == "tpu_cores" or arg == "hpus":
+        if arg in ("gpus", "tpu_cores", "hpus"):
             use_type = _gpus_allowed_type
 
         # hack for types in (int, float)
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index e15e97fb58f88..cc1b892935fa1 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -22,7 +22,7 @@
 from pytorch_lightning.accelerators import HPUAccelerator
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins import HPUPrecisionPlugin
-from pytorch_lightning.strategies.hpu import HPUStrategy
+from pytorch_lightning.strategies.hpu import SingleHPUStrategy
 from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy
 from pytorch_lightning.utilities import _HPU_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -61,7 +61,7 @@ def test_accelerator_selected(tmpdir):
 @RunIf(hpu=True)
 def test_no_warning_plugin(tmpdir):
     with pytest.warns(None) as record:
-        Trainer(default_root_dir=tmpdir, max_epochs=1, strategy=HPUStrategy(device=torch.device("hpu")))
+        Trainer(default_root_dir=tmpdir, max_epochs=1, strategy=SingleHPUStrategy(device=torch.device("hpu")))
     assert len(record) == 0
 
 
@@ -69,7 +69,7 @@ def test_no_warning_plugin(tmpdir):
 def test_all_stages(tmpdir, hpus):
     model = BoringModel()
     parallel_devices = hpus
-    hpustrat_1 = HPUStrategy(
+    hpustrat_1 = SingleHPUStrategy(
         device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=None)
     )
     hpustrat_8 = HPUParallelStrategy(
@@ -136,7 +136,7 @@ def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[st
 
     model = BoringModel()
     trainer = Trainer(
-        strategy=HPUStrategy(
+        strategy=SingleHPUStrategy(
             device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision="bf16", hmp_params=hmp_params)
         ),
         default_root_dir=tmpdir,
@@ -145,7 +145,7 @@ def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[st
         devices=1,
         callbacks=TestCallback(),
     )
-    assert isinstance(trainer.strategy, HPUStrategy)
+    assert isinstance(trainer.strategy, SingleHPUStrategy)
     assert isinstance(trainer.strategy.precision_plugin, HPUPrecisionPlugin)
     assert trainer.strategy.precision_plugin.precision == "bf16"
     with pytest.raises(SystemExit):
@@ -164,7 +164,7 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
     model = BoringModel()
     model = model.half()
     trainer = Trainer(
-        strategy=HPUStrategy(
+        strategy=SingleHPUStrategy(
             device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params)
         ),
         default_root_dir=tmpdir,
@@ -174,7 +174,7 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
         callbacks=TestCallback(),
     )
 
-    assert isinstance(trainer.strategy, HPUStrategy)
+    assert isinstance(trainer.strategy, SingleHPUStrategy)
     assert isinstance(trainer.strategy.precision_plugin, HPUPrecisionPlugin)
     assert trainer.strategy.precision_plugin.precision == 16
 
@@ -259,7 +259,7 @@ def test_accelerator_hpu_with_single_device():
 
     trainer = Trainer(accelerator="hpu", devices=1)
 
-    assert isinstance(trainer.strategy, HPUStrategy)
+    assert isinstance(trainer.strategy, SingleHPUStrategy)
     assert isinstance(trainer.accelerator, HPUAccelerator)
 
 
@@ -289,8 +289,8 @@ def test_set_devices_if_none_hpu():
 
 @RunIf(hpu=True)
 def test_strategy_choice_hpu_plugin(tmpdir):
-    trainer = Trainer(strategy=HPUStrategy(device=torch.device("hpu")), accelerator="hpu", devices=1)
-    assert isinstance(trainer.strategy, HPUStrategy)
+    trainer = Trainer(strategy=SingleHPUStrategy(device=torch.device("hpu")), accelerator="hpu", devices=1)
+    assert isinstance(trainer.strategy, SingleHPUStrategy)
 
 
 @RunIf(hpu=True)
@@ -304,8 +304,8 @@ def test_strategy_choice_hpu_parallel_plugin(tmpdir):
 @RunIf(hpu=True)
 def test_device_type_when_training_plugin_hpu_passed(tmpdir):
 
-    trainer = Trainer(strategy=HPUStrategy(device=torch.device("hpu")), accelerator="hpu", devices=1)
-    assert isinstance(trainer.strategy, HPUStrategy)
+    trainer = Trainer(strategy=SingleHPUStrategy(device=torch.device("hpu")), accelerator="hpu", devices=1)
+    assert isinstance(trainer.strategy, SingleHPUStrategy)
     assert isinstance(trainer.accelerator, HPUAccelerator)
 
 

From 8efe56f8867786a3d50031b26a78993aabbcfb54 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 14 Mar 2022 12:50:38 +0000
Subject: [PATCH 064/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pytorch_lightning/trainer/connectors/accelerator_connector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index 8cc2976e510df..8ff40d3a4b263 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -59,10 +59,10 @@
     DeepSpeedStrategy,
     HorovodStrategy,
     HPUParallelStrategy,
-    SingleHPUStrategy,
     IPUStrategy,
     ParallelStrategy,
     SingleDeviceStrategy,
+    SingleHPUStrategy,
     SingleTPUStrategy,
     Strategy,
     StrategyRegistry,

From 073b1700e10caf2db987790f7824f9f093397bce Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Tue, 15 Mar 2022 06:03:15 +0200
Subject: [PATCH 065/167] Add documentation for habana gaudi accelerator (HPU)

Signed-off-by: Jerome <janand@habana.ai>
---
 docs/source/accelerators/hpu.rst | 219 +++++++++++++++++++++++++++++++
 1 file changed, 219 insertions(+)
 create mode 100644 docs/source/accelerators/hpu.rst

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
new file mode 100644
index 0000000000000..3d47b0a54369c
--- /dev/null
+++ b/docs/source/accelerators/hpu.rst
@@ -0,0 +1,219 @@
+.. _hpu:
+
+Habana Gaudi AI Processor
+=========================
+
+Habana® Gaudi® AI training processors have been architected from the ground up and optimized for deep learning training efficiency.
+Gaudi offers substantial price/performance advantage -- so you get to do more deep learning training while spending less.
+
+You can use either the Gaudi-based AWS EC2 DL1 instances `<https://aws.amazon.com/ec2/instance-types/dl1/>` or the Supermicro X12 Gaudi server `< https://www.supermicro.com/en/solutions/habana-gaudi>`
+
+Habana’s SynapseAI® software suite is optimized for building and training deep learning models using TensorFlow and PyTorch frameworks.  Gaudi is referred to as the Habana Processing Unit (HPU).
+With SynapseAI, we aim to make training workloads on Gaudi easy, whether you're developing from scratch or migrating existing workloads.  Lightning supports running on HPUs.
+For more information, check out `<https://developer.habana.ai>` and `<https://habana.ai/>`_.
+
+PyTorch Lightning With Gaudi HPU
+================================
+
+Lightning supports training on a single HPU device or 8 HPU devices with the following plugins
+
+
+.. _hpu_accelerator:
+
+HPU accelerator
+---------------
+
+The :code:`devices=1` with :code:`accelerator="hpu"` parameters in the trainer class enables the Habana backend.
+
+
+.. _single_device_strategy:
+
+Training on Single HPU
+----------------------
+
+The :code:`devices=1` and :code:`accelerator="hpu"` with :code:`strategy=SingleHPUStrategy(device=torch.device("hpu"))` parameter in the trainer class enables the Habana backend for single Gaudi training.
+
+
+.. _parallel_device_strategy:
+
+Distributed Training
+---------------------
+
+
+The :code:`devices=8` and :code:`accelerator="hpu"` with :code:`strategy=HPUParallelStrategy( parallel_devices=[torch.device("hpu")] * devices)`  parameter in the trainer class enables the Habana backend for distributed training with 8 Gaudis.
+
+The Habana parallel device strategy is based on DDP strategy with the addition of  Habana's collective communication library (HCCL) to support scale-up within a node and scale-out across multiple nodes.
+
+
+.. _mixed_precision_plugin:
+
+Mixed Precision Plugin
+----------------------
+
+The :code:`precision=16` and a :code:`hmp_params` parameter in the trainer class enables the Habana plugin for mixed precision using the Habana Mixed Precision (HMP) package.
+
+You can execute the ops in FP32 or BF16 precision. The HMP package modifies the python operators to add the appropriate cast operations for the arguments before execution.
+The default settings enable users to easily enable mixed precision training with minimal code.
+
+In addition to the default settings in HMP,  users also have the option of overriding these defaults and providing your own BF16 and FP32 operator lists
+
+For more details, please refer `<https://docs.habana.ai/en/master/PyTorch/PyTorch_User_Guide/PT_Mixed_Precision.html#pytorch-mixed-precision-training>`_.
+
+
+.. _pytorch_lightning_examples:
+
+Getting Started with Lightning on Gaudi
+=======================================
+
+This section describes how to train models using Habana PyTorch with Gaudi.
+
+More Lightning HPU examples can be found in  pl_examples (`<https://github.com/PyTorchLightning/pytorch-lightning/pl_examples/hpu_examples/ >`)
+
+Enabling Lightning with Single Gaudi HPU
+----------------------------------------
+
+The below snippet shows an example model using MNIST with single Habana Gaudi.
+
+.. testcode::
+    import habana_frameworks.torch.core as htcore
+
+    class LitClassifier(pl.LightningModule):
+
+            def __init__(self):
+                super(LitClassifier, self).__init__()
+
+            ...
+
+
+    # Init our model
+    model = LitClassifier()
+
+    # Init DataLoader from MNIST Dataset
+    dm = MNISTDataModule(batch_size=batch_size)
+
+    ...
+
+    num_hpus = 1
+
+    # enable HPU strategy for single device, with mixed precision using default HMP settings
+    hpustrat_1 = SingleHPUStrategy(
+        device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16)
+    )
+
+    # Initialize a trainer with 1 HPU accelerator
+    trainer = pl.Trainer(
+        accelerator="hpu",
+        devices=num_hpus,
+        strategy=hpustrat_1,
+        ...
+    )
+
+    # Train the model ⚡
+    trainer.fit(model, datamodule=dm)
+
+
+Enabling Lightning with 8 Gaudi HPUs (distributed)
+--------------------------------------------------
+
+The below snippet shows an example model using MNIST with 8 Habana Gaudis.
+
+.. testcode::
+    import habana_frameworks.torch.core as htcore
+
+    class LitClassifier(pl.LightningModule):
+
+            def __init__(self):
+                super(LitClassifier, self).__init__()
+
+            ...
+
+
+    # Init our model
+    model = LitClassifier()
+
+    # Init DataLoader from MNIST Dataset
+    dm = MNISTDataModule(batch_size=batch_size)
+
+    ...
+
+    num_hpus = 8
+
+    # setup parallel strategy for 8 HPU's
+    hpustrat_8 = HPUParallelStrategy(
+        parallel_devices=[torch.device("hpu")] * num_hpus,
+        precision_plugin=HPUPrecisionPlugin(precision=16),
+    )
+
+    # Initialize a trainer with 1 HPU accelerator
+    trainer = pl.Trainer(
+        accelerator="hpu",
+        devices=num_hpus,
+        strategy=hpustrat_8,
+        ...
+    )
+
+    # Train the model ⚡
+    trainer.fit(model, datamodule=dm)
+
+
+Enabling Mixed Precision Options
+--------------------------------
+
+The below snippet shows an example model using MNIST with single Habana Gaudi and making use of HMP by overriding the default parameters.
+This enables advanced users to provide their own bf16 and fp32 operator list instead of using the HMP defaults.
+
+.. testcode::
+    import habana_frameworks.torch.core as htcore
+
+    class LitClassifier(pl.LightningModule):
+
+            def __init__(self):
+                super(LitClassifier, self).__init__()
+
+            ...
+
+
+    # Init our model
+    model = LitClassifier()
+
+    # Init DataLoader from MNIST Dataset
+    dm = MNISTDataModule(batch_size=batch_size)
+
+    ...
+
+    num_hpus = 1
+
+    # Optional Habana mixed precision params to be set
+    hmp_keys = ["level", "verbose", "bf16_ops", "fp32_ops"]
+    hmp_params = dict.fromkeys(hmp_keys)
+    hmp_params["level"] = "O1"
+    hmp_params["verbose"] = False
+    hmp_params["bf16_ops"] = "ops_bf16_mnist.txt"
+    hmp_params["fp32_ops"] = "ops_fp32_mnist.txt"
+
+    # enable HPU strategy for single device, with mixed precision using overidden HMP settings
+    hpustrat_1 = SingleHPUStrategy(
+        device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params)
+    )
+
+    # Initialize a trainer with 1 HPU accelerator
+    trainer = pl.Trainer(
+        accelerator="hpu",
+        devices=num_hpus,
+        strategy=hpustrat_1,
+        ...
+    )
+
+    # Train the model ⚡
+    trainer.fit(model, datamodule=dm)
+
+
+.. _known-limitations:
+
+Known limitations
+-----------------
+
+* Habana dataloader is not supported
+* Device stats monitoring is not supported
+
+

From 7bdcaf6e83b1cca4a3586b5fb538a77b826342d4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 15 Mar 2022 04:05:34 +0000
Subject: [PATCH 066/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 docs/source/accelerators/hpu.rst | 51 ++++++++++----------------------
 1 file changed, 16 insertions(+), 35 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index 3d47b0a54369c..57a8fdd62862e 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -77,12 +77,12 @@ The below snippet shows an example model using MNIST with single Habana Gaudi.
 .. testcode::
     import habana_frameworks.torch.core as htcore
 
-    class LitClassifier(pl.LightningModule):
 
-            def __init__(self):
-                super(LitClassifier, self).__init__()
+    class LitClassifier(pl.LightningModule):
+        def __init__(self):
+            super(LitClassifier, self).__init__()
 
-            ...
+        ...
 
 
     # Init our model
@@ -96,17 +96,10 @@ The below snippet shows an example model using MNIST with single Habana Gaudi.
     num_hpus = 1
 
     # enable HPU strategy for single device, with mixed precision using default HMP settings
-    hpustrat_1 = SingleHPUStrategy(
-        device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16)
-    )
+    hpustrat_1 = SingleHPUStrategy(device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16))
 
     # Initialize a trainer with 1 HPU accelerator
-    trainer = pl.Trainer(
-        accelerator="hpu",
-        devices=num_hpus,
-        strategy=hpustrat_1,
-        ...
-    )
+    trainer = pl.Trainer(accelerator="hpu", devices=num_hpus, strategy=hpustrat_1, ...)
 
     # Train the model ⚡
     trainer.fit(model, datamodule=dm)
@@ -120,12 +113,12 @@ The below snippet shows an example model using MNIST with 8 Habana Gaudis.
 .. testcode::
     import habana_frameworks.torch.core as htcore
 
-    class LitClassifier(pl.LightningModule):
 
-            def __init__(self):
-                super(LitClassifier, self).__init__()
+    class LitClassifier(pl.LightningModule):
+        def __init__(self):
+            super(LitClassifier, self).__init__()
 
-            ...
+        ...
 
 
     # Init our model
@@ -145,12 +138,7 @@ The below snippet shows an example model using MNIST with 8 Habana Gaudis.
     )
 
     # Initialize a trainer with 1 HPU accelerator
-    trainer = pl.Trainer(
-        accelerator="hpu",
-        devices=num_hpus,
-        strategy=hpustrat_8,
-        ...
-    )
+    trainer = pl.Trainer(accelerator="hpu", devices=num_hpus, strategy=hpustrat_8, ...)
 
     # Train the model ⚡
     trainer.fit(model, datamodule=dm)
@@ -165,12 +153,12 @@ This enables advanced users to provide their own bf16 and fp32 operator list ins
 .. testcode::
     import habana_frameworks.torch.core as htcore
 
-    class LitClassifier(pl.LightningModule):
 
-            def __init__(self):
-                super(LitClassifier, self).__init__()
+    class LitClassifier(pl.LightningModule):
+        def __init__(self):
+            super(LitClassifier, self).__init__()
 
-            ...
+        ...
 
 
     # Init our model
@@ -197,12 +185,7 @@ This enables advanced users to provide their own bf16 and fp32 operator list ins
     )
 
     # Initialize a trainer with 1 HPU accelerator
-    trainer = pl.Trainer(
-        accelerator="hpu",
-        devices=num_hpus,
-        strategy=hpustrat_1,
-        ...
-    )
+    trainer = pl.Trainer(accelerator="hpu", devices=num_hpus, strategy=hpustrat_1, ...)
 
     # Train the model ⚡
     trainer.fit(model, datamodule=dm)
@@ -215,5 +198,3 @@ Known limitations
 
 * Habana dataloader is not supported
 * Device stats monitoring is not supported
-
-

From da1037a958b76532a8fa6d3064933f031f5243cb Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Tue, 15 Mar 2022 06:55:51 +0200
Subject: [PATCH 067/167] Update test code syntax

Signed-off-by: Jerome <janand@habana.ai>
---
 docs/source/accelerators/hpu.rst | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index 57a8fdd62862e..82ad812a9b57d 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -74,7 +74,8 @@ Enabling Lightning with Single Gaudi HPU
 
 The below snippet shows an example model using MNIST with single Habana Gaudi.
 
-.. testcode::
+.. code-block:: python
+
     import habana_frameworks.torch.core as htcore
 
 
@@ -99,7 +100,7 @@ The below snippet shows an example model using MNIST with single Habana Gaudi.
     hpustrat_1 = SingleHPUStrategy(device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16))
 
     # Initialize a trainer with 1 HPU accelerator
-    trainer = pl.Trainer(accelerator="hpu", devices=num_hpus, strategy=hpustrat_1, ...)
+    trainer = pl.Trainer(accelerator="hpu", devices=num_hpus, strategy=hpustrat_1)
 
     # Train the model ⚡
     trainer.fit(model, datamodule=dm)
@@ -110,7 +111,8 @@ Enabling Lightning with 8 Gaudi HPUs (distributed)
 
 The below snippet shows an example model using MNIST with 8 Habana Gaudis.
 
-.. testcode::
+.. code-block:: python
+
     import habana_frameworks.torch.core as htcore
 
 
@@ -138,7 +140,7 @@ The below snippet shows an example model using MNIST with 8 Habana Gaudis.
     )
 
     # Initialize a trainer with 1 HPU accelerator
-    trainer = pl.Trainer(accelerator="hpu", devices=num_hpus, strategy=hpustrat_8, ...)
+    trainer = pl.Trainer(accelerator="hpu", devices=num_hpus, strategy=hpustrat_8)
 
     # Train the model ⚡
     trainer.fit(model, datamodule=dm)
@@ -150,7 +152,8 @@ Enabling Mixed Precision Options
 The below snippet shows an example model using MNIST with single Habana Gaudi and making use of HMP by overriding the default parameters.
 This enables advanced users to provide their own bf16 and fp32 operator list instead of using the HMP defaults.
 
-.. testcode::
+.. code-block:: python
+
     import habana_frameworks.torch.core as htcore
 
 
@@ -185,7 +188,7 @@ This enables advanced users to provide their own bf16 and fp32 operator list ins
     )
 
     # Initialize a trainer with 1 HPU accelerator
-    trainer = pl.Trainer(accelerator="hpu", devices=num_hpus, strategy=hpustrat_1, ...)
+    trainer = pl.Trainer(accelerator="hpu", devices=num_hpus, strategy=hpustrat_1)
 
     # Train the model ⚡
     trainer.fit(model, datamodule=dm)

From 5e7af01d5989d7b600713cb749c738fbf814d002 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Tue, 15 Mar 2022 10:10:46 +0200
Subject: [PATCH 068/167] Mitigate duplicate label error

Signed-off-by: Jerome <janand@habana.ai>
---
 docs/source/accelerators/hpu.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index 82ad812a9b57d..6e1061574fe48 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -194,7 +194,7 @@ This enables advanced users to provide their own bf16 and fp32 operator list ins
     trainer.fit(model, datamodule=dm)
 
 
-.. _known-limitations:
+.. _known-limitations_hpu:
 
 Known limitations
 -----------------

From 70d69939f83490889e59e78188c1faa0d148e2c5 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Wed, 16 Mar 2022 04:34:02 +0200
Subject: [PATCH 069/167] Add hpu to toctree

Signed-off-by: Jerome <janand@habana.ai>
---
 docs/source/accelerators/hpu.rst | 21 +++++++++++----------
 docs/source/index.rst            |  1 +
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index 6e1061574fe48..c3faf5d2f8f6e 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -9,13 +9,14 @@ Gaudi offers substantial price/performance advantage -- so you get to do more de
 You can use either the Gaudi-based AWS EC2 DL1 instances `<https://aws.amazon.com/ec2/instance-types/dl1/>` or the Supermicro X12 Gaudi server `< https://www.supermicro.com/en/solutions/habana-gaudi>`
 
 Habana’s SynapseAI® software suite is optimized for building and training deep learning models using TensorFlow and PyTorch frameworks.  Gaudi is referred to as the Habana Processing Unit (HPU).
-With SynapseAI, we aim to make training workloads on Gaudi easy, whether you're developing from scratch or migrating existing workloads.  Lightning supports running on HPUs.
+With SynapseAI, we aim to make training workloads on Gaudi easy, whether you're developing from scratch or migrating existing workloads.
+
 For more information, check out `<https://developer.habana.ai>` and `<https://habana.ai/>`_.
 
 PyTorch Lightning With Gaudi HPU
 ================================
 
-Lightning supports training on a single HPU device or 8 HPU devices with the following plugins
+Lightning supports training on a single HPU device or 8 HPU devices with the plugins described in the following sections
 
 
 .. _hpu_accelerator:
@@ -55,9 +56,9 @@ The :code:`precision=16` and a :code:`hmp_params` parameter in the trainer class
 You can execute the ops in FP32 or BF16 precision. The HMP package modifies the python operators to add the appropriate cast operations for the arguments before execution.
 The default settings enable users to easily enable mixed precision training with minimal code.
 
-In addition to the default settings in HMP,  users also have the option of overriding these defaults and providing your own BF16 and FP32 operator lists
+In addition to the default settings in HMP,  users also have the option of overriding these defaults and providing their own BF16 and FP32 operator lists.
 
-For more details, please refer `<https://docs.habana.ai/en/master/PyTorch/PyTorch_User_Guide/PT_Mixed_Precision.html#pytorch-mixed-precision-training>`_.
+For more details, please refer `<https://docs.habana.ai/en/latest/PyTorch_User_Guide/PyTorch_User_Guide.html#pytorch-mixed-precision-training-on-gaudi>`_.
 
 
 .. _pytorch_lightning_examples:
@@ -65,14 +66,14 @@ For more details, please refer `<https://docs.habana.ai/en/master/PyTorch/PyTorc
 Getting Started with Lightning on Gaudi
 =======================================
 
-This section describes how to train models using Habana PyTorch with Gaudi.
+This section describes how to train models using PyTorch Lightning with Habana Gaudi.
 
 More Lightning HPU examples can be found in  pl_examples (`<https://github.com/PyTorchLightning/pytorch-lightning/pl_examples/hpu_examples/ >`)
 
 Enabling Lightning with Single Gaudi HPU
 ----------------------------------------
 
-The below snippet shows an example model using MNIST with single Habana Gaudi.
+The below snippet shows an example model using MNIST with single Habana Gaudi device:
 
 .. code-block:: python
 
@@ -109,7 +110,7 @@ The below snippet shows an example model using MNIST with single Habana Gaudi.
 Enabling Lightning with 8 Gaudi HPUs (distributed)
 --------------------------------------------------
 
-The below snippet shows an example model using MNIST with 8 Habana Gaudis.
+The below snippet shows an example model using MNIST with 8 Habana Gaudi devices:
 
 .. code-block:: python
 
@@ -150,7 +151,7 @@ Enabling Mixed Precision Options
 --------------------------------
 
 The below snippet shows an example model using MNIST with single Habana Gaudi and making use of HMP by overriding the default parameters.
-This enables advanced users to provide their own bf16 and fp32 operator list instead of using the HMP defaults.
+This enables advanced users to provide their own BF16 and FP32 operator list instead of using the HMP defaults.
 
 .. code-block:: python
 
@@ -199,5 +200,5 @@ This enables advanced users to provide their own bf16 and fp32 operator list ins
 Known limitations
 -----------------
 
-* Habana dataloader is not supported
-* Device stats monitoring is not supported
+* Habana dataloader is not supported.
+* Device stats monitoring is not supported.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index b3e080fbcc2fd..2b15b96874e16 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -88,6 +88,7 @@ Welcome to PyTorch Lightning
    accelerators/gpu
    accelerators/tpu
    accelerators/ipu
+   accelerators/hpu
 
 .. toctree::
    :maxdepth: 1

From 5061d712278275cd33c98412880182cdc7feeb04 Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Wed, 16 Mar 2022 15:07:51 +0400
Subject: [PATCH 070/167] Update
 pytorch_lightning/plugins/precision/hpu_precision.py

---
 pytorch_lightning/plugins/precision/hpu_precision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/plugins/precision/hpu_precision.py b/pytorch_lightning/plugins/precision/hpu_precision.py
index 490eea19d579d..bdce48f7eb156 100644
--- a/pytorch_lightning/plugins/precision/hpu_precision.py
+++ b/pytorch_lightning/plugins/precision/hpu_precision.py
@@ -34,7 +34,7 @@ class HPUPrecisionPlugin(PrecisionPlugin):
     def __init__(self, precision: int, hmp_params: Optional[Sequence[Any]] = None) -> None:
         if not _HPU_AVAILABLE:
             raise MisconfigurationException(
-                "HPU Accelerator requires HPU devices to run ." "HPU precision plugin requires HPU support "
+                "HPU precision plugin requires HPU support."
             )
         super().__init__()
         self.precision = precision

From f6c36cea39fb1600c76ed3f6d2ca6c7f06c421ba Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 16 Mar 2022 11:09:08 +0000
Subject: [PATCH 071/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pytorch_lightning/plugins/precision/hpu_precision.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pytorch_lightning/plugins/precision/hpu_precision.py b/pytorch_lightning/plugins/precision/hpu_precision.py
index bdce48f7eb156..565a90fb7c8ad 100644
--- a/pytorch_lightning/plugins/precision/hpu_precision.py
+++ b/pytorch_lightning/plugins/precision/hpu_precision.py
@@ -33,9 +33,7 @@ class HPUPrecisionPlugin(PrecisionPlugin):
 
     def __init__(self, precision: int, hmp_params: Optional[Sequence[Any]] = None) -> None:
         if not _HPU_AVAILABLE:
-            raise MisconfigurationException(
-                "HPU precision plugin requires HPU support."
-            )
+            raise MisconfigurationException("HPU precision plugin requires HPU support.")
         super().__init__()
         self.precision = precision
         if hmp_params is not None:

From 798f137e368571a86eadb25e5eb809b74195669e Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Wed, 16 Mar 2022 16:45:27 +0400
Subject: [PATCH 072/167] Update _broadvast_object_list

---
 pytorch_lightning/overrides/torch_distributed.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py
index 9571d962bc3e6..71624076cd89e 100644
--- a/pytorch_lightning/overrides/torch_distributed.py
+++ b/pytorch_lightning/overrides/torch_distributed.py
@@ -146,9 +146,7 @@ def _broadcast_object_list(object_list, src=0, group=None, device=None):
             dtype=torch.uint8,
         )
 
-    if is_nccl_backend:
-        object_tensor = object_tensor.to(current_device)
-    elif is_hpu_backend:
+    if is_nccl_backend or is_hpu_backend:
         object_tensor = object_tensor.to(current_device)
 
     broadcast(object_tensor, src=src, group=group)

From 5e098cb3371ce5e45d173d6a0068c58bc633b65f Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Wed, 16 Mar 2022 17:19:06 +0400
Subject: [PATCH 073/167] Update broadcast for HPUParallelStrategy

---
 pytorch_lightning/strategies/hpu_parallel.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index 77949b2ddc3ae..64f3ccc4fa0d1 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -61,7 +61,6 @@ def setup_environment(self) -> None:
         from habana_frameworks.torch.utils.library_loader import load_habana_module
 
         load_habana_module()
-        import habana_frameworks.torch.core
         import habana_frameworks.torch.core.hccl  # noqa: F401
 
         os.environ["ID"] = str(self.local_rank)
@@ -73,11 +72,8 @@ def broadcast(self, obj: object, src: int = 0) -> object:  # type: ignore
         obj = [obj]
         if self.global_rank != src:
             obj = [None]
-        if self.root_device.type == "hpu":
-            broadcast_object_list(obj, src, group=_group.WORLD)
-        else:
-            torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD)
-
+            
+        broadcast_object_list(obj, src, group=_group.WORLD)
         return obj[0]
 
     @classmethod

From 093056cfbd09f56aea300a26bc806b6df89130dd Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 16 Mar 2022 13:21:03 +0000
Subject: [PATCH 074/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pytorch_lightning/strategies/hpu_parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index 64f3ccc4fa0d1..643e826aa9339 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -72,7 +72,7 @@ def broadcast(self, obj: object, src: int = 0) -> object:  # type: ignore
         obj = [obj]
         if self.global_rank != src:
             obj = [None]
-            
+
         broadcast_object_list(obj, src, group=_group.WORLD)
         return obj[0]
 

From 056331060ff7174fe793d8c7280312368666e9ce Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Thu, 17 Mar 2022 10:00:28 +0400
Subject: [PATCH 075/167] Update reference links

---
 pytorch_lightning/overrides/torch_distributed.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py
index 71624076cd89e..c2e0924abb9bf 100644
--- a/pytorch_lightning/overrides/torch_distributed.py
+++ b/pytorch_lightning/overrides/torch_distributed.py
@@ -24,7 +24,7 @@
 # the distributed backend and tensor type updates for habana backend is done here before broadcast
 
 
-# https://github.com/pytorch/pytorch/blob/1.10.1/torch/distributed/distributed_c10d.py#L256
+# https://github.com/pytorch/pytorch/blob/master/torch/distributed/distributed_c10d.py#L256
 def _rank_not_in_group(group: ProcessGroup):
     """Helper that checks if the current process's rank is not in a given group."""
     if group is None:
@@ -32,7 +32,7 @@ def _rank_not_in_group(group: ProcessGroup):
     return group == GroupMember.NON_GROUP_MEMBER
 
 
-# Taken from https://github.com/pytorch/pytorch/blob/1.10.1/torch/distributed/distributed_c10d.py#L1518
+# Taken from https://github.com/pytorch/pytorch/blob/master/torch/distributed/distributed_c10d.py#L1518
 def _object_to_tensor(obj):
     f = io.BytesIO()
     _pickler(f).dump(obj)
@@ -45,13 +45,13 @@ def _object_to_tensor(obj):
     return byte_tensor, local_size
 
 
-# Taken from https://github.com/pytorch/pytorch/blob/1.10.1/torch/distributed/distributed_c10d.py#L1530
+# Taken from https://github.com/pytorch/pytorch/blob/master/torch/distributed/distributed_c10d.py#L1530
 def _tensor_to_object(tensor, tensor_size):
     buf = tensor.numpy().tobytes()[:tensor_size]
     return _unpickler(io.BytesIO(buf)).load()
 
 
-# Taken from https://github.com/pytorch/pytorch/blob/1.10.1/torch/distributed/distributed_c10d.py#L1729
+# Taken from https://github.com/pytorch/pytorch/blob/master/torch/distributed/distributed_c10d.py#L1729
 def _broadcast_object_list(object_list, src=0, group=None, device=None):
     """Broadcasts picklable objects in ``object_list`` to the whole group. Similar to :func:`broadcast`, but Python
     objects can be passed in. Note that all objects in ``object_list`` must be picklable in order to be

From 65886ba123f01b30f98cf8fe1676fb4788de994e Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Thu, 17 Mar 2022 10:34:10 +0400
Subject: [PATCH 076/167] Update Strategies

---
 pytorch_lightning/strategies/ddp.py          |  4 +---
 pytorch_lightning/strategies/hpu.py          |  2 +-
 pytorch_lightning/strategies/hpu_parallel.py | 24 ++++++++++++++++----
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py
index 3636bf56cf293..8b499055227af 100644
--- a/pytorch_lightning/strategies/ddp.py
+++ b/pytorch_lightning/strategies/ddp.py
@@ -274,12 +274,10 @@ def configure_ddp(self) -> None:
         log.detail(f"{self.__class__.__name__}: configuring DistributedDataParallel")
         self.pre_configure_ddp()
         self.model = self._setup_model(LightningDistributedModule(self.model))
-        if self.root_device.type == "hpu" and self._static_graph:
-            self._model._set_static_graph()
         self._register_ddp_hooks()
 
     def determine_ddp_device_ids(self):
-        if self.root_device.type in ("cpu", "hpu"):
+        if self.root_device.type == "cpu":
             return None
         return [self.root_device.index]
 
diff --git a/pytorch_lightning/strategies/hpu.py b/pytorch_lightning/strategies/hpu.py
index 7e43ff94760f2..df1c048886dc6 100644
--- a/pytorch_lightning/strategies/hpu.py
+++ b/pytorch_lightning/strategies/hpu.py
@@ -38,7 +38,7 @@ def __init__(
     ):
 
         if not _HPU_AVAILABLE:
-            raise MisconfigurationException("HPU Accelerator requires HPU devices to run")
+            raise MisconfigurationException("`SingleHPUStrategy` requires HPU devices to run")
 
         from habana_frameworks.torch.utils.library_loader import load_habana_module
 
diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index 643e826aa9339..115f6df284fee 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import logging
 import os
 from typing import Dict, List, Optional
 
@@ -18,6 +19,7 @@
 import torch.distributed
 
 import pytorch_lightning as pl
+from pytorch_lightning.overrides import LightningDistributedModule
 from pytorch_lightning.overrides.torch_distributed import broadcast_object_list
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
 from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO
@@ -25,9 +27,10 @@
 from pytorch_lightning.strategies.ddp import DDPStrategy
 from pytorch_lightning.utilities import _HPU_AVAILABLE
 from pytorch_lightning.utilities.distributed import group as _group
-from pytorch_lightning.utilities.enums import _StrategyType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
+log = logging.getLogger(__name__)
+
 
 class HPUParallelStrategy(DDPStrategy):
     """Plugin for multi-process single-device training on one or multiple nodes.
@@ -36,7 +39,6 @@ class HPUParallelStrategy(DDPStrategy):
     devices (e.g. GPU) per node. It is very similar to how :mod:`torch.distributed.launch` launches processes.
     """
 
-    distributed_backend = _StrategyType.HPU_PARALLEL
     strategy_name = "hpu_parallel"
 
     def __init__(
@@ -46,6 +48,10 @@ def __init__(
         checkpoint_io: Optional[CheckpointIO] = None,
         precision_plugin: Optional[PrecisionPlugin] = None,
     ) -> None:
+
+        if not _HPU_AVAILABLE:
+            raise MisconfigurationException("`HPUParallelStrategy` requires HPU devices to run")
+
         super().__init__(
             accelerator=accelerator,
             parallel_devices=parallel_devices,
@@ -55,9 +61,6 @@ def __init__(
 
     def setup_environment(self) -> None:
 
-        if not _HPU_AVAILABLE:
-            raise MisconfigurationException("HPU Accelerator requires HPU devices to run")
-
         from habana_frameworks.torch.utils.library_loader import load_habana_module
 
         load_habana_module()
@@ -68,6 +71,17 @@ def setup_environment(self) -> None:
 
         super().setup_environment()
 
+    def determine_ddp_device_ids(self) -> None:
+        return None
+
+    def configure_ddp(self) -> None:
+        log.detail(f"{self.__class__.__name__}: configuring DistributedDataParallel")
+        self.pre_configure_ddp()
+        self.model = self._setup_model(LightningDistributedModule(self.model))
+        if self.root_device.type == "hpu" and self._static_graph:
+            self._model._set_static_graph()
+        self._register_ddp_hooks()
+
     def broadcast(self, obj: object, src: int = 0) -> object:  # type: ignore
         obj = [obj]
         if self.global_rank != src:

From d837ef35afc39e4f63fb7107a8f7fad551d7d9ae Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Thu, 17 Mar 2022 12:13:48 +0400
Subject: [PATCH 077/167] Address reviews

---
 pl_examples/hpu_examples/simple_mnist/mnist.py                | 2 +-
 pytorch_lightning/plugins/__init__.py                         | 2 +-
 .../plugins/precision/{hpu_precision.py => hpu.py}            | 0
 pytorch_lightning/strategies/__init__.py                      | 2 +-
 pytorch_lightning/strategies/{hpu.py => single_hpu.py}        | 0
 tests/accelerators/test_accelerator_connector.py              | 4 ++--
 tests/accelerators/test_hpu.py                                | 2 +-
 7 files changed, 6 insertions(+), 6 deletions(-)
 rename pytorch_lightning/plugins/precision/{hpu_precision.py => hpu.py} (100%)
 rename pytorch_lightning/strategies/{hpu.py => single_hpu.py} (100%)

diff --git a/pl_examples/hpu_examples/simple_mnist/mnist.py b/pl_examples/hpu_examples/simple_mnist/mnist.py
index 0c309bf28f912..c2a209a5d2d55 100644
--- a/pl_examples/hpu_examples/simple_mnist/mnist.py
+++ b/pl_examples/hpu_examples/simple_mnist/mnist.py
@@ -20,8 +20,8 @@
 import pytorch_lightning as pl
 from pl_examples.basic_examples.mnist_datamodule import MNISTDataModule
 from pytorch_lightning.plugins import HPUPrecisionPlugin
-from pytorch_lightning.strategies.hpu import SingleHPUStrategy
 from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy
+from pytorch_lightning.strategies.single_hpu import SingleHPUStrategy
 from pytorch_lightning.utilities.imports import _HPU_AVAILABLE
 
 
diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py
index 0c786776af981..2f7a5dd4e05e5 100644
--- a/pytorch_lightning/plugins/__init__.py
+++ b/pytorch_lightning/plugins/__init__.py
@@ -10,7 +10,7 @@
 from pytorch_lightning.plugins.precision.deepspeed import DeepSpeedPrecisionPlugin
 from pytorch_lightning.plugins.precision.double import DoublePrecisionPlugin
 from pytorch_lightning.plugins.precision.fully_sharded_native_amp import FullyShardedNativeMixedPrecisionPlugin
-from pytorch_lightning.plugins.precision.hpu_precision import HPUPrecisionPlugin
+from pytorch_lightning.plugins.precision.hpu import HPUPrecisionPlugin
 from pytorch_lightning.plugins.precision.ipu import IPUPrecisionPlugin
 from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin
 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
diff --git a/pytorch_lightning/plugins/precision/hpu_precision.py b/pytorch_lightning/plugins/precision/hpu.py
similarity index 100%
rename from pytorch_lightning/plugins/precision/hpu_precision.py
rename to pytorch_lightning/plugins/precision/hpu.py
diff --git a/pytorch_lightning/strategies/__init__.py b/pytorch_lightning/strategies/__init__.py
index 1cc928e489e13..3c8f30b8f30c5 100644
--- a/pytorch_lightning/strategies/__init__.py
+++ b/pytorch_lightning/strategies/__init__.py
@@ -21,13 +21,13 @@
 from pytorch_lightning.strategies.dp import DataParallelStrategy  # noqa: F401
 from pytorch_lightning.strategies.fully_sharded import DDPFullyShardedStrategy  # noqa: F401
 from pytorch_lightning.strategies.horovod import HorovodStrategy  # noqa: F401
-from pytorch_lightning.strategies.hpu import SingleHPUStrategy  # noqa: F401
 from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy  # noqa: F401
 from pytorch_lightning.strategies.ipu import IPUStrategy  # noqa: F401
 from pytorch_lightning.strategies.parallel import ParallelStrategy  # noqa: F401
 from pytorch_lightning.strategies.sharded import DDPShardedStrategy  # noqa: F401
 from pytorch_lightning.strategies.sharded_spawn import DDPSpawnShardedStrategy  # noqa: F401
 from pytorch_lightning.strategies.single_device import SingleDeviceStrategy  # noqa: F401
+from pytorch_lightning.strategies.single_hpu import SingleHPUStrategy  # noqa: F401
 from pytorch_lightning.strategies.single_tpu import SingleTPUStrategy  # noqa: F401
 from pytorch_lightning.strategies.strategy import Strategy  # noqa: F401
 from pytorch_lightning.strategies.strategy_registry import call_register_strategies, StrategyRegistry  # noqa: F401
diff --git a/pytorch_lightning/strategies/hpu.py b/pytorch_lightning/strategies/single_hpu.py
similarity index 100%
rename from pytorch_lightning/strategies/hpu.py
rename to pytorch_lightning/strategies/single_hpu.py
diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
index 9a664743f7747..1c3e9f2f2e9b4 100644
--- a/tests/accelerators/test_accelerator_connector.py
+++ b/tests/accelerators/test_accelerator_connector.py
@@ -937,12 +937,12 @@ def test_unsupported_ipu_choice(mock_ipu_acc_avail, monkeypatch):
 
 @mock.patch("pytorch_lightning.accelerators.hpu.HPUAccelerator.is_available", return_value=True)
 def test_unsupported_hpu_choice(monkeypatch):
-    import pytorch_lightning.strategies.hpu as hpu
+    import pytorch_lightning.strategies.single_hpu as single_hpu
     import pytorch_lightning.utilities.imports as imports
     from pytorch_lightning.trainer.connectors.accelerator_connector import AcceleratorConnector
 
     monkeypatch.setattr(imports, "_HPU_AVAILABLE", True)
-    monkeypatch.setattr(hpu, "_HPU_AVAILABLE", True)
+    monkeypatch.setattr(single_hpu, "_HPU_AVAILABLE", True)
     monkeypatch.setattr(AcceleratorConnector, "_HPU_AVAILABLE", True)
     with pytest.raises(
         MisconfigurationException,
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index cc1b892935fa1..432980569d8e2 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -22,8 +22,8 @@
 from pytorch_lightning.accelerators import HPUAccelerator
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins import HPUPrecisionPlugin
-from pytorch_lightning.strategies.hpu import SingleHPUStrategy
 from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy
+from pytorch_lightning.strategies.single_hpu import SingleHPUStrategy
 from pytorch_lightning.utilities import _HPU_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel

From 37e00003caaf11995058eaea58a816002509c985 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Thu, 17 Mar 2022 12:16:46 +0400
Subject: [PATCH 078/167] Address reviews

---
 pytorch_lightning/plugins/__init__.py                           | 2 +-
 pytorch_lightning/plugins/io/__init__.py                        | 2 +-
 .../plugins/io/{hpu_io_plugin.py => hpu_plugin.py}              | 0
 pytorch_lightning/strategies/hpu_parallel.py                    | 2 +-
 pytorch_lightning/strategies/single_hpu.py                      | 2 +-
 5 files changed, 4 insertions(+), 4 deletions(-)
 rename pytorch_lightning/plugins/io/{hpu_io_plugin.py => hpu_plugin.py} (100%)

diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py
index 2f7a5dd4e05e5..0f1c4ca85ed5a 100644
--- a/pytorch_lightning/plugins/__init__.py
+++ b/pytorch_lightning/plugins/__init__.py
@@ -2,7 +2,7 @@
 
 from pytorch_lightning.plugins.environments import ClusterEnvironment
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
-from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO
+from pytorch_lightning.plugins.io.hpu_plugin import HPUCheckpointIO
 from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO
 from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO
 from pytorch_lightning.plugins.layer_sync import LayerSync, NativeSyncBatchNorm
diff --git a/pytorch_lightning/plugins/io/__init__.py b/pytorch_lightning/plugins/io/__init__.py
index 0671d26a175e1..abd196eb2b1e3 100644
--- a/pytorch_lightning/plugins/io/__init__.py
+++ b/pytorch_lightning/plugins/io/__init__.py
@@ -12,6 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO  # noqa: F401
-from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO  # noqa: F401
+from pytorch_lightning.plugins.io.hpu_plugin import HPUCheckpointIO  # noqa: F401
 from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO  # noqa: F401
 from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO  # noqa: F401
diff --git a/pytorch_lightning/plugins/io/hpu_io_plugin.py b/pytorch_lightning/plugins/io/hpu_plugin.py
similarity index 100%
rename from pytorch_lightning/plugins/io/hpu_io_plugin.py
rename to pytorch_lightning/plugins/io/hpu_plugin.py
diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index 115f6df284fee..1d8fae7805848 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -22,7 +22,7 @@
 from pytorch_lightning.overrides import LightningDistributedModule
 from pytorch_lightning.overrides.torch_distributed import broadcast_object_list
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
-from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO
+from pytorch_lightning.plugins.io.hpu_plugin import HPUCheckpointIO
 from pytorch_lightning.plugins.precision import PrecisionPlugin
 from pytorch_lightning.strategies.ddp import DDPStrategy
 from pytorch_lightning.utilities import _HPU_AVAILABLE
diff --git a/pytorch_lightning/strategies/single_hpu.py b/pytorch_lightning/strategies/single_hpu.py
index df1c048886dc6..1a1615f4acc68 100644
--- a/pytorch_lightning/strategies/single_hpu.py
+++ b/pytorch_lightning/strategies/single_hpu.py
@@ -15,7 +15,7 @@
 from typing import Dict, Optional
 
 import pytorch_lightning as pl
-from pytorch_lightning.plugins.io.hpu_io_plugin import HPUCheckpointIO
+from pytorch_lightning.plugins.io.hpu_plugin import HPUCheckpointIO
 from pytorch_lightning.plugins.precision import PrecisionPlugin
 from pytorch_lightning.strategies.single_device import SingleDeviceStrategy
 from pytorch_lightning.utilities import _HPU_AVAILABLE

From 07c60b412061f5b420da88d170053b98e6d9d469 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Fri, 18 Mar 2022 12:01:59 +0200
Subject: [PATCH 079/167] Address reviews

Signed-off-by: Jerome <janand@habana.ai>
---
 pytorch_lightning/accelerators/hpu.py        |  2 +
 pytorch_lightning/strategies/ddp.py          | 12 -----
 pytorch_lightning/strategies/hpu_parallel.py | 55 ++++++++++++++++----
 pytorch_lightning/strategies/single_hpu.py   |  2 +-
 pytorch_lightning/utilities/imports.py       |  1 +
 5 files changed, 49 insertions(+), 23 deletions(-)

diff --git a/pytorch_lightning/accelerators/hpu.py b/pytorch_lightning/accelerators/hpu.py
index 38382aeab7fc5..1b9b7c2c56d14 100644
--- a/pytorch_lightning/accelerators/hpu.py
+++ b/pytorch_lightning/accelerators/hpu.py
@@ -18,6 +18,7 @@
 
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.utilities import _HPU_AVAILABLE
+from pytorch_lightning.utilities.rank_zero import rank_zero_debug
 
 
 class HPUAccelerator(Accelerator):
@@ -30,6 +31,7 @@ def name() -> str:
 
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         """HPU device stats aren't supported yet."""
+        rank_zero_debug("HPU device stats aren't supported yet.")
         return {}
 
     @staticmethod
diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py
index 8b499055227af..536a272bb48df 100644
--- a/pytorch_lightning/strategies/ddp.py
+++ b/pytorch_lightning/strategies/ddp.py
@@ -196,18 +196,6 @@ def pre_configure_ddp(self):
             )
             self._ddp_kwargs["find_unused_parameters"] = True
 
-        if self.root_device.type == "hpu":
-            self._static_graph = False
-            static_graph = self._ddp_kwargs.get("static_graph")
-            if static_graph:
-                # when _set_static_graph() is called find_unused_parameters does not have any significance.
-                # Resetting the value of find_unused_parameters to False which is the default value to DDP
-                self._ddp_kwargs["find_unused_parameters"] = False
-                self._static_graph = True
-            if static_graph is not None:
-                # DDP does not accept static_graph as a parameter, hence removing it from the list
-                del self._ddp_kwargs["static_graph"]
-
     def _register_ddp_hooks(self) -> None:
         log.detail(f"{self.__class__.__name__}: registering ddp hooks")
         # In 1.8, DDP communication hooks only work with NCCL backend and SPSD (single process single device) mode
diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index 1d8fae7805848..dd3f96e222c3e 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -28,16 +28,19 @@
 from pytorch_lightning.utilities import _HPU_AVAILABLE
 from pytorch_lightning.utilities.distributed import group as _group
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.rank_zero import rank_zero_warn
+from pytorch_lightning.utilities.imports import _TORCH_LESSER_EQUAL_1_10_2
 
 log = logging.getLogger(__name__)
 
 
 class HPUParallelStrategy(DDPStrategy):
-    """Plugin for multi-process single-device training on one or multiple nodes.
+    """ Strategy for distributed training on multiple HPU devices """
 
-    The main process in each node spawns N-1 child processes via :func:`subprocess.Popen`, where N is the number of
-    devices (e.g. GPU) per node. It is very similar to how :mod:`torch.distributed.launch` launches processes.
-    """
+    # The main process in each node spawns N-1 child processes via :func:`subprocess.Popen`, where N is the number of
+    # devices (e.g. GPU) per node. It is very similar to how :mod:`torch.distributed.launch` launches processes.
+
+    # Multi-device per process is not supported with habana
 
     strategy_name = "hpu_parallel"
 
@@ -74,13 +77,45 @@ def setup_environment(self) -> None:
     def determine_ddp_device_ids(self) -> None:
         return None
 
+    def pre_configure_ddp(self): # type: ignore
+        # if unset, default `find_unused_parameters` `True`
+        # Many models require setting this parameter to True, as there are corner cases
+        # when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True.
+        # This flag does come with a performance hit, so it is suggested to disable in cases where it is possible.
+        self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
+        if not self.lightning_module.automatic_optimization and not self._ddp_kwargs.get(
+            "find_unused_parameters", False
+        ):
+            # TODO: PyTorch 1.7.0 DDP introduces `self.reducer._rebuild_buckets()` breaking manual_optimization
+            rank_zero_warn(
+                "From PyTorch 1.7.0, Lightning `manual_optimization` needs to set `find_unused_parameters=True` to"
+                " properly work with DDP. Using `find_unused_parameters=True`."
+            )
+            self._ddp_kwargs["find_unused_parameters"] = True
+
+        if self.root_device.type == "hpu":
+            self._static_graph = False
+            static_graph = self._ddp_kwargs.get("static_graph")
+            if static_graph:
+                # when _set_static_graph() is called find_unused_parameters does not have any significance.
+                # Resetting the value of find_unused_parameters to False which is the default value to DDP
+                self._ddp_kwargs["find_unused_parameters"] = False
+                self._static_graph = True
+            if static_graph is not None:
+                # DDP does not accept static_graph as a parameter, hence removing it from the list
+                del self._ddp_kwargs["static_graph"]
+
     def configure_ddp(self) -> None:
-        log.detail(f"{self.__class__.__name__}: configuring DistributedDataParallel")
-        self.pre_configure_ddp()
-        self.model = self._setup_model(LightningDistributedModule(self.model))
-        if self.root_device.type == "hpu" and self._static_graph:
-            self._model._set_static_graph()
-        self._register_ddp_hooks()
+        # DDP does not accept static graph as param with torch < 1.11
+        if _TORCH_LESSER_EQUAL_1_10_2:
+            log.detail(f"{self.__class__.__name__}: configuring DistributedDataParallel")
+            self.pre_configure_ddp()
+            self.model = self._setup_model(LightningDistributedModule(self.model)) # type: ignore
+            if self.root_device.type == "hpu" and self._static_graph:
+                self._model._set_static_graph() # type: ignore
+            self._register_ddp_hooks()
+        else:
+            self.configure_ddp()
 
     def broadcast(self, obj: object, src: int = 0) -> object:  # type: ignore
         obj = [obj]
diff --git a/pytorch_lightning/strategies/single_hpu.py b/pytorch_lightning/strategies/single_hpu.py
index 1a1615f4acc68..a9e62bbf2100d 100644
--- a/pytorch_lightning/strategies/single_hpu.py
+++ b/pytorch_lightning/strategies/single_hpu.py
@@ -24,7 +24,7 @@
 
 
 class SingleHPUStrategy(SingleDeviceStrategy):
-    """Strategy for training on HPU devices."""
+    """ Strategy for training on single HPU device """
 
     strategy_name = "hpu_single"
 
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index 9e6778e46ba00..31f7dfbc1f810 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -93,6 +93,7 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version:
 _TORCH_GREATER_EQUAL_1_8_1 = _compare_version("torch", operator.ge, "1.8.1")
 _TORCH_GREATER_EQUAL_1_9 = _compare_version("torch", operator.ge, "1.9.0")
 _TORCH_GREATER_EQUAL_1_10 = _compare_version("torch", operator.ge, "1.10.0")
+_TORCH_LESSER_EQUAL_1_10_2 = _compare_version("torch", operator.le, "1.10.2")
 # _TORCH_GREATER_EQUAL_DEV_1_11 = _compare_version("torch", operator.ge, "1.11.0", use_base_version=True)
 
 _APEX_AVAILABLE = _module_available("apex.amp")

From 12dc3ca0620f0c1c5e668bd49f8e97553dc49067 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 18 Mar 2022 11:17:25 +0000
Subject: [PATCH 080/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pytorch_lightning/strategies/hpu_parallel.py | 10 +++++-----
 pytorch_lightning/strategies/single_hpu.py   |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index d169703a0efb6..aa97002bb11d2 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -28,14 +28,14 @@
 from pytorch_lightning.utilities import _HPU_AVAILABLE
 from pytorch_lightning.utilities.distributed import group as _group
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.rank_zero import rank_zero_warn
 from pytorch_lightning.utilities.imports import _TORCH_LESSER_EQUAL_1_10_2
+from pytorch_lightning.utilities.rank_zero import rank_zero_warn
 
 log = logging.getLogger(__name__)
 
 
 class HPUParallelStrategy(DDPStrategy):
-    """ Strategy for distributed training on multiple HPU devices """
+    """Strategy for distributed training on multiple HPU devices."""
 
     # The main process in each node spawns N-1 child processes via :func:`subprocess.Popen`, where N is the number of
     # devices (e.g. GPU) per node. It is very similar to how :mod:`torch.distributed.launch` launches processes.
@@ -79,7 +79,7 @@ def setup_environment(self) -> None:
     def determine_ddp_device_ids(self) -> None:
         return None
 
-    def pre_configure_ddp(self): # type: ignore
+    def pre_configure_ddp(self):  # type: ignore
         # if unset, default `find_unused_parameters` `True`
         # Many models require setting this parameter to True, as there are corner cases
         # when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True.
@@ -112,9 +112,9 @@ def configure_ddp(self) -> None:
         if _TORCH_LESSER_EQUAL_1_10_2:
             log.detail(f"{self.__class__.__name__}: configuring DistributedDataParallel")
             self.pre_configure_ddp()
-            self.model = self._setup_model(LightningDistributedModule(self.model)) # type: ignore
+            self.model = self._setup_model(LightningDistributedModule(self.model))  # type: ignore
             if self.root_device.type == "hpu" and self._static_graph:
-                self._model._set_static_graph() # type: ignore
+                self._model._set_static_graph()  # type: ignore
             self._register_ddp_hooks()
         else:
             self.configure_ddp()
diff --git a/pytorch_lightning/strategies/single_hpu.py b/pytorch_lightning/strategies/single_hpu.py
index a9e62bbf2100d..120adc9bb8190 100644
--- a/pytorch_lightning/strategies/single_hpu.py
+++ b/pytorch_lightning/strategies/single_hpu.py
@@ -24,7 +24,7 @@
 
 
 class SingleHPUStrategy(SingleDeviceStrategy):
-    """ Strategy for training on single HPU device """
+    """Strategy for training on single HPU device."""
 
     strategy_name = "hpu_single"
 

From 30645443c171a529382bdbbb0bfdca360cd6f372 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Sat, 19 Mar 2022 15:54:35 +0900
Subject: [PATCH 081/167] Remove too many sections from sidebar

---
 docs/source/accelerators/hpu.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index c3faf5d2f8f6e..4b8e66aeb01b9 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -1,7 +1,7 @@
 .. _hpu:
 
-Habana Gaudi AI Processor
-=========================
+Habana Gaudi AI Processor (HPU)
+===============================
 
 Habana® Gaudi® AI training processors have been architected from the ground up and optimized for deep learning training efficiency.
 Gaudi offers substantial price/performance advantage -- so you get to do more deep learning training while spending less.
@@ -14,7 +14,7 @@ With SynapseAI, we aim to make training workloads on Gaudi easy, whether you're
 For more information, check out `<https://developer.habana.ai>` and `<https://habana.ai/>`_.
 
 PyTorch Lightning With Gaudi HPU
-================================
+--------------------------------
 
 Lightning supports training on a single HPU device or 8 HPU devices with the plugins described in the following sections
 
@@ -64,7 +64,7 @@ For more details, please refer `<https://docs.habana.ai/en/latest/PyTorch_User_G
 .. _pytorch_lightning_examples:
 
 Getting Started with Lightning on Gaudi
-=======================================
+---------------------------------------
 
 This section describes how to train models using PyTorch Lightning with Habana Gaudi.
 

From 7c7721d1ad24473d3cca5d0555ac4e3c69b9bd95 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Sat, 19 Mar 2022 16:28:02 +0900
Subject: [PATCH 082/167] Fix invalid formatting and links

---
 docs/source/accelerators/hpu.rst | 42 +++++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index 4b8e66aeb01b9..1e0f56d103af6 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -6,12 +6,14 @@ Habana Gaudi AI Processor (HPU)
 Habana® Gaudi® AI training processors have been architected from the ground up and optimized for deep learning training efficiency.
 Gaudi offers substantial price/performance advantage -- so you get to do more deep learning training while spending less.
 
-You can use either the Gaudi-based AWS EC2 DL1 instances `<https://aws.amazon.com/ec2/instance-types/dl1/>` or the Supermicro X12 Gaudi server `< https://www.supermicro.com/en/solutions/habana-gaudi>`
+You can use either `the Gaudi-based AWS EC2 DL1 instances <https://aws.amazon.com/ec2/instance-types/dl1/>`_ or `the Supermicro X12 Gaudi server <https://www.supermicro.com/en/solutions/habana-gaudi>`_.
 
-Habana’s SynapseAI® software suite is optimized for building and training deep learning models using TensorFlow and PyTorch frameworks.  Gaudi is referred to as the Habana Processing Unit (HPU).
+Habana’s SynapseAI® software suite is optimized for building and training deep learning models using TensorFlow and PyTorch frameworks. Gaudi is referred to as the Habana Processing Unit (HPU).
 With SynapseAI, we aim to make training workloads on Gaudi easy, whether you're developing from scratch or migrating existing workloads.
 
-For more information, check out `<https://developer.habana.ai>` and `<https://habana.ai/>`_.
+For more information, check out `<https://developer.habana.ai>`_ and `<https://habana.ai/>`_.
+
+----------------
 
 PyTorch Lightning With Gaudi HPU
 --------------------------------
@@ -19,48 +21,58 @@ PyTorch Lightning With Gaudi HPU
 Lightning supports training on a single HPU device or 8 HPU devices with the plugins described in the following sections
 
 
+----------------
+
 .. _hpu_accelerator:
 
 HPU accelerator
 ---------------
 
-The :code:`devices=1` with :code:`accelerator="hpu"` parameters in the trainer class enables the Habana backend.
+The ``devices=1`` with ``accelerator="hpu"`` parameters in the trainer class enables the Habana backend.
+
 
+----------------
 
 .. _single_device_strategy:
 
 Training on Single HPU
 ----------------------
 
-The :code:`devices=1` and :code:`accelerator="hpu"` with :code:`strategy=SingleHPUStrategy(device=torch.device("hpu"))` parameter in the trainer class enables the Habana backend for single Gaudi training.
+The ``devices=1`` and ``accelerator="hpu"`` with ``strategy=SingleHPUStrategy(device=torch.device("hpu"))`` parameter in the trainer class enables the Habana backend for single Gaudi training.
 
 
+----------------
+
 .. _parallel_device_strategy:
 
 Distributed Training
 ---------------------
 
 
-The :code:`devices=8` and :code:`accelerator="hpu"` with :code:`strategy=HPUParallelStrategy( parallel_devices=[torch.device("hpu")] * devices)`  parameter in the trainer class enables the Habana backend for distributed training with 8 Gaudis.
+The ``devices=8`` and ``accelerator="hpu"`` with ``strategy=HPUParallelStrategy(parallel_devices=[torch.device("hpu")]*devices)`` parameter in the trainer class enables the Habana backend for distributed training with 8 Gaudis.
+
+The Habana parallel device strategy is based on DDP strategy with the addition of Habana's collective communication library (HCCL) to support scale-up within a node and scale-out across multiple nodes.
 
-The Habana parallel device strategy is based on DDP strategy with the addition of  Habana's collective communication library (HCCL) to support scale-up within a node and scale-out across multiple nodes.
 
+----------------
 
 .. _mixed_precision_plugin:
 
 Mixed Precision Plugin
 ----------------------
 
-The :code:`precision=16` and a :code:`hmp_params` parameter in the trainer class enables the Habana plugin for mixed precision using the Habana Mixed Precision (HMP) package.
+The ``precision=16`` and a ``hmp_params`` parameter in the trainer class enables the Habana plugin for mixed precision using the Habana Mixed Precision (HMP) package.
 
 You can execute the ops in FP32 or BF16 precision. The HMP package modifies the python operators to add the appropriate cast operations for the arguments before execution.
 The default settings enable users to easily enable mixed precision training with minimal code.
 
-In addition to the default settings in HMP,  users also have the option of overriding these defaults and providing their own BF16 and FP32 operator lists.
+In addition to the default settings in HMP, users also have the option of overriding these defaults and providing their own BF16 and FP32 operator lists.
 
-For more details, please refer `<https://docs.habana.ai/en/latest/PyTorch_User_Guide/PyTorch_User_Guide.html#pytorch-mixed-precision-training-on-gaudi>`_.
+For more details, please refer to `PyTorch Mixed Precision Training on Gaudi <https://docs.habana.ai/en/latest/PyTorch_User_Guide/PyTorch_User_Guide.html#pytorch-mixed-precision-training-on-gaudi>`_.
 
 
+----------------
+
 .. _pytorch_lightning_examples:
 
 Getting Started with Lightning on Gaudi
@@ -68,7 +80,9 @@ Getting Started with Lightning on Gaudi
 
 This section describes how to train models using PyTorch Lightning with Habana Gaudi.
 
-More Lightning HPU examples can be found in  pl_examples (`<https://github.com/PyTorchLightning/pytorch-lightning/pl_examples/hpu_examples/ >`)
+More Lightning HPU examples can be found in pl_examples (`<https://github.com/PyTorchLightning/pytorch-lightning/tree/master/pl_examples/hpu_examples>`_)
+
+----------------
 
 Enabling Lightning with Single Gaudi HPU
 ----------------------------------------
@@ -107,6 +121,8 @@ The below snippet shows an example model using MNIST with single Habana Gaudi de
     trainer.fit(model, datamodule=dm)
 
 
+----------------
+
 Enabling Lightning with 8 Gaudi HPUs (distributed)
 --------------------------------------------------
 
@@ -147,6 +163,8 @@ The below snippet shows an example model using MNIST with 8 Habana Gaudi devices
     trainer.fit(model, datamodule=dm)
 
 
+----------------
+
 Enabling Mixed Precision Options
 --------------------------------
 
@@ -195,6 +213,8 @@ This enables advanced users to provide their own BF16 and FP32 operator list ins
     trainer.fit(model, datamodule=dm)
 
 
+----------------
+
 .. _known-limitations_hpu:
 
 Known limitations

From e6eaa9f62bb37c312cc5080c5656a979c82cda04 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Mon, 21 Mar 2022 11:37:18 +0530
Subject: [PATCH 083/167] Address reviews for HPUCHeckpointIO

---
 pytorch_lightning/plugins/io/hpu_plugin.py | 37 +---------------------
 1 file changed, 1 insertion(+), 36 deletions(-)

diff --git a/pytorch_lightning/plugins/io/hpu_plugin.py b/pytorch_lightning/plugins/io/hpu_plugin.py
index 214f54ed71af4..77ce1eff079d6 100644
--- a/pytorch_lightning/plugins/io/hpu_plugin.py
+++ b/pytorch_lightning/plugins/io/hpu_plugin.py
@@ -13,13 +13,12 @@
 # limitations under the License.
 
 import os
-from typing import Any, Callable, Dict, Optional
+from typing import Any, Dict, Optional
 
 import torch
 
 from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO
 from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem
-from pytorch_lightning.utilities.cloud_io import load as pl_load
 from pytorch_lightning.utilities.imports import _HPU_AVAILABLE
 from pytorch_lightning.utilities.types import _PATH
 
@@ -37,37 +36,3 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
             checkpoint = move_data_to_device(checkpoint, torch.device("cpu"))
         # write the checkpoint dictionary on the file
         atomic_save(checkpoint, path)
-
-    def load_checkpoint(
-        self, path: _PATH, map_location: Optional[Callable] = lambda storage, loc: storage
-    ) -> Dict[str, Any]:
-        """Loads checkpoint using :func:`torch.load`, with additional handling for ``fsspec`` remote loading of
-        files.
-
-        Args:
-            path: Path to checkpoint
-            map_location: a function, :class:`torch.device`, string or a dict specifying how to remap storage
-            locations.
-
-        Returns: The loaded checkpoint.
-
-        Raises:
-            FileNotFoundError: If ``path`` is not found by the ``fsspec`` filesystem
-        """
-
-        # Try to read the checkpoint at `path`. If not exist, do not restore checkpoint.
-        fs = get_filesystem(path)
-        if not fs.exists(path):
-            raise FileNotFoundError(f"Checkpoint at {path} not found. Aborting training.")
-
-        return pl_load(path, map_location=map_location)
-
-    def remove_checkpoint(self, path: _PATH) -> None:
-        """Remove checkpoint file from the filesystem.
-
-        Args:
-            path: Path to checkpoint
-        """
-        fs = get_filesystem(path)
-        if fs.exists(path):
-            fs.rm(path, recursive=True)

From 33beabdbf493d29797c36c340457b78effa91b12 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Mon, 21 Mar 2022 12:16:53 +0530
Subject: [PATCH 084/167] Address reviews for HPU + AcceleratorConnector

---
 pytorch_lightning/trainer/connectors/accelerator_connector.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index b1cc76a70ab7d..329c96de770a7 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -563,9 +563,6 @@ def _choose_strategy(self) -> Union[Strategy, str]:
         if self._accelerator_flag == "ipu":
             return IPUStrategy.strategy_name
         if self._accelerator_flag == "hpu":
-            if not _HPU_AVAILABLE:
-                raise MisconfigurationException("HPU Accelerator requires HPU devices to run")
-
             if self._parallel_devices and len(self._parallel_devices) > 1:
                 return HPUParallelStrategy(parallel_devices=self.parallel_devices)  # type: ignore
             else:

From 759804e78b9f83e3dd6cd39a6f0ee774a6c1ba61 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Mon, 21 Mar 2022 12:48:02 +0530
Subject: [PATCH 085/167] Fix tests

---
 .../trainer/connectors/accelerator_connector.py             | 6 ++++++
 pytorch_lightning/trainer/trainer.py                        | 2 +-
 tests/accelerators/test_hpu.py                              | 4 ++--
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index 329c96de770a7..ea2fb54309cf6 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -853,6 +853,12 @@ def num_gpus(self) -> int:
             return self.devices
         return 0
 
+    @property
+    def num_hpus(self) -> int:
+        if isinstance(self.accelerator, HPUAccelerator):
+            return self.devices
+        return 0
+
     @property
     def gpus(self) -> Optional[Union[List[int], str, int]]:
         return self._gpus
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index a9f1fab63e236..c502199b69144 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -1796,7 +1796,7 @@ def _log_device_info(self) -> None:
         num_ipus = self.ipus if self.ipus is not None else 0
         rank_zero_info(f"IPU available: {_IPU_AVAILABLE}, using: {num_ipus} IPUs")
 
-        num_hpus = self.devices if self.devices is not None else 0
+        num_hpus = self.hpus if self.hpus is not None else 0
         rank_zero_info(f"HPU available: {_HPU_AVAILABLE}, using: {num_hpus} HPUs")
 
         if torch.cuda.is_available() and not isinstance(self.accelerator, GPUAccelerator):
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index 432980569d8e2..0b4112501cc6e 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -284,7 +284,7 @@ def test_accelerator_auto_with_devices_hpu():
 def test_set_devices_if_none_hpu():
 
     trainer = Trainer(accelerator="hpu", devices=8)
-    assert trainer.devices == 8
+    assert trainer.num_devices == 8
 
 
 @RunIf(hpu=True)
@@ -312,7 +312,7 @@ def test_device_type_when_training_plugin_hpu_passed(tmpdir):
 @RunIf(hpu=True)
 def test_devices_auto_choice_hpu():
     trainer = Trainer(accelerator="auto", devices="auto")
-    assert trainer.devices == 8
+    assert trainer.num_devices == 8
 
 
 @RunIf(hpu=True)

From bda7e369432fdf281fda0ade6deca57812a82c36 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Mon, 21 Mar 2022 13:06:04 +0530
Subject: [PATCH 086/167] Address reviews

---
 tests/accelerators/test_common.py   | 10 +---------
 tests/accelerators/test_hpu.py      |  4 ++++
 tests/plugins/precision/test_hpu.py | 13 +++++++++++++
 3 files changed, 18 insertions(+), 9 deletions(-)
 create mode 100644 tests/plugins/precision/test_hpu.py

diff --git a/tests/accelerators/test_common.py b/tests/accelerators/test_common.py
index 12aa1ea866015..ef8780f6986da 100644
--- a/tests/accelerators/test_common.py
+++ b/tests/accelerators/test_common.py
@@ -14,14 +14,7 @@
 from unittest import mock
 
 from pytorch_lightning import Trainer
-from pytorch_lightning.accelerators import (
-    Accelerator,
-    CPUAccelerator,
-    GPUAccelerator,
-    HPUAccelerator,
-    IPUAccelerator,
-    TPUAccelerator,
-)
+from pytorch_lightning.accelerators import Accelerator, CPUAccelerator, GPUAccelerator, IPUAccelerator, TPUAccelerator
 from pytorch_lightning.strategies import DDPStrategy
 
 
@@ -31,7 +24,6 @@ def test_auto_device_count(device_count_mock):
     assert GPUAccelerator.auto_device_count() == 2
     assert TPUAccelerator.auto_device_count() == 8
     assert IPUAccelerator.auto_device_count() == 4
-    assert HPUAccelerator.auto_device_count() == 8
 
 
 def test_pluggable_accelerator():
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index 0b4112501cc6e..7519be85cdb1b 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -324,3 +324,7 @@ def test_inference_only(tmpdir, hpus):
     trainer.validate(model)
     trainer.test(model)
     trainer.predict(model)
+
+
+def test_hpu_auto_device_count():
+    assert HPUAccelerator.auto_device_count() == 8
diff --git a/tests/plugins/precision/test_hpu.py b/tests/plugins/precision/test_hpu.py
new file mode 100644
index 0000000000000..d7aa17d7f8468
--- /dev/null
+++ b/tests/plugins/precision/test_hpu.py
@@ -0,0 +1,13 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

From bdc19be1ae18b8b6bea1e379bddc5d8ac9ed59da Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Mon, 21 Mar 2022 13:10:00 +0530
Subject: [PATCH 087/167] Remove setting hpu accelerator by just strategy

---
 .../trainer/connectors/accelerator_connector.py              | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index ea2fb54309cf6..59db2bee9d45e 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -195,7 +195,6 @@ def __init__(
         # 2. Instantiate Accelerator
         # handle `auto` and `None`
         self._set_accelerator_if_ipu_strategy_is_passed()
-        self._set_accelerator_if_hpu_strategy_is_passed()
         if self._accelerator_flag == "auto" or self._accelerator_flag is None:
             self._accelerator_flag = self._choose_accelerator()
         self._set_parallel_devices_and_init_accelerator()
@@ -479,10 +478,6 @@ def _set_accelerator_if_ipu_strategy_is_passed(self) -> None:
         if isinstance(self._strategy_flag, IPUStrategy):
             self._accelerator_flag = "ipu"
 
-    def _set_accelerator_if_hpu_strategy_is_passed(self) -> None:
-        if isinstance(self._strategy_flag, SingleHPUStrategy):
-            self._accelerator_flag = "hpu"
-
     def _choose_accelerator(self) -> str:
         """Choose the accelerator type (str) based on availability when ``accelerator='auto'``."""
         if self._accelerator_flag == "auto":

From 2d34cc5156f35a1c90f9d84d7496391b910ccd31 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Mon, 21 Mar 2022 13:19:19 +0530
Subject: [PATCH 088/167] Remove unnecessary properties for HPU

---
 .../trainer/connectors/accelerator_connector.py        |  6 ------
 pytorch_lightning/trainer/trainer.py                   | 10 +---------
 2 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index 59db2bee9d45e..86cf4d579b766 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -848,12 +848,6 @@ def num_gpus(self) -> int:
             return self.devices
         return 0
 
-    @property
-    def num_hpus(self) -> int:
-        if isinstance(self.accelerator, HPUAccelerator):
-            return self.devices
-        return 0
-
     @property
     def gpus(self) -> Optional[Union[List[int], str, int]]:
         return self._gpus
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index c502199b69144..ed9c1cb0f1444 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -1796,7 +1796,7 @@ def _log_device_info(self) -> None:
         num_ipus = self.ipus if self.ipus is not None else 0
         rank_zero_info(f"IPU available: {_IPU_AVAILABLE}, using: {num_ipus} IPUs")
 
-        num_hpus = self.hpus if self.hpus is not None else 0
+        num_hpus = self.num_devices if isinstance(self.accelerator, HPUAccelerator) else 0
         rank_zero_info(f"HPU available: {_HPU_AVAILABLE}, using: {num_hpus} HPUs")
 
         if torch.cuda.is_available() and not isinstance(self.accelerator, GPUAccelerator):
@@ -2079,18 +2079,10 @@ def tpu_cores(self) -> int:
     def ipus(self) -> int:
         return self._accelerator_connector.num_ipus
 
-    @property
-    def hpus(self) -> int:
-        return self._accelerator_connector.num_hpus
-
     @property
     def num_gpus(self) -> int:
         return self._accelerator_connector.num_gpus
 
-    @property
-    def num_hpus(self) -> int:
-        return self._accelerator_connector.num_hpus
-
     @property
     def devices(self) -> int:
         rank_zero_deprecation(

From c32601adae0f05c20a628c750c40efa43cd28f69 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Mon, 21 Mar 2022 15:06:23 +0530
Subject: [PATCH 089/167] Fix HPU tests

---
 tests/accelerators/test_hpu.py | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index 7519be85cdb1b..be22dcb1d2683 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -44,27 +44,17 @@ def test_availability():
 
 @pytest.mark.skipif(_HPU_AVAILABLE, reason="test requires non-HPU machine")
 @mock.patch("pytorch_lightning.accelerators.hpu.HPUAccelerator.is_available", return_value=True)
-def test_fail_if_no_hpus(tmpdir):
+def test_fail_if_no_hpus():
     with pytest.raises(MisconfigurationException, match="HPU Accelerator requires HPU devices to run"):
-        Trainer(default_root_dir=tmpdir, accelerator="hpu", devices=1)
-
-    with pytest.raises(MisconfigurationException, match="HPU Accelerator requires HPU devices to run"):
-        Trainer(default_root_dir=tmpdir, devices=1, accelerator="hpu")
+        Trainer(accelerator="hpu", devices=1)
 
 
 @RunIf(hpu=True)
-def test_accelerator_selected(tmpdir):
-    trainer = Trainer(default_root_dir=tmpdir, accelerator="hpu", devices=1)
+def test_accelerator_selected():
+    trainer = Trainer(accelerator="hpu")
     assert isinstance(trainer.accelerator, HPUAccelerator)
 
 
-@RunIf(hpu=True)
-def test_no_warning_plugin(tmpdir):
-    with pytest.warns(None) as record:
-        Trainer(default_root_dir=tmpdir, max_epochs=1, strategy=SingleHPUStrategy(device=torch.device("hpu")))
-    assert len(record) == 0
-
-
 @RunIf(hpu=True)
 def test_all_stages(tmpdir, hpus):
     model = BoringModel()

From f43750e0b5c6184a1aa19877ce479063beec8bd9 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Mon, 21 Mar 2022 15:23:43 +0530
Subject: [PATCH 090/167] Move tests

---
 tests/accelerators/test_hpu.py      | 79 ++++-------------------------
 tests/plugins/precision/test_hpu.py | 68 +++++++++++++++++++++++++
 2 files changed, 78 insertions(+), 69 deletions(-)

diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index be22dcb1d2683..466b03889051a 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-from typing import Optional
 from unittest import mock
 
 import pytest
@@ -20,7 +19,6 @@
 
 from pytorch_lightning import Callback, seed_everything, Trainer
 from pytorch_lightning.accelerators import HPUAccelerator
-from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins import HPUPrecisionPlugin
 from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy
 from pytorch_lightning.strategies.single_hpu import SingleHPUStrategy
@@ -117,61 +115,6 @@ def test_optimization(tmpdir):
     assert saved_result == test_result
 
 
-@RunIf(hpu=True)
-def test_mixed_precision(tmpdir, hmp_params):
-    class TestCallback(Callback):
-        def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None:
-            assert trainer.strategy.model.precision == "bf16"
-            raise SystemExit
-
-    model = BoringModel()
-    trainer = Trainer(
-        strategy=SingleHPUStrategy(
-            device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision="bf16", hmp_params=hmp_params)
-        ),
-        default_root_dir=tmpdir,
-        fast_dev_run=True,
-        accelerator="hpu",
-        devices=1,
-        callbacks=TestCallback(),
-    )
-    assert isinstance(trainer.strategy, SingleHPUStrategy)
-    assert isinstance(trainer.strategy.precision_plugin, HPUPrecisionPlugin)
-    assert trainer.strategy.precision_plugin.precision == "bf16"
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
-
-
-@RunIf(hpu=True)
-def test_pure_half_precision(tmpdir, hmp_params):
-    class TestCallback(Callback):
-        def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
-            assert trainer.strategy.model.precision == 16
-            for param in trainer.strategy.model.parameters():
-                assert param.dtype == torch.float16
-            raise SystemExit
-
-    model = BoringModel()
-    model = model.half()
-    trainer = Trainer(
-        strategy=SingleHPUStrategy(
-            device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params)
-        ),
-        default_root_dir=tmpdir,
-        fast_dev_run=True,
-        accelerator="hpu",
-        devices=1,
-        callbacks=TestCallback(),
-    )
-
-    assert isinstance(trainer.strategy, SingleHPUStrategy)
-    assert isinstance(trainer.strategy.precision_plugin, HPUPrecisionPlugin)
-    assert trainer.strategy.precision_plugin.precision == 16
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
-
-
 @RunIf(hpu=True)
 def test_stages_correct(tmpdir):
     """Ensure all stages correctly are traced correctly by asserting the output for each stage."""
@@ -224,13 +167,6 @@ def on_predict_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, da
     trainer.predict(model, model.test_dataloader())
 
 
-@RunIf(hpu=True)
-def test_precision_plugin(tmpdir, hmp_params):
-
-    plugin = HPUPrecisionPlugin(precision="bf16", hmp_params=hmp_params)
-    assert plugin.precision == "bf16"
-
-
 @RunIf(hpu=True)
 def test_accelerator_hpu():
 
@@ -278,24 +214,29 @@ def test_set_devices_if_none_hpu():
 
 
 @RunIf(hpu=True)
-def test_strategy_choice_hpu_plugin(tmpdir):
+def test_strategy_choice_hpu_plugin():
     trainer = Trainer(strategy=SingleHPUStrategy(device=torch.device("hpu")), accelerator="hpu", devices=1)
     assert isinstance(trainer.strategy, SingleHPUStrategy)
 
+    trainer = Trainer(accelerator="hpu", devices=1)
+    assert isinstance(trainer.strategy, SingleHPUStrategy)
+
 
 @RunIf(hpu=True)
-def test_strategy_choice_hpu_parallel_plugin(tmpdir):
+def test_strategy_choice_hpu_parallel_plugin():
     trainer = Trainer(
         strategy=HPUParallelStrategy(parallel_devices=[torch.device("hpu")] * 8), accelerator="hpu", devices=8
     )
     assert isinstance(trainer.strategy, HPUParallelStrategy)
 
+    trainer = Trainer(accelerator="hpu", devices=8)
+    assert isinstance(trainer.strategy, HPUParallelStrategy)
+
 
 @RunIf(hpu=True)
-def test_device_type_when_training_plugin_hpu_passed(tmpdir):
+def test_hpu_accelerator_type():
 
-    trainer = Trainer(strategy=SingleHPUStrategy(device=torch.device("hpu")), accelerator="hpu", devices=1)
-    assert isinstance(trainer.strategy, SingleHPUStrategy)
+    trainer = Trainer(accelerator="hpu", devices=1)
     assert isinstance(trainer.accelerator, HPUAccelerator)
 
 
diff --git a/tests/plugins/precision/test_hpu.py b/tests/plugins/precision/test_hpu.py
index d7aa17d7f8468..74d2cd259f053 100644
--- a/tests/plugins/precision/test_hpu.py
+++ b/tests/plugins/precision/test_hpu.py
@@ -11,3 +11,71 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Optional
+
+import pytest
+import torch
+
+from pytorch_lightning import Callback, LightningModule, Trainer
+from pytorch_lightning.plugins import HPUPrecisionPlugin
+from pytorch_lightning.strategies.single_hpu import SingleHPUStrategy
+from tests.helpers.boring_model import BoringModel
+from tests.helpers.runif import RunIf
+
+
+@RunIf(hpu=True)
+def test_precision_plugin(hmp_params):
+
+    plugin = HPUPrecisionPlugin(precision="bf16", hmp_params=hmp_params)
+    assert plugin.precision == "bf16"
+
+
+@RunIf(hpu=True)
+def test_mixed_precision(tmpdir, hmp_params):
+    class TestCallback(Callback):
+        def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None:
+            assert trainer.strategy.model.precision == "bf16"
+            raise SystemExit
+
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        fast_dev_run=True,
+        accelerator="hpu",
+        devices=1,
+        plugins=[HPUPrecisionPlugin(precision="bf16", hmp_params=hmp_params)],
+        callbacks=TestCallback(),
+    )
+    assert isinstance(trainer.strategy, SingleHPUStrategy)
+    assert isinstance(trainer.strategy.precision_plugin, HPUPrecisionPlugin)
+    assert trainer.strategy.precision_plugin.precision == "bf16"
+    with pytest.raises(SystemExit):
+        trainer.fit(model)
+
+
+@RunIf(hpu=True)
+def test_pure_half_precision(tmpdir, hmp_params):
+    class TestCallback(Callback):
+        def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
+            assert trainer.strategy.model.precision == 16
+            for param in trainer.strategy.model.parameters():
+                assert param.dtype == torch.float16
+            raise SystemExit
+
+    model = BoringModel()
+    model = model.half()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        fast_dev_run=True,
+        accelerator="hpu",
+        devices=1,
+        plugins=[HPUPrecisionPlugin(precision=16, hmp_params=hmp_params)],
+        callbacks=TestCallback(),
+    )
+
+    assert isinstance(trainer.strategy, SingleHPUStrategy)
+    assert isinstance(trainer.strategy.precision_plugin, HPUPrecisionPlugin)
+    assert trainer.strategy.precision_plugin.precision == 16
+
+    with pytest.raises(SystemExit):
+        trainer.fit(model)

From 4e092864f30367b4b828d925b4dbf5e83644d637 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Mon, 21 Mar 2022 15:46:37 +0530
Subject: [PATCH 091/167] Improve docs

---
 docs/source/accelerators/hpu.rst | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index 1e0f56d103af6..d9d8395037800 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -28,7 +28,7 @@ Lightning supports training on a single HPU device or 8 HPU devices with the plu
 HPU accelerator
 ---------------
 
-The ``devices=1`` with ``accelerator="hpu"`` parameters in the trainer class enables the Habana backend.
+The ``accelerator="hpu"`` parameters in the trainer class enables the Habana backend.
 
 
 ----------------
@@ -112,10 +112,10 @@ The below snippet shows an example model using MNIST with single Habana Gaudi de
     num_hpus = 1
 
     # enable HPU strategy for single device, with mixed precision using default HMP settings
-    hpustrat_1 = SingleHPUStrategy(device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16))
+    hpu_strategy = SingleHPUStrategy(device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16))
 
     # Initialize a trainer with 1 HPU accelerator
-    trainer = pl.Trainer(accelerator="hpu", devices=num_hpus, strategy=hpustrat_1)
+    trainer = pl.Trainer(accelerator="hpu", devices=num_hpus, strategy=hpu_strategy)
 
     # Train the model ⚡
     trainer.fit(model, datamodule=dm)
@@ -148,16 +148,8 @@ The below snippet shows an example model using MNIST with 8 Habana Gaudi devices
 
     ...
 
-    num_hpus = 8
-
-    # setup parallel strategy for 8 HPU's
-    hpustrat_8 = HPUParallelStrategy(
-        parallel_devices=[torch.device("hpu")] * num_hpus,
-        precision_plugin=HPUPrecisionPlugin(precision=16),
-    )
-
-    # Initialize a trainer with 1 HPU accelerator
-    trainer = pl.Trainer(accelerator="hpu", devices=num_hpus, strategy=hpustrat_8)
+    # Initialize a trainer with HPU accelerator with 8 devices
+    trainer = pl.Trainer(accelerator="hpu", devices=8, plugins=[HPUPrecisionPlugin(precision=16)])
 
     # Train the model ⚡
     trainer.fit(model, datamodule=dm)
@@ -201,13 +193,9 @@ This enables advanced users to provide their own BF16 and FP32 operator list ins
     hmp_params["bf16_ops"] = "ops_bf16_mnist.txt"
     hmp_params["fp32_ops"] = "ops_fp32_mnist.txt"
 
-    # enable HPU strategy for single device, with mixed precision using overidden HMP settings
-    hpustrat_1 = SingleHPUStrategy(
-        device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params)
-    )
-
-    # Initialize a trainer with 1 HPU accelerator
-    trainer = pl.Trainer(accelerator="hpu", devices=num_hpus, strategy=hpustrat_1)
+    # Initialize a trainer with HPU accelerator for HPU strategy for single device,
+    # with mixed precision using overidden HMP settings
+    trainer = pl.Trainer(accelerator="hpu", devices=1, plugins=[HPUPrecisionPlugin(precision=16, hmp_params=hmp_params)])
 
     # Train the model ⚡
     trainer.fit(model, datamodule=dm)

From ab2f595b2b87d4198f3cebe4b85f61796adfe502 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Mon, 21 Mar 2022 15:54:19 +0530
Subject: [PATCH 092/167] Improve tests

---
 tests/accelerators/test_hpu.py | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index 466b03889051a..c7f4e2279f913 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -56,20 +56,13 @@ def test_accelerator_selected():
 @RunIf(hpu=True)
 def test_all_stages(tmpdir, hpus):
     model = BoringModel()
-    parallel_devices = hpus
-    hpustrat_1 = SingleHPUStrategy(
-        device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=None)
-    )
-    hpustrat_8 = HPUParallelStrategy(
-        parallel_devices=[torch.device("hpu")] * parallel_devices,
-        precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=None),
-    )
+
     trainer = Trainer(
         default_root_dir=tmpdir,
         fast_dev_run=True,
         accelerator="hpu",
-        devices=parallel_devices,
-        strategy=hpustrat_8 if (parallel_devices == 8) else hpustrat_1,
+        devices=hpus,
+        plugins=[HPUPrecisionPlugin(precision=16, hmp_params=None)],
     )
     trainer.fit(model)
     trainer.validate(model)

From 549d7846db4eda3eed64e9567a48d2fab22e8ab3 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Mon, 21 Mar 2022 15:56:12 +0530
Subject: [PATCH 093/167] Update Changelog

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cb39df02a7b23..fc020ce56c0df 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -158,6 +158,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `Callback.state_dict()` and `Callback.load_state_dict()` methods ([#12232](https://github.com/PyTorchLightning/pytorch-lightning/pull/12232))
 
 
+- Added support for Habana Accelerator (HPU) ([#11808](https://github.com/PyTorchLightning/pytorch-lightning/pull/11808))
+
+
 ### Changed
 
 - Drop PyTorch 1.7 support ([#12191](https://github.com/PyTorchLightning/pytorch-lightning/pull/12191))

From ec929df02a198af6704c23b05d725e2b3b63b854 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Mon, 21 Mar 2022 16:16:36 +0530
Subject: [PATCH 094/167] Fix test for the rigth device type

---
 pytorch_lightning/strategies/hpu_parallel.py | 22 ++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index aa97002bb11d2..8590dc7b2deac 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -17,6 +17,7 @@
 
 import torch
 import torch.distributed
+from torch.nn.parallel.distributed import DistributedDataParallel
 
 import pytorch_lightning as pl
 from pytorch_lightning.overrides import LightningDistributedModule
@@ -25,6 +26,7 @@
 from pytorch_lightning.plugins.io.hpu_plugin import HPUCheckpointIO
 from pytorch_lightning.plugins.precision import PrecisionPlugin
 from pytorch_lightning.strategies.ddp import DDPStrategy
+from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.utilities import _HPU_AVAILABLE
 from pytorch_lightning.utilities.distributed import group as _group
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -127,6 +129,26 @@ def broadcast(self, obj: object, src: int = 0) -> object:  # type: ignore
         broadcast_object_list(obj, src, group=_group.WORLD)
         return obj[0]
 
+    def teardown(self) -> None:
+        log.detail(f"{self.__class__.__name__}: tearing down DDP plugin")
+        super().teardown()
+        if isinstance(self.model, DistributedDataParallel):
+            self.model = self.lightning_module
+
+        if (
+            self.lightning_module.trainer is not None
+            and self.lightning_module.trainer.state.fn == TrainerFn.FITTING
+            and self._layer_sync
+        ):
+            # `self.lightning_module.trainer` can be None if teardown gets called on an exception before
+            # the trainer gets set on the LightningModule
+            self.model = self._layer_sync.revert(self.model)
+
+        if self.root_device.type == "hpu":
+            # GPU teardown
+            log.detail(f"{self.__class__.__name__}: moving model to CPU")
+            self.lightning_module.cpu()
+
     @classmethod
     def register_strategies(cls, strategy_registry: Dict) -> None:
         strategy_registry.register(

From c55a82fb99bdb8ab7bf09478de977be6093ae341 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Mon, 21 Mar 2022 16:37:16 +0530
Subject: [PATCH 095/167] Fix tests

---
 tests/accelerators/test_accelerator_connector.py | 16 ----------------
 tests/accelerators/test_hpu.py                   |  4 +---
 2 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
index ca32609b39989..ff0e285f19085 100644
--- a/tests/accelerators/test_accelerator_connector.py
+++ b/tests/accelerators/test_accelerator_connector.py
@@ -936,22 +936,6 @@ def test_unsupported_ipu_choice(mock_ipu_acc_avail, monkeypatch):
         Trainer(accelerator="ipu", precision=64)
 
 
-@mock.patch("pytorch_lightning.accelerators.hpu.HPUAccelerator.is_available", return_value=True)
-def test_unsupported_hpu_choice(monkeypatch):
-    import pytorch_lightning.strategies.single_hpu as single_hpu
-    import pytorch_lightning.utilities.imports as imports
-    from pytorch_lightning.trainer.connectors.accelerator_connector import AcceleratorConnector
-
-    monkeypatch.setattr(imports, "_HPU_AVAILABLE", True)
-    monkeypatch.setattr(single_hpu, "_HPU_AVAILABLE", True)
-    monkeypatch.setattr(AcceleratorConnector, "_HPU_AVAILABLE", True)
-    with pytest.raises(
-        MisconfigurationException,
-        match=r"accelerator='hpu', precision=64\)` is not supported|HPU Accelerator requires HPU devices to run",
-    ):
-        Trainer(accelerator="hpu", precision=64)
-
-
 @mock.patch("torch.cuda.is_available", return_value=False)
 @mock.patch("pytorch_lightning.utilities.imports._TPU_AVAILABLE", return_value=False)
 @mock.patch("pytorch_lightning.utilities.imports._IPU_AVAILABLE", return_value=False)
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index c7f4e2279f913..baa8a98b23f05 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-from unittest import mock
 
 import pytest
 import torch
@@ -41,9 +40,8 @@ def test_availability():
 
 
 @pytest.mark.skipif(_HPU_AVAILABLE, reason="test requires non-HPU machine")
-@mock.patch("pytorch_lightning.accelerators.hpu.HPUAccelerator.is_available", return_value=True)
 def test_fail_if_no_hpus():
-    with pytest.raises(MisconfigurationException, match="HPU Accelerator requires HPU devices to run"):
+    with pytest.raises(MisconfigurationException, match="HPUAccelerator can not run on your system"):
         Trainer(accelerator="hpu", devices=1)
 
 

From 05dcc1c55c95bea633bdc88e0b255e5de57ffd2f Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Mon, 21 Mar 2022 17:15:30 +0530
Subject: [PATCH 096/167] Fix tests

---
 pytorch_lightning/accelerators/hpu.py        |  4 ++--
 pytorch_lightning/strategies/hpu_parallel.py | 16 +---------------
 2 files changed, 3 insertions(+), 17 deletions(-)

diff --git a/pytorch_lightning/accelerators/hpu.py b/pytorch_lightning/accelerators/hpu.py
index 1b9b7c2c56d14..5be4e96e1983b 100644
--- a/pytorch_lightning/accelerators/hpu.py
+++ b/pytorch_lightning/accelerators/hpu.py
@@ -40,9 +40,9 @@ def parse_devices(devices: int) -> int:
         return devices
 
     @staticmethod
-    def get_parallel_devices(devices: int) -> List[int]:
+    def get_parallel_devices(devices: int) -> List[torch.device]:
         """Gets parallel devices for the Accelerator."""
-        return list(range(devices))
+        return [torch.device("hpu")] * devices
 
     @staticmethod
     def auto_device_count() -> int:
diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index 8590dc7b2deac..c968b1f04482d 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -17,7 +17,6 @@
 
 import torch
 import torch.distributed
-from torch.nn.parallel.distributed import DistributedDataParallel
 
 import pytorch_lightning as pl
 from pytorch_lightning.overrides import LightningDistributedModule
@@ -26,7 +25,6 @@
 from pytorch_lightning.plugins.io.hpu_plugin import HPUCheckpointIO
 from pytorch_lightning.plugins.precision import PrecisionPlugin
 from pytorch_lightning.strategies.ddp import DDPStrategy
-from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.utilities import _HPU_AVAILABLE
 from pytorch_lightning.utilities.distributed import group as _group
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -130,22 +128,10 @@ def broadcast(self, obj: object, src: int = 0) -> object:  # type: ignore
         return obj[0]
 
     def teardown(self) -> None:
-        log.detail(f"{self.__class__.__name__}: tearing down DDP plugin")
+        log.detail(f"{self.__class__.__name__}: tearing down `HPUParallel` Strategy")
         super().teardown()
-        if isinstance(self.model, DistributedDataParallel):
-            self.model = self.lightning_module
-
-        if (
-            self.lightning_module.trainer is not None
-            and self.lightning_module.trainer.state.fn == TrainerFn.FITTING
-            and self._layer_sync
-        ):
-            # `self.lightning_module.trainer` can be None if teardown gets called on an exception before
-            # the trainer gets set on the LightningModule
-            self.model = self._layer_sync.revert(self.model)
 
         if self.root_device.type == "hpu":
-            # GPU teardown
             log.detail(f"{self.__class__.__name__}: moving model to CPU")
             self.lightning_module.cpu()
 

From f5a333b41f40d9d05c19723a1885e44ff6acbe50 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Mon, 21 Mar 2022 20:07:16 +0530
Subject: [PATCH 097/167] Address reviews

---
 pytorch_lightning/plugins/io/hpu_plugin.py |  7 ++-----
 pytorch_lightning/plugins/precision/hpu.py | 17 ++++++++---------
 pytorch_lightning/strategies/single_hpu.py |  8 ++++----
 3 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/pytorch_lightning/plugins/io/hpu_plugin.py b/pytorch_lightning/plugins/io/hpu_plugin.py
index 77ce1eff079d6..fc831cdc99d76 100644
--- a/pytorch_lightning/plugins/io/hpu_plugin.py
+++ b/pytorch_lightning/plugins/io/hpu_plugin.py
@@ -18,8 +18,8 @@
 import torch
 
 from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO
+from pytorch_lightning.utilities.apply_func import move_data_to_device
 from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem
-from pytorch_lightning.utilities.imports import _HPU_AVAILABLE
 from pytorch_lightning.utilities.types import _PATH
 
 
@@ -30,9 +30,6 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
         fs = get_filesystem(path)
         fs.makedirs(os.path.dirname(path), exist_ok=True)
 
-        if _HPU_AVAILABLE:
-            from pytorch_lightning.utilities.apply_func import move_data_to_device
-
-            checkpoint = move_data_to_device(checkpoint, torch.device("cpu"))
+        checkpoint = move_data_to_device(checkpoint, torch.device("cpu"))
         # write the checkpoint dictionary on the file
         atomic_save(checkpoint, path)
diff --git a/pytorch_lightning/plugins/precision/hpu.py b/pytorch_lightning/plugins/precision/hpu.py
index 565a90fb7c8ad..f947062763e4e 100644
--- a/pytorch_lightning/plugins/precision/hpu.py
+++ b/pytorch_lightning/plugins/precision/hpu.py
@@ -36,14 +36,13 @@ def __init__(self, precision: int, hmp_params: Optional[Sequence[Any]] = None) -
             raise MisconfigurationException("HPU precision plugin requires HPU support.")
         super().__init__()
         self.precision = precision
-        if hmp_params is not None:
+        if not hmp_params:
+            return
 
-            from habana_frameworks.torch.hpex import hmp
+        from habana_frameworks.torch.hpex import hmp
 
-            hmp_opt_level = hmp_params["level"]  # type: ignore
-            hmp_bf16 = hmp_params["bf16_ops"]  # type: ignore
-            hmp_fp32 = hmp_params["fp32_ops"]  # type: ignore
-            hmp_verbose = hmp_params["verbose"]  # type: ignore
-            hmp.convert(
-                opt_level=hmp_opt_level, bf16_file_path=hmp_bf16, fp32_file_path=hmp_fp32, isVerbose=hmp_verbose
-            )
+        hmp_opt_level = hmp_params["level"]  # type: ignore
+        hmp_bf16 = hmp_params["bf16_ops"]  # type: ignore
+        hmp_fp32 = hmp_params["fp32_ops"]  # type: ignore
+        hmp_verbose = hmp_params["verbose"]  # type: ignore
+        hmp.convert(opt_level=hmp_opt_level, bf16_file_path=hmp_bf16, fp32_file_path=hmp_fp32, isVerbose=hmp_verbose)
diff --git a/pytorch_lightning/strategies/single_hpu.py b/pytorch_lightning/strategies/single_hpu.py
index 120adc9bb8190..eeb4b2aebd583 100644
--- a/pytorch_lightning/strategies/single_hpu.py
+++ b/pytorch_lightning/strategies/single_hpu.py
@@ -34,7 +34,6 @@ def __init__(
         accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None,
         checkpoint_io: Optional[HPUCheckpointIO] = None,
         precision_plugin: Optional[PrecisionPlugin] = None,
-        hmp_params: Optional[str] = None,
     ):
 
         if not _HPU_AVAILABLE:
@@ -46,10 +45,11 @@ def __init__(
         import habana_frameworks.torch.core
         import habana_frameworks.torch.core.hccl  # noqa: F401
 
-        device = device
-        checkpoint_io = checkpoint_io or HPUCheckpointIO()
         super().__init__(
-            accelerator=accelerator, device=device, checkpoint_io=checkpoint_io, precision_plugin=precision_plugin
+            accelerator=accelerator,
+            device=device,
+            checkpoint_io=checkpoint_io or HPUCheckpointIO(),
+            precision_plugin=precision_plugin,
         )
 
     @property

From 57b9c243d50172d739ea3750dbf6b9e2d73a0d7c Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Tue, 22 Mar 2022 00:18:48 +0530
Subject: [PATCH 098/167] Update plugins

---
 pytorch_lightning/accelerators/hpu.py        |  2 +-
 pytorch_lightning/plugins/io/hpu_plugin.py   |  2 +-
 pytorch_lightning/strategies/hpu_parallel.py | 11 +++++------
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/pytorch_lightning/accelerators/hpu.py b/pytorch_lightning/accelerators/hpu.py
index 5be4e96e1983b..653925b696c16 100644
--- a/pytorch_lightning/accelerators/hpu.py
+++ b/pytorch_lightning/accelerators/hpu.py
@@ -47,7 +47,7 @@ def get_parallel_devices(devices: int) -> List[torch.device]:
     @staticmethod
     def auto_device_count() -> int:
         """Get the devices when set to auto."""
-        # TODO: Update this when api is exposed by the Habana team
+        # TODO(@kaushikb11): Update this when api is exposed by the Habana team
         return 8
 
     @staticmethod
diff --git a/pytorch_lightning/plugins/io/hpu_plugin.py b/pytorch_lightning/plugins/io/hpu_plugin.py
index fc831cdc99d76..7ff2a4c1a63fa 100644
--- a/pytorch_lightning/plugins/io/hpu_plugin.py
+++ b/pytorch_lightning/plugins/io/hpu_plugin.py
@@ -31,5 +31,5 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
         fs.makedirs(os.path.dirname(path), exist_ok=True)
 
         checkpoint = move_data_to_device(checkpoint, torch.device("cpu"))
-        # write the checkpoint dictionary on the file
+        # write the checkpoint dictionary to the provided path
         atomic_save(checkpoint, path)
diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index c968b1f04482d..3e6b95a62c410 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -25,12 +25,15 @@
 from pytorch_lightning.plugins.io.hpu_plugin import HPUCheckpointIO
 from pytorch_lightning.plugins.precision import PrecisionPlugin
 from pytorch_lightning.strategies.ddp import DDPStrategy
-from pytorch_lightning.utilities import _HPU_AVAILABLE
 from pytorch_lightning.utilities.distributed import group as _group
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import _TORCH_LESSER_EQUAL_1_10_2
+from pytorch_lightning.utilities.imports import _HPU_AVAILABLE, _TORCH_LESSER_EQUAL_1_10_2
 from pytorch_lightning.utilities.rank_zero import rank_zero_warn
 
+if _HPU_AVAILABLE:
+    import habana_frameworks.torch.core.hccl  # noqa: F401
+    from habana_frameworks.torch.utils.library_loader import load_habana_module
+
 log = logging.getLogger(__name__)
 
 
@@ -65,11 +68,7 @@ def __init__(
         )
 
     def setup_environment(self) -> None:
-
-        from habana_frameworks.torch.utils.library_loader import load_habana_module
-
         load_habana_module()
-        import habana_frameworks.torch.core.hccl  # noqa: F401
 
         os.environ["ID"] = str(self.local_rank)
         os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "hccl"

From 3dd763c6f9964fba2c655721ced577261902f352 Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Tue, 22 Mar 2022 09:20:22 +0530
Subject: [PATCH 099/167] Update docs/source/accelerators/hpu.rst

Co-authored-by: thomas chaton <thomas@grid.ai>
---
 docs/source/accelerators/hpu.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index d9d8395037800..c9a170700a47b 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -28,7 +28,7 @@ Lightning supports training on a single HPU device or 8 HPU devices with the plu
 HPU accelerator
 ---------------
 
-The ``accelerator="hpu"`` parameters in the trainer class enables the Habana backend.
+To enable PyTorch Lightning to utilize the HPU accelerator, simply provide ``Trainer(accelerator="hpu")`` parameters in the trainer class.
 
 
 ----------------

From 773a7a0ab15fb8abab6614ab5010e89fdd9e5f25 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Tue, 22 Mar 2022 09:31:36 +0530
Subject: [PATCH 100/167] Update HPU mnist example

---
 .../hpu_examples/simple_mnist/mnist.py        | 74 +++++++------------
 1 file changed, 28 insertions(+), 46 deletions(-)

diff --git a/pl_examples/hpu_examples/simple_mnist/mnist.py b/pl_examples/hpu_examples/simple_mnist/mnist.py
index c2a209a5d2d55..71a22f3ef41e3 100644
--- a/pl_examples/hpu_examples/simple_mnist/mnist.py
+++ b/pl_examples/hpu_examples/simple_mnist/mnist.py
@@ -20,9 +20,6 @@
 import pytorch_lightning as pl
 from pl_examples.basic_examples.mnist_datamodule import MNISTDataModule
 from pytorch_lightning.plugins import HPUPrecisionPlugin
-from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy
-from pytorch_lightning.strategies.single_hpu import SingleHPUStrategy
-from pytorch_lightning.utilities.imports import _HPU_AVAILABLE
 
 
 def parse_args():
@@ -87,47 +84,32 @@ def configure_optimizers(self):
 
 
 if __name__ == "__main__":
+    args = parse_args()
+
+    # Init our model
+    model = LitClassifier()
+
+    # Init DataLoader from MNIST Dataset
+    dm = MNISTDataModule(batch_size=args.batch_size)
+
+    # TBD: import these keys from hmp
+    hmp_keys = ["level", "verbose", "bf16_ops", "fp32_ops"]
+    hmp_params = dict.fromkeys(hmp_keys)
+    hmp_params["level"] = args.hmp_opt_level
+    hmp_params["verbose"] = args.hmp_verbose
+    hmp_params["bf16_ops"] = args.hmp_bf16  # "./pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt"
+    hmp_params["fp32_ops"] = args.hmp_fp32  # "./pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt"
+
+    # Initialize a trainer
+    trainer = pl.Trainer(
+        default_root_dir=os.getcwd(),
+        accelerator="hpu",
+        devices=args.hpus,
+        plugins=[HPUPrecisionPlugin(precision=16, hmp_params=hmp_params)],
+        max_epochs=args.epochs,
+    )
 
-    if _HPU_AVAILABLE:
-
-        args = parse_args()
-
-        # Init our model
-        model = LitClassifier()
-
-        # Init DataLoader from MNIST Dataset
-        dm = MNISTDataModule(batch_size=args.batch_size)
-
-        # TBD: import these keys from hmp
-        hmp_keys = ["level", "verbose", "bf16_ops", "fp32_ops"]
-        hmp_params = dict.fromkeys(hmp_keys)
-        hmp_params["level"] = args.hmp_opt_level
-        hmp_params["verbose"] = args.hmp_verbose
-        hmp_params["bf16_ops"] = args.hmp_bf16  # "./pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt"
-        hmp_params["fp32_ops"] = args.hmp_fp32  # "./pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt"
-
-        parallel_devices = args.hpus
-        hpustrat_1 = SingleHPUStrategy(
-            device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params)
-        )
-        hpustrat_8 = HPUParallelStrategy(
-            parallel_devices=[torch.device("hpu")] * parallel_devices,
-            precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params),
-        )
-
-        # Initialize a trainer
-        trainer = pl.Trainer(
-            strategy=hpustrat_8 if (parallel_devices == 8) else hpustrat_1,
-            devices=parallel_devices,
-            max_epochs=args.epochs,
-            default_root_dir=os.getcwd(),
-            accelerator="hpu",
-        )
-
-        # Train the model ⚡
-        trainer.fit(model, datamodule=dm)
-        trainer.test(model, datamodule=dm)
-        trainer.validate(model, datamodule=dm)
-
-    else:
-        print("This example is supported only on HPU !")
+    # Train the model ⚡
+    trainer.fit(model, datamodule=dm)
+    trainer.test(model, datamodule=dm)
+    trainer.validate(model, datamodule=dm)

From 9378c8781f3b0edb668a78c3c82e4a97dc2725cd Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Tue, 22 Mar 2022 09:44:48 +0530
Subject: [PATCH 101/167] Update strategy

---
 pytorch_lightning/strategies/single_hpu.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/strategies/single_hpu.py b/pytorch_lightning/strategies/single_hpu.py
index eeb4b2aebd583..a5dac56dcf867 100644
--- a/pytorch_lightning/strategies/single_hpu.py
+++ b/pytorch_lightning/strategies/single_hpu.py
@@ -22,6 +22,10 @@
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.types import _DEVICE
 
+if _HPU_AVAILABLE:
+    import habana_frameworks.torch.core.hccl  # noqa: F401
+    from habana_frameworks.torch.utils.library_loader import load_habana_module
+
 
 class SingleHPUStrategy(SingleDeviceStrategy):
     """Strategy for training on single HPU device."""
@@ -39,12 +43,7 @@ def __init__(
         if not _HPU_AVAILABLE:
             raise MisconfigurationException("`SingleHPUStrategy` requires HPU devices to run")
 
-        from habana_frameworks.torch.utils.library_loader import load_habana_module
-
         load_habana_module()
-        import habana_frameworks.torch.core
-        import habana_frameworks.torch.core.hccl  # noqa: F401
-
         super().__init__(
             accelerator=accelerator,
             device=device,

From 9aefcd288cdea424850392140493d18f805379d4 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Tue, 22 Mar 2022 10:36:14 +0200
Subject: [PATCH 102/167] Address reviews

Signed-off-by: Jerome <janand@habana.ai>
---
 pytorch_lightning/strategies/hpu_parallel.py | 2 +-
 tests/accelerators/test_hpu.py               | 6 +-----
 tests/conftest.py                            | 1 +
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index 3e6b95a62c410..3054957c1b143 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -132,7 +132,7 @@ def teardown(self) -> None:
 
         if self.root_device.type == "hpu":
             log.detail(f"{self.__class__.__name__}: moving model to CPU")
-            self.lightning_module.cpu()
+            self.lightning_module.cpu() # type: ignore
 
     @classmethod
     def register_strategies(cls, strategy_registry: Dict) -> None:
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index baa8a98b23f05..5ea7ea73d21cf 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -28,11 +28,6 @@
 from tests.helpers.runif import RunIf
 from tests.helpers.simple_models import ClassificationModel
 
-if _HPU_AVAILABLE:
-    import habana_frameworks.torch.core as htcore  # noqa: F401
-
-    os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "hccl"
-
 
 @RunIf(hpu=True)
 def test_availability():
@@ -53,6 +48,7 @@ def test_accelerator_selected():
 
 @RunIf(hpu=True)
 def test_all_stages(tmpdir, hpus):
+    """Tests all the model stages using BoringModel on HPU"""
     model = BoringModel()
 
     trainer = Trainer(
diff --git a/tests/conftest.py b/tests/conftest.py
index fd4573c33cbcb..f8c0e1d53e535 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -63,6 +63,7 @@ def restore_env_variables():
         "MASTER_PORT",
         "PL_GLOBAL_SEED",
         "PL_SEED_WORKERS",
+        "PL_TORCH_DISTRIBUTED_BACKEND",
         "WANDB_MODE",
         "WANDB_REQUIRE_SERVICE",
         "WANDB_SERVICE",

From 1f0b18723bdc115bd1200e164f156532183d9702 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 22 Mar 2022 08:38:08 +0000
Subject: [PATCH 103/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pytorch_lightning/strategies/hpu_parallel.py | 2 +-
 tests/accelerators/test_hpu.py               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index 3054957c1b143..98db1576bea14 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -132,7 +132,7 @@ def teardown(self) -> None:
 
         if self.root_device.type == "hpu":
             log.detail(f"{self.__class__.__name__}: moving model to CPU")
-            self.lightning_module.cpu() # type: ignore
+            self.lightning_module.cpu()  # type: ignore
 
     @classmethod
     def register_strategies(cls, strategy_registry: Dict) -> None:
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index 5ea7ea73d21cf..cc8c2c2277712 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -48,7 +48,7 @@ def test_accelerator_selected():
 
 @RunIf(hpu=True)
 def test_all_stages(tmpdir, hpus):
-    """Tests all the model stages using BoringModel on HPU"""
+    """Tests all the model stages using BoringModel on HPU."""
     model = BoringModel()
 
     trainer = Trainer(

From 1d30ef90a564253ef4a270fadc2604fad5775bef Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Tue, 22 Mar 2022 12:17:59 +0200
Subject: [PATCH 104/167] Add precision tests to azure pipeline

Signed-off-by: Jerome <janand@habana.ai>
---
 .azure-pipelines/run_hpu_tests.py                    | 12 +++++++-----
 .../precision/hpu/ops_bf16.txt}                      |  0
 .../precision/hpu/ops_fp32.txt}                      |  0
 tests/plugins/precision/{ => hpu}/test_hpu.py        |  0
 4 files changed, 7 insertions(+), 5 deletions(-)
 rename tests/{accelerators/ops_bf16_mnist.txt => plugins/precision/hpu/ops_bf16.txt} (100%)
 rename tests/{accelerators/ops_fp32_mnist.txt => plugins/precision/hpu/ops_fp32.txt} (100%)
 rename tests/plugins/precision/{ => hpu}/test_hpu.py (100%)

diff --git a/.azure-pipelines/run_hpu_tests.py b/.azure-pipelines/run_hpu_tests.py
index cd8531103529e..026277b5f655c 100644
--- a/.azure-pipelines/run_hpu_tests.py
+++ b/.azure-pipelines/run_hpu_tests.py
@@ -13,8 +13,6 @@
 
 HPU_TESTS_DICTIONARY = {
     "hpu1_test": "python -m coverage run --source pytorch_lightning -m pytest -sv tests/accelerators/test_hpu.py \
-            --hmp-bf16 'tests/accelerators/ops_bf16_mnist.txt' \
-            --hmp-fp32 'tests/accelerators/ops_fp32_mnist.txt' \
             --forked \
             --junitxml=hpu1_test-results.xml",
     "hpu2_test": "python -m coverage run --source pytorch_lightning -m pytest -sv tests/accelerators/test_hpu.py \
@@ -32,19 +30,23 @@
             --forked \
             --junitxml=hpu4_test-results.xml",
     "hpu8_test": "python -m coverage run --source pytorch_lightning -m pytest -sv tests/accelerators/test_hpu.py \
-            --hmp-bf16 'tests/accelerators/ops_bf16_mnist.txt' \
-            --hmp-fp32 'tests/accelerators/ops_fp32_mnist.txt' \
             --forked \
             --hpus 8 \
             --junitxml=hpu8_test-results.xml",
+    "hpu1_precision_test": "python -m coverage run --source pytorch_lightning -m pytest -sv tests/plugins/precision/hpu/test_hpu.py \
+            --hmp-bf16 'tests/plugins/precision/hpu/ops_bf16.txt' \
+            --hmp-fp32 'tests/plugins/precision/hpu/ops_fp32.txt' \
+            --forked \
+            --junitxml=hpu1_precision_test-results.xml",
 }
 
 HPU1_TEST = HPU_TESTS_DICTIONARY["hpu1_test"]
 HPU2_TEST = HPU_TESTS_DICTIONARY["hpu2_test"]
 HPU4_TEST = HPU_TESTS_DICTIONARY["hpu4_test"]
 HPU8_TEST = HPU_TESTS_DICTIONARY["hpu8_test"]
+HPU1_PRECISION_TEST = HPU_TESTS_DICTIONARY["hpu1_precision_test"]
 
-PARALLEL_HPU_TESTS_EXECUTION = [[HPU4_TEST, HPU1_TEST], [HPU2_TEST, HPU1_TEST], [HPU8_TEST]]
+PARALLEL_HPU_TESTS_EXECUTION = [[HPU4_TEST, HPU1_TEST], [HPU2_TEST, HPU1_TEST], [HPU8_TEST],[HPU1_PRECISION_TEST]]
 TIMEOUT = 60
 TIMEOUT_EXIT_CODE = -9
 
diff --git a/tests/accelerators/ops_bf16_mnist.txt b/tests/plugins/precision/hpu/ops_bf16.txt
similarity index 100%
rename from tests/accelerators/ops_bf16_mnist.txt
rename to tests/plugins/precision/hpu/ops_bf16.txt
diff --git a/tests/accelerators/ops_fp32_mnist.txt b/tests/plugins/precision/hpu/ops_fp32.txt
similarity index 100%
rename from tests/accelerators/ops_fp32_mnist.txt
rename to tests/plugins/precision/hpu/ops_fp32.txt
diff --git a/tests/plugins/precision/test_hpu.py b/tests/plugins/precision/hpu/test_hpu.py
similarity index 100%
rename from tests/plugins/precision/test_hpu.py
rename to tests/plugins/precision/hpu/test_hpu.py

From fd9488f654565ace2070dd6b29598c9533944e09 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 22 Mar 2022 10:19:56 +0000
Subject: [PATCH 105/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .azure-pipelines/run_hpu_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azure-pipelines/run_hpu_tests.py b/.azure-pipelines/run_hpu_tests.py
index 026277b5f655c..6403c1bfcac7c 100644
--- a/.azure-pipelines/run_hpu_tests.py
+++ b/.azure-pipelines/run_hpu_tests.py
@@ -46,7 +46,7 @@
 HPU8_TEST = HPU_TESTS_DICTIONARY["hpu8_test"]
 HPU1_PRECISION_TEST = HPU_TESTS_DICTIONARY["hpu1_precision_test"]
 
-PARALLEL_HPU_TESTS_EXECUTION = [[HPU4_TEST, HPU1_TEST], [HPU2_TEST, HPU1_TEST], [HPU8_TEST],[HPU1_PRECISION_TEST]]
+PARALLEL_HPU_TESTS_EXECUTION = [[HPU4_TEST, HPU1_TEST], [HPU2_TEST, HPU1_TEST], [HPU8_TEST], [HPU1_PRECISION_TEST]]
 TIMEOUT = 60
 TIMEOUT_EXIT_CODE = -9
 

From a4f79fb1b9a7bfb1028755dfdac4df020717b2c8 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Tue, 22 Mar 2022 19:32:37 +0530
Subject: [PATCH 106/167] Add comments

---
 pytorch_lightning/plugins/precision/hpu.py   | 5 +++--
 pytorch_lightning/strategies/hpu_parallel.py | 2 ++
 pytorch_lightning/strategies/single_hpu.py   | 3 +++
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/plugins/precision/hpu.py b/pytorch_lightning/plugins/precision/hpu.py
index f947062763e4e..8ca5da921a177 100644
--- a/pytorch_lightning/plugins/precision/hpu.py
+++ b/pytorch_lightning/plugins/precision/hpu.py
@@ -27,6 +27,9 @@
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _HPU_AVAILABLE
 
+if _HPU_AVAILABLE:
+    from habana_frameworks.torch.hpex import hmp
+
 
 class HPUPrecisionPlugin(PrecisionPlugin):
     """Plugin that enables bfloats/floats on HPUs."""
@@ -39,8 +42,6 @@ def __init__(self, precision: int, hmp_params: Optional[Sequence[Any]] = None) -
         if not hmp_params:
             return
 
-        from habana_frameworks.torch.hpex import hmp
-
         hmp_opt_level = hmp_params["level"]  # type: ignore
         hmp_bf16 = hmp_params["bf16_ops"]  # type: ignore
         hmp_fp32 = hmp_params["fp32_ops"]  # type: ignore
diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index 98db1576bea14..3310340af5fd2 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -68,6 +68,8 @@ def __init__(
         )
 
     def setup_environment(self) -> None:
+        # This function is used to load Habana libraries required for PyTorch
+        # to register HPU as one of the available devices.
         load_habana_module()
 
         os.environ["ID"] = str(self.local_rank)
diff --git a/pytorch_lightning/strategies/single_hpu.py b/pytorch_lightning/strategies/single_hpu.py
index a5dac56dcf867..edafe63441906 100644
--- a/pytorch_lightning/strategies/single_hpu.py
+++ b/pytorch_lightning/strategies/single_hpu.py
@@ -43,7 +43,10 @@ def __init__(
         if not _HPU_AVAILABLE:
             raise MisconfigurationException("`SingleHPUStrategy` requires HPU devices to run")
 
+        # This function is used to load Habana libraries required for PyTorch
+        # to register HPU as one of the available devices.
         load_habana_module()
+
         super().__init__(
             accelerator=accelerator,
             device=device,

From a6a336df97bd5e4e124cae5112f8827c60a09b80 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Tue, 22 Mar 2022 21:18:48 +0530
Subject: [PATCH 107/167] Fix argparse

---
 pytorch_lightning/utilities/argparse.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/utilities/argparse.py b/pytorch_lightning/utilities/argparse.py
index 57cb6b5315e88..8927ff0934373 100644
--- a/pytorch_lightning/utilities/argparse.py
+++ b/pytorch_lightning/utilities/argparse.py
@@ -250,7 +250,7 @@ def add_argparse_args(
         else:
             use_type = arg_types[0]
 
-        if arg in ("gpus", "tpu_cores", "hpus"):
+        if arg == "gpus" or arg == "tpu_cores":
             use_type = _gpus_allowed_type
 
         # hack for types in (int, float)

From dca30ee3f51d7267d68bb043a86bbe64b6966085 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Tue, 22 Mar 2022 23:16:12 +0530
Subject: [PATCH 108/167] Remove unnecessary use of
 PL_TORCH_DISTRIBUTED_BACKEND env variable

---
 pytorch_lightning/strategies/hpu_parallel.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index 3310340af5fd2..48029a3f80195 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -73,8 +73,6 @@ def setup_environment(self) -> None:
         load_habana_module()
 
         os.environ["ID"] = str(self.local_rank)
-        os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "hccl"
-
         super().setup_environment()
 
     def determine_ddp_device_ids(self) -> None:

From bb8984fad0c2f5a717ba00d716f401e5512db6d4 Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Tue, 22 Mar 2022 23:27:38 +0530
Subject: [PATCH 109/167] Update pytorch_lightning/strategies/hpu_parallel.py

Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com>
---
 pytorch_lightning/strategies/hpu_parallel.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index 48029a3f80195..cddbdaba65901 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -40,10 +40,6 @@
 class HPUParallelStrategy(DDPStrategy):
     """Strategy for distributed training on multiple HPU devices."""
 
-    # The main process in each node spawns N-1 child processes via :func:`subprocess.Popen`, where N is the number of
-    # devices (e.g. GPU) per node. It is very similar to how :mod:`torch.distributed.launch` launches processes.
-
-    # Multi-device per process is not supported with habana
 
     strategy_name = "hpu_parallel"
 

From 4ab35dbb307ac3a59c026e7a55a9b92fe1d9b674 Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Tue, 22 Mar 2022 23:28:00 +0530
Subject: [PATCH 110/167] Update pytorch_lightning/utilities/distributed.py

Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com>
---
 pytorch_lightning/utilities/distributed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index 5b1735a44e178..10cf4ee6a73b1 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -137,7 +137,7 @@ def sync_ddp(
         is_hpu_backend = group_backend == torch.distributed.Backend(str(dist_backend))
         if is_hpu_backend:
             if (result.type() == "torch.LongTensor") or (result.type() == "torch.hpu.LongTensor"):
-                new_rank_zero_info("Long tensor unsupported, casting to float")
+                new_rank_zero_info("Long tensor unsupported on HPU, casting to float")
                 result = result.float()
 
     # sync all processes before reduction

From e65a3fbf991ee578d241971b6b3cbd128b3e88d5 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 22 Mar 2022 18:03:19 +0000
Subject: [PATCH 111/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pytorch_lightning/strategies/hpu_parallel.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index cddbdaba65901..77562aac7b988 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -40,7 +40,6 @@
 class HPUParallelStrategy(DDPStrategy):
     """Strategy for distributed training on multiple HPU devices."""
 
-
     strategy_name = "hpu_parallel"
 
     def __init__(

From a51794288d4019d7e2ec820938f187d0149fcb02 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Wed, 23 Mar 2022 04:43:06 +0200
Subject: [PATCH 112/167] Address review

Signed-off-by: Jerome <janand@habana.ai>
---
 pytorch_lightning/plugins/precision/hpu.py   | 10 ----------
 pytorch_lightning/strategies/hpu_parallel.py |  2 ++
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/pytorch_lightning/plugins/precision/hpu.py b/pytorch_lightning/plugins/precision/hpu.py
index 8ca5da921a177..0dd45915e5307 100644
--- a/pytorch_lightning/plugins/precision/hpu.py
+++ b/pytorch_lightning/plugins/precision/hpu.py
@@ -1,13 +1,3 @@
-# Copyright (C) 2021 Habana Labs, Ltd. an Intel Company
-# All Rights Reserved.
-#
-# Unauthorized copying of this file or any element(s) within it, via any medium
-# is strictly prohibited.
-# This file contains Habana Labs, Ltd. proprietary and confidential information
-# and is subject to the confidentiality and license agreements under which it
-# was provided.
-#
-
 # Copyright The PyTorch Lightning team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index 77562aac7b988..b15455d04f0f5 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -68,6 +68,8 @@ def setup_environment(self) -> None:
         load_habana_module()
 
         os.environ["ID"] = str(self.local_rank)
+        # this env is used in overrides to check the backend initiated
+        os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "hccl"
         super().setup_environment()
 
     def determine_ddp_device_ids(self) -> None:

From d89815d11d7452943a90f29ff66fcbe791b9cc04 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Wed, 23 Mar 2022 11:14:45 +0530
Subject: [PATCH 113/167] Address reviews

---
 docs/source/accelerators/hpu.rst | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index c9a170700a47b..12051874b149b 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -6,10 +6,10 @@ Habana Gaudi AI Processor (HPU)
 Habana® Gaudi® AI training processors have been architected from the ground up and optimized for deep learning training efficiency.
 Gaudi offers substantial price/performance advantage -- so you get to do more deep learning training while spending less.
 
-You can use either `the Gaudi-based AWS EC2 DL1 instances <https://aws.amazon.com/ec2/instance-types/dl1/>`_ or `the Supermicro X12 Gaudi server <https://www.supermicro.com/en/solutions/habana-gaudi>`_.
+You can either use `the Gaudi-based AWS EC2 DL1 instances <https://aws.amazon.com/ec2/instance-types/dl1/>`_ or `the Supermicro X12 Gaudi server <https://www.supermicro.com/en/solutions/habana-gaudi>`_.
 
 Habana’s SynapseAI® software suite is optimized for building and training deep learning models using TensorFlow and PyTorch frameworks. Gaudi is referred to as the Habana Processing Unit (HPU).
-With SynapseAI, we aim to make training workloads on Gaudi easy, whether you're developing from scratch or migrating existing workloads.
+With SynapseAI, the aim is to make training workloads on Gaudi easy, whether you're developing from scratch or migrating existing workloads.
 
 For more information, check out `<https://developer.habana.ai>`_ and `<https://habana.ai/>`_.
 
@@ -18,17 +18,17 @@ For more information, check out `<https://developer.habana.ai>`_ and `<https://h
 PyTorch Lightning With Gaudi HPU
 --------------------------------
 
-Lightning supports training on a single HPU device or 8 HPU devices with the plugins described in the following sections
+Lightning supports training on a single HPU device or 8 HPU devices with the integrations described in the following sections
 
 
 ----------------
 
 .. _hpu_accelerator:
 
-HPU accelerator
+HPU Accelerator
 ---------------
 
-To enable PyTorch Lightning to utilize the HPU accelerator, simply provide ``Trainer(accelerator="hpu")`` parameters in the trainer class.
+To enable PyTorch Lightning to utilize the HPU accelerator, simply provide ``Trainer(accelerator="hpu")`` parameter in the Trainer class.
 
 
 ----------------
@@ -38,7 +38,11 @@ To enable PyTorch Lightning to utilize the HPU accelerator, simply provide ``Tra
 Training on Single HPU
 ----------------------
 
-The ``devices=1`` and ``accelerator="hpu"`` with ``strategy=SingleHPUStrategy(device=torch.device("hpu"))`` parameter in the trainer class enables the Habana backend for single Gaudi training.
+The ``devices=1`` and ``accelerator="hpu"`` in the Trainer class enables the Habana backend for single Gaudi training.
+
+.. code-block:: python
+
+    trainer = Trainer(devices=1, accelerator="hpu")
 
 
 ----------------
@@ -49,10 +53,13 @@ Distributed Training
 ---------------------
 
 
-The ``devices=8`` and ``accelerator="hpu"`` with ``strategy=HPUParallelStrategy(parallel_devices=[torch.device("hpu")]*devices)`` parameter in the trainer class enables the Habana backend for distributed training with 8 Gaudis.
+The ``devices=8`` and ``accelerator="hpu"`` parameters in the Trainer class enables the Habana backend for distributed training with 8 Gaudis.
 
 The Habana parallel device strategy is based on DDP strategy with the addition of Habana's collective communication library (HCCL) to support scale-up within a node and scale-out across multiple nodes.
 
+.. code-block:: python
+
+    trainer = Trainer(devices=8, accelerator="hpu")
 
 ----------------
 
@@ -61,7 +68,7 @@ The Habana parallel device strategy is based on DDP strategy with the addition o
 Mixed Precision Plugin
 ----------------------
 
-The ``precision=16`` and a ``hmp_params`` parameter in the trainer class enables the Habana plugin for mixed precision using the Habana Mixed Precision (HMP) package.
+The ``precision=16`` parameter in the Trainer class enables the Habana plugin for mixed precision using the Habana Mixed Precision (HMP) package.
 
 You can execute the ops in FP32 or BF16 precision. The HMP package modifies the python operators to add the appropriate cast operations for the arguments before execution.
 The default settings enable users to easily enable mixed precision training with minimal code.

From 0238b45d7a6f01e657bc1d4ae5fefc7333be1698 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Wed, 23 Mar 2022 08:23:08 +0200
Subject: [PATCH 114/167] Update document

Signed-off-by: Jerome <janand@habana.ai>
---
 docs/source/accelerators/hpu.rst | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index 12051874b149b..420938303033d 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -28,7 +28,7 @@ Lightning supports training on a single HPU device or 8 HPU devices with the int
 HPU Accelerator
 ---------------
 
-To enable PyTorch Lightning to utilize the HPU accelerator, simply provide ``Trainer(accelerator="hpu")`` parameter in the Trainer class.
+To enable PyTorch Lightning to utilize the HPU Accelerator, simply provide ``Trainer(accelerator="hpu")`` parameter in the Trainer class.
 
 
 ----------------
@@ -73,7 +73,12 @@ The ``precision=16`` parameter in the Trainer class enables the Habana plugin fo
 You can execute the ops in FP32 or BF16 precision. The HMP package modifies the python operators to add the appropriate cast operations for the arguments before execution.
 The default settings enable users to easily enable mixed precision training with minimal code.
 
-In addition to the default settings in HMP, users also have the option of overriding these defaults and providing their own BF16 and FP32 operator lists.
+In addition to the default settings in HMP, users also have the option of overriding these defaults and providing their own BF16 and FP32 operator lists using the ``plugins`` parameter of Trainer class.
+HPU's precision plugin is realised using ``HPUPrecisionPlugin``. The ``hmp_params`` parameter with this plugin is used to override the default operator list. An example can be found in the subsequent section.
+
+.. code-block:: python
+
+    trainer = Trainer(devices=1, accelerator="hpu", plugins=[HPUPrecisionPlugin(precision="bf16", hmp_params=hmp_params)])
 
 For more details, please refer to `PyTorch Mixed Precision Training on Gaudi <https://docs.habana.ai/en/latest/PyTorch_User_Guide/PyTorch_User_Guide.html#pytorch-mixed-precision-training-on-gaudi>`_.
 
@@ -116,13 +121,8 @@ The below snippet shows an example model using MNIST with single Habana Gaudi de
 
     ...
 
-    num_hpus = 1
-
-    # enable HPU strategy for single device, with mixed precision using default HMP settings
-    hpu_strategy = SingleHPUStrategy(device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16))
-
     # Initialize a trainer with 1 HPU accelerator
-    trainer = pl.Trainer(accelerator="hpu", devices=num_hpus, strategy=hpu_strategy)
+    trainer = pl.Trainer(accelerator="hpu", devices=1)
 
     # Train the model ⚡
     trainer.fit(model, datamodule=dm)
@@ -156,7 +156,7 @@ The below snippet shows an example model using MNIST with 8 Habana Gaudi devices
     ...
 
     # Initialize a trainer with HPU accelerator with 8 devices
-    trainer = pl.Trainer(accelerator="hpu", devices=8, plugins=[HPUPrecisionPlugin(precision=16)])
+    trainer = pl.Trainer(accelerator="hpu", devices=8)
 
     # Train the model ⚡
     trainer.fit(model, datamodule=dm)

From 4f44ea9df24c2c62d4f6dba87954f6937694278a Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Wed, 23 Mar 2022 12:04:42 +0530
Subject: [PATCH 115/167] Improve Habana doc

---
 docs/source/accelerators/hpu.rst | 44 ++++++++------------------------
 1 file changed, 10 insertions(+), 34 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index 12051874b149b..0ea1dd8f4df23 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -15,55 +15,44 @@ For more information, check out `<https://developer.habana.ai>`_ and `<https://h
 
 ----------------
 
-PyTorch Lightning With Gaudi HPU
---------------------------------
-
-Lightning supports training on a single HPU device or 8 HPU devices with the integrations described in the following sections
-
+Getting Started with Lightning on Gaudi
+---------------------------------------
 
-----------------
+Lightning supports `Habana Gaudi AI Processor (HPU) <https://habana.ai/>`__ with the integrations described in the following sections:
 
-.. _hpu_accelerator:
 
 HPU Accelerator
 ---------------
 
-To enable PyTorch Lightning to utilize the HPU accelerator, simply provide ``Trainer(accelerator="hpu")`` parameter in the Trainer class.
+To enable PyTorch Lightning to utilize the HPU accelerator, simply provide ``accelerator="hpu"`` parameter to the Trainer class.
 
+.. code-block:: python
 
-----------------
+    trainer = Trainer(accelerator="hpu")
 
-.. _single_device_strategy:
 
 Training on Single HPU
 ----------------------
 
-The ``devices=1`` and ``accelerator="hpu"`` in the Trainer class enables the Habana backend for single Gaudi training.
+The ``devices=1`` and ``accelerator="hpu"`` in the Trainer class enables the Habana accelerator for single Gaudi training.
 
 .. code-block:: python
 
     trainer = Trainer(devices=1, accelerator="hpu")
 
 
-----------------
-
-.. _parallel_device_strategy:
-
 Distributed Training
 ---------------------
 
+The ``devices=8`` and ``accelerator="hpu"`` parameters to the Trainer class enables the Habana accelerator for distributed training with 8 Gaudis.
 
-The ``devices=8`` and ``accelerator="hpu"`` parameters in the Trainer class enables the Habana backend for distributed training with 8 Gaudis.
-
-The Habana parallel device strategy is based on DDP strategy with the addition of Habana's collective communication library (HCCL) to support scale-up within a node and scale-out across multiple nodes.
+The :class:`~pytorch_lightning.strategies.HPUParallelStrategy` is based on DDP strategy with the addition of Habana's collective communication library (HCCL) to support scale-up within a node and scale-out across multiple nodes.
+It is used when ``devices=8`` and ``accelerator="hpu"`` are provided.
 
 .. code-block:: python
 
     trainer = Trainer(devices=8, accelerator="hpu")
 
-----------------
-
-.. _mixed_precision_plugin:
 
 Mixed Precision Plugin
 ----------------------
@@ -78,19 +67,6 @@ In addition to the default settings in HMP, users also have the option of overri
 For more details, please refer to `PyTorch Mixed Precision Training on Gaudi <https://docs.habana.ai/en/latest/PyTorch_User_Guide/PyTorch_User_Guide.html#pytorch-mixed-precision-training-on-gaudi>`_.
 
 
-----------------
-
-.. _pytorch_lightning_examples:
-
-Getting Started with Lightning on Gaudi
----------------------------------------
-
-This section describes how to train models using PyTorch Lightning with Habana Gaudi.
-
-More Lightning HPU examples can be found in pl_examples (`<https://github.com/PyTorchLightning/pytorch-lightning/tree/master/pl_examples/hpu_examples>`_)
-
-----------------
-
 Enabling Lightning with Single Gaudi HPU
 ----------------------------------------
 

From 81202c6cea08c40edfb705609e48aaf44fca0850 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Wed, 23 Mar 2022 13:02:06 +0530
Subject: [PATCH 116/167] Improve Habana doc

---
 docs/source/accelerators/hpu.rst | 122 +++++++------------------------
 1 file changed, 28 insertions(+), 94 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index 4a3dad6e2e6f6..191dcf2565883 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -3,26 +3,33 @@
 Habana Gaudi AI Processor (HPU)
 ===============================
 
-Habana® Gaudi® AI training processors have been architected from the ground up and optimized for deep learning training efficiency.
-Gaudi offers substantial price/performance advantage -- so you get to do more deep learning training while spending less.
+Lightning supports `Habana Gaudi AI Processor (HPU) <https://habana.ai/>`__, for accelerating Deep Learning training workloads.
 
-You can either use `the Gaudi-based AWS EC2 DL1 instances <https://aws.amazon.com/ec2/instance-types/dl1/>`_ or `the Supermicro X12 Gaudi server <https://www.supermicro.com/en/solutions/habana-gaudi>`_.
+HPU terminology
+---------------
 
-Habana’s SynapseAI® software suite is optimized for building and training deep learning models using TensorFlow and PyTorch frameworks. Gaudi is referred to as the Habana Processing Unit (HPU).
-With SynapseAI, the aim is to make training workloads on Gaudi easy, whether you're developing from scratch or migrating existing workloads.
+Habana® Gaudi® AI training processors are built on a heterogeneous architecture with a cluster of fully programmable Tensor Processing Cores (TPC) along with its associated development tools and libraries, and a configurable Matrix Math engine.
 
-For more information, check out `<https://developer.habana.ai>`_ and `<https://habana.ai/>`_.
+The TPC core is a VLIW SIMD processor with instruction set and hardware tailored to serve training workloads efficiently.
+The Gaudi memory architecture includes on-die SRAM and local memories in each TPC and,
+Gaudi is the first DL training processor that has integrated RDMA over Converged Ethernet (RoCE v2) engines on-chip.
 
-----------------
+On the software side, the PyTorch Habana bridge interfaces between the framework and SynapseAI software stack to enable the execution of deep learning models on the Habana Gaudi device.
 
-Getting Started with Lightning on Gaudi
----------------------------------------
+Gaudi offers substantial price/performance advantage -- so you get to do more deep learning training while spending less.
 
-Lightning supports `Habana Gaudi AI Processor (HPU) <https://habana.ai/>`__ with the integrations described in the following sections:
+For more information, check out `Gaudi Architecture <https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Overview.html#gaudi-architecture>`__ and `Gaudi Developer Docs <https://developer.habana.ai>`__.
 
+How to access HPUs
+------------------
 
-HPU Accelerator
----------------
+To use HPUs, you must have access to a system with HPU devices.
+You can either use `Gaudi-based AWS EC2 DL1 instances <https://aws.amazon.com/ec2/instance-types/dl1/>`__ or `Supermicro X12 Gaudi server <https://www.supermicro.com/en/solutions/habana-gaudi>`__ to get access to HPUs.
+
+Checkout the `Getting Started Guide with AWS and Habana <https://docs.habana.ai/en/latest/AWS_EC2_Getting_Started/AWS_EC2_Getting_Started.html>`__.
+
+Training with HPUs
+------------------
 
 To enable PyTorch Lightning to utilize the HPU accelerator, simply provide ``accelerator="hpu"`` parameter to the Trainer class.
 
@@ -30,29 +37,23 @@ To enable PyTorch Lightning to utilize the HPU accelerator, simply provide ``acc
 
     trainer = Trainer(accelerator="hpu")
 
-
-Training on Single HPU
-----------------------
-
-The ``devices=1`` and ``accelerator="hpu"`` in the Trainer class enables the Habana accelerator for single Gaudi training.
+Passing ``devices=1`` and ``accelerator="hpu"`` to the Trainer class enables the Habana accelerator for single Gaudi training.
 
 .. code-block:: python
 
     trainer = Trainer(devices=1, accelerator="hpu")
 
-
-Distributed Training
----------------------
-
 The ``devices=8`` and ``accelerator="hpu"`` parameters to the Trainer class enables the Habana accelerator for distributed training with 8 Gaudis.
-
-The :class:`~pytorch_lightning.strategies.HPUParallelStrategy` is based on DDP strategy with the addition of Habana's collective communication library (HCCL) to support scale-up within a node and scale-out across multiple nodes.
-It is used when ``devices=8`` and ``accelerator="hpu"`` are provided.
+It uses :class:`~pytorch_lightning.strategies.HPUParallelStrategy` internally which is based on DDP strategy with the addition of Habana's collective communication library (HCCL) to support scale-up within a node and scale-out across multiple nodes.
 
 .. code-block:: python
 
     trainer = Trainer(devices=8, accelerator="hpu")
 
+.. note::
+    If the ``devices`` flag is not defined, it will assume ``devices`` to be ``"auto"`` and fetch the :meth:`~pytorch_lightning.accelerators.HPUAccelerator.auto_device_count`
+    from :class:`~pytorch_lightning.accelerators.HPUAccelerator`.
+
 
 Mixed Precision Plugin
 ----------------------
@@ -67,78 +68,10 @@ HPU's precision plugin is realised using ``HPUPrecisionPlugin``. The ``hmp_param
 
 .. code-block:: python
 
-    trainer = Trainer(devices=1, accelerator="hpu", plugins=[HPUPrecisionPlugin(precision="bf16", hmp_params=hmp_params)])
-
-For more details, please refer to `PyTorch Mixed Precision Training on Gaudi <https://docs.habana.ai/en/latest/PyTorch_User_Guide/PyTorch_User_Guide.html#pytorch-mixed-precision-training-on-gaudi>`_.
-
-
-Enabling Lightning with Single Gaudi HPU
-----------------------------------------
-
-The below snippet shows an example model using MNIST with single Habana Gaudi device:
-
-.. code-block:: python
-
-    import habana_frameworks.torch.core as htcore
-
-
-    class LitClassifier(pl.LightningModule):
-        def __init__(self):
-            super(LitClassifier, self).__init__()
-
-        ...
-
-
-    # Init our model
-    model = LitClassifier()
-
-    # Init DataLoader from MNIST Dataset
-    dm = MNISTDataModule(batch_size=batch_size)
-
-    ...
-
-    # Initialize a trainer with 1 HPU accelerator
-    trainer = pl.Trainer(accelerator="hpu", devices=1)
-
-    # Train the model ⚡
-    trainer.fit(model, datamodule=dm)
-
-
-----------------
-
-Enabling Lightning with 8 Gaudi HPUs (distributed)
---------------------------------------------------
-
-The below snippet shows an example model using MNIST with 8 Habana Gaudi devices:
-
-.. code-block:: python
-
-    import habana_frameworks.torch.core as htcore
-
-
-    class LitClassifier(pl.LightningModule):
-        def __init__(self):
-            super(LitClassifier, self).__init__()
-
-        ...
-
-
-    # Init our model
-    model = LitClassifier()
-
-    # Init DataLoader from MNIST Dataset
-    dm = MNISTDataModule(batch_size=batch_size)
+    trainer = Trainer(devices=1, accelerator="hpu", precision="bf16")
 
-    ...
+For more details, please refer to `PyTorch Mixed Precision Training on Gaudi <https://docs.habana.ai/en/latest/PyTorch_User_Guide/PyTorch_User_Guide.html#pytorch-mixed-precision-training-on-gaudi>`__.
 
-    # Initialize a trainer with HPU accelerator with 8 devices
-    trainer = pl.Trainer(accelerator="hpu", devices=8)
-
-    # Train the model ⚡
-    trainer.fit(model, datamodule=dm)
-
-
-----------------
 
 Enabling Mixed Precision Options
 --------------------------------
@@ -191,5 +124,6 @@ This enables advanced users to provide their own BF16 and FP32 operator list ins
 Known limitations
 -----------------
 
+* Multiple optimizers are not supported.
 * Habana dataloader is not supported.
 * Device stats monitoring is not supported.

From 503df4e0ae0a12b1971f0c13395c55b69e8d1148 Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Wed, 23 Mar 2022 13:07:08 +0530
Subject: [PATCH 117/167] Update
 pytorch_lightning/trainer/connectors/accelerator_connector.py

Co-authored-by: four4fish <88516121+four4fish@users.noreply.github.com>
---
 pytorch_lightning/trainer/connectors/accelerator_connector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index 86cf4d579b766..667fa06d5011a 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -559,7 +559,7 @@ def _choose_strategy(self) -> Union[Strategy, str]:
             return IPUStrategy.strategy_name
         if self._accelerator_flag == "hpu":
             if self._parallel_devices and len(self._parallel_devices) > 1:
-                return HPUParallelStrategy(parallel_devices=self.parallel_devices)  # type: ignore
+                return HPUParallelStrategy.strategy_name
             else:
                 return SingleHPUStrategy(device=torch.device("hpu"))
         if self._accelerator_flag == "tpu":

From e6af41766d427e7f146d2b7165f6390c51576f28 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Wed, 23 Mar 2022 13:13:34 +0530
Subject: [PATCH 118/167] Update links

---
 docs/source/accelerators/hpu.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index 191dcf2565883..92341ad8d1cfb 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -125,5 +125,5 @@ Known limitations
 -----------------
 
 * Multiple optimizers are not supported.
-* Habana dataloader is not supported.
-* Device stats monitoring is not supported.
+* `Habana dataloader <https://docs.habana.ai/en/latest/PyTorch_User_Guide/PyTorch_User_Guide.html#habana-data-loader>`__ is not supported.
+* :class:`~pytorch_lightning.callbacks.DeviceStatsMonitor` is not supported.

From 67e710e2b823036bbc877b97e2031ecdd5c3d38e Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Wed, 23 Mar 2022 14:16:19 +0530
Subject: [PATCH 119/167] Update precision sections

---
 docs/source/accelerators/hpu.rst | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index 92341ad8d1cfb..3133871b4bc62 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -58,30 +58,35 @@ It uses :class:`~pytorch_lightning.strategies.HPUParallelStrategy` internally wh
 Mixed Precision Plugin
 ----------------------
 
-The ``precision=16`` parameter in the Trainer class enables the Habana plugin for mixed precision using the Habana Mixed Precision (HMP) package.
-
-You can execute the ops in FP32 or BF16 precision. The HMP package modifies the python operators to add the appropriate cast operations for the arguments before execution.
-The default settings enable users to easily enable mixed precision training with minimal code.
+Lightning also allows mixed precision training with HPUs.
+By default, HPU training will use 32-bit precision. To enable mixed precision, set the ``precision`` flag.
 
 In addition to the default settings in HMP, users also have the option of overriding these defaults and providing their own BF16 and FP32 operator lists using the ``plugins`` parameter of Trainer class.
 HPU's precision plugin is realised using ``HPUPrecisionPlugin``. The ``hmp_params`` parameter with this plugin is used to override the default operator list. An example can be found in the subsequent section.
 
 .. code-block:: python
 
-    trainer = Trainer(devices=1, accelerator="hpu", precision="bf16")
-
-For more details, please refer to `PyTorch Mixed Precision Training on Gaudi <https://docs.habana.ai/en/latest/PyTorch_User_Guide/PyTorch_User_Guide.html#pytorch-mixed-precision-training-on-gaudi>`__.
+    trainer = Trainer(devices=1, accelerator="hpu", precision=16)
 
 
 Enabling Mixed Precision Options
 --------------------------------
 
+Internally, :class:`~pytorch_lightning.plugins.precision.HPUPrecisionPlugin` uses the Habana Mixed Precision (HMP) package to enable mixed precision training.
+
+You can execute the ops in FP32 or BF16 precision. The HMP package modifies the python operators to add the appropriate cast operations for the arguments before execution.
+The default settings enable users to easily enable mixed precision training with minimal code.
+
+In addition to the default settings in HMP, users also have the option of overriding these defaults and providing their own BF16 and FP32 operator lists by passing it
+to the ``hmp_params`` parameter of :class:`~pytorch_lightning.plugins.precision.HPUPrecisionPlugin`.
+
 The below snippet shows an example model using MNIST with single Habana Gaudi and making use of HMP by overriding the default parameters.
 This enables advanced users to provide their own BF16 and FP32 operator list instead of using the HMP defaults.
 
 .. code-block:: python
 
-    import habana_frameworks.torch.core as htcore
+    import pytorch_lightning as pl
+    from pytorch_lightning.plugins import HPUPrecisionPlugin
 
 
     class LitClassifier(pl.LightningModule):
@@ -116,6 +121,7 @@ This enables advanced users to provide their own BF16 and FP32 operator list ins
     # Train the model ⚡
     trainer.fit(model, datamodule=dm)
 
+For more details, please refer to `PyTorch Mixed Precision Training on Gaudi <https://docs.habana.ai/en/latest/PyTorch_User_Guide/PyTorch_User_Guide.html#pytorch-mixed-precision-training-on-gaudi>`__.
 
 ----------------
 

From 1df801b1f3f68dda9aaf8df4bc73e4bf69b59764 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Wed, 23 Mar 2022 14:27:33 +0530
Subject: [PATCH 120/167] Update doc

---
 docs/source/accelerators/hpu.rst | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index 3133871b4bc62..c446dafa92532 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -104,8 +104,6 @@ This enables advanced users to provide their own BF16 and FP32 operator list ins
 
     ...
 
-    num_hpus = 1
-
     # Optional Habana mixed precision params to be set
     hmp_keys = ["level", "verbose", "bf16_ops", "fp32_ops"]
     hmp_params = dict.fromkeys(hmp_keys)

From 915211423177c20c0319b6c2aa65aedf04e739ab Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Thu, 24 Mar 2022 00:24:53 +0530
Subject: [PATCH 121/167] Add defaults to hmp_params for Precision Plugin

---
 pytorch_lightning/plugins/precision/hpu.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/plugins/precision/hpu.py b/pytorch_lightning/plugins/precision/hpu.py
index 0dd45915e5307..f248a732285e9 100644
--- a/pytorch_lightning/plugins/precision/hpu.py
+++ b/pytorch_lightning/plugins/precision/hpu.py
@@ -32,8 +32,9 @@ def __init__(self, precision: int, hmp_params: Optional[Sequence[Any]] = None) -
         if not hmp_params:
             return
 
-        hmp_opt_level = hmp_params["level"]  # type: ignore
-        hmp_bf16 = hmp_params["bf16_ops"]  # type: ignore
-        hmp_fp32 = hmp_params["fp32_ops"]  # type: ignore
-        hmp_verbose = hmp_params["verbose"]  # type: ignore
+        hmp_opt_level = hmp_params.get("level", "02")  # type: ignore
+        hmp_bf16 = hmp_params.get("bf16_ops", None)  # type: ignore
+        hmp_fp32 = hmp_params.get("fp32_ops", None)  # type: ignore
+        hmp_verbose = hmp_params.get("verbose", False)  # type: ignore
+
         hmp.convert(opt_level=hmp_opt_level, bf16_file_path=hmp_bf16, fp32_file_path=hmp_fp32, isVerbose=hmp_verbose)

From 9846b6a0c5a41ecd562c158d875d78a48279bef7 Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Thu, 24 Mar 2022 17:37:15 +0530
Subject: [PATCH 122/167] Update .azure-pipelines/run_hpu_tests.py

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 .azure-pipelines/run_hpu_tests.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/.azure-pipelines/run_hpu_tests.py b/.azure-pipelines/run_hpu_tests.py
index 6403c1bfcac7c..325f54341d820 100644
--- a/.azure-pipelines/run_hpu_tests.py
+++ b/.azure-pipelines/run_hpu_tests.py
@@ -56,12 +56,16 @@ def run_hpu_tests_parallel(timeout=TIMEOUT):
 
     We run the tests in sub process to utilize all the eight cards available in the DL1 instance
     Considering the max time taken to run the HPU tests as 60 seconds, we kill the process if the time taken exceeds.
-    Return of this function will be the list of exit status of the HPU tests that were run in the subprocess.
-    Here, the exit_status 0 means the test run is successful. exit_status 1 means the test run is failed.
+
     Args:
         timeout: The threshold time to run the HPU tests in parallel.
-        Exception is logged if the threshold timeout gets expired.
-        TIMEOUT_EXIT_CODE will be returned as -9 in case of timeout, 0 in case of success and 4 in case of a failure.
+            An exception is logged if the threshold timeout gets expired.
+            TIMEOUT_EXIT_CODE will be returned as -9 in case of timeout,
+            0 in case of success and 4 in case of failure.
+        
+    Return:
+        The list of exit status of the HPU tests that were run in the subprocess.
+        Here, the exit_status 0 means the test run is successful. exit_status 1 means the test run is failed.
     """
     exit_status = []
     with open("stdout_log.txt", "w") as stdout_log, open("error_log.txt", "w") as error_log:

From e86becf35e4498698e1b96346b3e5c7adf2f645f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 24 Mar 2022 12:08:38 +0000
Subject: [PATCH 123/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .azure-pipelines/run_hpu_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azure-pipelines/run_hpu_tests.py b/.azure-pipelines/run_hpu_tests.py
index 325f54341d820..63a983d83c77c 100644
--- a/.azure-pipelines/run_hpu_tests.py
+++ b/.azure-pipelines/run_hpu_tests.py
@@ -62,7 +62,7 @@ def run_hpu_tests_parallel(timeout=TIMEOUT):
             An exception is logged if the threshold timeout gets expired.
             TIMEOUT_EXIT_CODE will be returned as -9 in case of timeout,
             0 in case of success and 4 in case of failure.
-        
+
     Return:
         The list of exit status of the HPU tests that were run in the subprocess.
         Here, the exit_status 0 means the test run is successful. exit_status 1 means the test run is failed.

From d165c442af649c364b750fefacf776e2ab62c419 Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Thu, 24 Mar 2022 17:44:13 +0530
Subject: [PATCH 124/167] Apply suggestions from code review

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 .azure-pipelines/run_hpu_tests.py | 10 +++++++---
 docs/source/accelerators/hpu.rst  | 14 +++++++-------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/.azure-pipelines/run_hpu_tests.py b/.azure-pipelines/run_hpu_tests.py
index 63a983d83c77c..93fbba726af34 100644
--- a/.azure-pipelines/run_hpu_tests.py
+++ b/.azure-pipelines/run_hpu_tests.py
@@ -90,9 +90,12 @@ def run_hpu_tests_parallel(timeout=TIMEOUT):
 def zip_cmd_exitcode(exit_status):
     """This function is called to zip the tests that were executed with the exit status of the test.
 
-    Return of this function will be list of hpu tests called and their exit status.
+
     Args:
         exit_status: The returned exit_status after executing run_hpu_tests_parallel().
+
+    Return:
+        A list of hpu tests called and their exit status.
     """
     status_list = []
     hpu_tests_called = []
@@ -118,8 +121,9 @@ def print_subprocess_logs_and_return_status(exit_status):
 
     Args:
         exit_status: The returned exit_status after executing run_hpu_tests_parallel().
-    Return of this function will be the return to main().
-    Based on the exit status of the HPU tests, we return success or failure to the main method.
+
+    Return:
+        Based on the exit status of the HPU tests, we return success or failure to the main method.
     """
     if all(v == 0 for v in exit_status):
         print("All HPU tests passed")
diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index c446dafa92532..941c5365d0b4a 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -5,18 +5,18 @@ Habana Gaudi AI Processor (HPU)
 
 Lightning supports `Habana Gaudi AI Processor (HPU) <https://habana.ai/>`__, for accelerating Deep Learning training workloads.
 
-HPU terminology
+HPU Terminology
 ---------------
 
 Habana® Gaudi® AI training processors are built on a heterogeneous architecture with a cluster of fully programmable Tensor Processing Cores (TPC) along with its associated development tools and libraries, and a configurable Matrix Math engine.
 
-The TPC core is a VLIW SIMD processor with instruction set and hardware tailored to serve training workloads efficiently.
+The TPC core is a VLIW SIMD processor with an instruction set and hardware tailored to serve training workloads efficiently.
 The Gaudi memory architecture includes on-die SRAM and local memories in each TPC and,
 Gaudi is the first DL training processor that has integrated RDMA over Converged Ethernet (RoCE v2) engines on-chip.
 
 On the software side, the PyTorch Habana bridge interfaces between the framework and SynapseAI software stack to enable the execution of deep learning models on the Habana Gaudi device.
 
-Gaudi offers substantial price/performance advantage -- so you get to do more deep learning training while spending less.
+Gaudi offers a substantial price/performance advantage -- so you get to do more deep learning training while spending less.
 
 For more information, check out `Gaudi Architecture <https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Overview.html#gaudi-architecture>`__ and `Gaudi Developer Docs <https://developer.habana.ai>`__.
 
@@ -44,15 +44,15 @@ Passing ``devices=1`` and ``accelerator="hpu"`` to the Trainer class enables the
     trainer = Trainer(devices=1, accelerator="hpu")
 
 The ``devices=8`` and ``accelerator="hpu"`` parameters to the Trainer class enables the Habana accelerator for distributed training with 8 Gaudis.
-It uses :class:`~pytorch_lightning.strategies.HPUParallelStrategy` internally which is based on DDP strategy with the addition of Habana's collective communication library (HCCL) to support scale-up within a node and scale-out across multiple nodes.
+It uses :class:`~pytorch_lightning.strategies.hpu_parallel.HPUParallelStrategy` internally which is based on DDP strategy with the addition of Habana's collective communication library (HCCL) to support scale-up within a node and scale-out across multiple nodes.
 
 .. code-block:: python
 
     trainer = Trainer(devices=8, accelerator="hpu")
 
 .. note::
-    If the ``devices`` flag is not defined, it will assume ``devices`` to be ``"auto"`` and fetch the :meth:`~pytorch_lightning.accelerators.HPUAccelerator.auto_device_count`
-    from :class:`~pytorch_lightning.accelerators.HPUAccelerator`.
+    If the ``devices`` flag is not defined, it will assume ``devices`` to be ``"auto"`` and fetch the :meth:`~pytorch_lightning.accelerators.hpu.HPUAccelerator.auto_device_count`
+    from :class:`~pytorch_lightning.accelerators.hpu.HPUAccelerator`.
 
 
 Mixed Precision Plugin
@@ -62,7 +62,7 @@ Lightning also allows mixed precision training with HPUs.
 By default, HPU training will use 32-bit precision. To enable mixed precision, set the ``precision`` flag.
 
 In addition to the default settings in HMP, users also have the option of overriding these defaults and providing their own BF16 and FP32 operator lists using the ``plugins`` parameter of Trainer class.
-HPU's precision plugin is realised using ``HPUPrecisionPlugin``. The ``hmp_params`` parameter with this plugin is used to override the default operator list. An example can be found in the subsequent section.
+HPU's precision plugin is realised using :class:`~pytorch_lightning.plugins.precision.hpu.HPUPrecisionPlugin`. The ``hmp_params`` parameter with this plugin is used to override the default operator list. An example can be found in the subsequent section.
 
 .. code-block:: python
 

From c76b95fbb2d42b9ed7842371290780bf7a801c00 Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Thu, 24 Mar 2022 17:44:46 +0530
Subject: [PATCH 125/167] Update docs/source/accelerators/hpu.rst

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 docs/source/accelerators/hpu.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index 941c5365d0b4a..c30fcf809dfa4 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -80,7 +80,7 @@ The default settings enable users to easily enable mixed precision training with
 In addition to the default settings in HMP, users also have the option of overriding these defaults and providing their own BF16 and FP32 operator lists by passing it
 to the ``hmp_params`` parameter of :class:`~pytorch_lightning.plugins.precision.HPUPrecisionPlugin`.
 
-The below snippet shows an example model using MNIST with single Habana Gaudi and making use of HMP by overriding the default parameters.
+The below snippet shows an example model using MNIST with a single Habana Gaudi device and making use of HMP by overriding the default parameters.
 This enables advanced users to provide their own BF16 and FP32 operator list instead of using the HMP defaults.
 
 .. code-block:: python

From bafcb8de26fc27c7b2aab89a5e4d70b022009d76 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 24 Mar 2022 12:15:52 +0000
Subject: [PATCH 126/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .azure-pipelines/run_hpu_tests.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.azure-pipelines/run_hpu_tests.py b/.azure-pipelines/run_hpu_tests.py
index 93fbba726af34..9493350664fd1 100644
--- a/.azure-pipelines/run_hpu_tests.py
+++ b/.azure-pipelines/run_hpu_tests.py
@@ -90,7 +90,6 @@ def run_hpu_tests_parallel(timeout=TIMEOUT):
 def zip_cmd_exitcode(exit_status):
     """This function is called to zip the tests that were executed with the exit status of the test.
 
-
     Args:
         exit_status: The returned exit_status after executing run_hpu_tests_parallel().
 

From 2d6c6ddb501bfee040e2513c195cecba41199edd Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Thu, 24 Mar 2022 17:51:03 +0530
Subject: [PATCH 127/167] Apply suggestions from code review

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 pytorch_lightning/trainer/connectors/accelerator_connector.py | 4 ++--
 pytorch_lightning/trainer/trainer.py                          | 2 +-
 tests/accelerators/test_hpu.py                                | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index 667fa06d5011a..26ad0aa6009c6 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -792,7 +792,7 @@ def _lazy_init_strategy(self) -> None:
         ):
             raise ValueError(
                 "The `TPUAccelerator` can only be used with a `SingleTPUStrategy` or `TPUSpawnStrategy`,"
-                f" found {self.strategy}."
+                f" found {self.strategy.__class__.__name__}."
             )
 
         if isinstance(self.accelerator, HPUAccelerator) and not isinstance(
@@ -800,7 +800,7 @@ def _lazy_init_strategy(self) -> None:
         ):
             raise ValueError(
                 "The `HPUAccelerator` can only be used with a `SingleHPUStrategy` or `HPUParallelStrategy`,"
-                f" found {self.strategy}."
+                f" found {self.strategy.__class__.__name__}."
             )
 
     """The following properties are here for backward-compatibility and will be deprecated and removed in favor
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index ed9c1cb0f1444..2591e631f4b0b 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -263,7 +263,7 @@ def __init__(
             deterministic: If ``True``, sets whether PyTorch operations must use deterministic algorithms.
                 Default: ``False``.
 
-            devices: Will be mapped to either `gpus`, `tpu_cores`, `hpus`, `num_processes` or `ipus`,
+            devices: Will be mapped to either `gpus`, `tpu_cores`, `num_processes` or `ipus`,
                 based on the accelerator type.
 
             fast_dev_run: Runs n if set to ``n`` (int) else 1 if set to ``True`` batch(es)
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index cc8c2c2277712..c4812449bfc93 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -151,7 +151,7 @@ def on_predict_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, da
     trainer.fit(model)
     trainer.test(model)
     trainer.validate(model)
-    trainer.predict(model, model.test_dataloader())
+    trainer.predict(model)
 
 
 @RunIf(hpu=True)

From 75728b6891d71ddcad8f292b15433d867f51ee89 Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Thu, 24 Mar 2022 17:53:46 +0530
Subject: [PATCH 128/167] Apply suggestions from code review

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 pytorch_lightning/strategies/hpu_parallel.py | 28 +++++++++-----------
 tests/accelerators/test_hpu.py               |  2 +-
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index b15455d04f0f5..8cb0b0b32bf1f 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -91,17 +91,16 @@ def pre_configure_ddp(self):  # type: ignore
             )
             self._ddp_kwargs["find_unused_parameters"] = True
 
-        if self.root_device.type == "hpu":
-            self._static_graph = False
-            static_graph = self._ddp_kwargs.get("static_graph")
-            if static_graph:
-                # when _set_static_graph() is called find_unused_parameters does not have any significance.
-                # Resetting the value of find_unused_parameters to False which is the default value to DDP
-                self._ddp_kwargs["find_unused_parameters"] = False
-                self._static_graph = True
-            if static_graph is not None:
-                # DDP does not accept static_graph as a parameter, hence removing it from the list
-                del self._ddp_kwargs["static_graph"]
+        self._static_graph = False
+        static_graph = self._ddp_kwargs.get("static_graph")
+        if static_graph:
+            # when _set_static_graph() is called find_unused_parameters does not have any significance.
+            # Resetting the value of find_unused_parameters to False which is the default value to DDP
+            self._ddp_kwargs["find_unused_parameters"] = False
+            self._static_graph = True
+        if static_graph is not None:
+            # DDP does not accept static_graph as a parameter, hence removing it from the list
+            del self._ddp_kwargs["static_graph"]
 
     def configure_ddp(self) -> None:
         # DDP does not accept static graph as param with torch < 1.11
@@ -124,12 +123,11 @@ def broadcast(self, obj: object, src: int = 0) -> object:  # type: ignore
         return obj[0]
 
     def teardown(self) -> None:
-        log.detail(f"{self.__class__.__name__}: tearing down `HPUParallel` Strategy")
+        log.detail(f"{self.__class__.__name__}: tearing down strategy.")
         super().teardown()
 
-        if self.root_device.type == "hpu":
-            log.detail(f"{self.__class__.__name__}: moving model to CPU")
-            self.lightning_module.cpu()  # type: ignore
+        log.detail(f"{self.__class__.__name__}: moving model to CPU")
+        self.lightning_module.cpu()  # type: ignore
 
     @classmethod
     def register_strategies(cls, strategy_registry: Dict) -> None:
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index c4812449bfc93..2120511e4be4e 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -56,7 +56,7 @@ def test_all_stages(tmpdir, hpus):
         fast_dev_run=True,
         accelerator="hpu",
         devices=hpus,
-        plugins=[HPUPrecisionPlugin(precision=16, hmp_params=None)],
+        precision=16,
     )
     trainer.fit(model)
     trainer.validate(model)

From 68c52817c9a5b7bead9a7fb0f29c4d68d3a3e673 Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Thu, 24 Mar 2022 17:56:26 +0530
Subject: [PATCH 129/167] Update docs/source/accelerators/hpu.rst

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 docs/source/accelerators/hpu.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index c30fcf809dfa4..ee628810a515e 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -75,9 +75,9 @@ Enabling Mixed Precision Options
 Internally, :class:`~pytorch_lightning.plugins.precision.HPUPrecisionPlugin` uses the Habana Mixed Precision (HMP) package to enable mixed precision training.
 
 You can execute the ops in FP32 or BF16 precision. The HMP package modifies the python operators to add the appropriate cast operations for the arguments before execution.
-The default settings enable users to easily enable mixed precision training with minimal code.
+The default settings enable users to enable mixed precision training with minimal code easily.
 
-In addition to the default settings in HMP, users also have the option of overriding these defaults and providing their own BF16 and FP32 operator lists by passing it
+In addition to the default settings in HMP, users also have the option of overriding these defaults and providing their BF16 and FP32 operator lists by passing it
 to the ``hmp_params`` parameter of :class:`~pytorch_lightning.plugins.precision.HPUPrecisionPlugin`.
 
 The below snippet shows an example model using MNIST with a single Habana Gaudi device and making use of HMP by overriding the default parameters.

From 600e1bd89b4a6f44bc2bcc309a25f97dfcc3516c Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Thu, 24 Mar 2022 17:57:35 +0530
Subject: [PATCH 130/167] Address reviews

---
 docs/source/accelerators/hpu.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index ee628810a515e..e07577cdfaa24 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -130,4 +130,4 @@ Known limitations
 
 * Multiple optimizers are not supported.
 * `Habana dataloader <https://docs.habana.ai/en/latest/PyTorch_User_Guide/PyTorch_User_Guide.html#habana-data-loader>`__ is not supported.
-* :class:`~pytorch_lightning.callbacks.DeviceStatsMonitor` is not supported.
+* :class:`~pytorch_lightning.callbacks.device_stats_monitor.DeviceStatsMonitor` is not supported.

From b03d079591b116d74151ee8b28bffcc0359b5e06 Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Thu, 24 Mar 2022 17:58:34 +0530
Subject: [PATCH 131/167] Apply suggestions from code review

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 pl_examples/hpu_examples/simple_mnist/mnist.py | 10 ++--------
 pytorch_lightning/plugins/precision/hpu.py     |  2 +-
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/pl_examples/hpu_examples/simple_mnist/mnist.py b/pl_examples/hpu_examples/simple_mnist/mnist.py
index 71a22f3ef41e3..0422ee75aba99 100644
--- a/pl_examples/hpu_examples/simple_mnist/mnist.py
+++ b/pl_examples/hpu_examples/simple_mnist/mnist.py
@@ -61,24 +61,18 @@ def validation_step(self, batch, batch_idx):
         x, y = batch
         probs = self(x)
         acc = self.accuracy(probs, y)
-        return acc
+        self.log("val_acc", acc)
 
     def test_step(self, batch, batch_idx):
         x, y = batch
         logits = self(x)
         acc = self.accuracy(logits, y)
-        return acc
+        self.log("test_acc", acc)
 
     def accuracy(self, logits, y):
         acc = torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y)
         return acc
 
-    def validation_epoch_end(self, outputs) -> None:
-        self.log("val_acc", torch.stack(outputs).mean(), prog_bar=True)
-
-    def test_epoch_end(self, outputs) -> None:
-        self.log("test_acc", torch.stack(outputs).mean())
-
     def configure_optimizers(self):
         return torch.optim.Adam(self.parameters(), lr=0.02)
 
diff --git a/pytorch_lightning/plugins/precision/hpu.py b/pytorch_lightning/plugins/precision/hpu.py
index f248a732285e9..8bf6651f31a1c 100644
--- a/pytorch_lightning/plugins/precision/hpu.py
+++ b/pytorch_lightning/plugins/precision/hpu.py
@@ -26,7 +26,7 @@ class HPUPrecisionPlugin(PrecisionPlugin):
 
     def __init__(self, precision: int, hmp_params: Optional[Sequence[Any]] = None) -> None:
         if not _HPU_AVAILABLE:
-            raise MisconfigurationException("HPU precision plugin requires HPU support.")
+            raise MisconfigurationException("HPU precision plugin requires HPU devices.")
         super().__init__()
         self.precision = precision
         if not hmp_params:

From 6e4474e36853b44bd033696a0e8ec34b28dc1cdf Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Thu, 24 Mar 2022 18:07:02 +0530
Subject: [PATCH 132/167] Update API references

---
 docs/source/api_references.rst         |  5 +++++
 docs/source/extensions/accelerator.rst |  4 +++-
 docs/source/extensions/plugins.rst     | 13 +++++++------
 docs/source/extensions/strategy.rst    |  2 ++
 4 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/docs/source/api_references.rst b/docs/source/api_references.rst
index e0c4eb173fc04..31fc822683ea3 100644
--- a/docs/source/api_references.rst
+++ b/docs/source/api_references.rst
@@ -16,6 +16,7 @@ Accelerator API
     Accelerator
     CPUAccelerator
     GPUAccelerator
+    HPUAccelerator
     IPUAccelerator
     TPUAccelerator
 
@@ -59,9 +60,11 @@ Strategy API
     DataParallelStrategy
     DeepSpeedStrategy
     HorovodStrategy
+    HPUParallelStrategy
     IPUStrategy
     ParallelStrategy
     SingleDeviceStrategy
+    SingleHPUStrategy
     SingleTPUStrategy
     Strategy
     TPUSpawnStrategy
@@ -198,6 +201,7 @@ Precision Plugins
     DeepSpeedPrecisionPlugin
     DoublePrecisionPlugin
     FullyShardedNativeMixedPrecisionPlugin
+    HPUPrecisionPlugin
     IPUPrecisionPlugin
     MixedPrecisionPlugin
     NativeMixedPrecisionPlugin
@@ -234,6 +238,7 @@ Checkpoint IO Plugins
     :template: classtemplate.rst
 
     CheckpointIO
+    HPUCheckpointIO
     TorchCheckpointIO
     XLACheckpointIO
 
diff --git a/docs/source/extensions/accelerator.rst b/docs/source/extensions/accelerator.rst
index 762f2b6e57a90..0d78371a0e7ed 100644
--- a/docs/source/extensions/accelerator.rst
+++ b/docs/source/extensions/accelerator.rst
@@ -15,6 +15,7 @@ Currently there are accelerators for:
 - GPU
 - TPU
 - IPU
+- HPU
 
 Each Accelerator gets two plugins upon initialization:
 One to handle differences from the training routine and one to handle different precisions.
@@ -58,5 +59,6 @@ Accelerator API
     Accelerator
     CPUAccelerator
     GPUAccelerator
-    TPUAccelerator
+    HPUAccelerator
     IPUAccelerator
+    TPUAccelerator
diff --git a/docs/source/extensions/plugins.rst b/docs/source/extensions/plugins.rst
index e6e25321bd739..c9b23c580d43a 100644
--- a/docs/source/extensions/plugins.rst
+++ b/docs/source/extensions/plugins.rst
@@ -61,17 +61,18 @@ Precision Plugins
     :nosignatures:
     :template: classtemplate.rst
 
-    PrecisionPlugin
-    MixedPrecisionPlugin
-    NativeMixedPrecisionPlugin
-    ShardedNativeMixedPrecisionPlugin
     ApexMixedPrecisionPlugin
     DeepSpeedPrecisionPlugin
-    TPUPrecisionPlugin
-    TPUBf16PrecisionPlugin
     DoublePrecisionPlugin
     FullyShardedNativeMixedPrecisionPlugin
+    HPUPrecisionPlugin
     IPUPrecisionPlugin
+    MixedPrecisionPlugin
+    NativeMixedPrecisionPlugin
+    PrecisionPlugin
+    ShardedNativeMixedPrecisionPlugin
+    TPUBf16PrecisionPlugin
+    TPUPrecisionPlugin
 
 
 Cluster Environments
diff --git a/docs/source/extensions/strategy.rst b/docs/source/extensions/strategy.rst
index e85b719e8566c..7c5596c7362ea 100644
--- a/docs/source/extensions/strategy.rst
+++ b/docs/source/extensions/strategy.rst
@@ -108,9 +108,11 @@ Built-In Training Strategies
     DataParallelStrategy
     DeepSpeedStrategy
     HorovodStrategy
+    HPUParallelStrategy
     IPUStrategy
     ParallelStrategy
     SingleDeviceStrategy
+    SingleHPUStrategy
     SingleTPUStrategy
     Strategy
     TPUSpawnStrategy

From efd9f6552cb72ddc166ac2710a7ccf4ce0feb51a Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Thu, 24 Mar 2022 18:20:27 +0530
Subject: [PATCH 133/167] Address reviews regarding precision

---
 pytorch_lightning/plugins/io/hpu_plugin.py | 17 +++++++++++++++++
 pytorch_lightning/plugins/precision/hpu.py | 10 ++++++++--
 tests/plugins/precision/hpu/test_hpu.py    |  6 ++++++
 3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/plugins/io/hpu_plugin.py b/pytorch_lightning/plugins/io/hpu_plugin.py
index 7ff2a4c1a63fa..c72d1d9fcd112 100644
--- a/pytorch_lightning/plugins/io/hpu_plugin.py
+++ b/pytorch_lightning/plugins/io/hpu_plugin.py
@@ -27,6 +27,23 @@ class HPUCheckpointIO(TorchCheckpointIO):
     """CheckpointIO to save checkpoints for HPU training strategies."""
 
     def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None) -> None:
+        """Save model/training states as a checkpoint file through state-dump and file-write.
+
+        Args:
+            checkpoint: dict containing model and trainer state
+            path: write-target path
+            storage_options: not used in ``XLACheckpointIO.save_checkpoint``
+
+        Raises:
+            TypeError:
+                If ``storage_options`` arg is passed in
+        """
+        if storage_options is not None:
+            raise TypeError(
+                "`Trainer.save_checkpoint(..., storage_options=...)` with `storage_options` arg"
+                f" is not supported for `{self.__class__.__name__}`. Please implement your custom `CheckpointIO`"
+                " to define how you'd like to use `storage_options`."
+            )
         fs = get_filesystem(path)
         fs.makedirs(os.path.dirname(path), exist_ok=True)
 
diff --git a/pytorch_lightning/plugins/precision/hpu.py b/pytorch_lightning/plugins/precision/hpu.py
index 8bf6651f31a1c..942ed3b30f9b6 100644
--- a/pytorch_lightning/plugins/precision/hpu.py
+++ b/pytorch_lightning/plugins/precision/hpu.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Optional, Sequence
+from typing import Any, Optional, Sequence, Union
 
 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -24,9 +24,15 @@
 class HPUPrecisionPlugin(PrecisionPlugin):
     """Plugin that enables bfloats/floats on HPUs."""
 
-    def __init__(self, precision: int, hmp_params: Optional[Sequence[Any]] = None) -> None:
+    def __init__(self, precision: Union[str, int], hmp_params: Optional[Sequence[Any]] = None) -> None:
         if not _HPU_AVAILABLE:
             raise MisconfigurationException("HPU precision plugin requires HPU devices.")
+        supported_precision_values = (16, 32, "bf16")
+        if precision not in supported_precision_values:
+            raise ValueError(
+                f"`Trainer(accelerator='hpu', precision={precision!r})` is not supported."
+                f" `precision` must be one of: {supported_precision_values}."
+            )
         super().__init__()
         self.precision = precision
         if not hmp_params:
diff --git a/tests/plugins/precision/hpu/test_hpu.py b/tests/plugins/precision/hpu/test_hpu.py
index 74d2cd259f053..74a71c7cba7d7 100644
--- a/tests/plugins/precision/hpu/test_hpu.py
+++ b/tests/plugins/precision/hpu/test_hpu.py
@@ -79,3 +79,9 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
 
     with pytest.raises(SystemExit):
         trainer.fit(model)
+
+
+@RunIf(hpu=True)
+def test_unsupported_precision_plugin():
+    with pytest.raises(ValueError, match="`Trainer(accelerator='hpu', precision='mixed` is not supported."):
+        HPUPrecisionPlugin(precision="mixed")

From 22827f03a91bf0fcf0e651a22838f6885cda17b6 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Thu, 24 Mar 2022 19:53:17 +0530
Subject: [PATCH 134/167] Address reviews regarding docs and precision

---
 docs/source/accelerators/hpu.rst                | 13 -------------
 pytorch_lightning/plugins/precision/__init__.py | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index e07577cdfaa24..097af7cf38ce1 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -61,9 +61,6 @@ Mixed Precision Plugin
 Lightning also allows mixed precision training with HPUs.
 By default, HPU training will use 32-bit precision. To enable mixed precision, set the ``precision`` flag.
 
-In addition to the default settings in HMP, users also have the option of overriding these defaults and providing their own BF16 and FP32 operator lists using the ``plugins`` parameter of Trainer class.
-HPU's precision plugin is realised using :class:`~pytorch_lightning.plugins.precision.hpu.HPUPrecisionPlugin`. The ``hmp_params`` parameter with this plugin is used to override the default operator list. An example can be found in the subsequent section.
-
 .. code-block:: python
 
     trainer = Trainer(devices=1, accelerator="hpu", precision=16)
@@ -88,22 +85,12 @@ This enables advanced users to provide their own BF16 and FP32 operator list ins
     import pytorch_lightning as pl
     from pytorch_lightning.plugins import HPUPrecisionPlugin
 
-
-    class LitClassifier(pl.LightningModule):
-        def __init__(self):
-            super(LitClassifier, self).__init__()
-
-        ...
-
-
     # Init our model
     model = LitClassifier()
 
     # Init DataLoader from MNIST Dataset
     dm = MNISTDataModule(batch_size=batch_size)
 
-    ...
-
     # Optional Habana mixed precision params to be set
     hmp_keys = ["level", "verbose", "bf16_ops", "fp32_ops"]
     hmp_params = dict.fromkeys(hmp_keys)
diff --git a/pytorch_lightning/plugins/precision/__init__.py b/pytorch_lightning/plugins/precision/__init__.py
index b407e47ca9337..4bc29c1be1864 100644
--- a/pytorch_lightning/plugins/precision/__init__.py
+++ b/pytorch_lightning/plugins/precision/__init__.py
@@ -1,9 +1,23 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from pytorch_lightning.plugins.precision.apex_amp import ApexMixedPrecisionPlugin  # noqa: F401
 from pytorch_lightning.plugins.precision.deepspeed import DeepSpeedPrecisionPlugin  # noqa: F401
 from pytorch_lightning.plugins.precision.double import DoublePrecisionPlugin  # noqa: F401
 from pytorch_lightning.plugins.precision.fully_sharded_native_amp import (  # noqa: F401
     FullyShardedNativeMixedPrecisionPlugin,
 )
+from pytorch_lightning.plugins.precision.hpu import HPUPrecisionPlugin  # noqa: F401
 from pytorch_lightning.plugins.precision.ipu import IPUPrecisionPlugin  # noqa: F401
 from pytorch_lightning.plugins.precision.mixed import MixedPrecisionPlugin  # noqa: F401
 from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin  # noqa: F401

From e82544c3295b2c08c91c003d8b57a3eb69a4f335 Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Thu, 24 Mar 2022 19:54:03 +0530
Subject: [PATCH 135/167] Update docs/source/accelerators/hpu.rst
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
---
 docs/source/accelerators/hpu.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index 097af7cf38ce1..2a8f1b7635aaa 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -92,12 +92,12 @@ This enables advanced users to provide their own BF16 and FP32 operator list ins
     dm = MNISTDataModule(batch_size=batch_size)
 
     # Optional Habana mixed precision params to be set
-    hmp_keys = ["level", "verbose", "bf16_ops", "fp32_ops"]
-    hmp_params = dict.fromkeys(hmp_keys)
-    hmp_params["level"] = "O1"
-    hmp_params["verbose"] = False
-    hmp_params["bf16_ops"] = "ops_bf16_mnist.txt"
-    hmp_params["fp32_ops"] = "ops_fp32_mnist.txt"
+    hmp_params = {
+        'level': 'O1',
+        'verbose': False,
+        'bf16_ops': 'ops_bf16_mnist.txt',
+        'fp32_ops': 'ops_fp32_mnist.txt'
+    }
 
     # Initialize a trainer with HPU accelerator for HPU strategy for single device,
     # with mixed precision using overidden HMP settings

From 4500a7ed2a95387741cd7c75023c6110486ac6b7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 24 Mar 2022 14:25:46 +0000
Subject: [PATCH 136/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 docs/source/accelerators/hpu.rst | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index 2a8f1b7635aaa..ef416c36682d0 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -92,12 +92,7 @@ This enables advanced users to provide their own BF16 and FP32 operator list ins
     dm = MNISTDataModule(batch_size=batch_size)
 
     # Optional Habana mixed precision params to be set
-    hmp_params = {
-        'level': 'O1',
-        'verbose': False,
-        'bf16_ops': 'ops_bf16_mnist.txt',
-        'fp32_ops': 'ops_fp32_mnist.txt'
-    }
+    hmp_params = {"level": "O1", "verbose": False, "bf16_ops": "ops_bf16_mnist.txt", "fp32_ops": "ops_fp32_mnist.txt"}
 
     # Initialize a trainer with HPU accelerator for HPU strategy for single device,
     # with mixed precision using overidden HMP settings

From 98ba21f78ef3827364438d4a332de872b2159a35 Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Thu, 24 Mar 2022 20:09:00 +0530
Subject: [PATCH 137/167] Apply suggestions from code review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
---
 docs/source/accelerators/hpu.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index ef416c36682d0..e8bdc62527edd 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -26,7 +26,7 @@ How to access HPUs
 To use HPUs, you must have access to a system with HPU devices.
 You can either use `Gaudi-based AWS EC2 DL1 instances <https://aws.amazon.com/ec2/instance-types/dl1/>`__ or `Supermicro X12 Gaudi server <https://www.supermicro.com/en/solutions/habana-gaudi>`__ to get access to HPUs.
 
-Checkout the `Getting Started Guide with AWS and Habana <https://docs.habana.ai/en/latest/AWS_EC2_Getting_Started/AWS_EC2_Getting_Started.html>`__.
+Check out the `Getting Started Guide with AWS and Habana <https://docs.habana.ai/en/latest/AWS_EC2_Getting_Started/AWS_EC2_Getting_Started.html>`__.
 
 Training with HPUs
 ------------------
@@ -71,7 +71,7 @@ Enabling Mixed Precision Options
 
 Internally, :class:`~pytorch_lightning.plugins.precision.HPUPrecisionPlugin` uses the Habana Mixed Precision (HMP) package to enable mixed precision training.
 
-You can execute the ops in FP32 or BF16 precision. The HMP package modifies the python operators to add the appropriate cast operations for the arguments before execution.
+You can execute the ops in FP32 or BF16 precision. The HMP package modifies the Python operators to add the appropriate cast operations for the arguments before execution.
 The default settings enable users to enable mixed precision training with minimal code easily.
 
 In addition to the default settings in HMP, users also have the option of overriding these defaults and providing their BF16 and FP32 operator lists by passing it

From 3c1035974626c9f8a64c3074b770f18758910832 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Thu, 24 Mar 2022 21:10:14 +0530
Subject: [PATCH 138/167] Address reviews & update tests

---
 .azure-pipelines/hpu-tests.yml               | 4 ++++
 docs/source/accelerators/hpu.rst             | 3 +--
 pytorch_lightning/strategies/hpu_parallel.py | 2 ++
 tests/plugins/precision/hpu/test_hpu.py      | 2 +-
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/.azure-pipelines/hpu-tests.yml b/.azure-pipelines/hpu-tests.yml
index 4c4b40e5f304f..600e9548f17d9 100644
--- a/.azure-pipelines/hpu-tests.yml
+++ b/.azure-pipelines/hpu-tests.yml
@@ -40,6 +40,10 @@ jobs:
          python ".azure-pipelines/run_hpu_tests.py"
       displayName: 'HPU Tests in parallel'
 
+    - script: |
+        python pl_examples/hpu_examples/simple_mnist/mnist.py
+      displayName: 'Testing: examples'
+
     - task: PublishTestResults@2
       inputs:
         testResultsFiles: 'hpu*_test-results.xml'
diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index 097af7cf38ce1..c30910547b025 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -51,8 +51,7 @@ It uses :class:`~pytorch_lightning.strategies.hpu_parallel.HPUParallelStrategy`
     trainer = Trainer(devices=8, accelerator="hpu")
 
 .. note::
-    If the ``devices`` flag is not defined, it will assume ``devices`` to be ``"auto"`` and fetch the :meth:`~pytorch_lightning.accelerators.hpu.HPUAccelerator.auto_device_count`
-    from :class:`~pytorch_lightning.accelerators.hpu.HPUAccelerator`.
+    If the ``devices`` flag is not defined, it will assume ``devices`` to be ``"auto"`` and select 8 Gaudi devices for :class:`~pytorch_lightning.accelerators.hpu.HPUAccelerator`.
 
 
 Mixed Precision Plugin
diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index 8cb0b0b32bf1f..85fe698838ce6 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -128,6 +128,8 @@ def teardown(self) -> None:
 
         log.detail(f"{self.__class__.__name__}: moving model to CPU")
         self.lightning_module.cpu()  # type: ignore
+        # Was set to local rank
+        os.environ.pop("ID", None)
 
     @classmethod
     def register_strategies(cls, strategy_registry: Dict) -> None:
diff --git a/tests/plugins/precision/hpu/test_hpu.py b/tests/plugins/precision/hpu/test_hpu.py
index 74a71c7cba7d7..f6e8d7706d00f 100644
--- a/tests/plugins/precision/hpu/test_hpu.py
+++ b/tests/plugins/precision/hpu/test_hpu.py
@@ -83,5 +83,5 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
 
 @RunIf(hpu=True)
 def test_unsupported_precision_plugin():
-    with pytest.raises(ValueError, match="`Trainer(accelerator='hpu', precision='mixed` is not supported."):
+    with pytest.raises(ValueError, match=r"accelerator='hpu', precision='mixed'\)` is not supported."):
         HPUPrecisionPlugin(precision="mixed")

From e137f199c5f39a91caabde47df8013327d35f7af Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Thu, 24 Mar 2022 21:18:51 +0530
Subject: [PATCH 139/167] Update testing pipeline & conftest

---
 .azure-pipelines/hpu-tests.yml | 4 ++--
 tests/conftest.py              | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.azure-pipelines/hpu-tests.yml b/.azure-pipelines/hpu-tests.yml
index 600e9548f17d9..d98a25f7bc4d9 100644
--- a/.azure-pipelines/hpu-tests.yml
+++ b/.azure-pipelines/hpu-tests.yml
@@ -40,8 +40,8 @@ jobs:
          python ".azure-pipelines/run_hpu_tests.py"
       displayName: 'HPU Tests in parallel'
 
-    - script: |
-        python pl_examples/hpu_examples/simple_mnist/mnist.py
+    - bash: |
+        python "pl_examples/hpu_examples/simple_mnist/mnist.py"
       displayName: 'Testing: examples'
 
     - task: PublishTestResults@2
diff --git a/tests/conftest.py b/tests/conftest.py
index f8c0e1d53e535..11e0d6d542209 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -81,7 +81,6 @@ def restore_env_variables():
         "XRT_SHARD_ORDINAL",
         "XRT_SHARD_LOCAL_ORDINAL",
         "TF_CPP_MIN_LOG_LEVEL",
-        "ID",  # used by HPU for acquiring the right gaudi device based on rank,
     }
     leaked_vars.difference_update(allowlist)
     assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}"

From a62cfa17cdb2dd15950ec592dd1acd0a0c1dbd4a Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Thu, 24 Mar 2022 21:59:15 +0530
Subject: [PATCH 140/167] Fix ci

---
 .azure-pipelines/hpu-tests.yml | 6 +++---
 tests/accelerators/test_hpu.py | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/.azure-pipelines/hpu-tests.yml b/.azure-pipelines/hpu-tests.yml
index d98a25f7bc4d9..bcfa567cf7835 100644
--- a/.azure-pipelines/hpu-tests.yml
+++ b/.azure-pipelines/hpu-tests.yml
@@ -40,9 +40,9 @@ jobs:
          python ".azure-pipelines/run_hpu_tests.py"
       displayName: 'HPU Tests in parallel'
 
-    - bash: |
-        python "pl_examples/hpu_examples/simple_mnist/mnist.py"
-      displayName: 'Testing: examples'
+    # - bash: |
+    #     python "pl_examples/hpu_examples/simple_mnist/mnist.py"
+    #   displayName: 'Testing: examples'
 
     - task: PublishTestResults@2
       inputs:
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index 2120511e4be4e..dc4f298b45e32 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -18,7 +18,6 @@
 
 from pytorch_lightning import Callback, seed_everything, Trainer
 from pytorch_lightning.accelerators import HPUAccelerator
-from pytorch_lightning.plugins import HPUPrecisionPlugin
 from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy
 from pytorch_lightning.strategies.single_hpu import SingleHPUStrategy
 from pytorch_lightning.utilities import _HPU_AVAILABLE

From 1078a6994d2cb9a2f453d7d1d6a5446c01876856 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Thu, 24 Mar 2022 22:17:34 +0530
Subject: [PATCH 141/167] Add device parsing logic for HPUs

---
 pytorch_lightning/accelerators/hpu.py        |  6 ++---
 pytorch_lightning/utilities/device_parser.py | 28 ++++++++++++++++++++
 tests/accelerators/test_hpu.py               | 25 +++++++----------
 3 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/pytorch_lightning/accelerators/hpu.py b/pytorch_lightning/accelerators/hpu.py
index 653925b696c16..12ac2f600a2cd 100644
--- a/pytorch_lightning/accelerators/hpu.py
+++ b/pytorch_lightning/accelerators/hpu.py
@@ -17,7 +17,7 @@
 import torch
 
 from pytorch_lightning.accelerators.accelerator import Accelerator
-from pytorch_lightning.utilities import _HPU_AVAILABLE
+from pytorch_lightning.utilities import _HPU_AVAILABLE, device_parser
 from pytorch_lightning.utilities.rank_zero import rank_zero_debug
 
 
@@ -35,9 +35,9 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         return {}
 
     @staticmethod
-    def parse_devices(devices: int) -> int:
+    def parse_devices(devices: Union[int, str, List[int]]) -> int:
         """Accelerator device parsing logic."""
-        return devices
+        return device_parser.parse_hpus(devices)
 
     @staticmethod
     def get_parallel_devices(devices: int) -> List[torch.device]:
diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py
index d7b8a319ea4d2..d8a3b4c5023bf 100644
--- a/pytorch_lightning/utilities/device_parser.py
+++ b/pytorch_lightning/utilities/device_parser.py
@@ -243,3 +243,31 @@ def _parse_tpu_cores_str(tpu_cores: str) -> Union[int, List[int]]:
     if tpu_cores in ("1", "8"):
         return int(tpu_cores)
     return [int(x.strip()) for x in tpu_cores.split(",") if len(x) > 0]
+
+
+def parse_hpus(devices: Optional[Union[int, str, List[int]]]) -> Optional[int]:
+    """
+    Parses the hpus given in the format as accepted by the
+    :class:`~pytorch_lightning.trainer.Trainer` for the `devices` flag.
+
+    Args:
+        devices: An int 1 or string '1' indicate that 1 Gaudi device should be used
+            An int 8 or string '8' indicate that all 8 Gaudi devices should be used
+
+    Returns:
+        Either integer 1 or 8,  or ``None`` if no devices were requested
+
+    Raises:
+        MisconfigurationException:
+            If devices aren't of value 1 or 8, or either of type `int` or `str`
+    """
+    if devices is not None and isinstance(devices, (int, str)):
+        raise MisconfigurationException("`devices` for `HPUAccelerator` must be int, string or None.")
+
+    if isinstance(devices, str) and devices in ("1", "8"):
+        devices = int(devices)
+
+    if devices not in (1, 8, None):
+        raise MisconfigurationException("`devices` can only be 1 or 8")
+
+    return devices
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index dc4f298b45e32..18c4da5b1c643 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -158,12 +158,15 @@ def test_accelerator_hpu():
 
     trainer = Trainer(accelerator="hpu", devices=1)
     assert isinstance(trainer.accelerator, HPUAccelerator)
+    assert trainer.num_devices == 1
 
     trainer = Trainer(accelerator="hpu")
     assert isinstance(trainer.accelerator, HPUAccelerator)
+    assert trainer.num_devices == 8
 
     trainer = Trainer(accelerator="auto", devices=8)
     assert isinstance(trainer.accelerator, HPUAccelerator)
+    assert trainer.num_devices == 8
 
 
 @RunIf(hpu=True)
@@ -192,13 +195,6 @@ def test_accelerator_auto_with_devices_hpu():
     assert isinstance(trainer.strategy, HPUParallelStrategy)
 
 
-@RunIf(hpu=True)
-def test_set_devices_if_none_hpu():
-
-    trainer = Trainer(accelerator="hpu", devices=8)
-    assert trainer.num_devices == 8
-
-
 @RunIf(hpu=True)
 def test_strategy_choice_hpu_plugin():
     trainer = Trainer(strategy=SingleHPUStrategy(device=torch.device("hpu")), accelerator="hpu", devices=1)
@@ -219,13 +215,6 @@ def test_strategy_choice_hpu_parallel_plugin():
     assert isinstance(trainer.strategy, HPUParallelStrategy)
 
 
-@RunIf(hpu=True)
-def test_hpu_accelerator_type():
-
-    trainer = Trainer(accelerator="hpu", devices=1)
-    assert isinstance(trainer.accelerator, HPUAccelerator)
-
-
 @RunIf(hpu=True)
 def test_devices_auto_choice_hpu():
     trainer = Trainer(accelerator="auto", devices="auto")
@@ -233,7 +222,7 @@ def test_devices_auto_choice_hpu():
 
 
 @RunIf(hpu=True)
-@pytest.mark.parametrize("hpus", [1])
+@pytest.mark.parametrize("hpus", 1)
 def test_inference_only(tmpdir, hpus):
     model = BoringModel()
 
@@ -245,3 +234,9 @@ def test_inference_only(tmpdir, hpus):
 
 def test_hpu_auto_device_count():
     assert HPUAccelerator.auto_device_count() == 8
+
+
+@RunIf(hpu=True)
+def test_hpu_unsupported_device_type():
+    with pytest.raises(MisconfigurationException, match="`devices` for `HPUAccelerator` must be int, string or None."):
+        Trainer(accelerator="hpu", devices=[1])

From a9dfcf323439de8b830ce4ae58b226fa73668179 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Thu, 24 Mar 2022 22:24:58 +0530
Subject: [PATCH 142/167] Fix device parsing

---
 pytorch_lightning/utilities/device_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py
index d8a3b4c5023bf..7710e162779c2 100644
--- a/pytorch_lightning/utilities/device_parser.py
+++ b/pytorch_lightning/utilities/device_parser.py
@@ -261,7 +261,7 @@ def parse_hpus(devices: Optional[Union[int, str, List[int]]]) -> Optional[int]:
         MisconfigurationException:
             If devices aren't of value 1 or 8, or either of type `int` or `str`
     """
-    if devices is not None and isinstance(devices, (int, str)):
+    if devices is not None and not isinstance(devices, (int, str)):
         raise MisconfigurationException("`devices` for `HPUAccelerator` must be int, string or None.")
 
     if isinstance(devices, str) and devices in ("1", "8"):

From 4665101f29cfc57650bb98e062f62160eec1aba5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?=
 <carlossmocholi@gmail.comk-Pro.local>
Date: Thu, 24 Mar 2022 20:19:32 +0100
Subject: [PATCH 143/167] Use the CLI in the example

---
 docs/source/accelerators/hpu.rst              | 25 ++++---
 .../hpu_examples/simple_mnist/mnist.py        | 68 +++++--------------
 pytorch_lightning/plugins/precision/hpu.py    | 33 +++++----
 tests/conftest.py                             | 12 ----
 tests/plugins/precision/hpu/test_hpu.py       | 17 +++--
 5 files changed, 66 insertions(+), 89 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index a186b7731c66e..fa8c93165eecb 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -84,19 +84,26 @@ This enables advanced users to provide their own BF16 and FP32 operator list ins
     import pytorch_lightning as pl
     from pytorch_lightning.plugins import HPUPrecisionPlugin
 
+    # Initialize a trainer with HPU accelerator for HPU strategy for single device,
+    # with mixed precision using overidden HMP settings
+    trainer = pl.Trainer(
+        accelerator="hpu",
+        devices=1,
+        # Optional Habana mixed precision params to be set
+        plugins=HPUPrecisionPlugin(
+            precision=16,
+            opt_level="O1",
+            verbose=False,
+            bf16_file_path="ops_bf16_mnist.txt",
+            fp32_file_path="ops_fp32_mnist.txt",
+        ),
+    )
+
     # Init our model
     model = LitClassifier()
-
-    # Init DataLoader from MNIST Dataset
+    # Init the data
     dm = MNISTDataModule(batch_size=batch_size)
 
-    # Optional Habana mixed precision params to be set
-    hmp_params = {"level": "O1", "verbose": False, "bf16_ops": "ops_bf16_mnist.txt", "fp32_ops": "ops_fp32_mnist.txt"}
-
-    # Initialize a trainer with HPU accelerator for HPU strategy for single device,
-    # with mixed precision using overidden HMP settings
-    trainer = pl.Trainer(accelerator="hpu", devices=1, plugins=[HPUPrecisionPlugin(precision=16, hmp_params=hmp_params)])
-
     # Train the model ⚡
     trainer.fit(model, datamodule=dm)
 
diff --git a/pl_examples/hpu_examples/simple_mnist/mnist.py b/pl_examples/hpu_examples/simple_mnist/mnist.py
index 0422ee75aba99..5664ce06cf526 100644
--- a/pl_examples/hpu_examples/simple_mnist/mnist.py
+++ b/pl_examples/hpu_examples/simple_mnist/mnist.py
@@ -11,42 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-import os
-
 import torch
+from jsonargparse import lazy_instance
 from torch.nn import functional as F
 
 import pytorch_lightning as pl
 from pl_examples.basic_examples.mnist_datamodule import MNISTDataModule
 from pytorch_lightning.plugins import HPUPrecisionPlugin
-
-
-def parse_args():
-    import argparse
-
-    parser = argparse.ArgumentParser(description="PyTorch Classification Training")
-
-    parser.add_argument("-b", "--batch-size", default=32, type=int)
-    parser.add_argument("--epochs", default=1, type=int, metavar="N", help="number of total epochs to run")
-    parser.add_argument(
-        "--hpus", default=1, type=int, metavar="N", help="number of habana accelerator for training (default: 1)"
-    )
-    parser.add_argument("--hmp", dest="is_hmp", action="store_true", help="enable habana mixed precision mode")
-    parser.add_argument("--hmp-bf16", default="", help="path to bf16 ops list in hmp O1 mode")
-    parser.add_argument("--hmp-fp32", default="", help="path to fp32 ops list in hmp O1 mode")
-    parser.add_argument("--hmp-opt-level", default="O1", help="choose optimization level for hmp")
-    parser.add_argument("--hmp-verbose", action="store_true", help="enable verbose mode for hmp")
-
-    args = parser.parse_args()
-
-    return args
+from pytorch_lightning.utilities.cli import LightningCLI
 
 
 class LitClassifier(pl.LightningModule):
     def __init__(self):
         super().__init__()
-
         self.l1 = torch.nn.Linear(28 * 28, 10)
 
     def forward(self, x):
@@ -78,32 +55,19 @@ def configure_optimizers(self):
 
 
 if __name__ == "__main__":
-    args = parse_args()
-
-    # Init our model
-    model = LitClassifier()
-
-    # Init DataLoader from MNIST Dataset
-    dm = MNISTDataModule(batch_size=args.batch_size)
-
-    # TBD: import these keys from hmp
-    hmp_keys = ["level", "verbose", "bf16_ops", "fp32_ops"]
-    hmp_params = dict.fromkeys(hmp_keys)
-    hmp_params["level"] = args.hmp_opt_level
-    hmp_params["verbose"] = args.hmp_verbose
-    hmp_params["bf16_ops"] = args.hmp_bf16  # "./pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt"
-    hmp_params["fp32_ops"] = args.hmp_fp32  # "./pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt"
-
-    # Initialize a trainer
-    trainer = pl.Trainer(
-        default_root_dir=os.getcwd(),
-        accelerator="hpu",
-        devices=args.hpus,
-        plugins=[HPUPrecisionPlugin(precision=16, hmp_params=hmp_params)],
-        max_epochs=args.epochs,
+    cli = LightningCLI(
+        LitClassifier,
+        MNISTDataModule,
+        trainer_defaults={
+            "accelerator": "hpu",
+            "devices": 1,
+            "max_epochs": 1,
+            "plugins": lazy_instance(HPUPrecisionPlugin, precision=16),
+        },
+        run=False,
     )
 
-    # Train the model ⚡
-    trainer.fit(model, datamodule=dm)
-    trainer.test(model, datamodule=dm)
-    trainer.validate(model, datamodule=dm)
+    # Run the model ⚡
+    cli.trainer.fit(cli.model, datamodule=cli.datamodule)
+    cli.trainer.validate(cli.model, datamodule=cli.datamodule)
+    cli.trainer.test(cli.model, datamodule=cli.datamodule)
diff --git a/pytorch_lightning/plugins/precision/hpu.py b/pytorch_lightning/plugins/precision/hpu.py
index 942ed3b30f9b6..3c02d82a2de10 100644
--- a/pytorch_lightning/plugins/precision/hpu.py
+++ b/pytorch_lightning/plugins/precision/hpu.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Optional, Sequence, Union
+from typing import Optional, Union
 
 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -22,9 +22,24 @@
 
 
 class HPUPrecisionPlugin(PrecisionPlugin):
-    """Plugin that enables bfloats/floats on HPUs."""
+    """Plugin that enables bfloat/half support on HPUs.
 
-    def __init__(self, precision: Union[str, int], hmp_params: Optional[Sequence[Any]] = None) -> None:
+    Args:
+        precision: The precision to use.
+        opt_level: Choose optimization level for hmp.
+        bf16_file_path: Path to bf16 ops list in hmp O1 mode.
+        fp32_file_path: Path to fp32 ops list in hmp O1 mode.
+        verbose: Enable verbose mode for hmp.
+    """
+
+    def __init__(
+        self,
+        precision: Union[str, int],
+        opt_level: str = "O2",
+        bf16_file_path: Optional[str] = None,
+        fp32_file_path: Optional[str] = None,
+        verbose: bool = False,
+    ) -> None:
         if not _HPU_AVAILABLE:
             raise MisconfigurationException("HPU precision plugin requires HPU devices.")
         supported_precision_values = (16, 32, "bf16")
@@ -35,12 +50,6 @@ def __init__(self, precision: Union[str, int], hmp_params: Optional[Sequence[Any
             )
         super().__init__()
         self.precision = precision
-        if not hmp_params:
-            return
-
-        hmp_opt_level = hmp_params.get("level", "02")  # type: ignore
-        hmp_bf16 = hmp_params.get("bf16_ops", None)  # type: ignore
-        hmp_fp32 = hmp_params.get("fp32_ops", None)  # type: ignore
-        hmp_verbose = hmp_params.get("verbose", False)  # type: ignore
-
-        hmp.convert(opt_level=hmp_opt_level, bf16_file_path=hmp_bf16, fp32_file_path=hmp_fp32, isVerbose=hmp_verbose)
+        hmp.convert(
+            opt_level=opt_level, bf16_file_path=bf16_file_path, fp32_file_path=fp32_file_path, isVerbose=verbose
+        )
diff --git a/tests/conftest.py b/tests/conftest.py
index 11e0d6d542209..73720f08ba2a5 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -207,15 +207,3 @@ def pytest_addoption(parser):
 def hpus(request):
     hpus = request.config.getoption("--hpus")
     return hpus
-
-
-@pytest.fixture
-def hmp_params(request):
-    """Ensure precision plugin value is set correctly."""
-    hmp_keys = ["level", "verbose", "bf16_ops", "fp32_ops"]
-    hmp_params = dict.fromkeys(hmp_keys)
-    hmp_params["level"] = "O1"
-    hmp_params["verbose"] = False
-    hmp_params["bf16_ops"] = request.config.getoption("--hmp-bf16")
-    hmp_params["fp32_ops"] = request.config.getoption("--hmp-fp32")
-    return hmp_params
diff --git a/tests/plugins/precision/hpu/test_hpu.py b/tests/plugins/precision/hpu/test_hpu.py
index f6e8d7706d00f..d092cf1f243b6 100644
--- a/tests/plugins/precision/hpu/test_hpu.py
+++ b/tests/plugins/precision/hpu/test_hpu.py
@@ -23,10 +23,19 @@
 from tests.helpers.runif import RunIf
 
 
+@pytest.fixture
+def hmp_params(request):
+    return {
+        "opt_level": "01",
+        "verbose": False,
+        "bf16_file_path": request.config.getoption("--hmp-bf16"),
+        "fp32_file_path": request.config.getoption("--hmp-fp32"),
+    }
+
+
 @RunIf(hpu=True)
 def test_precision_plugin(hmp_params):
-
-    plugin = HPUPrecisionPlugin(precision="bf16", hmp_params=hmp_params)
+    plugin = HPUPrecisionPlugin(precision="bf16", **hmp_params)
     assert plugin.precision == "bf16"
 
 
@@ -43,7 +52,7 @@ def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[st
         fast_dev_run=True,
         accelerator="hpu",
         devices=1,
-        plugins=[HPUPrecisionPlugin(precision="bf16", hmp_params=hmp_params)],
+        plugins=[HPUPrecisionPlugin(precision="bf16", **hmp_params)],
         callbacks=TestCallback(),
     )
     assert isinstance(trainer.strategy, SingleHPUStrategy)
@@ -69,7 +78,7 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
         fast_dev_run=True,
         accelerator="hpu",
         devices=1,
-        plugins=[HPUPrecisionPlugin(precision=16, hmp_params=hmp_params)],
+        plugins=[HPUPrecisionPlugin(precision=16, **hmp_params)],
         callbacks=TestCallback(),
     )
 

From 2ee4bbfc9a8cbf850b3738a7cbee3ec7dd45520b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?=
 <carlossmocholi@gmail.comk-Pro.local>
Date: Thu, 24 Mar 2022 20:22:35 +0100
Subject: [PATCH 144/167] Docs

---
 docs/source/accelerators/hpu.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index fa8c93165eecb..f6159728e2196 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -73,8 +73,8 @@ Internally, :class:`~pytorch_lightning.plugins.precision.HPUPrecisionPlugin` use
 You can execute the ops in FP32 or BF16 precision. The HMP package modifies the Python operators to add the appropriate cast operations for the arguments before execution.
 The default settings enable users to enable mixed precision training with minimal code easily.
 
-In addition to the default settings in HMP, users also have the option of overriding these defaults and providing their BF16 and FP32 operator lists by passing it
-to the ``hmp_params`` parameter of :class:`~pytorch_lightning.plugins.precision.HPUPrecisionPlugin`.
+In addition to the default settings in HMP, users also have the option of overriding these defaults and providing their
+BF16 and FP32 operator lists by passing them as parameter to :class:`~pytorch_lightning.plugins.precision.HPUPrecisionPlugin`.
 
 The below snippet shows an example model using MNIST with a single Habana Gaudi device and making use of HMP by overriding the default parameters.
 This enables advanced users to provide their own BF16 and FP32 operator list instead of using the HMP defaults.

From dc3eca72cd9ffb1c1adbe3b47237b99d0c375770 Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Fri, 25 Mar 2022 01:01:25 +0530
Subject: [PATCH 145/167] Update docs/source/accelerators/hpu.rst

Co-authored-by: ananthsub <ananth.subramaniam@gmail.com>
---
 docs/source/accelerators/hpu.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index f6159728e2196..8595ebbf07a6c 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -90,13 +90,13 @@ This enables advanced users to provide their own BF16 and FP32 operator list ins
         accelerator="hpu",
         devices=1,
         # Optional Habana mixed precision params to be set
-        plugins=HPUPrecisionPlugin(
+        plugins=[HPUPrecisionPlugin(
             precision=16,
             opt_level="O1",
             verbose=False,
             bf16_file_path="ops_bf16_mnist.txt",
             fp32_file_path="ops_fp32_mnist.txt",
-        ),
+        )],
     )
 
     # Init our model

From 695212578f057f38b6e9d94af9574585f94d2f3f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 24 Mar 2022 19:33:20 +0000
Subject: [PATCH 146/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 docs/source/accelerators/hpu.rst | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index 8595ebbf07a6c..0dc59ad674aec 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -90,13 +90,15 @@ This enables advanced users to provide their own BF16 and FP32 operator list ins
         accelerator="hpu",
         devices=1,
         # Optional Habana mixed precision params to be set
-        plugins=[HPUPrecisionPlugin(
-            precision=16,
-            opt_level="O1",
-            verbose=False,
-            bf16_file_path="ops_bf16_mnist.txt",
-            fp32_file_path="ops_fp32_mnist.txt",
-        )],
+        plugins=[
+            HPUPrecisionPlugin(
+                precision=16,
+                opt_level="O1",
+                verbose=False,
+                bf16_file_path="ops_bf16_mnist.txt",
+                fp32_file_path="ops_fp32_mnist.txt",
+            )
+        ],
     )
 
     # Init our model

From 91cced39298ac6f2a6ab93f57c35604b738fae9b Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Fri, 25 Mar 2022 01:10:38 +0530
Subject: [PATCH 147/167] Update hmp_params

---
 tests/plugins/precision/hpu/test_hpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/plugins/precision/hpu/test_hpu.py b/tests/plugins/precision/hpu/test_hpu.py
index d092cf1f243b6..2e9806be5305c 100644
--- a/tests/plugins/precision/hpu/test_hpu.py
+++ b/tests/plugins/precision/hpu/test_hpu.py
@@ -27,7 +27,7 @@
 def hmp_params(request):
     return {
         "opt_level": "01",
-        "verbose": False,
+        "isVerbose": False,
         "bf16_file_path": request.config.getoption("--hmp-bf16"),
         "fp32_file_path": request.config.getoption("--hmp-fp32"),
     }

From 0671d2ce4a884db08a2e6c3556abc11dc9afebdb Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Fri, 25 Mar 2022 01:13:46 +0530
Subject: [PATCH 148/167] Support passing amp_level to HPUPrecision

---
 pytorch_lightning/trainer/connectors/accelerator_connector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index 0f1281a7374d7..7ac8a658202c3 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -655,7 +655,7 @@ def _check_and_init_precision(self) -> PrecisionPlugin:
         if isinstance(self.accelerator, IPUAccelerator):
             return IPUPrecisionPlugin(self._precision_flag)  # type: ignore
         if isinstance(self.accelerator, HPUAccelerator):
-            return HPUPrecisionPlugin(self._precision_flag)  # type: ignore
+            return HPUPrecisionPlugin(self._precision_flag, self._amp_level_flag)  # type: ignore
         if isinstance(self.accelerator, TPUAccelerator):
             if self._precision_flag == 32:
                 return TPUPrecisionPlugin()

From 522106e11990eab15f14457159a64e18aa7aebd8 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Fri, 25 Mar 2022 01:29:39 +0530
Subject: [PATCH 149/167] Update HPUAccelerator

---
 pytorch_lightning/accelerators/hpu.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/accelerators/hpu.py b/pytorch_lightning/accelerators/hpu.py
index 12ac2f600a2cd..b5cbe3d7ce27b 100644
--- a/pytorch_lightning/accelerators/hpu.py
+++ b/pytorch_lightning/accelerators/hpu.py
@@ -18,16 +18,22 @@
 
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.utilities import _HPU_AVAILABLE, device_parser
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.rank_zero import rank_zero_debug
 
 
 class HPUAccelerator(Accelerator):
     """Accelerator for HPU devices."""
 
-    @staticmethod
-    def name() -> str:
-        """Name of the Accelerator."""
-        return "hpu"
+    def setup_environment(self, root_device: torch.device) -> None:
+        """
+        Raises:
+            MisconfigurationException:
+                If the selected device is not HPU.
+        """
+        super().setup_environment(root_device)
+        if root_device.type != "hpu":
+            raise MisconfigurationException(f"Device should be HPU, got {root_device} instead.")
 
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         """HPU device stats aren't supported yet."""
@@ -53,3 +59,11 @@ def auto_device_count() -> int:
     @staticmethod
     def is_available() -> bool:
         return _HPU_AVAILABLE
+
+    @classmethod
+    def register_accelerators(cls, accelerator_registry: Dict) -> None:
+        accelerator_registry.register(
+            "hpu",
+            cls,
+            description=f"{cls.__class__.__name__}",
+        )

From c8b89ea622e19b06188d5f259384ef02d82d13e5 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Fri, 25 Mar 2022 08:53:40 +0530
Subject: [PATCH 150/167] Update tests

---
 pytorch_lightning/trainer/connectors/accelerator_connector.py | 2 +-
 tests/accelerators/test_hpu.py                                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index 7ac8a658202c3..0f1281a7374d7 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -655,7 +655,7 @@ def _check_and_init_precision(self) -> PrecisionPlugin:
         if isinstance(self.accelerator, IPUAccelerator):
             return IPUPrecisionPlugin(self._precision_flag)  # type: ignore
         if isinstance(self.accelerator, HPUAccelerator):
-            return HPUPrecisionPlugin(self._precision_flag, self._amp_level_flag)  # type: ignore
+            return HPUPrecisionPlugin(self._precision_flag)  # type: ignore
         if isinstance(self.accelerator, TPUAccelerator):
             if self._precision_flag == 32:
                 return TPUPrecisionPlugin()
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
index 18c4da5b1c643..64415eeb5a0c8 100644
--- a/tests/accelerators/test_hpu.py
+++ b/tests/accelerators/test_hpu.py
@@ -222,7 +222,7 @@ def test_devices_auto_choice_hpu():
 
 
 @RunIf(hpu=True)
-@pytest.mark.parametrize("hpus", 1)
+@pytest.mark.parametrize("hpus", [1])
 def test_inference_only(tmpdir, hpus):
     model = BoringModel()
 

From 7d028b18677a54610ebacdefc13225eda09f9677 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Fri, 25 Mar 2022 09:03:14 +0530
Subject: [PATCH 151/167] Fix precision tests

---
 tests/plugins/precision/hpu/test_hpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/plugins/precision/hpu/test_hpu.py b/tests/plugins/precision/hpu/test_hpu.py
index 2e9806be5305c..d092cf1f243b6 100644
--- a/tests/plugins/precision/hpu/test_hpu.py
+++ b/tests/plugins/precision/hpu/test_hpu.py
@@ -27,7 +27,7 @@
 def hmp_params(request):
     return {
         "opt_level": "01",
-        "isVerbose": False,
+        "verbose": False,
         "bf16_file_path": request.config.getoption("--hmp-bf16"),
         "fp32_file_path": request.config.getoption("--hmp-fp32"),
     }

From 3c86aff1d45659e1a8d9d74ddca23f5495c0b2f9 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Fri, 25 Mar 2022 09:24:43 +0530
Subject: [PATCH 152/167] Update device parsing logic

---
 pytorch_lightning/utilities/device_parser.py | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py
index 7710e162779c2..5f886a4e36598 100644
--- a/pytorch_lightning/utilities/device_parser.py
+++ b/pytorch_lightning/utilities/device_parser.py
@@ -251,23 +251,16 @@ def parse_hpus(devices: Optional[Union[int, str, List[int]]]) -> Optional[int]:
     :class:`~pytorch_lightning.trainer.Trainer` for the `devices` flag.
 
     Args:
-        devices: An int 1 or string '1' indicate that 1 Gaudi device should be used
-            An int 8 or string '8' indicate that all 8 Gaudi devices should be used
+        devices: An integer that indicates the number of Gaudi devices to be used
 
     Returns:
-        Either integer 1 or 8,  or ``None`` if no devices were requested
+        Either an integer or ``None`` if no devices were requested
 
     Raises:
         MisconfigurationException:
-            If devices aren't of value 1 or 8, or either of type `int` or `str`
+            If devices aren't of type `int` or `str`
     """
     if devices is not None and not isinstance(devices, (int, str)):
         raise MisconfigurationException("`devices` for `HPUAccelerator` must be int, string or None.")
 
-    if isinstance(devices, str) and devices in ("1", "8"):
-        devices = int(devices)
-
-    if devices not in (1, 8, None):
-        raise MisconfigurationException("`devices` can only be 1 or 8")
-
-    return devices
+    return int(devices) if isinstance(devices, str) else devices

From 3c8e3210c20f0c53a4aa355604fcfdccfbaa5cdc Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Fri, 25 Mar 2022 09:37:40 +0530
Subject: [PATCH 153/167] Fix tests & address reviews

---
 docs/source/accelerators/hpu.rst                 | 5 +++--
 pytorch_lightning/overrides/torch_distributed.py | 7 +++----
 tests/plugins/precision/hpu/test_hpu.py          | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/source/accelerators/hpu.rst b/docs/source/accelerators/hpu.rst
index 0dc59ad674aec..fd7bd310ffc43 100644
--- a/docs/source/accelerators/hpu.rst
+++ b/docs/source/accelerators/hpu.rst
@@ -68,13 +68,13 @@ By default, HPU training will use 32-bit precision. To enable mixed precision, s
 Enabling Mixed Precision Options
 --------------------------------
 
-Internally, :class:`~pytorch_lightning.plugins.precision.HPUPrecisionPlugin` uses the Habana Mixed Precision (HMP) package to enable mixed precision training.
+Internally, :class:`~pytorch_lightning.plugins.precision.hpu.HPUPrecisionPlugin` uses the Habana Mixed Precision (HMP) package to enable mixed precision training.
 
 You can execute the ops in FP32 or BF16 precision. The HMP package modifies the Python operators to add the appropriate cast operations for the arguments before execution.
 The default settings enable users to enable mixed precision training with minimal code easily.
 
 In addition to the default settings in HMP, users also have the option of overriding these defaults and providing their
-BF16 and FP32 operator lists by passing them as parameter to :class:`~pytorch_lightning.plugins.precision.HPUPrecisionPlugin`.
+BF16 and FP32 operator lists by passing them as parameter to :class:`~pytorch_lightning.plugins.precision.hpu.HPUPrecisionPlugin`.
 
 The below snippet shows an example model using MNIST with a single Habana Gaudi device and making use of HMP by overriding the default parameters.
 This enables advanced users to provide their own BF16 and FP32 operator list instead of using the HMP defaults.
@@ -90,6 +90,7 @@ This enables advanced users to provide their own BF16 and FP32 operator list ins
         accelerator="hpu",
         devices=1,
         # Optional Habana mixed precision params to be set
+        # Checkout `pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt` for the format
         plugins=[
             HPUPrecisionPlugin(
                 precision=16,
diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py
index c2e0924abb9bf..261208b0ec41e 100644
--- a/pytorch_lightning/overrides/torch_distributed.py
+++ b/pytorch_lightning/overrides/torch_distributed.py
@@ -24,7 +24,7 @@
 # the distributed backend and tensor type updates for habana backend is done here before broadcast
 
 
-# https://github.com/pytorch/pytorch/blob/master/torch/distributed/distributed_c10d.py#L256
+# Taken from https://github.com/pytorch/pytorch/blob/3466c1b6901f06a563b8cbfa3c942fa50bda835b/torch/distributed/distributed_c10d.py#L267 # noqa: E501
 def _rank_not_in_group(group: ProcessGroup):
     """Helper that checks if the current process's rank is not in a given group."""
     if group is None:
@@ -32,7 +32,7 @@ def _rank_not_in_group(group: ProcessGroup):
     return group == GroupMember.NON_GROUP_MEMBER
 
 
-# Taken from https://github.com/pytorch/pytorch/blob/master/torch/distributed/distributed_c10d.py#L1518
+# Taken from https://github.com/pytorch/pytorch/blob/3466c1b6901f06a563b8cbfa3c942fa50bda835b/torch/distributed/distributed_c10d.py#L1551 # noqa: E501
 def _object_to_tensor(obj):
     f = io.BytesIO()
     _pickler(f).dump(obj)
@@ -45,13 +45,12 @@ def _object_to_tensor(obj):
     return byte_tensor, local_size
 
 
-# Taken from https://github.com/pytorch/pytorch/blob/master/torch/distributed/distributed_c10d.py#L1530
+# Taken from https://github.com/pytorch/pytorch/blob/3466c1b6901f06a563b8cbfa3c942fa50bda835b/torch/distributed/distributed_c10d.py#L1563 # noqa: E501
 def _tensor_to_object(tensor, tensor_size):
     buf = tensor.numpy().tobytes()[:tensor_size]
     return _unpickler(io.BytesIO(buf)).load()
 
 
-# Taken from https://github.com/pytorch/pytorch/blob/master/torch/distributed/distributed_c10d.py#L1729
 def _broadcast_object_list(object_list, src=0, group=None, device=None):
     """Broadcasts picklable objects in ``object_list`` to the whole group. Similar to :func:`broadcast`, but Python
     objects can be passed in. Note that all objects in ``object_list`` must be picklable in order to be
diff --git a/tests/plugins/precision/hpu/test_hpu.py b/tests/plugins/precision/hpu/test_hpu.py
index d092cf1f243b6..dcb2a99b8aa49 100644
--- a/tests/plugins/precision/hpu/test_hpu.py
+++ b/tests/plugins/precision/hpu/test_hpu.py
@@ -26,7 +26,7 @@
 @pytest.fixture
 def hmp_params(request):
     return {
-        "opt_level": "01",
+        "opt_level": "O1",
         "verbose": False,
         "bf16_file_path": request.config.getoption("--hmp-bf16"),
         "fp32_file_path": request.config.getoption("--hmp-fp32"),

From dcda0ac12a27003c2beb07a5b1686076e8a71e33 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Fri, 25 Mar 2022 09:43:36 +0530
Subject: [PATCH 154/167] Update run_hpu_tests

---
 .azure-pipelines/run_hpu_tests.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/.azure-pipelines/run_hpu_tests.py b/.azure-pipelines/run_hpu_tests.py
index 9493350664fd1..590c5d9c42251 100644
--- a/.azure-pipelines/run_hpu_tests.py
+++ b/.azure-pipelines/run_hpu_tests.py
@@ -47,7 +47,7 @@
 HPU1_PRECISION_TEST = HPU_TESTS_DICTIONARY["hpu1_precision_test"]
 
 PARALLEL_HPU_TESTS_EXECUTION = [[HPU4_TEST, HPU1_TEST], [HPU2_TEST, HPU1_TEST], [HPU8_TEST], [HPU1_PRECISION_TEST]]
-TIMEOUT = 60
+TIMEOUT = 60  # seconds
 TIMEOUT_EXIT_CODE = -9
 
 
@@ -97,10 +97,7 @@ def zip_cmd_exitcode(exit_status):
         A list of hpu tests called and their exit status.
     """
     status_list = []
-    hpu_tests_called = []
-    for hpu_tests in PARALLEL_HPU_TESTS_EXECUTION:
-        hpu_tests_called.append(hpu_tests)
-    status_list = list(zip(list(itertools.chain(*hpu_tests_called)), exit_status))
+    status_list = list(zip(list(itertools.chain(*PARALLEL_HPU_TESTS_EXECUTION)), exit_status))
     return status_list
 
 

From e254cd0baece42a16a8ac48f0a2ed51f9b7367c6 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Fri, 25 Mar 2022 07:31:46 +0300
Subject: [PATCH 155/167] Update CLI test

Signed-off-by: Jerome <janand@habana.ai>
---
 pl_examples/hpu_examples/simple_mnist/mnist.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pl_examples/hpu_examples/simple_mnist/mnist.py b/pl_examples/hpu_examples/simple_mnist/mnist.py
index 5664ce06cf526..b042efdd99379 100644
--- a/pl_examples/hpu_examples/simple_mnist/mnist.py
+++ b/pl_examples/hpu_examples/simple_mnist/mnist.py
@@ -65,6 +65,7 @@ def configure_optimizers(self):
             "plugins": lazy_instance(HPUPrecisionPlugin, precision=16),
         },
         run=False,
+        save_config_overwrite=True,
     )
 
     # Run the model ⚡

From c452bd29cc6e1bcc0c982b57572305811b0200ff Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Fri, 25 Mar 2022 10:02:35 +0530
Subject: [PATCH 156/167] Fix typing

---
 pytorch_lightning/accelerators/hpu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/accelerators/hpu.py b/pytorch_lightning/accelerators/hpu.py
index b5cbe3d7ce27b..76fdb02b307b8 100644
--- a/pytorch_lightning/accelerators/hpu.py
+++ b/pytorch_lightning/accelerators/hpu.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Optional, Union
 
 import torch
 
@@ -41,7 +41,7 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         return {}
 
     @staticmethod
-    def parse_devices(devices: Union[int, str, List[int]]) -> int:
+    def parse_devices(devices: Union[int, str, List[int]]) -> Optional[int]:
         """Accelerator device parsing logic."""
         return device_parser.parse_hpus(devices)
 

From dca6b0ff0ac2d79e4f226bd8e51523d1b49d2f6f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 25 Mar 2022 05:33:40 +0000
Subject: [PATCH 157/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 6beaf3cf6f054..23968c3f22caa 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -214,4 +214,4 @@ def pytest_addoption(parser):
 @pytest.fixture
 def hpus(request):
     hpus = request.config.getoption("--hpus")
-    return hpus
\ No newline at end of file
+    return hpus

From 98e901de152e76a6e3fdea26fd8440d430dc0417 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Fri, 25 Mar 2022 08:38:46 +0300
Subject: [PATCH 158/167] Enable example test in pipeline

Signed-off-by: Jerome <janand@habana.ai>
---
 .azure-pipelines/hpu-tests.yml | 6 +++---
 requirements/test.txt          | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.azure-pipelines/hpu-tests.yml b/.azure-pipelines/hpu-tests.yml
index bcfa567cf7835..f2ec8891c5feb 100644
--- a/.azure-pipelines/hpu-tests.yml
+++ b/.azure-pipelines/hpu-tests.yml
@@ -40,9 +40,9 @@ jobs:
          python ".azure-pipelines/run_hpu_tests.py"
       displayName: 'HPU Tests in parallel'
 
-    # - bash: |
-    #     python "pl_examples/hpu_examples/simple_mnist/mnist.py"
-    #   displayName: 'Testing: examples'
+    - bash: |
+        python "pl_examples/hpu_examples/simple_mnist/mnist.py"
+      displayName: 'Testing: HPU examples'
 
     - task: PublishTestResults@2
       inputs:
diff --git a/requirements/test.txt b/requirements/test.txt
index 25e0f0f55a1c7..51d9ecf71db44 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -8,6 +8,7 @@ flake8>=3.9.2
 pre-commit>=1.0
 pytest-forked
 sklearn
+jsonargparse
 
 # needed in tests
 cloudpickle>=1.3

From 2860a4e99cf769c66942f2464c4f89d337e088f1 Mon Sep 17 00:00:00 2001
From: Jerome <janand@habana.ai>
Date: Fri, 25 Mar 2022 08:44:33 +0300
Subject: [PATCH 159/167] export path of modules

Signed-off-by: Jerome <janand@habana.ai>
---
 .azure-pipelines/hpu-tests.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.azure-pipelines/hpu-tests.yml b/.azure-pipelines/hpu-tests.yml
index f2ec8891c5feb..d846994175f40 100644
--- a/.azure-pipelines/hpu-tests.yml
+++ b/.azure-pipelines/hpu-tests.yml
@@ -41,6 +41,7 @@ jobs:
       displayName: 'HPU Tests in parallel'
 
     - bash: |
+        export PYTHONPATH="${PYTHONPATH}:$(pwd)"
         python "pl_examples/hpu_examples/simple_mnist/mnist.py"
       displayName: 'Testing: HPU examples'
 

From a297593b764009c61014fe1c47856b178bb7f9dd Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Fri, 25 Mar 2022 11:56:12 +0530
Subject: [PATCH 160/167] Fix test

---
 tests/accelerators/test_accelerator_registry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/accelerators/test_accelerator_registry.py b/tests/accelerators/test_accelerator_registry.py
index b21cd95e33cbd..2e2f4c4e47020 100644
--- a/tests/accelerators/test_accelerator_registry.py
+++ b/tests/accelerators/test_accelerator_registry.py
@@ -63,4 +63,4 @@ def is_available():
 
 
 def test_available_accelerators_in_registry():
-    assert AcceleratorRegistry.available_accelerators() == ["cpu", "gpu", "ipu", "tpu"]
+    assert AcceleratorRegistry.available_accelerators() == ["cpu", "gpu", "hpu", "ipu", "tpu"]

From 65f1fb99cf9b61622e1a82397f5d481c07984f89 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Fri, 25 Mar 2022 12:02:28 +0530
Subject: [PATCH 161/167] Update torch distributed

---
 pytorch_lightning/overrides/torch_distributed.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py
index 261208b0ec41e..e7410cda93f2b 100644
--- a/pytorch_lightning/overrides/torch_distributed.py
+++ b/pytorch_lightning/overrides/torch_distributed.py
@@ -8,9 +8,6 @@
 import torch
 from torch._C._distributed_c10d import ProcessGroup
 
-from pytorch_lightning.utilities import _HPU_AVAILABLE
-from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8
-
 _pickler = pickle.Pickler
 _unpickler = pickle.Unpickler
 
@@ -168,7 +165,5 @@ def _broadcast_noop(obj, *_, **__):
         return obj
 
     broadcast_object_list = _broadcast_noop
-elif _TORCH_GREATER_EQUAL_1_8 and not _HPU_AVAILABLE:
-    from torch.distributed.distributed_c10d import broadcast_object_list
 else:
     broadcast_object_list = _broadcast_object_list

From 23808872c1f2feca3cc1c45c98307df692ce081f Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Fri, 25 Mar 2022 12:22:31 +0530
Subject: [PATCH 162/167] Update strategy

---
 pytorch_lightning/strategies/hpu_parallel.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index 85fe698838ce6..3545d31cdd7b5 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -28,7 +28,6 @@
 from pytorch_lightning.utilities.distributed import group as _group
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _HPU_AVAILABLE, _TORCH_LESSER_EQUAL_1_10_2
-from pytorch_lightning.utilities.rank_zero import rank_zero_warn
 
 if _HPU_AVAILABLE:
     import habana_frameworks.torch.core.hccl  # noqa: F401
@@ -81,15 +80,6 @@ def pre_configure_ddp(self):  # type: ignore
         # when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True.
         # This flag does come with a performance hit, so it is suggested to disable in cases where it is possible.
         self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
-        if not self.lightning_module.automatic_optimization and not self._ddp_kwargs.get(
-            "find_unused_parameters", False
-        ):
-            # TODO: PyTorch 1.7.0 DDP introduces `self.reducer._rebuild_buckets()` breaking manual_optimization
-            rank_zero_warn(
-                "From PyTorch 1.7.0, Lightning `manual_optimization` needs to set `find_unused_parameters=True` to"
-                " properly work with DDP. Using `find_unused_parameters=True`."
-            )
-            self._ddp_kwargs["find_unused_parameters"] = True
 
         self._static_graph = False
         static_graph = self._ddp_kwargs.get("static_graph")

From 59ef6fdd1a96d2dfb350337734d290cdf8dea6b4 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Fri, 25 Mar 2022 13:07:53 +0530
Subject: [PATCH 163/167] Update example

---
 pl_examples/hpu_examples/simple_mnist/mnist.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pl_examples/hpu_examples/simple_mnist/mnist.py b/pl_examples/hpu_examples/simple_mnist/mnist.py
index b042efdd99379..a5d4b47d6b829 100644
--- a/pl_examples/hpu_examples/simple_mnist/mnist.py
+++ b/pl_examples/hpu_examples/simple_mnist/mnist.py
@@ -46,7 +46,8 @@ def test_step(self, batch, batch_idx):
         acc = self.accuracy(logits, y)
         self.log("test_acc", acc)
 
-    def accuracy(self, logits, y):
+    @staticmethod
+    def accuracy(logits, y):
         acc = torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y)
         return acc
 

From c02c1ed5a5fbe26949d8d48ae1a8ab82a1684e33 Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Fri, 25 Mar 2022 13:10:32 +0530
Subject: [PATCH 164/167] Apply suggestions from code review

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 tests/plugins/precision/hpu/test_hpu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/plugins/precision/hpu/test_hpu.py b/tests/plugins/precision/hpu/test_hpu.py
index dcb2a99b8aa49..5701bf2dc2caa 100644
--- a/tests/plugins/precision/hpu/test_hpu.py
+++ b/tests/plugins/precision/hpu/test_hpu.py
@@ -40,7 +40,7 @@ def test_precision_plugin(hmp_params):
 
 
 @RunIf(hpu=True)
-def test_mixed_precision(tmpdir, hmp_params):
+def test_mixed_precision(tmpdir, hmp_params: dict):
     class TestCallback(Callback):
         def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None:
             assert trainer.strategy.model.precision == "bf16"
@@ -63,7 +63,7 @@ def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[st
 
 
 @RunIf(hpu=True)
-def test_pure_half_precision(tmpdir, hmp_params):
+def test_pure_half_precision(tmpdir, hmp_params: dict):
     class TestCallback(Callback):
         def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
             assert trainer.strategy.model.precision == 16

From beda30cdac43fb02a1efd4cb25848751eccfa9d6 Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Fri, 25 Mar 2022 13:15:53 +0530
Subject: [PATCH 165/167] Address reviews

---
 pytorch_lightning/overrides/torch_distributed.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py
index e7410cda93f2b..57fd34c312228 100644
--- a/pytorch_lightning/overrides/torch_distributed.py
+++ b/pytorch_lightning/overrides/torch_distributed.py
@@ -54,13 +54,13 @@ def _broadcast_object_list(object_list, src=0, group=None, device=None):
     broadcasted.
 
     Args:
-        object_list (List[Any]): List of input objects to broadcast.
+        object_list: List of input objects to broadcast.
             Each object must be picklable. Only objects on the ``src`` rank will
             be broadcast, but each rank must provide lists of equal sizes.
-        src (int): Source rank from which to broadcast ``object_list``.
-        group: (ProcessGroup, optional): The process group to work on. If None,
+        src: Source rank from which to broadcast ``object_list``.
+        group: The process group to work on. If None,
             the default process group will be used. Default is ``None``.
-        device (``torch.device``, optional): If not None, the objects are
+        device: If not None, the objects are
             serialized and converted to tensors which are moved to the
             ``device`` before broadcasting. Default is ``None``.
 

From c465a06b1990e705e35e31c9e0f623670212b05e Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Fri, 25 Mar 2022 14:13:05 +0530
Subject: [PATCH 166/167] Update backend env variable for strategy

---
 pytorch_lightning/overrides/torch_distributed.py | 3 +--
 pytorch_lightning/strategies/hpu_parallel.py     | 6 ++++--
 tests/conftest.py                                | 1 -
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py
index 57fd34c312228..9c70a2867b429 100644
--- a/pytorch_lightning/overrides/torch_distributed.py
+++ b/pytorch_lightning/overrides/torch_distributed.py
@@ -104,8 +104,7 @@ def _broadcast_object_list(object_list, src=0, group=None, device=None):
     # broadcasted to this device.
     group_backend = get_backend(group)
     is_nccl_backend = group_backend == Backend.NCCL
-    dist_backend = os.environ.get("PL_TORCH_DISTRIBUTED_BACKEND")
-    is_hpu_backend = group_backend == torch.distributed.Backend(str(dist_backend))
+    is_hpu_backend = os.environ.get("HCCL_DISTRIBUTED_BACKEND") == "1"
     current_device = None
     if device is not None:
         if is_nccl_backend and device.type != "cuda":
diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
index 3545d31cdd7b5..562a841b89510 100644
--- a/pytorch_lightning/strategies/hpu_parallel.py
+++ b/pytorch_lightning/strategies/hpu_parallel.py
@@ -67,8 +67,9 @@ def setup_environment(self) -> None:
         load_habana_module()
 
         os.environ["ID"] = str(self.local_rank)
-        # this env is used in overrides to check the backend initiated
-        os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "hccl"
+        if self._process_group_backend == "hccl":
+            # this env is used in overrides to check the backend initiated
+            os.environ["HCCL_DISTRIBUTED_BACKEND"] = str(1)
         super().setup_environment()
 
     def determine_ddp_device_ids(self) -> None:
@@ -120,6 +121,7 @@ def teardown(self) -> None:
         self.lightning_module.cpu()  # type: ignore
         # Was set to local rank
         os.environ.pop("ID", None)
+        os.environ.pop("HCCL_DISTRIBUTED_BACKEND", None)
 
     @classmethod
     def register_strategies(cls, strategy_registry: Dict) -> None:
diff --git a/tests/conftest.py b/tests/conftest.py
index 23968c3f22caa..b24f1dad8c61b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -63,7 +63,6 @@ def restore_env_variables():
         "MASTER_PORT",
         "PL_GLOBAL_SEED",
         "PL_SEED_WORKERS",
-        "PL_TORCH_DISTRIBUTED_BACKEND",
         "WANDB_MODE",
         "WANDB_REQUIRE_SERVICE",
         "WANDB_SERVICE",

From 60f2da4aef665b08339b080f3242b1a728f8234e Mon Sep 17 00:00:00 2001
From: Kaushik B <kaushikbokka@gmail.com>
Date: Fri, 25 Mar 2022 14:21:34 +0530
Subject: [PATCH 167/167] Update backend env variable for strategy

---
 pytorch_lightning/utilities/distributed.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index 10cf4ee6a73b1..b94351b22a335 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -18,7 +18,6 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
-from torch.distributed import get_backend
 from torch.nn.parallel.distributed import DistributedDataParallel
 
 import pytorch_lightning as pl
@@ -132,9 +131,7 @@ def sync_ddp(
 
     # WA for HPU. HPU doesn't support Long types, forcefully set it to float
     if _HPU_AVAILABLE:
-        group_backend = get_backend(group)
-        dist_backend = os.environ.get("PL_TORCH_DISTRIBUTED_BACKEND")
-        is_hpu_backend = group_backend == torch.distributed.Backend(str(dist_backend))
+        is_hpu_backend = os.environ.get("HCCL_DISTRIBUTED_BACKEND") == "1"
         if is_hpu_backend:
             if (result.type() == "torch.LongTensor") or (result.type() == "torch.hpu.LongTensor"):
                 new_rank_zero_info("Long tensor unsupported on HPU, casting to float")