Lightning-AI · kaushikb11 · Mar 25, 2022 · Feb 8, 2022 · Feb 8, 2022 · Feb 8, 2022
@@ -0,0 +1,62 @@
+import os
+import sys
+
+import habana_frameworks.torch.core as htcore
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.utils.data import DataLoader, random_split
+from torchvision import transforms
+from torchvision.datasets import MNIST
+
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks import HPUStatsMonitor
+
+
+class MNISTModel(pl.LightningModule):
+    def __init__(self):
+        super().__init__()
+        self.l1 = torch.nn.Linear(28 * 28, 10)
+
+    def forward(self, x):
+        return torch.relu(self.l1(x.view(x.size(0), -1)))
+
+    def training_step(self, batch, batch_nb):
+        x, y = batch
+        loss = F.cross_entropy(self(x), y)
+        return loss
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), lr=0.02)
+
+
+# Init our model
+mnist_model = MNISTModel()
+
+# Init DataLoader from MNIST Dataset
+train_ds = MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())
+train_loader = DataLoader(train_ds, batch_size=32)
+
+# TBD: import these keys from hmp
+hmp_keys = ["level", "verbose", "bf16_ops", "fp32_ops"]
+hmp_params = dict.fromkeys(hmp_keys)
+hmp_params["level"] = "O1"
+hmp_params["verbose"] = False
+hmp_params["bf16_ops"] = "./pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt"
+hmp_params["fp32_ops"] = "./pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt"
+
+hpu_stats = HPUStatsMonitor(log_save_dir="habana_ptl_log", exp_name="mnist")
+
+# Initialize a trainer
+trainer = pl.Trainer(
+    devices=1,
+    callbacks=[hpu_stats],
+    max_epochs=1,
+    precision=32,
+    hmp_params=hmp_params,
+    default_root_dir="/tmp/",
+    accelerator="hpu",
+)
+
+# Train the model ⚡
+trainer.fit(mnist_model, train_loader)
@@ -0,0 +1,2 @@
+linear
+relu
@@ -0,0 +1 @@
+cross_entropy
@@ -13,5 +13,6 @@
 from pytorch_lightning.accelerators.accelerator import Accelerator  # noqa: F401
 from pytorch_lightning.accelerators.cpu import CPUAccelerator  # noqa: F401
 from pytorch_lightning.accelerators.gpu import GPUAccelerator  # noqa: F401
+from pytorch_lightning.accelerators.hpu import HPUAccelerator  # noqa: F401
 from pytorch_lightning.accelerators.ipu import IPUAccelerator  # noqa: F401
 from pytorch_lightning.accelerators.tpu import TPUAccelerator  # noqa: F401
@@ -28,6 +28,7 @@ class Accelerator(ABC):
     - GPU
     - TPU
     - IPU
+    - HPU
     """
 
     def setup_environment(self, root_device: torch.device) -> None:

@@ -0,0 +1,33 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, Union
+
+import torch
+
+from pytorch_lightning.accelerators.accelerator import Accelerator
+
+
+class HPUAccelerator(Accelerator):
+    """Accelerator for HPU devices."""
+
+    def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
+        """HPU device stats aren't supported yet."""
+        return {}
+
+    @staticmethod
+    def auto_device_count() -> int:
+        """Get the devices when set to auto."""
+        # TBD: make this configurable
+        return 8
diff --git a/pytorch_lightning/callbacks/__init__.py b/pytorch_lightning/callbacks/__init__.py
@@ -17,6 +17,7 @@
 from pytorch_lightning.callbacks.finetuning import BackboneFinetuning, BaseFinetuning
 from pytorch_lightning.callbacks.gpu_stats_monitor import GPUStatsMonitor
 from pytorch_lightning.callbacks.gradient_accumulation_scheduler import GradientAccumulationScheduler
+from pytorch_lightning.callbacks.hpu_stats_monitor import HPUStatsMonitor
 from pytorch_lightning.callbacks.lambda_function import LambdaCallback
 from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor
 from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
@@ -37,6 +38,7 @@
     "DeviceStatsMonitor",
     "EarlyStopping",
     "GPUStatsMonitor",
+    "HPUStatsMonitor",
     "XLAStatsMonitor",
     "GradientAccumulationScheduler",
     "LambdaCallback",

diff --git a/pytorch_lightning/callbacks/hpu_stats_monitor.py b/pytorch_lightning/callbacks/hpu_stats_monitor.py
@@ -0,0 +1,80 @@
+# Copyright (C) 2021 Habana Labs, Ltd. an Intel Company
+# All Rights Reserved.
+#
+# Unauthorized copying of this file or any element(s) within it, via any medium
+# is strictly prohibited.
+# This file contains Habana Labs, Ltd. proprietary and confidential information
+# and is subject to the confidentiality and license agreements under which it
+# was provided.
+#
+
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+hpu Stats Monitor
+=================
+
+Monitor and logs hpu stats during training.
+
+"""
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks.base import Callback
+from pytorch_lightning.utilities import rank_zero_only
+
+
+class HPUStatsMonitor(Callback):
+    """Automatically monitors and logs hpu stats during training stage.
+
+    Args:
+        save_dir: directory to save the logs.
+        exp_name: name of the experiment.
+
+    Example::
+
+        >>> from pytorch_lightning import Trainer
+        >>> from pytorch_lightning.callbacks import HPUStatsMonitor
+        >>> hpu_stats = HPUStatsMonitor()
+        >>> trainer = Trainer(hpus=1, callbacks=[hpu_stats])
+
+    you can also optionally provide save_dir and exp_name in HPUStatsMonitor.
+    No need to provide logger in Trainer.
+    """
+
+    def __init__(self, log_save_dir: str = "habana_ptl_logs", exp_name: str = "default"):
+        super().__init__()
+        self.log_save_dir = log_save_dir
+        self.exp_name = exp_name
+
+    def on_init_end(self, trainer: "pl.Trainer") -> None:
+        from pytorch_lightning import loggers as pl_logger
+
+        self.tb_logger = pl_logger.TensorBoardLogger(save_dir=self.log_save_dir, name=self.exp_name)
+        trainer.logger = self.tb_logger
+
+    def on_before_backward(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", loss: torch.Tensor) -> None:
+        pl_module.log("Model_Loss", loss, on_step=True, on_epoch=True, enable_graph=False, logger=True)
+
+    def on_train_epoch_end(
+        self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", unused: Optional = None
+    ) -> None:
+        tensor_board = trainer.logger.experiment
+        dict = vars(pl_module)
+        modules = dict["_modules"]
+        for module_name in modules:
+            tensor_board.add_histogram(module_name + ".weight", modules[module_name].weight, pl_module.current_epoch)
+            tensor_board.add_histogram(module_name + ".bias", modules[module_name].bias, pl_module.current_epoch)
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
@@ -223,6 +223,14 @@ def on_gpu(self):
         """
         return self.device.type == "cuda"
 
+    @property
+    def on_hpu(self):
+        """True if your model is currently running on HPUs.
+
+        Useful to set flags around the LightningModule for different CPU vs GPU vs HPU behavior.
+        """
+        return self.device.type == "hpu"
+
     @property
     def automatic_optimization(self) -> bool:
         """If set to ``False`` you are responsible for calling ``.backward()``, ``.step()``, ``.zero_grad()``."""

diff --git a/pytorch_lightning/lite/lite.py b/pytorch_lightning/lite/lite.py
@@ -86,6 +86,7 @@ def __init__(
             devices=devices,
             tpu_cores=tpu_cores,
             ipus=None,
+            hpus=None,
             accelerator=accelerator,
             strategy=strategy,
             gpus=gpus,
@@ -98,6 +99,7 @@ def __init__(
             precision=precision,
             amp_type="native",
             amp_level=None,
+            hmp_params=None,
             plugins=plugins,
         )
         self._strategy = self._accelerator_connector.strategy
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		linear
jerome-habana marked this conversation as resolved. Show resolved Hide resolved
		relu
jerome-habana marked this conversation as resolved. Show resolved Hide resolved
-Original file line number
+Diff line change
@@ Expand Up / @@ -28,6 +28,7 @@ class Accelerator(ABC): @@
         - GPU
         - TPU
         - IPU
+        - HPU
         """
         def setup_environment(self, root_device: torch.device) -> None:
@@ Expand Down @@