Lightning-AI
diff --git a/‎.azure-pipelines/hpu-tests.yml
Lines changed: 20 additions & 0 deletions b/‎.azure-pipelines/hpu-tests.yml
Lines changed: 20 additions & 0 deletions
diff --git a/‎.azure-pipelines/run_hpu_tests.py
Lines changed: 148 additions & 0 deletions b/‎.azure-pipelines/run_hpu_tests.py
Lines changed: 148 additions & 0 deletions
diff --git a/‎CHANGELOG.md
Lines changed: 3 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/source/accelerators/hpu.rst
Lines changed: 124 additions & 0 deletions b/‎docs/source/accelerators/hpu.rst
Lines changed: 124 additions & 0 deletions
diff --git a/‎docs/source/api_references.rst
Lines changed: 5 additions & 0 deletions b/‎docs/source/api_references.rst
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/source/extensions/accelerator.rst
Lines changed: 3 additions & 1 deletion b/‎docs/source/extensions/accelerator.rst
Lines changed: 3 additions & 1 deletion
diff --git a/‎docs/source/extensions/plugins.rst
Lines changed: 7 additions & 6 deletions b/‎docs/source/extensions/plugins.rst
Lines changed: 7 additions & 6 deletions
diff --git a/‎docs/source/extensions/strategy.rst
Lines changed: 2 additions & 0 deletions b/‎docs/source/extensions/strategy.rst
Lines changed: 2 additions & 0 deletions
@@ -31,3 +31,23 @@ jobs:
         apt-get install -y hwinfo
         hwinfo --short
       displayName: 'Instance HW info'
+
+    - bash: |
+        pip install . --requirement requirements/test.txt
+      displayName: 'Install dependencies'
+
+    - bash: |
+         python ".azure-pipelines/run_hpu_tests.py"
+      displayName: 'HPU Tests in parallel'
+
+    - bash: |
+        export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+        python "pl_examples/hpu_examples/simple_mnist/mnist.py"
+      displayName: 'Testing: HPU examples'
+
+    - task: PublishTestResults@2
+      inputs:
+        testResultsFiles: 'hpu*_test-results.xml'
+        testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
+      condition: succeededOrFailed()
+      displayName: 'Publish test results'
@@ -0,0 +1,148 @@
+"""This file is called from the hpu-tests.yml pipeline.
+
+The following script run the hpu tests in parallel.
+Tests run are:
+1. test_inference_only is run on four cards
+2. test_all_stages on two cards
+3. complete hpu tests using one card
+4. complete hpu tests using eight cards.
+"""
+import itertools
+import subprocess
+import sys
+
+HPU_TESTS_DICTIONARY = {
+    "hpu1_test": "python -m coverage run --source pytorch_lightning -m pytest -sv tests/accelerators/test_hpu.py \
+            --forked \
+            --junitxml=hpu1_test-results.xml",
+    "hpu2_test": "python -m coverage run --source pytorch_lightning -m pytest -sv tests/accelerators/test_hpu.py \
+            -k test_all_stages \
+            --hpus 2 \
+            --verbose \
+            --capture=no \
+            --forked \
+            --junitxml=hpu2_test-results.xml",
+    "hpu4_test": "python -m coverage run --source pytorch_lightning -m pytest -sv tests/accelerators/test_hpu.py \
+            -k test_inference_only \
+            --hpus 4 \
+            --capture=no \
+            --verbose \
+            --forked \
+            --junitxml=hpu4_test-results.xml",
+    "hpu8_test": "python -m coverage run --source pytorch_lightning -m pytest -sv tests/accelerators/test_hpu.py \
+            --forked \
+            --hpus 8 \
+            --junitxml=hpu8_test-results.xml",
+    "hpu1_precision_test": "python -m coverage run --source pytorch_lightning -m pytest -sv tests/plugins/precision/hpu/test_hpu.py \
+            --hmp-bf16 'tests/plugins/precision/hpu/ops_bf16.txt' \
+            --hmp-fp32 'tests/plugins/precision/hpu/ops_fp32.txt' \
+            --forked \
+            --junitxml=hpu1_precision_test-results.xml",
+}
+
+HPU1_TEST = HPU_TESTS_DICTIONARY["hpu1_test"]
+HPU2_TEST = HPU_TESTS_DICTIONARY["hpu2_test"]
+HPU4_TEST = HPU_TESTS_DICTIONARY["hpu4_test"]
+HPU8_TEST = HPU_TESTS_DICTIONARY["hpu8_test"]
+HPU1_PRECISION_TEST = HPU_TESTS_DICTIONARY["hpu1_precision_test"]
+
+PARALLEL_HPU_TESTS_EXECUTION = [[HPU4_TEST, HPU1_TEST], [HPU2_TEST, HPU1_TEST], [HPU8_TEST], [HPU1_PRECISION_TEST]]
+TIMEOUT = 60  # seconds
+TIMEOUT_EXIT_CODE = -9
+
+
+def run_hpu_tests_parallel(timeout=TIMEOUT):
+    """This function is called to run the HPU tests in parallel.
+
+    We run the tests in sub process to utilize all the eight cards available in the DL1 instance
+    Considering the max time taken to run the HPU tests as 60 seconds, we kill the process if the time taken exceeds.
+
+    Args:
+        timeout: The threshold time to run the HPU tests in parallel.
+            An exception is logged if the threshold timeout gets expired.
+            TIMEOUT_EXIT_CODE will be returned as -9 in case of timeout,
+            0 in case of success and 4 in case of failure.
+
+    Return:
+        The list of exit status of the HPU tests that were run in the subprocess.
+        Here, the exit_status 0 means the test run is successful. exit_status 1 means the test run is failed.
+    """
+    exit_status = []
+    with open("stdout_log.txt", "w") as stdout_log, open("error_log.txt", "w") as error_log:
+        for hpu_tests in PARALLEL_HPU_TESTS_EXECUTION:
+            process_list = [
+                subprocess.Popen(
+                    each_hpu_test, shell=True, stdout=stdout_log, stderr=error_log, universal_newlines=True
+                )
+                for each_hpu_test in hpu_tests
+            ]
+            for process in process_list:
+                try:
+                    exit_status.append(process.wait(timeout=TIMEOUT))
+                except subprocess.TimeoutExpired as e:
+                    print(e)
+                    print("Killing the process....")
+                    process.kill()
+                    exit_status.append(TIMEOUT_EXIT_CODE)
+    return exit_status
+
+
+def zip_cmd_exitcode(exit_status):
+    """This function is called to zip the tests that were executed with the exit status of the test.
+
+    Args:
+        exit_status: The returned exit_status after executing run_hpu_tests_parallel().
+
+    Return:
+        A list of hpu tests called and their exit status.
+    """
+    status_list = []
+    status_list = list(zip(list(itertools.chain(*PARALLEL_HPU_TESTS_EXECUTION)), exit_status))
+    return status_list
+
+
+def print_logs(filename):
+    """This function is called to read the file and print the logs.
+
+    Args:
+        filename: Provide the log filename that need to be print on the console.
+    """
+    with open(filename) as f:
+        print(f.read())
+
+
+def print_subprocess_logs_and_return_status(exit_status):
+    """This function is called to print the logs of subprocess stdout and stderror and return the status of test
+    execution.
+
+    Args:
+        exit_status: The returned exit_status after executing run_hpu_tests_parallel().
+
+    Return:
+        Based on the exit status of the HPU tests, we return success or failure to the main method.
+    """
+    if all(v == 0 for v in exit_status):
+        print("All HPU tests passed")
+        file_name = "stdout_log.txt"
+        print_logs(file_name)
+        return 0
+    else:
+        print("HPU tests are failing")
+        print("Printing stdout_log.txt...")
+        file_name = "stdout_log.txt"
+        print_logs(file_name)
+        print("Printing error_log.txt...")
+        file_name = "error_log.txt"
+        print_logs(file_name)
+        return 1
+
+
+def main():
+    exit_status = run_hpu_tests_parallel(timeout=TIMEOUT)
+    status_list = zip_cmd_exitcode(exit_status)
+    print("HPU Tests executed and their exit status:", status_list)
+    return print_subprocess_logs_and_return_status(exit_status)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -167,6 +167,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `AcceleratorRegistry` ([#12180](https://github.com/PyTorchLightning/pytorch-lightning/pull/12180))
 
 
+- Added support for Habana Accelerator (HPU) ([#11808](https://github.com/PyTorchLightning/pytorch-lightning/pull/11808))
+
+
 ### Changed
 
 - Drop PyTorch 1.7 support ([#12191](https://github.com/PyTorchLightning/pytorch-lightning/pull/12191))
 
@@ -0,0 +1,124 @@
+.. _hpu:
+
+Habana Gaudi AI Processor (HPU)
+===============================
+
+Lightning supports `Habana Gaudi AI Processor (HPU) <https://habana.ai/>`__, for accelerating Deep Learning training workloads.
+
+HPU Terminology
+---------------
+
+Habana® Gaudi® AI training processors are built on a heterogeneous architecture with a cluster of fully programmable Tensor Processing Cores (TPC) along with its associated development tools and libraries, and a configurable Matrix Math engine.
+
+The TPC core is a VLIW SIMD processor with an instruction set and hardware tailored to serve training workloads efficiently.
+The Gaudi memory architecture includes on-die SRAM and local memories in each TPC and,
+Gaudi is the first DL training processor that has integrated RDMA over Converged Ethernet (RoCE v2) engines on-chip.
+
+On the software side, the PyTorch Habana bridge interfaces between the framework and SynapseAI software stack to enable the execution of deep learning models on the Habana Gaudi device.
+
+Gaudi offers a substantial price/performance advantage -- so you get to do more deep learning training while spending less.
+
+For more information, check out `Gaudi Architecture <https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Overview.html#gaudi-architecture>`__ and `Gaudi Developer Docs <https://developer.habana.ai>`__.
+
+How to access HPUs
+------------------
+
+To use HPUs, you must have access to a system with HPU devices.
+You can either use `Gaudi-based AWS EC2 DL1 instances <https://aws.amazon.com/ec2/instance-types/dl1/>`__ or `Supermicro X12 Gaudi server <https://www.supermicro.com/en/solutions/habana-gaudi>`__ to get access to HPUs.
+
+Check out the `Getting Started Guide with AWS and Habana <https://docs.habana.ai/en/latest/AWS_EC2_Getting_Started/AWS_EC2_Getting_Started.html>`__.
+
+Training with HPUs
+------------------
+
+To enable PyTorch Lightning to utilize the HPU accelerator, simply provide ``accelerator="hpu"`` parameter to the Trainer class.
+
+.. code-block:: python
+
+    trainer = Trainer(accelerator="hpu")
+
+Passing ``devices=1`` and ``accelerator="hpu"`` to the Trainer class enables the Habana accelerator for single Gaudi training.
+
+.. code-block:: python
+
+    trainer = Trainer(devices=1, accelerator="hpu")
+
+The ``devices=8`` and ``accelerator="hpu"`` parameters to the Trainer class enables the Habana accelerator for distributed training with 8 Gaudis.
+It uses :class:`~pytorch_lightning.strategies.hpu_parallel.HPUParallelStrategy` internally which is based on DDP strategy with the addition of Habana's collective communication library (HCCL) to support scale-up within a node and scale-out across multiple nodes.
+
+.. code-block:: python
+
+    trainer = Trainer(devices=8, accelerator="hpu")
+
+.. note::
+    If the ``devices`` flag is not defined, it will assume ``devices`` to be ``"auto"`` and select 8 Gaudi devices for :class:`~pytorch_lightning.accelerators.hpu.HPUAccelerator`.
+
+
+Mixed Precision Plugin
+----------------------
+
+Lightning also allows mixed precision training with HPUs.
+By default, HPU training will use 32-bit precision. To enable mixed precision, set the ``precision`` flag.
+
+.. code-block:: python
+
+    trainer = Trainer(devices=1, accelerator="hpu", precision=16)
+
+
+Enabling Mixed Precision Options
+--------------------------------
+
+Internally, :class:`~pytorch_lightning.plugins.precision.hpu.HPUPrecisionPlugin` uses the Habana Mixed Precision (HMP) package to enable mixed precision training.
+
+You can execute the ops in FP32 or BF16 precision. The HMP package modifies the Python operators to add the appropriate cast operations for the arguments before execution.
+The default settings enable users to enable mixed precision training with minimal code easily.
+
+In addition to the default settings in HMP, users also have the option of overriding these defaults and providing their
+BF16 and FP32 operator lists by passing them as parameter to :class:`~pytorch_lightning.plugins.precision.hpu.HPUPrecisionPlugin`.
+
+The below snippet shows an example model using MNIST with a single Habana Gaudi device and making use of HMP by overriding the default parameters.
+This enables advanced users to provide their own BF16 and FP32 operator list instead of using the HMP defaults.
+
+.. code-block:: python
+
+    import pytorch_lightning as pl
+    from pytorch_lightning.plugins import HPUPrecisionPlugin
+
+    # Initialize a trainer with HPU accelerator for HPU strategy for single device,
+    # with mixed precision using overidden HMP settings
+    trainer = pl.Trainer(
+        accelerator="hpu",
+        devices=1,
+        # Optional Habana mixed precision params to be set
+        # Checkout `pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt` for the format
+        plugins=[
+            HPUPrecisionPlugin(
+                precision=16,
+                opt_level="O1",
+                verbose=False,
+                bf16_file_path="ops_bf16_mnist.txt",
+                fp32_file_path="ops_fp32_mnist.txt",
+            )
+        ],
+    )
+
+    # Init our model
+    model = LitClassifier()
+    # Init the data
+    dm = MNISTDataModule(batch_size=batch_size)
+
+    # Train the model ⚡
+    trainer.fit(model, datamodule=dm)
+
+For more details, please refer to `PyTorch Mixed Precision Training on Gaudi <https://docs.habana.ai/en/latest/PyTorch_User_Guide/PyTorch_User_Guide.html#pytorch-mixed-precision-training-on-gaudi>`__.
+
+----------------
+
+.. _known-limitations_hpu:
+
+Known limitations
+-----------------
+
+* Multiple optimizers are not supported.
+* `Habana dataloader <https://docs.habana.ai/en/latest/PyTorch_User_Guide/PyTorch_User_Guide.html#habana-data-loader>`__ is not supported.
+* :class:`~pytorch_lightning.callbacks.device_stats_monitor.DeviceStatsMonitor` is not supported.
@@ -16,6 +16,7 @@ Accelerator API
     Accelerator
     CPUAccelerator
     GPUAccelerator
+    HPUAccelerator
     IPUAccelerator
     TPUAccelerator
 
@@ -59,9 +60,11 @@ Strategy API
     DataParallelStrategy
     DeepSpeedStrategy
     HorovodStrategy
+    HPUParallelStrategy
     IPUStrategy
     ParallelStrategy
     SingleDeviceStrategy
+    SingleHPUStrategy
     SingleTPUStrategy
     Strategy
     TPUSpawnStrategy
@@ -198,6 +201,7 @@ Precision Plugins
     DeepSpeedPrecisionPlugin
     DoublePrecisionPlugin
     FullyShardedNativeMixedPrecisionPlugin
+    HPUPrecisionPlugin
     IPUPrecisionPlugin
     MixedPrecisionPlugin
     NativeMixedPrecisionPlugin
@@ -234,6 +238,7 @@ Checkpoint IO Plugins
     :template: classtemplate.rst
 
     CheckpointIO
+    HPUCheckpointIO
     TorchCheckpointIO
     XLACheckpointIO
 
 
@@ -15,6 +15,7 @@ Currently there are accelerators for:
 - GPU
 - TPU
 - IPU
+- HPU
 
 Each Accelerator gets two plugins upon initialization:
 One to handle differences from the training routine and one to handle different precisions.
@@ -58,5 +59,6 @@ Accelerator API
     Accelerator
     CPUAccelerator
     GPUAccelerator
-    TPUAccelerator
+    HPUAccelerator
     IPUAccelerator
+    TPUAccelerator
@@ -61,17 +61,18 @@ Precision Plugins
     :nosignatures:
     :template: classtemplate.rst
 
-    PrecisionPlugin
-    MixedPrecisionPlugin
-    NativeMixedPrecisionPlugin
-    ShardedNativeMixedPrecisionPlugin
     ApexMixedPrecisionPlugin
     DeepSpeedPrecisionPlugin
-    TPUPrecisionPlugin
-    TPUBf16PrecisionPlugin
     DoublePrecisionPlugin
     FullyShardedNativeMixedPrecisionPlugin
+    HPUPrecisionPlugin
     IPUPrecisionPlugin
+    MixedPrecisionPlugin
+    NativeMixedPrecisionPlugin
+    PrecisionPlugin
+    ShardedNativeMixedPrecisionPlugin
+    TPUBf16PrecisionPlugin
+    TPUPrecisionPlugin
 
 
 Cluster Environments
 
@@ -108,9 +108,11 @@ Built-In Training Strategies
     DataParallelStrategy
     DeepSpeedStrategy
     HorovodStrategy
+    HPUParallelStrategy
     IPUStrategy
     ParallelStrategy
     SingleDeviceStrategy
+    SingleHPUStrategy
     SingleTPUStrategy
     Strategy
     TPUSpawnStrategy