Lightning-AI · kaushikb11 · Mar 25, 2022 · Feb 8, 2022 · Feb 8, 2022 · Feb 8, 2022
@@ -31,3 +31,18 @@ jobs:
         apt-get install -y hwinfo
         hwinfo --short
       displayName: 'Instance HW info'
+
+    - bash: |
+        pip install . --requirement requirements/test.txt
+      displayName: 'Install dependencies'
+
+    - bash: |
+         python ".azure-pipelines/run_hpu_tests.py"
+      displayName: 'HPU Tests in parallel'
+
+    - task: PublishTestResults@2
+      inputs:
+        testResultsFiles: 'hpu*_test-results.xml'
+        testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
+      condition: succeededOrFailed()
+      displayName: 'Publish test results'
@@ -0,0 +1,142 @@
+"""This file is called from the hpu-tests.yml pipeline.
+
+The following script run the hpu tests in parallel.
+Tests run are:
+1. test_inference_only is run on four cards
+2. test_all_stages on two cards
+3. complete hpu tests using one card
+4. complete hpu tests using eight cards.
+"""
+import itertools
+import subprocess
+import sys
+
+HPU_TESTS_DICTIONARY = {
+    "hpu1_test": "python -m coverage run --source pytorch_lightning -m pytest -sv tests/accelerators/test_hpu.py \
+            --hmp-bf16 'tests/accelerators/ops_bf16_mnist.txt' \
+            --hmp-fp32 'tests/accelerators/ops_fp32_mnist.txt' \
+            --forked \
+            --junitxml=hpu1_test-results.xml",
+    "hpu2_test": "python -m coverage run --source pytorch_lightning -m pytest -sv tests/accelerators/test_hpu.py \
+            -k test_all_stages \
+            --hpus 2 \
+            --verbose \
+            --capture=no \
+            --forked \
+            --junitxml=hpu2_test-results.xml",
+    "hpu4_test": "python -m coverage run --source pytorch_lightning -m pytest -sv tests/accelerators/test_hpu.py \
+            -k test_inference_only \
+            --hpus 4 \
+            --capture=no \
+            --verbose \
+            --forked \
+            --junitxml=hpu4_test-results.xml",
+    "hpu8_test": "python -m coverage run --source pytorch_lightning -m pytest -sv tests/accelerators/test_hpu.py \
+            --hmp-bf16 'tests/accelerators/ops_bf16_mnist.txt' \
+            --hmp-fp32 'tests/accelerators/ops_fp32_mnist.txt' \
+            --forked \
+            --hpus 8 \
+            --junitxml=hpu8_test-results.xml",
+}
+
+HPU1_TEST = HPU_TESTS_DICTIONARY["hpu1_test"]
+HPU2_TEST = HPU_TESTS_DICTIONARY["hpu2_test"]
+HPU4_TEST = HPU_TESTS_DICTIONARY["hpu4_test"]
+HPU8_TEST = HPU_TESTS_DICTIONARY["hpu8_test"]
+
+PARALLEL_HPU_TESTS_EXECUTION = [[HPU4_TEST, HPU1_TEST], [HPU2_TEST, HPU1_TEST], [HPU8_TEST]]
+TIMEOUT = 60
+TIMEOUT_EXIT_CODE = -9
+
+
+def run_hpu_tests_parallel(timeout=TIMEOUT):
+    """This function is called to run the HPU tests in parallel.
+
+    We run the tests in sub process to utilize all the eight cards available in the DL1 instance
+    Considering the max time taken to run the HPU tests as 60 seconds, we kill the process if the time taken exceeds.
+    Return of this function will be the list of exit status of the HPU tests that were run in the subprocess.
+    Here, the exit_status 0 means the test run is successful. exit_status 1 means the test run is failed.
+    Args:
+        timeout: The threshold time to run the HPU tests in parallel.
+        Exception is logged if the threshold timeout gets expired.
+        TIMEOUT_EXIT_CODE will be returned as -9 in case of timeout, 0 in case of success and 4 in case of a failure.
+    """
+    exit_status = []
+    with open("stdout_log.txt", "w") as stdout_log, open("error_log.txt", "w") as error_log:
+        for hpu_tests in PARALLEL_HPU_TESTS_EXECUTION:
+            process_list = [
+                subprocess.Popen(
+                    each_hpu_test, shell=True, stdout=stdout_log, stderr=error_log, universal_newlines=True
+                )
+                for each_hpu_test in hpu_tests
+            ]
+            for process in process_list:
+                try:
+                    exit_status.append(process.wait(timeout=TIMEOUT))
+                except subprocess.TimeoutExpired as e:
+                    print(e)
+                    print("Killing the process....")
+                    process.kill()
+                    exit_status.append(TIMEOUT_EXIT_CODE)
+    return exit_status
+
+
+def zip_cmd_exitcode(exit_status):
+    """This function is called to zip the tests that were executed with the exit status of the test.
+
+    Return of this function will be list of hpu tests called and their exit status.
+    Args:
+        exit_status: The returned exit_status after executing run_hpu_tests_parallel().
+    """
+    status_list = []
+    hpu_tests_called = []
+    for hpu_tests in PARALLEL_HPU_TESTS_EXECUTION:
+        hpu_tests_called.append(hpu_tests)
+    status_list = list(zip(list(itertools.chain(*hpu_tests_called)), exit_status))
+    return status_list
+
+
+def print_logs(filename):
+    """This function is called to read the file and print the logs.
+
+    Args:
+        filename: Provide the log filename that need to be print on the console.
+    """
+    with open(filename) as f:
+        print(f.read())
+
+
+def print_subprocess_logs_and_return_status(exit_status):
+    """This function is called to print the logs of subprocess stdout and stderror and return the status of test
+    execution.
+
+    Args:
+        exit_status: The returned exit_status after executing run_hpu_tests_parallel().
+    Return of this function will be the return to main().
+    Based on the exit status of the HPU tests, we return success or failure to the main method.
+    """
+    if all(v == 0 for v in exit_status):
+        print("All HPU tests passed")
+        file_name = "stdout_log.txt"
+        print_logs(file_name)
+        return 0
+    else:
+        print("HPU tests are failing")
+        print("Printing stdout_log.txt...")
+        file_name = "stdout_log.txt"
+        print_logs(file_name)
+        print("Printing error_log.txt...")
+        file_name = "error_log.txt"
+        print_logs(file_name)
+        return 1
+
+
+def main():
+    exit_status = run_hpu_tests_parallel(timeout=TIMEOUT)
+    status_list = zip_cmd_exitcode(exit_status)
+    print("HPU Tests executed and their exit status:", status_list)
+    return print_subprocess_logs_and_return_status(exit_status)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -158,6 +158,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `Callback.state_dict()` and `Callback.load_state_dict()` methods ([#12232](https://github.com/PyTorchLightning/pytorch-lightning/pull/12232))
 
 
+- Added support for Habana Accelerator (HPU) ([#11808](https://github.com/PyTorchLightning/pytorch-lightning/pull/11808))
+
+
 ### Changed
 
 - Drop PyTorch 1.7 support ([#12191](https://github.com/PyTorchLightning/pytorch-lightning/pull/12191))

@@ -0,0 +1,212 @@
+.. _hpu:
+
+Habana Gaudi AI Processor (HPU)
+===============================
+
+Habana® Gaudi® AI training processors have been architected from the ground up and optimized for deep learning training efficiency.
+Gaudi offers substantial price/performance advantage -- so you get to do more deep learning training while spending less.
+
+You can use either `the Gaudi-based AWS EC2 DL1 instances <https://aws.amazon.com/ec2/instance-types/dl1/>`_ or `the Supermicro X12 Gaudi server <https://www.supermicro.com/en/solutions/habana-gaudi>`_.
+
+Habana’s SynapseAI® software suite is optimized for building and training deep learning models using TensorFlow and PyTorch frameworks. Gaudi is referred to as the Habana Processing Unit (HPU).
+With SynapseAI, we aim to make training workloads on Gaudi easy, whether you're developing from scratch or migrating existing workloads.
+
+For more information, check out `<https://developer.habana.ai>`_ and `<https://habana.ai/>`_.
+
+----------------
+
+PyTorch Lightning With Gaudi HPU
+--------------------------------
+
+Lightning supports training on a single HPU device or 8 HPU devices with the plugins described in the following sections
+
+
+----------------
+
+.. _hpu_accelerator:
+
+HPU accelerator
+---------------
+
+To enable PyTorch Lightning to utilize the HPU accelerator, simply provide ``Trainer(accelerator="hpu")`` parameters in the trainer class.
+
+
+----------------
+
+.. _single_device_strategy:
+
+Training on Single HPU
+----------------------
+
+The ``devices=1`` and ``accelerator="hpu"`` with ``strategy=SingleHPUStrategy(device=torch.device("hpu"))`` parameter in the trainer class enables the Habana backend for single Gaudi training.
+
+
+----------------
+
+.. _parallel_device_strategy:
+
+Distributed Training
+---------------------
+
+
+The ``devices=8`` and ``accelerator="hpu"`` with ``strategy=HPUParallelStrategy(parallel_devices=[torch.device("hpu")]*devices)`` parameter in the trainer class enables the Habana backend for distributed training with 8 Gaudis.
+
+The Habana parallel device strategy is based on DDP strategy with the addition of Habana's collective communication library (HCCL) to support scale-up within a node and scale-out across multiple nodes.
+
+
+----------------
+
+.. _mixed_precision_plugin:
+
+Mixed Precision Plugin
+----------------------
+
+The ``precision=16`` and a ``hmp_params`` parameter in the trainer class enables the Habana plugin for mixed precision using the Habana Mixed Precision (HMP) package.
+
+You can execute the ops in FP32 or BF16 precision. The HMP package modifies the python operators to add the appropriate cast operations for the arguments before execution.
+The default settings enable users to easily enable mixed precision training with minimal code.
+
+In addition to the default settings in HMP, users also have the option of overriding these defaults and providing their own BF16 and FP32 operator lists.
+
+For more details, please refer to `PyTorch Mixed Precision Training on Gaudi <https://docs.habana.ai/en/latest/PyTorch_User_Guide/PyTorch_User_Guide.html#pytorch-mixed-precision-training-on-gaudi>`_.
+
+
+----------------
+
+.. _pytorch_lightning_examples:
+
+Getting Started with Lightning on Gaudi
+---------------------------------------
+
+This section describes how to train models using PyTorch Lightning with Habana Gaudi.
+
+More Lightning HPU examples can be found in pl_examples (`<https://github.com/PyTorchLightning/pytorch-lightning/tree/master/pl_examples/hpu_examples>`_)
+
+----------------
+
+Enabling Lightning with Single Gaudi HPU
+----------------------------------------
+
+The below snippet shows an example model using MNIST with single Habana Gaudi device:
+
+.. code-block:: python
+
+    import habana_frameworks.torch.core as htcore
+
+
+    class LitClassifier(pl.LightningModule):
+        def __init__(self):
+            super(LitClassifier, self).__init__()
+
+        ...
+
+
+    # Init our model
+    model = LitClassifier()
+
+    # Init DataLoader from MNIST Dataset
+    dm = MNISTDataModule(batch_size=batch_size)
+
+    ...
+
+    num_hpus = 1
+
+    # enable HPU strategy for single device, with mixed precision using default HMP settings
+    hpu_strategy = SingleHPUStrategy(device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16))
+
+    # Initialize a trainer with 1 HPU accelerator
+    trainer = pl.Trainer(accelerator="hpu", devices=num_hpus, strategy=hpu_strategy)
+
+    # Train the model ⚡
+    trainer.fit(model, datamodule=dm)
+
+
+----------------
+
+Enabling Lightning with 8 Gaudi HPUs (distributed)
+--------------------------------------------------
+
+The below snippet shows an example model using MNIST with 8 Habana Gaudi devices:
+
+.. code-block:: python
+
+    import habana_frameworks.torch.core as htcore
+
+
+    class LitClassifier(pl.LightningModule):
+        def __init__(self):
+            super(LitClassifier, self).__init__()
+
+        ...
+
+
+    # Init our model
+    model = LitClassifier()
+
+    # Init DataLoader from MNIST Dataset
+    dm = MNISTDataModule(batch_size=batch_size)
+
+    ...
+
+    # Initialize a trainer with HPU accelerator with 8 devices
+    trainer = pl.Trainer(accelerator="hpu", devices=8, plugins=[HPUPrecisionPlugin(precision=16)])
+
+    # Train the model ⚡
+    trainer.fit(model, datamodule=dm)
+
+
+----------------
+
+Enabling Mixed Precision Options
+--------------------------------
+
+The below snippet shows an example model using MNIST with single Habana Gaudi and making use of HMP by overriding the default parameters.
+This enables advanced users to provide their own BF16 and FP32 operator list instead of using the HMP defaults.
+
+.. code-block:: python
+
+    import habana_frameworks.torch.core as htcore
+
+
+    class LitClassifier(pl.LightningModule):
+        def __init__(self):
+            super(LitClassifier, self).__init__()
+
+        ...
+
+
+    # Init our model
+    model = LitClassifier()
+
+    # Init DataLoader from MNIST Dataset
+    dm = MNISTDataModule(batch_size=batch_size)
+
+    ...
+
+    num_hpus = 1
+
+    # Optional Habana mixed precision params to be set
+    hmp_keys = ["level", "verbose", "bf16_ops", "fp32_ops"]
+    hmp_params = dict.fromkeys(hmp_keys)
+    hmp_params["level"] = "O1"
+    hmp_params["verbose"] = False
+    hmp_params["bf16_ops"] = "ops_bf16_mnist.txt"
+    hmp_params["fp32_ops"] = "ops_fp32_mnist.txt"
+
+    # Initialize a trainer with HPU accelerator for HPU strategy for single device,
+    # with mixed precision using overidden HMP settings
+    trainer = pl.Trainer(accelerator="hpu", devices=1, plugins=[HPUPrecisionPlugin(precision=16, hmp_params=hmp_params)])
+
+    # Train the model ⚡
+    trainer.fit(model, datamodule=dm)
+
+
+----------------
+
+.. _known-limitations_hpu:
+
+Known limitations
+-----------------
+
+* Habana dataloader is not supported.
+* Device stats monitoring is not supported.
@@ -88,6 +88,7 @@ Welcome to PyTorch Lightning
    accelerators/gpu
    accelerators/tpu
    accelerators/ipu
+   accelerators/hpu
 
 .. toctree::
    :maxdepth: 1