Lightning-AI · kaushikb11 · Mar 25, 2022 · Feb 8, 2022 · Feb 8, 2022 · Feb 8, 2022
@@ -28,5 +28,21 @@ jobs:
 
     steps:
     - bash: |
+        apt-get install hwinfo
         hwinfo --short
       displayName: 'Instance HW info'
+
+    - bash: |
+        pip install . --requirement requirements/test.txt
+      displayName: 'Install dependencies'
+
+    - bash: |
+         python ".azure-pipelines/run_hpu_tests.py"
+      displayName: 'HPU Tests in parallel'
+
+    - task: PublishTestResults@2
+      inputs:
+        testResultsFiles: 'hpu*_test-results.xml'
+        testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
+      condition: succeededOrFailed()
+      displayName: 'Publish test results'
@@ -0,0 +1,142 @@
+"""This file is called from the hpu-tests.yml pipeline.
+
+The following script run the hpu tests in parallel.
+Tests run are:
+1. test_inference_only is run on four cards
+2. test_all_stages on two cards
+3. complete hpu tests using one card
+4. complete hpu tests using eight cards.
+"""
+import itertools
+import subprocess
+import sys
+
+HPU_TESTS_DICTIONARY = {
+    "hpu1_test": "python -m coverage run --source pytorch_lightning -m pytest -sv tests/accelerators/test_hpu.py \
+            --hmp-bf16 'tests/accelerators/ops_bf16_mnist.txt' \
+            --hmp-fp32 'tests/accelerators/ops_fp32_mnist.txt' \
+            --forked \
+            --junitxml=hpu1_test-results.xml",
+    "hpu2_test": "python -m coverage run --source pytorch_lightning -m pytest -sv tests/accelerators/test_hpu.py \
+            -k test_all_stages \
+            --hpus 2 \
+            --verbose \
+            --capture=no \
+            --forked \
+            --junitxml=hpu2_test-results.xml",
+    "hpu4_test": "python -m coverage run --source pytorch_lightning -m pytest -sv tests/accelerators/test_hpu.py \
+            -k test_inference_only \
+            --hpus 4 \
+            --capture=no \
+            --verbose \
+            --forked \
+            --junitxml=hpu4_test-results.xml",
+    "hpu8_test": "python -m coverage run --source pytorch_lightning -m pytest -sv tests/accelerators/test_hpu.py \
+            --hmp-bf16 'tests/accelerators/ops_bf16_mnist.txt' \
+            --hmp-fp32 'tests/accelerators/ops_fp32_mnist.txt' \
+            --forked \
+            --hpus 8 \
+            --junitxml=hpu8_test-results.xml",
+}
+
+HPU1_TEST = HPU_TESTS_DICTIONARY["hpu1_test"]
+HPU2_TEST = HPU_TESTS_DICTIONARY["hpu2_test"]
+HPU4_TEST = HPU_TESTS_DICTIONARY["hpu4_test"]
+HPU8_TEST = HPU_TESTS_DICTIONARY["hpu8_test"]
+
+PARALLEL_HPU_TESTS_EXECUTION = [[HPU4_TEST, HPU1_TEST], [HPU2_TEST, HPU1_TEST], [HPU8_TEST]]
+TIMEOUT = 60
+TIMEOUT_EXIT_CODE = -9
+
+
+def run_hpu_tests_parallel(timeout=TIMEOUT):
+    """This function is called to run the HPU tests in parallel.
+
+    We run the tests in sub process to utilize all the eight cards available in the DL1 instance
+    Considering the max time taken to run the HPU tests as 60 seconds, we kill the process if the time taken exceeds.
+    Return of this function will be the list of exit status of the HPU tests that were run in the subprocess.
+    Here, the exit_status 0 means the test run is successful. exit_status 1 means the test run is failed.
+    Args:
+        timeout: The threshold time to run the HPU tests in parallel.
+        Exception is logged if the threshold timeout gets expired.
+        TIMEOUT_EXIT_CODE will be returned as -9 in case of timeout, 0 in case of success and 4 in case of a failure.
+    """
+    exit_status = []
+    with open("stdout_log.txt", "w") as stdout_log, open("error_log.txt", "w") as error_log:
+        for hpu_tests in PARALLEL_HPU_TESTS_EXECUTION:
+            process_list = [
+                subprocess.Popen(
+                    each_hpu_test, shell=True, stdout=stdout_log, stderr=error_log, universal_newlines=True
+                )
+                for each_hpu_test in hpu_tests
+            ]
+            for process in process_list:
+                try:
+                    exit_status.append(process.wait(timeout=TIMEOUT))
+                except subprocess.TimeoutExpired as e:
+                    print(e)
+                    print("Killing the process....")
+                    process.kill()
+                    exit_status.append(TIMEOUT_EXIT_CODE)
+    return exit_status
+
+
+def zip_cmd_exitcode(exit_status):
+    """This function is called to zip the tests that were executed with the exit status of the test.
+
+    Return of this function will be list of hpu tests called and their exit status.
+    Args:
+        exit_status: The returned exit_status after executing run_hpu_tests_parallel().
+    """
+    status_list = []
+    hpu_tests_called = []
+    for hpu_tests in PARALLEL_HPU_TESTS_EXECUTION:
+        hpu_tests_called.append(hpu_tests)
+    status_list = list(zip(list(itertools.chain(*hpu_tests_called)), exit_status))
+    return status_list
+
+
+def print_logs(filename):
+    """This function is called to read the file and print the logs.
+
+    Args:
+        filename: Provide the log filename that need to be print on the console.
+    """
+    with open(filename) as f:
+        print(f.read())
+
+
+def print_subprocess_logs_and_return_status(exit_status):
+    """This function is called to print the logs of subprocess stdout and stderror and return the status of test
+    execution.
+
+    Args:
+        exit_status: The returned exit_status after executing run_hpu_tests_parallel().
+    Return of this function will be the return to main().
+    Based on the exit status of the HPU tests, we return success or failure to the main method.
+    """
+    if all(v == 0 for v in exit_status):
+        print("All HPU tests passed")
+        file_name = "stdout_log.txt"
+        print_logs(file_name)
+        return 0
+    else:
+        print("HPU tests are failing")
+        print("Printing stdout_log.txt...")
+        file_name = "stdout_log.txt"
+        print_logs(file_name)
+        print("Printing error_log.txt...")
+        file_name = "error_log.txt"
+        print_logs(file_name)
+        return 1
+
+
+def main():
+    exit_status = run_hpu_tests_parallel(timeout=TIMEOUT)
+    status_list = zip_cmd_exitcode(exit_status)
+    print("HPU Tests executed and their exit status:", status_list)
+    return print_subprocess_logs_and_return_status(exit_status)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,133 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+from torch.nn import functional as F
+
+import pytorch_lightning as pl
+from pl_examples.basic_examples.mnist_datamodule import MNISTDataModule
+from pytorch_lightning.plugins import HPUPrecisionPlugin
+from pytorch_lightning.strategies.hpu import HPUStrategy
+from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy
+from pytorch_lightning.utilities.imports import _HPU_AVAILABLE
+
+
+def parse_args():
+    import argparse
+
+    parser = argparse.ArgumentParser(description="PyTorch Classification Training")
+
+    parser.add_argument("-b", "--batch-size", default=32, type=int)
+    parser.add_argument("--epochs", default=1, type=int, metavar="N", help="number of total epochs to run")
+    parser.add_argument(
+        "--hpus", default=1, type=int, metavar="N", help="number of habana accelerator for training (default: 1)"
+    )
+    parser.add_argument("--hmp", dest="is_hmp", action="store_true", help="enable habana mixed precision mode")
+    parser.add_argument("--hmp-bf16", default="", help="path to bf16 ops list in hmp O1 mode")
+    parser.add_argument("--hmp-fp32", default="", help="path to fp32 ops list in hmp O1 mode")
+    parser.add_argument("--hmp-opt-level", default="O1", help="choose optimization level for hmp")
+    parser.add_argument("--hmp-verbose", action="store_true", help="enable verbose mode for hmp")
+
+    args = parser.parse_args()
+
+    return args
+
+
+class LitClassifier(pl.LightningModule):
+    def __init__(self):
+        super().__init__()
+
+        self.l1 = torch.nn.Linear(28 * 28, 10)
+
+    def forward(self, x):
+        return torch.relu(self.l1(x.view(x.size(0), -1)))
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        loss = F.cross_entropy(self(x), y)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        probs = self(x)
+        acc = self.accuracy(probs, y)
+        return acc
+
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        acc = self.accuracy(logits, y)
+        return acc
+
+    def accuracy(self, logits, y):
+        acc = torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y)
+        return acc
+
+    def validation_epoch_end(self, outputs) -> None:
+        self.log("val_acc", torch.stack(outputs).mean(), prog_bar=True)
+
+    def test_epoch_end(self, outputs) -> None:
+        self.log("test_acc", torch.stack(outputs).mean())
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), lr=0.02)
+
+
+if __name__ == "__main__":
+
+    if _HPU_AVAILABLE:
+
+        args = parse_args()
+
+        # Init our model
+        model = LitClassifier()
+
+        # Init DataLoader from MNIST Dataset
+        dm = MNISTDataModule(batch_size=args.batch_size)
+
+        # TBD: import these keys from hmp
+        hmp_keys = ["level", "verbose", "bf16_ops", "fp32_ops"]
+        hmp_params = dict.fromkeys(hmp_keys)
+        hmp_params["level"] = args.hmp_opt_level
+        hmp_params["verbose"] = args.hmp_verbose
+        hmp_params["bf16_ops"] = args.hmp_bf16  # "./pl_examples/hpu_examples/simple_mnist/ops_bf16_mnist.txt"
+        hmp_params["fp32_ops"] = args.hmp_fp32  # "./pl_examples/hpu_examples/simple_mnist/ops_fp32_mnist.txt"
+
+        parallel_devices = args.hpus
+        hpustrat_1 = HPUStrategy(
+            device=torch.device("hpu"), precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params)
+        )
+        hpustrat_8 = HPUParallelStrategy(
+            parallel_devices=[torch.device("hpu")] * parallel_devices,
+            precision_plugin=HPUPrecisionPlugin(precision=16, hmp_params=hmp_params),
+        )
+
+        # Initialize a trainer
+        trainer = pl.Trainer(
+            strategy=hpustrat_8 if (parallel_devices == 8) else hpustrat_1,
+            devices=parallel_devices,
+            max_epochs=args.epochs,
+            default_root_dir=os.getcwd(),
+            accelerator="hpu",
+        )
+
+        # Train the model ⚡
+        trainer.fit(model, datamodule=dm)
+        trainer.test(model, datamodule=dm)
+        trainer.validate(model, datamodule=dm)
+
+    else:
+        print("This example is supported only on HPU !")
@@ -0,0 +1,2 @@
+linear
+relu
@@ -0,0 +1 @@
+cross_entropy
@@ -13,5 +13,6 @@
 from pytorch_lightning.accelerators.accelerator import Accelerator  # noqa: F401
 from pytorch_lightning.accelerators.cpu import CPUAccelerator  # noqa: F401
 from pytorch_lightning.accelerators.gpu import GPUAccelerator  # noqa: F401
+from pytorch_lightning.accelerators.hpu import HPUAccelerator  # noqa: F401
 from pytorch_lightning.accelerators.ipu import IPUAccelerator  # noqa: F401
 from pytorch_lightning.accelerators.tpu import TPUAccelerator  # noqa: F401
@@ -28,6 +28,7 @@ class Accelerator(ABC):
     - GPU
     - TPU
     - IPU
+    - HPU
     """
 
     def setup_environment(self, root_device: torch.device) -> None:

@@ -0,0 +1,53 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Union
+
+import torch
+
+from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.utilities import _HPU_AVAILABLE
+
+
+class HPUAccelerator(Accelerator):
+    """Accelerator for HPU devices."""
+
+    @staticmethod
+    def name() -> str:
+        """Name of the Accelerator."""
+        return "hpu"
+
+    def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
+        """HPU device stats aren't supported yet."""
+        return {}
+
+    @staticmethod
+    def parse_devices(devices: int) -> int:
+        """Accelerator device parsing logic."""
+        return devices
+
+    @staticmethod
+    def get_parallel_devices(devices: int) -> List[int]:
+        """Gets parallel devices for the Accelerator."""
+        return list(range(devices))
+
+    @staticmethod
+    def auto_device_count() -> int:
+        """Get the devices when set to auto."""
+        # TODO: Update this when api is exposed by the Habana team
+        return 8
+
+    @staticmethod
+    def is_available() -> bool:
+        return _HPU_AVAILABLE
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		linear
jerome-habana marked this conversation as resolved. Show resolved Hide resolved
		relu
jerome-habana marked this conversation as resolved. Show resolved Hide resolved
-Original file line number
+Diff line change
@@ Expand Up / @@ -28,6 +28,7 @@ class Accelerator(ABC): @@
         - GPU
         - TPU
         - IPU
+        - HPU
         """
         def setup_environment(self, root_device: torch.device) -> None:
@@ Expand Down @@