From c11eb87b4e062345957bc833177229a42a397226 Mon Sep 17 00:00:00 2001
From: Danielle Pintz <daniellepintz@gmail.com>
Date: Fri, 17 Sep 2021 22:15:25 +0000
Subject: [PATCH 01/23] Add interface to accelerator to get_device_stats

---
 pytorch_lightning/accelerators/accelerator.py |  5 ++
 pytorch_lightning/accelerators/gpu.py         | 63 +++++++++++++++++++
 pytorch_lightning/accelerators/tpu.py         | 17 ++++-
 3 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 3036fd83ebf22..63522cab095ef 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -41,6 +41,7 @@ class Accelerator:
     - CPU
     - GPU
     - TPU
+    - IPU
 
     Each Accelerator gets two plugins upon initialization:
     One to handle differences from the training routine and one to handle different precisions.
@@ -436,6 +437,10 @@ def restore_checkpoint_after_pre_dispatch(self) -> bool:
         """
         return self.training_type_plugin.restore_checkpoint_after_pre_dispatch
 
+    def get_device_stats(self, device: Optional[torch.device] = None) -> Dict[str, Any]:
+        """Gets stats for a given device."""
+        pass
+
     def on_train_start(self) -> None:
         """Called when train begins."""
         return self.training_type_plugin.on_train_start()
diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
index 6a38cd2cf50e9..2951258195e5d 100644
--- a/pytorch_lightning/accelerators/gpu.py
+++ b/pytorch_lightning/accelerators/gpu.py
@@ -13,12 +13,16 @@
 # limitations under the License.
 import logging
 import os
+import shutil
+import subprocess
+from typing import Any, Dict, List, Optional
 
 import torch
 
 import pytorch_lightning as pl
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8
 
 _log = logging.getLogger(__name__)
 
@@ -39,6 +43,13 @@ def setup(self, trainer: "pl.Trainer") -> None:
                 If the selected device is not GPU.
         """
         self.set_nvidia_flags(trainer.local_rank)
+
+        # The logical device IDs for selected devices
+        self._device_ids: List[int] = sorted(set(trainer.data_parallel_device_ids))
+
+        # The unmasked real GPU IDs
+        self._gpu_ids: List[int] = self._get_gpu_ids(self._device_ids)
+
         return super().setup(trainer)
 
     def on_train_start(self) -> None:
@@ -53,6 +64,58 @@ def set_nvidia_flags(local_rank: int) -> None:
         devices = os.getenv("CUDA_VISIBLE_DEVICES", all_gpu_ids)
         _log.info(f"LOCAL_RANK: {local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]")
 
+    def get_device_stats(self, device: Optional[torch.device] = None) -> Dict[str, Any]:
+        """Gets stats for the given GPU device."""
+        if _TORCH_GREATER_EQUAL_1_8:
+            return torch.cuda.memory_stats(device=device)
+        else:
+            gpu_stat_keys = [
+                ("utilization.gpu", "%"),
+                ("memory.used", "MB"),
+                ("memory.free", "MB"),
+                ("utilization.memory", "%"),
+                ("fan.speed", "%"),
+                ("temperature.gpu", "°C"),
+                ("temperature.memory", "°C"),
+            ]
+            gpu_stats = self._get_gpu_stats([k for k, _ in gpu_stat_keys])
+            device_stats = self._parse_gpu_stats(self._device_ids, gpu_stats, gpu_stat_keys)
+            return device_stats
+
+    def _get_gpu_stats(self, queries: List[str]) -> List[List[float]]:
+        if not queries:
+            return []
+
+        """Run nvidia-smi to get the gpu stats"""
+        gpu_query = ",".join(queries)
+        format = "csv,nounits,noheader"
+        gpu_ids = ",".join(self._gpu_ids)
+        result = subprocess.run(
+            [shutil.which("nvidia-smi"), f"--query-gpu={gpu_query}", f"--format={format}", f"--id={gpu_ids}"],
+            encoding="utf-8",
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,  # for backward compatibility with python version 3.6
+            check=True,
+        )
+
+        def _to_float(x: str) -> float:
+            try:
+                return float(x)
+            except ValueError:
+                return 0.0
+
+        stats = result.stdout.strip().split(os.linesep)
+        stats = [[_to_float(x) for x in s.split(", ")] for s in stats]
+        return stats
+
+    @staticmethod
+    def _get_gpu_ids(device_ids: List[int]) -> List[str]:
+        """Get the unmasked real GPU IDs."""
+        # All devices if `CUDA_VISIBLE_DEVICES` unset
+        default = ",".join(str(i) for i in range(torch.cuda.device_count()))
+        cuda_visible_devices: List[str] = os.getenv("CUDA_VISIBLE_DEVICES", default=default).split(",")
+        return [cuda_visible_devices[device_id].strip() for device_id in device_ids]
+
     def teardown(self) -> None:
         super().teardown()
         self._move_optimizer_state(torch.device("cpu"))
diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py
index 954bed3dbc58a..302b09d9a21fb 100644
--- a/pytorch_lightning/accelerators/tpu.py
+++ b/pytorch_lightning/accelerators/tpu.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Dict, Optional
 
 import torch
 from torch.optim import Optimizer
@@ -59,3 +59,18 @@ def _move_optimizer_state(self, device: Optional[torch.device] = None) -> None:
         for opt in self.optimizers:
             for p, v in opt.state.items():
                 opt.state[p] = apply_to_collection(v, torch.Tensor, move_data_to_device, self.root_device)
+
+    def get_device_stats(self, device: Optional[torch.device] = None) -> Dict[str, Any]:
+        """Gets stats for the given TPU device."""
+        device_stats = {}
+        memory_info = xm.get_memory_info(device)
+
+        free_memory = memory_info["kb_free"]
+        peak_memory = memory_info["kb_total"] - free_memory
+
+        free_memory = self.training_type_plugin.reduce(free_memory) * 0.001
+        peak_memory = self.training_type_plugin.reduce(peak_memory) * 0.001
+
+        device_stats["avg. free memory (MB)"] = free_memory
+        device_stats["avg. peak memory (MB)"] = peak_memory
+        return device_stats

From cba491627940b876f23a19f7f229f9ea5ffe6e41 Mon Sep 17 00:00:00 2001
From: Danielle Pintz <daniellepintz@gmail.com>
Date: Fri, 17 Sep 2021 22:20:45 +0000
Subject: [PATCH 02/23] Update changelog

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8b6884971bf4e..7b0e2c10dcb54 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -133,6 +133,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `PL_RECONCILE_PROCESS` environment variable to enable process reconciliation regardless of cluster environment settings ([#9389](https://github.com/PyTorchLightning/pytorch-lightning/pull/9389))
 
 
+- Added `get_device_stats` to Accelerator interface and implement it for GPU and TPU ([#9586](https://github.com/PyTorchLightning/pytorch-lightning/pull/9586))
+
+
 ### Changed
 
 - `pytorch_lightning.loggers.neptune.NeptuneLogger` is now consistent with new [neptune-client](https://github.com/neptune-ai/neptune-client) API ([#6867](https://github.com/PyTorchLightning/pytorch-lightning/pull/6867)).

From d0e1233a3896faa85d54fb9b2dda623735251d92 Mon Sep 17 00:00:00 2001
From: Danielle Pintz <daniellepintz@gmail.com>
Date: Fri, 17 Sep 2021 22:57:29 +0000
Subject: [PATCH 03/23] address comments

---
 pytorch_lightning/accelerators/accelerator.py | 6 ++++--
 pytorch_lightning/accelerators/gpu.py         | 4 ++--
 pytorch_lightning/accelerators/tpu.py         | 4 ++--
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 63522cab095ef..55d0194ec23e4 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import contextlib
 from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Union
+from abc import ABC, abstractmethod
 
 import torch
 from torch import Tensor
@@ -33,7 +34,7 @@
     from torch.cuda.amp import GradScaler
 
 
-class Accelerator:
+class Accelerator(ABC):
     """The Accelerator Base Class. An Accelerator is meant to deal with one type of Hardware.
 
     Currently there are accelerators for:
@@ -437,7 +438,8 @@ def restore_checkpoint_after_pre_dispatch(self) -> bool:
         """
         return self.training_type_plugin.restore_checkpoint_after_pre_dispatch
 
-    def get_device_stats(self, device: Optional[torch.device] = None) -> Dict[str, Any]:
+    @abstractmethod
+    def get_device_stats(self, device: Union[str, torch.dtype]) -> Dict[str, Any]:
         """Gets stats for a given device."""
         pass
 
diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
index 2951258195e5d..53c7d8ab01184 100644
--- a/pytorch_lightning/accelerators/gpu.py
+++ b/pytorch_lightning/accelerators/gpu.py
@@ -15,7 +15,7 @@
 import os
 import shutil
 import subprocess
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 import torch
 
@@ -64,7 +64,7 @@ def set_nvidia_flags(local_rank: int) -> None:
         devices = os.getenv("CUDA_VISIBLE_DEVICES", all_gpu_ids)
         _log.info(f"LOCAL_RANK: {local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]")
 
-    def get_device_stats(self, device: Optional[torch.device] = None) -> Dict[str, Any]:
+    def get_device_stats(self, device: Union[str, torch.dtype]) -> Dict[str, Any]:
         """Gets stats for the given GPU device."""
         if _TORCH_GREATER_EQUAL_1_8:
             return torch.cuda.memory_stats(device=device)
diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py
index 302b09d9a21fb..e1e0386e53ba4 100644
--- a/pytorch_lightning/accelerators/tpu.py
+++ b/pytorch_lightning/accelerators/tpu.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Callable, Dict, Optional
+from typing import Any, Callable, Dict, Optional, Union
 
 import torch
 from torch.optim import Optimizer
@@ -60,7 +60,7 @@ def _move_optimizer_state(self, device: Optional[torch.device] = None) -> None:
             for p, v in opt.state.items():
                 opt.state[p] = apply_to_collection(v, torch.Tensor, move_data_to_device, self.root_device)
 
-    def get_device_stats(self, device: Optional[torch.device] = None) -> Dict[str, Any]:
+    def get_device_stats(self, device: Union[str, torch.dtype]) -> Dict[str, Any]:
         """Gets stats for the given TPU device."""
         device_stats = {}
         memory_info = xm.get_memory_info(device)

From 269f3fff8591824b1b2664152c906aa89fd3a186 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 17 Sep 2021 22:58:41 +0000
Subject: [PATCH 04/23] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pytorch_lightning/accelerators/accelerator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 55d0194ec23e4..43167417c248c 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import contextlib
-from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Union
 from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Union
 
 import torch
 from torch import Tensor

From 4d8cc759c86e26b1a701425e57196965591f9c63 Mon Sep 17 00:00:00 2001
From: Danielle Pintz <daniellepintz@gmail.com>
Date: Sat, 18 Sep 2021 00:21:35 +0000
Subject: [PATCH 05/23] comments

---
 pytorch_lightning/accelerators/accelerator.py |  2 +-
 pytorch_lightning/accelerators/gpu.py         | 76 +++++++++----------
 pytorch_lightning/accelerators/tpu.py         |  2 +-
 3 files changed, 36 insertions(+), 44 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 55d0194ec23e4..01eebfc362644 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -439,7 +439,7 @@ def restore_checkpoint_after_pre_dispatch(self) -> bool:
         return self.training_type_plugin.restore_checkpoint_after_pre_dispatch
 
     @abstractmethod
-    def get_device_stats(self, device: Union[str, torch.dtype]) -> Dict[str, Any]:
+    def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         """Gets stats for a given device."""
         pass
 
diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
index 53c7d8ab01184..0df8dffdd9a3f 100644
--- a/pytorch_lightning/accelerators/gpu.py
+++ b/pytorch_lightning/accelerators/gpu.py
@@ -15,7 +15,7 @@
 import os
 import shutil
 import subprocess
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Union
 
 import torch
 
@@ -31,25 +31,18 @@ class GPUAccelerator(Accelerator):
     """Accelerator for GPU devices."""
 
     def setup_environment(self) -> None:
+        """
+        Raises:
+            MisconfigurationException:
+                If the selected device is not GPU.
+        """
         super().setup_environment()
         if "cuda" not in str(self.root_device):
             raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead")
         torch.cuda.set_device(self.root_device)
 
     def setup(self, trainer: "pl.Trainer") -> None:
-        """
-        Raises:
-            MisconfigurationException:
-                If the selected device is not GPU.
-        """
         self.set_nvidia_flags(trainer.local_rank)
-
-        # The logical device IDs for selected devices
-        self._device_ids: List[int] = sorted(set(trainer.data_parallel_device_ids))
-
-        # The unmasked real GPU IDs
-        self._gpu_ids: List[int] = self._get_gpu_ids(self._device_ids)
-
         return super().setup(trainer)
 
     def on_train_start(self) -> None:
@@ -64,34 +57,33 @@ def set_nvidia_flags(local_rank: int) -> None:
         devices = os.getenv("CUDA_VISIBLE_DEVICES", all_gpu_ids)
         _log.info(f"LOCAL_RANK: {local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]")
 
-    def get_device_stats(self, device: Union[str, torch.dtype]) -> Dict[str, Any]:
+    def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         """Gets stats for the given GPU device."""
         if _TORCH_GREATER_EQUAL_1_8:
-            return torch.cuda.memory_stats(device=device)
+            return torch.cuda.memory_stats(device)  
         else:
-            gpu_stat_keys = [
-                ("utilization.gpu", "%"),
-                ("memory.used", "MB"),
-                ("memory.free", "MB"),
-                ("utilization.memory", "%"),
-                ("fan.speed", "%"),
-                ("temperature.gpu", "°C"),
-                ("temperature.memory", "°C"),
-            ]
-            gpu_stats = self._get_gpu_stats([k for k, _ in gpu_stat_keys])
-            device_stats = self._parse_gpu_stats(self._device_ids, gpu_stats, gpu_stat_keys)
-            return device_stats
-
-    def _get_gpu_stats(self, queries: List[str]) -> List[List[float]]:
-        if not queries:
-            return []
-
-        """Run nvidia-smi to get the gpu stats"""
-        gpu_query = ",".join(queries)
+            return self._get_gpu_stats(device)  
+
+    def _get_gpu_stats(self, device: torch.device) -> Dict[str, float]:
+        nvidia_smi_path = shutil.which("nvidia-smi")
+        if nvidia_smi_path is None:
+            raise FileNotFoundError("nvidia-smi: command not found")
+
+        gpu_stat_keys = [
+            "utilization.gpu",
+            "memory.used",
+            "memory.free",
+            "utilization.memory",
+            "fan.speed",
+            "temperature.gpu",
+            "temperature.memoy",
+        ]
+        gpu_ids = self._get_gpu_id(device.index)
+
+        gpu_query = ",".join(gpu_stat_keys)
         format = "csv,nounits,noheader"
-        gpu_ids = ",".join(self._gpu_ids)
         result = subprocess.run(
-            [shutil.which("nvidia-smi"), f"--query-gpu={gpu_query}", f"--format={format}", f"--id={gpu_ids}"],
+            [nvidia_smi_path, f"--query-gpu={gpu_query}", f"--format={format}", f"--id={gpu_ids}"],
             encoding="utf-8",
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,  # for backward compatibility with python version 3.6
@@ -104,17 +96,17 @@ def _to_float(x: str) -> float:
             except ValueError:
                 return 0.0
 
-        stats = result.stdout.strip().split(os.linesep)
-        stats = [[_to_float(x) for x in s.split(", ")] for s in stats]
-        return stats
+        stats = [_to_float(x) for x in result.stdout.strip().split(os.linesep)]
+        for key in gpu_stat_keys:
+            gpu_stats = {key: stat for _, stat in enumerate(stats)}
+        return gpu_stats
 
-    @staticmethod
-    def _get_gpu_ids(device_ids: List[int]) -> List[str]:
+    def _get_gpu_id(self, device_id: int) -> List[str]:
         """Get the unmasked real GPU IDs."""
         # All devices if `CUDA_VISIBLE_DEVICES` unset
         default = ",".join(str(i) for i in range(torch.cuda.device_count()))
         cuda_visible_devices: List[str] = os.getenv("CUDA_VISIBLE_DEVICES", default=default).split(",")
-        return [cuda_visible_devices[device_id].strip() for device_id in device_ids]
+        return cuda_visible_devices[device_id].strip()
 
     def teardown(self) -> None:
         super().teardown()
diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py
index e1e0386e53ba4..58272d6285347 100644
--- a/pytorch_lightning/accelerators/tpu.py
+++ b/pytorch_lightning/accelerators/tpu.py
@@ -60,7 +60,7 @@ def _move_optimizer_state(self, device: Optional[torch.device] = None) -> None:
             for p, v in opt.state.items():
                 opt.state[p] = apply_to_collection(v, torch.Tensor, move_data_to_device, self.root_device)
 
-    def get_device_stats(self, device: Union[str, torch.dtype]) -> Dict[str, Any]:
+    def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         """Gets stats for the given TPU device."""
         device_stats = {}
         memory_info = xm.get_memory_info(device)

From 6d9cc2e3c4d300a748a5de036ed8ab72063d45f2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 18 Sep 2021 00:23:11 +0000
Subject: [PATCH 06/23] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pytorch_lightning/accelerators/gpu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
index 0df8dffdd9a3f..83c580c0af7d6 100644
--- a/pytorch_lightning/accelerators/gpu.py
+++ b/pytorch_lightning/accelerators/gpu.py
@@ -60,9 +60,9 @@ def set_nvidia_flags(local_rank: int) -> None:
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         """Gets stats for the given GPU device."""
         if _TORCH_GREATER_EQUAL_1_8:
-            return torch.cuda.memory_stats(device)  
+            return torch.cuda.memory_stats(device)
         else:
-            return self._get_gpu_stats(device)  
+            return self._get_gpu_stats(device)
 
     def _get_gpu_stats(self, device: torch.device) -> Dict[str, float]:
         nvidia_smi_path = shutil.which("nvidia-smi")

From 018e5cd9dfd0c3db083488b55327903b4465c68e Mon Sep 17 00:00:00 2001
From: Danielle Pintz <daniellepintz@gmail.com>
Date: Sat, 18 Sep 2021 06:48:00 +0000
Subject: [PATCH 07/23] fix gpu

---
 pytorch_lightning/accelerators/gpu.py | 52 ++++++++++++++++-----------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
index 0df8dffdd9a3f..30f16f3b9db29 100644
--- a/pytorch_lightning/accelerators/gpu.py
+++ b/pytorch_lightning/accelerators/gpu.py
@@ -60,30 +60,39 @@ def set_nvidia_flags(local_rank: int) -> None:
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         """Gets stats for the given GPU device."""
         if _TORCH_GREATER_EQUAL_1_8:
-            return torch.cuda.memory_stats(device)  
+            return torch.cuda.memory_stats(device)
         else:
-            return self._get_gpu_stats(device)  
+            return self._get_gpu_stats(device)
 
     def _get_gpu_stats(self, device: torch.device) -> Dict[str, float]:
-        nvidia_smi_path = shutil.which("nvidia-smi")
-        if nvidia_smi_path is None:
-            raise FileNotFoundError("nvidia-smi: command not found")
+        """Get the current gpu usage.
 
-        gpu_stat_keys = [
-            "utilization.gpu",
-            "memory.used",
-            "memory.free",
-            "utilization.memory",
-            "fan.speed",
-            "temperature.gpu",
-            "temperature.memoy",
-        ]
-        gpu_ids = self._get_gpu_id(device.index)
+        Return:
+            A dictionary in which the keys are device ids as integers and
+            values are memory usage as integers in MB.
 
+        Raises:
+            FileNotFoundError:
+                If nvidia-smi installation not found
+        """
+        gpu_stat_metrics = [
+            ("utilization.gpu", "%"),
+            ("memory.used", "MB"),
+            ("memory.free", "MB"),
+            ("utilization.memory", "%"),
+            ("fan.speed", "%"),
+            ("temperature.gpu", "°C"),
+            ("temperature.memory", "°C"),
+        ]
+        gpu_stat_keys = [k for k, _ in gpu_stat_metrics]
         gpu_query = ",".join(gpu_stat_keys)
-        format = "csv,nounits,noheader"
+
+        gpu_id = self._get_gpu_id(device.index)
+        nvidia_smi_path = shutil.which("nvidia-smi")
+        if nvidia_smi_path is None:
+            raise FileNotFoundError("nvidia-smi: command not found")
         result = subprocess.run(
-            [nvidia_smi_path, f"--query-gpu={gpu_query}", f"--format={format}", f"--id={gpu_ids}"],
+            [nvidia_smi_path, f"--query-gpu={gpu_query}", "--format=csv,nounits,noheader", f"--id={gpu_id}"],
             encoding="utf-8",
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,  # for backward compatibility with python version 3.6
@@ -96,9 +105,12 @@ def _to_float(x: str) -> float:
             except ValueError:
                 return 0.0
 
-        stats = [_to_float(x) for x in result.stdout.strip().split(os.linesep)]
-        for key in gpu_stat_keys:
-            gpu_stats = {key: stat for _, stat in enumerate(stats)}
+        s = result.stdout.strip()
+        stats = [_to_float(x) for x in s.split(", ")]
+
+        gpu_stats = {}
+        for i, (x, unit) in enumerate(gpu_stat_metrics):
+            gpu_stats[f"{x} ({unit})"] = stats[i]
         return gpu_stats
 
     def _get_gpu_id(self, device_id: int) -> List[str]:

From ec8084d3689c88fa2c4c0e6b8951df7da1bf7278 Mon Sep 17 00:00:00 2001
From: Danielle Pintz <daniellepintz@gmail.com>
Date: Sat, 18 Sep 2021 07:01:15 +0000
Subject: [PATCH 08/23] fix

---
 pytorch_lightning/accelerators/gpu.py | 116 +++++++++++++-------------
 1 file changed, 59 insertions(+), 57 deletions(-)

diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
index 30f16f3b9db29..6eff33326d673 100644
--- a/pytorch_lightning/accelerators/gpu.py
+++ b/pytorch_lightning/accelerators/gpu.py
@@ -62,64 +62,66 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         if _TORCH_GREATER_EQUAL_1_8:
             return torch.cuda.memory_stats(device)
         else:
-            return self._get_gpu_stats(device)
-
-    def _get_gpu_stats(self, device: torch.device) -> Dict[str, float]:
-        """Get the current gpu usage.
-
-        Return:
-            A dictionary in which the keys are device ids as integers and
-            values are memory usage as integers in MB.
-
-        Raises:
-            FileNotFoundError:
-                If nvidia-smi installation not found
-        """
-        gpu_stat_metrics = [
-            ("utilization.gpu", "%"),
-            ("memory.used", "MB"),
-            ("memory.free", "MB"),
-            ("utilization.memory", "%"),
-            ("fan.speed", "%"),
-            ("temperature.gpu", "°C"),
-            ("temperature.memory", "°C"),
-        ]
-        gpu_stat_keys = [k for k, _ in gpu_stat_metrics]
-        gpu_query = ",".join(gpu_stat_keys)
-
-        gpu_id = self._get_gpu_id(device.index)
-        nvidia_smi_path = shutil.which("nvidia-smi")
-        if nvidia_smi_path is None:
-            raise FileNotFoundError("nvidia-smi: command not found")
-        result = subprocess.run(
-            [nvidia_smi_path, f"--query-gpu={gpu_query}", "--format=csv,nounits,noheader", f"--id={gpu_id}"],
-            encoding="utf-8",
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,  # for backward compatibility with python version 3.6
-            check=True,
-        )
-
-        def _to_float(x: str) -> float:
-            try:
-                return float(x)
-            except ValueError:
-                return 0.0
-
-        s = result.stdout.strip()
-        stats = [_to_float(x) for x in s.split(", ")]
-
-        gpu_stats = {}
-        for i, (x, unit) in enumerate(gpu_stat_metrics):
-            gpu_stats[f"{x} ({unit})"] = stats[i]
-        return gpu_stats
-
-    def _get_gpu_id(self, device_id: int) -> List[str]:
-        """Get the unmasked real GPU IDs."""
-        # All devices if `CUDA_VISIBLE_DEVICES` unset
-        default = ",".join(str(i) for i in range(torch.cuda.device_count()))
-        cuda_visible_devices: List[str] = os.getenv("CUDA_VISIBLE_DEVICES", default=default).split(",")
-        return cuda_visible_devices[device_id].strip()
+            return _get_gpu_stats(device)
 
     def teardown(self) -> None:
         super().teardown()
         self._move_optimizer_state(torch.device("cpu"))
+
+
+def _get_gpu_stats(device: torch.device) -> Dict[str, float]:
+    """Get the current gpu usage.
+
+    Return:
+        A dictionary in which the keys are device ids as integers and
+        values are memory usage as integers in MB.
+
+    Raises:
+        FileNotFoundError:
+            If nvidia-smi installation not found
+    """
+    gpu_stat_metrics = [
+        ("utilization.gpu", "%"),
+        ("memory.used", "MB"),
+        ("memory.free", "MB"),
+        ("utilization.memory", "%"),
+        ("fan.speed", "%"),
+        ("temperature.gpu", "°C"),
+        ("temperature.memory", "°C"),
+    ]
+    gpu_stat_keys = [k for k, _ in gpu_stat_metrics]
+    gpu_query = ",".join(gpu_stat_keys)
+
+    gpu_id = _get_gpu_id(device.index)
+    nvidia_smi_path = shutil.which("nvidia-smi")
+    if nvidia_smi_path is None:
+        raise FileNotFoundError("nvidia-smi: command not found")
+    result = subprocess.run(
+        [nvidia_smi_path, f"--query-gpu={gpu_query}", "--format=csv,nounits,noheader", f"--id={gpu_id}"],
+        encoding="utf-8",
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,  # for backward compatibility with python version 3.6
+        check=True,
+    )
+
+    def _to_float(x: str) -> float:
+        try:
+            return float(x)
+        except ValueError:
+            return 0.0
+
+    s = result.stdout.strip()
+    stats = [_to_float(x) for x in s.split(", ")]
+
+    gpu_stats = {}
+    for i, (x, unit) in enumerate(gpu_stat_metrics):
+        gpu_stats[f"{x} ({unit})"] = stats[i]
+    return gpu_stats
+
+
+def _get_gpu_id(device_id: int) -> List[str]:
+    """Get the unmasked real GPU IDs."""
+    # All devices if `CUDA_VISIBLE_DEVICES` unset
+    default = ",".join(str(i) for i in range(torch.cuda.device_count()))
+    cuda_visible_devices: List[str] = os.getenv("CUDA_VISIBLE_DEVICES", default=default).split(",")
+    return cuda_visible_devices[device_id].strip()

From 5abce119117ed4832a18261b9e0f72c8048a5c03 Mon Sep 17 00:00:00 2001
From: Danielle Pintz <daniellepintz@gmail.com>
Date: Sat, 18 Sep 2021 07:05:12 +0000
Subject: [PATCH 09/23] update docstring

---
 pytorch_lightning/accelerators/gpu.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
index 6eff33326d673..b4a41c367604c 100644
--- a/pytorch_lightning/accelerators/gpu.py
+++ b/pytorch_lightning/accelerators/gpu.py
@@ -62,19 +62,18 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         if _TORCH_GREATER_EQUAL_1_8:
             return torch.cuda.memory_stats(device)
         else:
-            return _get_gpu_stats(device)
+            return _get_nvidia_gpu_stats(device)
 
     def teardown(self) -> None:
         super().teardown()
         self._move_optimizer_state(torch.device("cpu"))
 
 
-def _get_gpu_stats(device: torch.device) -> Dict[str, float]:
-    """Get the current gpu usage.
+def _get_nvidia_gpu_stats(device: torch.device) -> Dict[str, float]:
+    """Get GPU stats including memory, fan speed, and temperature from nvidia-smi
 
     Return:
-        A dictionary in which the keys are device ids as integers and
-        values are memory usage as integers in MB.
+        A dictionary mapping the metrics to their values.
 
     Raises:
         FileNotFoundError:

From 3936242538509620b3f27f1f1bc616ee9943a5ba Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 18 Sep 2021 07:06:25 +0000
Subject: [PATCH 10/23] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pytorch_lightning/accelerators/gpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
index b4a41c367604c..2002a3a51a55e 100644
--- a/pytorch_lightning/accelerators/gpu.py
+++ b/pytorch_lightning/accelerators/gpu.py
@@ -70,7 +70,7 @@ def teardown(self) -> None:
 
 
 def _get_nvidia_gpu_stats(device: torch.device) -> Dict[str, float]:
-    """Get GPU stats including memory, fan speed, and temperature from nvidia-smi
+    """Get GPU stats including memory, fan speed, and temperature from nvidia-smi.
 
     Return:
         A dictionary mapping the metrics to their values.

From 0fdd3687932aa1b2bedb232a18e51788a6fd284f Mon Sep 17 00:00:00 2001
From: Danielle Pintz <daniellepintz@gmail.com>
Date: Sat, 18 Sep 2021 08:18:19 +0000
Subject: [PATCH 11/23] fix tests

---
 pytorch_lightning/accelerators/accelerator.py |  1 -
 pytorch_lightning/accelerators/cpu.py         |  7 +++++++
 pytorch_lightning/accelerators/gpu.py         | 12 ++++++++++--
 pytorch_lightning/accelerators/ipu.py         |  6 +++++-
 pytorch_lightning/accelerators/tpu.py         |  6 +++++-
 5 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 0103910e68b7d..2c24d7c31447a 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -441,7 +441,6 @@ def restore_checkpoint_after_pre_dispatch(self) -> bool:
     @abstractmethod
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         """Gets stats for a given device."""
-        pass
 
     def on_train_start(self) -> None:
         """Called when train begins."""
diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py
index 46e74193fb557..2cfe996307e94 100644
--- a/pytorch_lightning/accelerators/cpu.py
+++ b/pytorch_lightning/accelerators/cpu.py
@@ -11,6 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Any, Dict, Union
+
+import torch
+
 import pytorch_lightning as pl
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -29,3 +33,6 @@ def setup(self, trainer: "pl.Trainer") -> None:
             raise MisconfigurationException(f"Device should be CPU, got {self.root_device} instead.")
 
         return super().setup(trainer)
+
+    def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
+        raise NotImplementedError
diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
index b4a41c367604c..e898de2824004 100644
--- a/pytorch_lightning/accelerators/gpu.py
+++ b/pytorch_lightning/accelerators/gpu.py
@@ -58,7 +58,15 @@ def set_nvidia_flags(local_rank: int) -> None:
         _log.info(f"LOCAL_RANK: {local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]")
 
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
-        """Gets stats for the given GPU device."""
+        """Gets stats for the given GPU device.
+        
+        Returns:
+        A dictionary mapping the metrics to their values.
+
+        Raises:
+            FileNotFoundError:
+                If nvidia-smi installation not found
+        """
         if _TORCH_GREATER_EQUAL_1_8:
             return torch.cuda.memory_stats(device)
         else:
@@ -72,7 +80,7 @@ def teardown(self) -> None:
 def _get_nvidia_gpu_stats(device: torch.device) -> Dict[str, float]:
     """Get GPU stats including memory, fan speed, and temperature from nvidia-smi
 
-    Return:
+    Returns:
         A dictionary mapping the metrics to their values.
 
     Raises:
diff --git a/pytorch_lightning/accelerators/ipu.py b/pytorch_lightning/accelerators/ipu.py
index 4de644b15eac5..b32593deeb810 100644
--- a/pytorch_lightning/accelerators/ipu.py
+++ b/pytorch_lightning/accelerators/ipu.py
@@ -11,8 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Callable
+from typing import Any, Callable, Dict, Union
 
+import torch
 from torch.optim import Optimizer
 
 import pytorch_lightning as pl
@@ -32,3 +33,6 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None:
     def optimizer_step(self, optimizer: Optimizer, opt_idx: int, lambda_closure: Callable, **kwargs: Any) -> None:
         # Optimizer step is handled by the IPU accelerator.
         lambda_closure()
+
+    def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
+        raise NotImplementedError
diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py
index 58272d6285347..a1e006528c9b5 100644
--- a/pytorch_lightning/accelerators/tpu.py
+++ b/pytorch_lightning/accelerators/tpu.py
@@ -61,7 +61,11 @@ def _move_optimizer_state(self, device: Optional[torch.device] = None) -> None:
                 opt.state[p] = apply_to_collection(v, torch.Tensor, move_data_to_device, self.root_device)
 
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
-        """Gets stats for the given TPU device."""
+        """Gets stats for the given TPU device.
+
+        Returns:
+        A dictionary mapping the metrics (free memory and peak memory) to their values.
+        """
         device_stats = {}
         memory_info = xm.get_memory_info(device)
 

From d8314cf72b4b41fa0ba92e2fce2a761b7e2e64b0 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 18 Sep 2021 08:19:49 +0000
Subject: [PATCH 12/23] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pytorch_lightning/accelerators/gpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
index 44117ded5a3ec..f1851a00881c1 100644
--- a/pytorch_lightning/accelerators/gpu.py
+++ b/pytorch_lightning/accelerators/gpu.py
@@ -59,7 +59,7 @@ def set_nvidia_flags(local_rank: int) -> None:
 
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         """Gets stats for the given GPU device.
-        
+
         Returns:
         A dictionary mapping the metrics to their values.
 

From 5699e858f547a10a5e42ccea95bd0b65c547a8b8 Mon Sep 17 00:00:00 2001
From: Danielle Pintz <daniellepintz@gmail.com>
Date: Sat, 18 Sep 2021 08:24:43 +0000
Subject: [PATCH 13/23] type fix

---
 pytorch_lightning/accelerators/gpu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
index 44117ded5a3ec..316aa7b8599ee 100644
--- a/pytorch_lightning/accelerators/gpu.py
+++ b/pytorch_lightning/accelerators/gpu.py
@@ -59,7 +59,7 @@ def set_nvidia_flags(local_rank: int) -> None:
 
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         """Gets stats for the given GPU device.
-        
+
         Returns:
         A dictionary mapping the metrics to their values.
 
@@ -126,7 +126,7 @@ def _to_float(x: str) -> float:
     return gpu_stats
 
 
-def _get_gpu_id(device_id: int) -> List[str]:
+def _get_gpu_id(device_id: int) -> str:
     """Get the unmasked real GPU IDs."""
     # All devices if `CUDA_VISIBLE_DEVICES` unset
     default = ",".join(str(i) for i in range(torch.cuda.device_count()))

From 3ac0821ac9993f6dd2e76b59bacb3cedd3c7d569 Mon Sep 17 00:00:00 2001
From: Danielle Pintz <daniellepintz@gmail.com>
Date: Sat, 18 Sep 2021 18:04:54 +0000
Subject: [PATCH 14/23] fix test

---
 tests/accelerators/test_accelerator_connector.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
index 650b7949ac1ba..062fb69c84a17 100644
--- a/tests/accelerators/test_accelerator_connector.py
+++ b/tests/accelerators/test_accelerator_connector.py
@@ -13,7 +13,7 @@
 # limitations under the License
 
 import os
-from typing import Optional
+from typing import Optional, Dict, Union, Any
 from unittest import mock
 
 import pytest
@@ -385,7 +385,8 @@ def creates_children(self) -> bool:
 @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
 def test_custom_accelerator(device_count_mock, setup_distributed_mock):
     class Accel(Accelerator):
-        pass
+        def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
+            return []
 
     class Prec(PrecisionPlugin):
         pass

From 1160cd069c33f2b2ae7fcf1df21d4a1622e1dac8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 18 Sep 2021 18:06:13 +0000
Subject: [PATCH 15/23] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/accelerators/test_accelerator_connector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
index 062fb69c84a17..f943d52d76e08 100644
--- a/tests/accelerators/test_accelerator_connector.py
+++ b/tests/accelerators/test_accelerator_connector.py
@@ -13,7 +13,7 @@
 # limitations under the License
 
 import os
-from typing import Optional, Dict, Union, Any
+from typing import Any, Dict, Optional, Union
 from unittest import mock
 
 import pytest

From ef5bc17163837c1cf21061655fb7208838408397 Mon Sep 17 00:00:00 2001
From: Danielle Pintz <38207072+daniellepintz@users.noreply.github.com>
Date: Sun, 19 Sep 2021 16:37:37 -0700
Subject: [PATCH 16/23] Update pytorch_lightning/accelerators/gpu.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
---
 pytorch_lightning/accelerators/gpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
index 316aa7b8599ee..688b32897586d 100644
--- a/pytorch_lightning/accelerators/gpu.py
+++ b/pytorch_lightning/accelerators/gpu.py
@@ -61,7 +61,7 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         """Gets stats for the given GPU device.
 
         Returns:
-        A dictionary mapping the metrics to their values.
+            A dictionary mapping the metrics to their values.
 
         Raises:
             FileNotFoundError:

From 497680c9dee36209aaf4f2dbf8ccda16ff17df79 Mon Sep 17 00:00:00 2001
From: Danielle Pintz <daniellepintz@gmail.com>
Date: Tue, 21 Sep 2021 22:13:47 +0000
Subject: [PATCH 17/23] address comments

---
 CHANGELOG.md                                     | 2 +-
 pytorch_lightning/accelerators/accelerator.py    | 5 ++---
 pytorch_lightning/accelerators/cpu.py            | 3 ---
 pytorch_lightning/accelerators/gpu.py            | 2 +-
 pytorch_lightning/accelerators/ipu.py            | 3 ---
 tests/accelerators/test_accelerator_connector.py | 3 +--
 6 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 683e7350ed039..ec41f1f68129b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -142,7 +142,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `PL_RECONCILE_PROCESS` environment variable to enable process reconciliation regardless of cluster environment settings ([#9389](https://github.com/PyTorchLightning/pytorch-lightning/pull/9389))
 
 
-- Added `get_device_stats` to Accelerator interface and implement it for GPU and TPU ([#9586](https://github.com/PyTorchLightning/pytorch-lightning/pull/9586))
+- Added `get_device_stats` to the Accelerator Interface and added its implementation for GPU and TPU ([#9586](https://github.com/PyTorchLightning/pytorch-lightning/pull/9586))
 
 
 - Added `RichModelSummary` callback ([#9546](https://github.com/PyTorchLightning/pytorch-lightning/pull/9546))
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 2c24d7c31447a..11bf0e96d0e01 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import contextlib
-from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Union
 
 import torch
@@ -34,7 +33,7 @@
     from torch.cuda.amp import GradScaler
 
 
-class Accelerator(ABC):
+class Accelerator:
     """The Accelerator Base Class. An Accelerator is meant to deal with one type of Hardware.
 
     Currently there are accelerators for:
@@ -438,9 +437,9 @@ def restore_checkpoint_after_pre_dispatch(self) -> bool:
         """
         return self.training_type_plugin.restore_checkpoint_after_pre_dispatch
 
-    @abstractmethod
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         """Gets stats for a given device."""
+        raise NotImplementedError
 
     def on_train_start(self) -> None:
         """Called when train begins."""
diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py
index 2cfe996307e94..3e996c1809d96 100644
--- a/pytorch_lightning/accelerators/cpu.py
+++ b/pytorch_lightning/accelerators/cpu.py
@@ -33,6 +33,3 @@ def setup(self, trainer: "pl.Trainer") -> None:
             raise MisconfigurationException(f"Device should be CPU, got {self.root_device} instead.")
 
         return super().setup(trainer)
-
-    def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
-        raise NotImplementedError
diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
index 316aa7b8599ee..688b32897586d 100644
--- a/pytorch_lightning/accelerators/gpu.py
+++ b/pytorch_lightning/accelerators/gpu.py
@@ -61,7 +61,7 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         """Gets stats for the given GPU device.
 
         Returns:
-        A dictionary mapping the metrics to their values.
+            A dictionary mapping the metrics to their values.
 
         Raises:
             FileNotFoundError:
diff --git a/pytorch_lightning/accelerators/ipu.py b/pytorch_lightning/accelerators/ipu.py
index b32593deeb810..deacae8b0af61 100644
--- a/pytorch_lightning/accelerators/ipu.py
+++ b/pytorch_lightning/accelerators/ipu.py
@@ -33,6 +33,3 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None:
     def optimizer_step(self, optimizer: Optimizer, opt_idx: int, lambda_closure: Callable, **kwargs: Any) -> None:
         # Optimizer step is handled by the IPU accelerator.
         lambda_closure()
-
-    def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
-        raise NotImplementedError
diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
index 062fb69c84a17..a8cc23ee4b241 100644
--- a/tests/accelerators/test_accelerator_connector.py
+++ b/tests/accelerators/test_accelerator_connector.py
@@ -385,8 +385,7 @@ def creates_children(self) -> bool:
 @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
 def test_custom_accelerator(device_count_mock, setup_distributed_mock):
     class Accel(Accelerator):
-        def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
-            return []
+        pass
 
     class Prec(PrecisionPlugin):
         pass

From ae7e912caac0b30e103d6b2f10ac96bf5ebc18c2 Mon Sep 17 00:00:00 2001
From: Danielle Pintz <daniellepintz@gmail.com>
Date: Thu, 23 Sep 2021 00:18:55 +0000
Subject: [PATCH 18/23] Add unit tests

---
 tests/accelerators/test_gpu.py | 36 ++++++++++++++++++++++++++++++++++
 tests/accelerators/test_tpu.py | 16 +++++++++++++++
 2 files changed, 52 insertions(+)
 create mode 100644 tests/accelerators/test_gpu.py
 create mode 100644 tests/accelerators/test_tpu.py

diff --git a/tests/accelerators/test_gpu.py b/tests/accelerators/test_gpu.py
new file mode 100644
index 0000000000000..058ac1f8ef1fb
--- /dev/null
+++ b/tests/accelerators/test_gpu.py
@@ -0,0 +1,36 @@
+import torch
+
+from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin
+from pytorch_lightning.accelerators import GPUAccelerator
+from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
+from tests.helpers.runif import RunIf
+
+
+@RunIf(min_torch="1.8")
+@RunIf(min_gpus=1)
+def test_get_torch_gpu_stats(tmpdir):
+    """Test GPU get_device_stats with Pytorch >= 1.8.0."""
+    current_device = torch.device(f"cuda:{torch.cuda.current_device()}")
+    GPUAccel = GPUAccelerator(
+        training_type_plugin=DataParallelPlugin(parallel_devices=[current_device]), precision_plugin=PrecisionPlugin()
+    )
+    gpu_stats = GPUAccel.get_device_stats(current_device)
+    fields = ["allocated_bytes.all.freed", "inactive_split.all.peak", "reserved_bytes.large_pool.peak"]
+
+    for f in fields:
+        assert any(f in h for h in gpu_stats.keys())
+
+
+@RunIf(max_torch="1.7")
+@RunIf(min_gpus=1)
+def test_get_nvidia_gpu_stats(tmpdir):
+    """Test GPU get_device_stats with Pytorch < 1.8.0."""
+    current_device = torch.device(f"cuda:{torch.cuda.current_device()}")
+    GPUAccel = GPUAccelerator(
+        training_type_plugin=DataParallelPlugin(parallel_devices=[current_device]), precision_plugin=PrecisionPlugin()
+    )
+    gpu_stats = GPUAccel.get_device_stats(current_device)
+    fields = ["utilization.gpu", "memory.used", "memory.free", "utilization.memory"]
+
+    for f in fields:
+        assert any(f in h for h in gpu_stats.keys())
diff --git a/tests/accelerators/test_tpu.py b/tests/accelerators/test_tpu.py
new file mode 100644
index 0000000000000..7d63c31ee7d39
--- /dev/null
+++ b/tests/accelerators/test_tpu.py
@@ -0,0 +1,16 @@
+from pytorch_lightning.plugins.training_type import TPUSpawnPlugin
+from pytorch_lightning.accelerators import TPUAccelerator
+from tests.helpers.runif import RunIf
+from pytorch_lightning.plugins import SingleTPUPlugin
+
+
+@RunIf(tpu=True)
+def test_device_stats_tpu(tmpdir):
+    """Test TPU get_device_stats."""
+    plugin = SingleTPUPlugin(1)
+    TPUAccel = TPUAccelerator(training_type_plugin=TPUSpawnPlugin(), precision_plugin=plugin)
+    tpu_stats = TPUAccel.get_device_stats("1")
+    fields = ["avg. free memory (MB)", "avg. peak memory (MB)"]
+
+    for f in fields:
+        assert any(f in h for h in tpu_stats.keys())

From 418e4a079adb7f7203cb62bcc32559763ab1cf9c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 23 Sep 2021 00:23:15 +0000
Subject: [PATCH 19/23] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/accelerators/test_gpu.py | 2 +-
 tests/accelerators/test_tpu.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/accelerators/test_gpu.py b/tests/accelerators/test_gpu.py
index 058ac1f8ef1fb..85ce0cd9f0f18 100644
--- a/tests/accelerators/test_gpu.py
+++ b/tests/accelerators/test_gpu.py
@@ -1,8 +1,8 @@
 import torch
 
-from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin
 from pytorch_lightning.accelerators import GPUAccelerator
 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
+from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin
 from tests.helpers.runif import RunIf
 
 
diff --git a/tests/accelerators/test_tpu.py b/tests/accelerators/test_tpu.py
index 7d63c31ee7d39..f3a2c50c0e347 100644
--- a/tests/accelerators/test_tpu.py
+++ b/tests/accelerators/test_tpu.py
@@ -1,7 +1,7 @@
-from pytorch_lightning.plugins.training_type import TPUSpawnPlugin
 from pytorch_lightning.accelerators import TPUAccelerator
-from tests.helpers.runif import RunIf
 from pytorch_lightning.plugins import SingleTPUPlugin
+from pytorch_lightning.plugins.training_type import TPUSpawnPlugin
+from tests.helpers.runif import RunIf
 
 
 @RunIf(tpu=True)

From 46b9f3665e642e95825fabd7a8b6c5f6fd01a86e Mon Sep 17 00:00:00 2001
From: Danielle Pintz <daniellepintz@gmail.com>
Date: Thu, 23 Sep 2021 01:01:35 +0000
Subject: [PATCH 20/23] comments

---
 pytorch_lightning/accelerators/cpu.py | 4 ++++
 pytorch_lightning/accelerators/ipu.py | 3 +--
 pytorch_lightning/accelerators/tpu.py | 3 ---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py
index 3e996c1809d96..7e7ae26a2713f 100644
--- a/pytorch_lightning/accelerators/cpu.py
+++ b/pytorch_lightning/accelerators/cpu.py
@@ -33,3 +33,7 @@ def setup(self, trainer: "pl.Trainer") -> None:
             raise MisconfigurationException(f"Device should be CPU, got {self.root_device} instead.")
 
         return super().setup(trainer)
+
+    def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
+        """Returns dummy implementation for now"""
+        return {}
diff --git a/pytorch_lightning/accelerators/ipu.py b/pytorch_lightning/accelerators/ipu.py
index deacae8b0af61..4de644b15eac5 100644
--- a/pytorch_lightning/accelerators/ipu.py
+++ b/pytorch_lightning/accelerators/ipu.py
@@ -11,9 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Callable, Dict, Union
+from typing import Any, Callable
 
-import torch
 from torch.optim import Optimizer
 
 import pytorch_lightning as pl
diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py
index a1e006528c9b5..7e826d4317308 100644
--- a/pytorch_lightning/accelerators/tpu.py
+++ b/pytorch_lightning/accelerators/tpu.py
@@ -72,9 +72,6 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         free_memory = memory_info["kb_free"]
         peak_memory = memory_info["kb_total"] - free_memory
 
-        free_memory = self.training_type_plugin.reduce(free_memory) * 0.001
-        peak_memory = self.training_type_plugin.reduce(peak_memory) * 0.001
-
         device_stats["avg. free memory (MB)"] = free_memory
         device_stats["avg. peak memory (MB)"] = peak_memory
         return device_stats

From ccadca51daf83c395c97956c24a1f825435e27af Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 23 Sep 2021 01:03:31 +0000
Subject: [PATCH 21/23] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pytorch_lightning/accelerators/cpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py
index 7e7ae26a2713f..baa922b6d796b 100644
--- a/pytorch_lightning/accelerators/cpu.py
+++ b/pytorch_lightning/accelerators/cpu.py
@@ -35,5 +35,5 @@ def setup(self, trainer: "pl.Trainer") -> None:
         return super().setup(trainer)
 
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
-        """Returns dummy implementation for now"""
+        """Returns dummy implementation for now."""
         return {}

From 2658b4af7910e76b15fbc28986d0f4fb69e5fafc Mon Sep 17 00:00:00 2001
From: Danielle Pintz <daniellepintz@gmail.com>
Date: Thu, 23 Sep 2021 06:39:33 +0000
Subject: [PATCH 22/23] lint

---
 tests/accelerators/test_accelerator_connector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
index c90abc38df4bd..650b7949ac1ba 100644
--- a/tests/accelerators/test_accelerator_connector.py
+++ b/tests/accelerators/test_accelerator_connector.py
@@ -13,7 +13,7 @@
 # limitations under the License
 
 import os
-from typing import Any, Dict, Optional, Union
+from typing import Optional
 from unittest import mock
 
 import pytest

From c4f0d02d26cd63ae0a7936381c950815f7928211 Mon Sep 17 00:00:00 2001
From: Danielle Pintz <daniellepintz@gmail.com>
Date: Thu, 23 Sep 2021 20:54:59 +0000
Subject: [PATCH 23/23] comments

---
 pytorch_lightning/accelerators/accelerator.py |  9 ++++++++-
 pytorch_lightning/accelerators/gpu.py         |  9 +++++++--
 pytorch_lightning/accelerators/tpu.py         | 14 ++++++++------
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 11bf0e96d0e01..137ee9b98ab67 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -438,7 +438,14 @@ def restore_checkpoint_after_pre_dispatch(self) -> bool:
         return self.training_type_plugin.restore_checkpoint_after_pre_dispatch
 
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
-        """Gets stats for a given device."""
+        """Gets stats for a given device.
+
+        Args:
+            device: device for which to get stats
+
+        Returns:
+            Dictionary of device stats
+        """
         raise NotImplementedError
 
     def on_train_start(self) -> None:
diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
index 688b32897586d..b33903c2d60c9 100644
--- a/pytorch_lightning/accelerators/gpu.py
+++ b/pytorch_lightning/accelerators/gpu.py
@@ -60,6 +60,9 @@ def set_nvidia_flags(local_rank: int) -> None:
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         """Gets stats for the given GPU device.
 
+        Args:
+            device: GPU device for which to get stats
+
         Returns:
             A dictionary mapping the metrics to their values.
 
@@ -69,8 +72,7 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         """
         if _TORCH_GREATER_EQUAL_1_8:
             return torch.cuda.memory_stats(device)
-        else:
-            return _get_nvidia_gpu_stats(device)
+        return _get_nvidia_gpu_stats(device)
 
     def teardown(self) -> None:
         super().teardown()
@@ -80,6 +82,9 @@ def teardown(self) -> None:
 def _get_nvidia_gpu_stats(device: torch.device) -> Dict[str, float]:
     """Get GPU stats including memory, fan speed, and temperature from nvidia-smi.
 
+    Args:
+        device: GPU device for which to get stats
+
     Returns:
         A dictionary mapping the metrics to their values.
 
diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py
index 9f09fafcb5bdf..68925ab67aca9 100644
--- a/pytorch_lightning/accelerators/tpu.py
+++ b/pytorch_lightning/accelerators/tpu.py
@@ -65,15 +65,17 @@ def _move_optimizer_state(self, device: Optional[torch.device] = None) -> None:
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
         """Gets stats for the given TPU device.
 
+        Args:
+            device: TPU device for which to get stats
+
         Returns:
-        A dictionary mapping the metrics (free memory and peak memory) to their values.
+            A dictionary mapping the metrics (free memory and peak memory) to their values.
         """
-        device_stats = {}
         memory_info = xm.get_memory_info(device)
-
         free_memory = memory_info["kb_free"]
         peak_memory = memory_info["kb_total"] - free_memory
-
-        device_stats["avg. free memory (MB)"] = free_memory
-        device_stats["avg. peak memory (MB)"] = peak_memory
+        device_stats = {
+            "avg. free memory (MB)": free_memory,
+            "avg. peak memory (MB)": peak_memory,
+        }
         return device_stats