Lightning-AI · daniellepintz · Sep 27, 2021 · Sep 17, 2021 · Sep 17, 2021 · Sep 17, 2021
@@ -142,6 +142,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `PL_RECONCILE_PROCESS` environment variable to enable process reconciliation regardless of cluster environment settings ([#9389](https://github.com/PyTorchLightning/pytorch-lightning/pull/9389))
 
 
+- Added `get_device_stats` to Accelerator interface and implement it for GPU and TPU ([#9586](https://github.com/PyTorchLightning/pytorch-lightning/pull/9586))
+
+
 - Added `RichModelSummary` callback ([#9546](https://github.com/PyTorchLightning/pytorch-lightning/pull/9546))
 
 

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import contextlib
+from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Union
 
 import torch
@@ -33,14 +34,15 @@
     from torch.cuda.amp import GradScaler
 
 
-class Accelerator:
+class Accelerator(ABC):
     """The Accelerator Base Class. An Accelerator is meant to deal with one type of Hardware.
 
     Currently there are accelerators for:
 
     - CPU
     - GPU
     - TPU
+    - IPU
 
     Each Accelerator gets two plugins upon initialization:
     One to handle differences from the training routine and one to handle different precisions.
@@ -436,6 +438,11 @@ def restore_checkpoint_after_pre_dispatch(self) -> bool:
         """
         return self.training_type_plugin.restore_checkpoint_after_pre_dispatch
 
+    @abstractmethod
+    def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
+        """Gets stats for a given device."""
+        pass
+
     def on_train_start(self) -> None:
         """Called when train begins."""
         return self.training_type_plugin.on_train_start()

@@ -13,12 +13,16 @@
 # limitations under the License.
 import logging
 import os
+import shutil
+import subprocess
+from typing import Any, Dict, List, Union
 
 import torch
 
 import pytorch_lightning as pl
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8
 
 _log = logging.getLogger(__name__)
 
@@ -27,17 +31,17 @@ class GPUAccelerator(Accelerator):
     """Accelerator for GPU devices."""
 
     def setup_environment(self) -> None:
+        """
+        Raises:
+            MisconfigurationException:
+                If the selected device is not GPU.
+        """
         super().setup_environment()
         if "cuda" not in str(self.root_device):
             raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead")
         torch.cuda.set_device(self.root_device)
 
     def setup(self, trainer: "pl.Trainer") -> None:
-        """
-        Raises:
-            MisconfigurationException:
-                If the selected device is not GPU.
-        """
         self.set_nvidia_flags(trainer.local_rank)
         return super().setup(trainer)
 
@@ -53,6 +57,57 @@ def set_nvidia_flags(local_rank: int) -> None:
         devices = os.getenv("CUDA_VISIBLE_DEVICES", all_gpu_ids)
         _log.info(f"LOCAL_RANK: {local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]")
 
+    def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
+        """Gets stats for the given GPU device."""
+        if _TORCH_GREATER_EQUAL_1_8:
+            return torch.cuda.memory_stats(device)
+        else:
+            return self._get_gpu_stats(device)
+
+    def _get_gpu_stats(self, device: torch.device) -> Dict[str, float]:
+        nvidia_smi_path = shutil.which("nvidia-smi")
+        if nvidia_smi_path is None:
+            raise FileNotFoundError("nvidia-smi: command not found")
+
+        gpu_stat_keys = [
+            "utilization.gpu",
+            "memory.used",
+            "memory.free",
+            "utilization.memory",
+            "fan.speed",
+            "temperature.gpu",
+            "temperature.memoy",
+        ]
+        gpu_ids = self._get_gpu_id(device.index)
+
+        gpu_query = ",".join(gpu_stat_keys)
+        format = "csv,nounits,noheader"
+        result = subprocess.run(
+            [nvidia_smi_path, f"--query-gpu={gpu_query}", f"--format={format}", f"--id={gpu_ids}"],
+            encoding="utf-8",
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,  # for backward compatibility with python version 3.6
+            check=True,
+        )
+
+        def _to_float(x: str) -> float:
+            try:
+                return float(x)
+            except ValueError:
+                return 0.0
+
+        stats = [_to_float(x) for x in result.stdout.strip().split(os.linesep)]
+        for key in gpu_stat_keys:
+            gpu_stats = {key: stat for _, stat in enumerate(stats)}
+        return gpu_stats
+
+    def _get_gpu_id(self, device_id: int) -> List[str]:
+        """Get the unmasked real GPU IDs."""
+        # All devices if `CUDA_VISIBLE_DEVICES` unset
+        default = ",".join(str(i) for i in range(torch.cuda.device_count()))
+        cuda_visible_devices: List[str] = os.getenv("CUDA_VISIBLE_DEVICES", default=default).split(",")
+        return cuda_visible_devices[device_id].strip()
+
     def teardown(self) -> None:
         super().teardown()
         self._move_optimizer_state(torch.device("cpu"))
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Dict, Optional, Union
 
 import torch
 from torch.optim import Optimizer
@@ -59,3 +59,18 @@ def _move_optimizer_state(self, device: Optional[torch.device] = None) -> None:
         for opt in self.optimizers:
             for p, v in opt.state.items():
                 opt.state[p] = apply_to_collection(v, torch.Tensor, move_data_to_device, self.root_device)
+
+    def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
+        """Gets stats for the given TPU device."""
+        device_stats = {}
+        memory_info = xm.get_memory_info(device)
+
+        free_memory = memory_info["kb_free"]
+        peak_memory = memory_info["kb_total"] - free_memory
+
+        free_memory = self.training_type_plugin.reduce(free_memory) * 0.001
+        peak_memory = self.training_type_plugin.reduce(peak_memory) * 0.001
+
+        device_stats["avg. free memory (MB)"] = free_memory
+        device_stats["avg. peak memory (MB)"] = peak_memory
+        return device_stats
Original file line number	Diff line number	Diff line change
Expand Up		@@ -142,6 +142,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
		- Added `PL_RECONCILE_PROCESS` environment variable to enable process reconciliation regardless of cluster environment settings ([#9389](https://github.com/PyTorchLightning/pytorch-lightning/pull/9389))


		- Added `get_device_stats` to Accelerator interface and implement it for GPU and TPU ([#9586](https://github.com/PyTorchLightning/pytorch-lightning/pull/9586))
daniellepintz marked this conversation as resolved. Show resolved Hide resolved


		- Added `RichModelSummary` callback ([#9546](https://github.com/PyTorchLightning/pytorch-lightning/pull/9546))


Expand Down