From c11eb87b4e062345957bc833177229a42a397226 Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Fri, 17 Sep 2021 22:15:25 +0000 Subject: [PATCH 01/23] Add interface to accelerator to get_device_stats --- pytorch_lightning/accelerators/accelerator.py | 5 ++ pytorch_lightning/accelerators/gpu.py | 63 +++++++++++++++++++ pytorch_lightning/accelerators/tpu.py | 17 ++++- 3 files changed, 84 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 3036fd83ebf22..63522cab095ef 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -41,6 +41,7 @@ class Accelerator: - CPU - GPU - TPU + - IPU Each Accelerator gets two plugins upon initialization: One to handle differences from the training routine and one to handle different precisions. @@ -436,6 +437,10 @@ def restore_checkpoint_after_pre_dispatch(self) -> bool: """ return self.training_type_plugin.restore_checkpoint_after_pre_dispatch + def get_device_stats(self, device: Optional[torch.device] = None) -> Dict[str, Any]: + """Gets stats for a given device.""" + pass + def on_train_start(self) -> None: """Called when train begins.""" return self.training_type_plugin.on_train_start() diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 6a38cd2cf50e9..2951258195e5d 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -13,12 +13,16 @@ # limitations under the License. import logging import os +import shutil +import subprocess +from typing import Any, Dict, List, Optional import torch import pytorch_lightning as pl from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8 _log = logging.getLogger(__name__) @@ -39,6 +43,13 @@ def setup(self, trainer: "pl.Trainer") -> None: If the selected device is not GPU. """ self.set_nvidia_flags(trainer.local_rank) + + # The logical device IDs for selected devices + self._device_ids: List[int] = sorted(set(trainer.data_parallel_device_ids)) + + # The unmasked real GPU IDs + self._gpu_ids: List[int] = self._get_gpu_ids(self._device_ids) + return super().setup(trainer) def on_train_start(self) -> None: @@ -53,6 +64,58 @@ def set_nvidia_flags(local_rank: int) -> None: devices = os.getenv("CUDA_VISIBLE_DEVICES", all_gpu_ids) _log.info(f"LOCAL_RANK: {local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]") + def get_device_stats(self, device: Optional[torch.device] = None) -> Dict[str, Any]: + """Gets stats for the given GPU device.""" + if _TORCH_GREATER_EQUAL_1_8: + return torch.cuda.memory_stats(device=device) + else: + gpu_stat_keys = [ + ("utilization.gpu", "%"), + ("memory.used", "MB"), + ("memory.free", "MB"), + ("utilization.memory", "%"), + ("fan.speed", "%"), + ("temperature.gpu", "°C"), + ("temperature.memory", "°C"), + ] + gpu_stats = self._get_gpu_stats([k for k, _ in gpu_stat_keys]) + device_stats = self._parse_gpu_stats(self._device_ids, gpu_stats, gpu_stat_keys) + return device_stats + + def _get_gpu_stats(self, queries: List[str]) -> List[List[float]]: + if not queries: + return [] + + """Run nvidia-smi to get the gpu stats""" + gpu_query = ",".join(queries) + format = "csv,nounits,noheader" + gpu_ids = ",".join(self._gpu_ids) + result = subprocess.run( + [shutil.which("nvidia-smi"), f"--query-gpu={gpu_query}", f"--format={format}", f"--id={gpu_ids}"], + encoding="utf-8", + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, # for backward compatibility with python version 3.6 + check=True, + ) + + def _to_float(x: str) -> float: + try: + return float(x) + except ValueError: + return 0.0 + + stats = result.stdout.strip().split(os.linesep) + stats = [[_to_float(x) for x in s.split(", ")] for s in stats] + return stats + + @staticmethod + def _get_gpu_ids(device_ids: List[int]) -> List[str]: + """Get the unmasked real GPU IDs.""" + # All devices if `CUDA_VISIBLE_DEVICES` unset + default = ",".join(str(i) for i in range(torch.cuda.device_count())) + cuda_visible_devices: List[str] = os.getenv("CUDA_VISIBLE_DEVICES", default=default).split(",") + return [cuda_visible_devices[device_id].strip() for device_id in device_ids] + def teardown(self) -> None: super().teardown() self._move_optimizer_state(torch.device("cpu")) diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py index 954bed3dbc58a..302b09d9a21fb 100644 --- a/pytorch_lightning/accelerators/tpu.py +++ b/pytorch_lightning/accelerators/tpu.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Callable, Optional +from typing import Any, Callable, Dict, Optional import torch from torch.optim import Optimizer @@ -59,3 +59,18 @@ def _move_optimizer_state(self, device: Optional[torch.device] = None) -> None: for opt in self.optimizers: for p, v in opt.state.items(): opt.state[p] = apply_to_collection(v, torch.Tensor, move_data_to_device, self.root_device) + + def get_device_stats(self, device: Optional[torch.device] = None) -> Dict[str, Any]: + """Gets stats for the given TPU device.""" + device_stats = {} + memory_info = xm.get_memory_info(device) + + free_memory = memory_info["kb_free"] + peak_memory = memory_info["kb_total"] - free_memory + + free_memory = self.training_type_plugin.reduce(free_memory) * 0.001 + peak_memory = self.training_type_plugin.reduce(peak_memory) * 0.001 + + device_stats["avg. free memory (MB)"] = free_memory + device_stats["avg. peak memory (MB)"] = peak_memory + return device_stats From cba491627940b876f23a19f7f229f9ea5ffe6e41 Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Fri, 17 Sep 2021 22:20:45 +0000 Subject: [PATCH 02/23] Update changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b6884971bf4e..7b0e2c10dcb54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -133,6 +133,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `PL_RECONCILE_PROCESS` environment variable to enable process reconciliation regardless of cluster environment settings ([#9389](https://github.com/PyTorchLightning/pytorch-lightning/pull/9389)) +- Added `get_device_stats` to Accelerator interface and implement it for GPU and TPU ([#9586](https://github.com/PyTorchLightning/pytorch-lightning/pull/9586)) + + ### Changed - `pytorch_lightning.loggers.neptune.NeptuneLogger` is now consistent with new [neptune-client](https://github.com/neptune-ai/neptune-client) API ([#6867](https://github.com/PyTorchLightning/pytorch-lightning/pull/6867)). From d0e1233a3896faa85d54fb9b2dda623735251d92 Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Fri, 17 Sep 2021 22:57:29 +0000 Subject: [PATCH 03/23] address comments --- pytorch_lightning/accelerators/accelerator.py | 6 ++++-- pytorch_lightning/accelerators/gpu.py | 4 ++-- pytorch_lightning/accelerators/tpu.py | 4 ++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 63522cab095ef..55d0194ec23e4 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -13,6 +13,7 @@ # limitations under the License. import contextlib from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Union +from abc import ABC, abstractmethod import torch from torch import Tensor @@ -33,7 +34,7 @@ from torch.cuda.amp import GradScaler -class Accelerator: +class Accelerator(ABC): """The Accelerator Base Class. An Accelerator is meant to deal with one type of Hardware. Currently there are accelerators for: @@ -437,7 +438,8 @@ def restore_checkpoint_after_pre_dispatch(self) -> bool: """ return self.training_type_plugin.restore_checkpoint_after_pre_dispatch - def get_device_stats(self, device: Optional[torch.device] = None) -> Dict[str, Any]: + @abstractmethod + def get_device_stats(self, device: Union[str, torch.dtype]) -> Dict[str, Any]: """Gets stats for a given device.""" pass diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 2951258195e5d..53c7d8ab01184 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -15,7 +15,7 @@ import os import shutil import subprocess -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union import torch @@ -64,7 +64,7 @@ def set_nvidia_flags(local_rank: int) -> None: devices = os.getenv("CUDA_VISIBLE_DEVICES", all_gpu_ids) _log.info(f"LOCAL_RANK: {local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]") - def get_device_stats(self, device: Optional[torch.device] = None) -> Dict[str, Any]: + def get_device_stats(self, device: Union[str, torch.dtype]) -> Dict[str, Any]: """Gets stats for the given GPU device.""" if _TORCH_GREATER_EQUAL_1_8: return torch.cuda.memory_stats(device=device) diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py index 302b09d9a21fb..e1e0386e53ba4 100644 --- a/pytorch_lightning/accelerators/tpu.py +++ b/pytorch_lightning/accelerators/tpu.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Callable, Dict, Optional +from typing import Any, Callable, Dict, Optional, Union import torch from torch.optim import Optimizer @@ -60,7 +60,7 @@ def _move_optimizer_state(self, device: Optional[torch.device] = None) -> None: for p, v in opt.state.items(): opt.state[p] = apply_to_collection(v, torch.Tensor, move_data_to_device, self.root_device) - def get_device_stats(self, device: Optional[torch.device] = None) -> Dict[str, Any]: + def get_device_stats(self, device: Union[str, torch.dtype]) -> Dict[str, Any]: """Gets stats for the given TPU device.""" device_stats = {} memory_info = xm.get_memory_info(device) From 269f3fff8591824b1b2664152c906aa89fd3a186 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 17 Sep 2021 22:58:41 +0000 Subject: [PATCH 04/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pytorch_lightning/accelerators/accelerator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 55d0194ec23e4..43167417c248c 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import contextlib -from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Union from abc import ABC, abstractmethod +from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Union import torch from torch import Tensor From 4d8cc759c86e26b1a701425e57196965591f9c63 Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Sat, 18 Sep 2021 00:21:35 +0000 Subject: [PATCH 05/23] comments --- pytorch_lightning/accelerators/accelerator.py | 2 +- pytorch_lightning/accelerators/gpu.py | 76 +++++++++---------- pytorch_lightning/accelerators/tpu.py | 2 +- 3 files changed, 36 insertions(+), 44 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 55d0194ec23e4..01eebfc362644 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -439,7 +439,7 @@ def restore_checkpoint_after_pre_dispatch(self) -> bool: return self.training_type_plugin.restore_checkpoint_after_pre_dispatch @abstractmethod - def get_device_stats(self, device: Union[str, torch.dtype]) -> Dict[str, Any]: + def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: """Gets stats for a given device.""" pass diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 53c7d8ab01184..0df8dffdd9a3f 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -15,7 +15,7 @@ import os import shutil import subprocess -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Union import torch @@ -31,25 +31,18 @@ class GPUAccelerator(Accelerator): """Accelerator for GPU devices.""" def setup_environment(self) -> None: + """ + Raises: + MisconfigurationException: + If the selected device is not GPU. + """ super().setup_environment() if "cuda" not in str(self.root_device): raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead") torch.cuda.set_device(self.root_device) def setup(self, trainer: "pl.Trainer") -> None: - """ - Raises: - MisconfigurationException: - If the selected device is not GPU. - """ self.set_nvidia_flags(trainer.local_rank) - - # The logical device IDs for selected devices - self._device_ids: List[int] = sorted(set(trainer.data_parallel_device_ids)) - - # The unmasked real GPU IDs - self._gpu_ids: List[int] = self._get_gpu_ids(self._device_ids) - return super().setup(trainer) def on_train_start(self) -> None: @@ -64,34 +57,33 @@ def set_nvidia_flags(local_rank: int) -> None: devices = os.getenv("CUDA_VISIBLE_DEVICES", all_gpu_ids) _log.info(f"LOCAL_RANK: {local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]") - def get_device_stats(self, device: Union[str, torch.dtype]) -> Dict[str, Any]: + def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: """Gets stats for the given GPU device.""" if _TORCH_GREATER_EQUAL_1_8: - return torch.cuda.memory_stats(device=device) + return torch.cuda.memory_stats(device) else: - gpu_stat_keys = [ - ("utilization.gpu", "%"), - ("memory.used", "MB"), - ("memory.free", "MB"), - ("utilization.memory", "%"), - ("fan.speed", "%"), - ("temperature.gpu", "°C"), - ("temperature.memory", "°C"), - ] - gpu_stats = self._get_gpu_stats([k for k, _ in gpu_stat_keys]) - device_stats = self._parse_gpu_stats(self._device_ids, gpu_stats, gpu_stat_keys) - return device_stats - - def _get_gpu_stats(self, queries: List[str]) -> List[List[float]]: - if not queries: - return [] - - """Run nvidia-smi to get the gpu stats""" - gpu_query = ",".join(queries) + return self._get_gpu_stats(device) + + def _get_gpu_stats(self, device: torch.device) -> Dict[str, float]: + nvidia_smi_path = shutil.which("nvidia-smi") + if nvidia_smi_path is None: + raise FileNotFoundError("nvidia-smi: command not found") + + gpu_stat_keys = [ + "utilization.gpu", + "memory.used", + "memory.free", + "utilization.memory", + "fan.speed", + "temperature.gpu", + "temperature.memoy", + ] + gpu_ids = self._get_gpu_id(device.index) + + gpu_query = ",".join(gpu_stat_keys) format = "csv,nounits,noheader" - gpu_ids = ",".join(self._gpu_ids) result = subprocess.run( - [shutil.which("nvidia-smi"), f"--query-gpu={gpu_query}", f"--format={format}", f"--id={gpu_ids}"], + [nvidia_smi_path, f"--query-gpu={gpu_query}", f"--format={format}", f"--id={gpu_ids}"], encoding="utf-8", stdout=subprocess.PIPE, stderr=subprocess.PIPE, # for backward compatibility with python version 3.6 @@ -104,17 +96,17 @@ def _to_float(x: str) -> float: except ValueError: return 0.0 - stats = result.stdout.strip().split(os.linesep) - stats = [[_to_float(x) for x in s.split(", ")] for s in stats] - return stats + stats = [_to_float(x) for x in result.stdout.strip().split(os.linesep)] + for key in gpu_stat_keys: + gpu_stats = {key: stat for _, stat in enumerate(stats)} + return gpu_stats - @staticmethod - def _get_gpu_ids(device_ids: List[int]) -> List[str]: + def _get_gpu_id(self, device_id: int) -> List[str]: """Get the unmasked real GPU IDs.""" # All devices if `CUDA_VISIBLE_DEVICES` unset default = ",".join(str(i) for i in range(torch.cuda.device_count())) cuda_visible_devices: List[str] = os.getenv("CUDA_VISIBLE_DEVICES", default=default).split(",") - return [cuda_visible_devices[device_id].strip() for device_id in device_ids] + return cuda_visible_devices[device_id].strip() def teardown(self) -> None: super().teardown() diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py index e1e0386e53ba4..58272d6285347 100644 --- a/pytorch_lightning/accelerators/tpu.py +++ b/pytorch_lightning/accelerators/tpu.py @@ -60,7 +60,7 @@ def _move_optimizer_state(self, device: Optional[torch.device] = None) -> None: for p, v in opt.state.items(): opt.state[p] = apply_to_collection(v, torch.Tensor, move_data_to_device, self.root_device) - def get_device_stats(self, device: Union[str, torch.dtype]) -> Dict[str, Any]: + def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: """Gets stats for the given TPU device.""" device_stats = {} memory_info = xm.get_memory_info(device) From 6d9cc2e3c4d300a748a5de036ed8ab72063d45f2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 18 Sep 2021 00:23:11 +0000 Subject: [PATCH 06/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pytorch_lightning/accelerators/gpu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 0df8dffdd9a3f..83c580c0af7d6 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -60,9 +60,9 @@ def set_nvidia_flags(local_rank: int) -> None: def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: """Gets stats for the given GPU device.""" if _TORCH_GREATER_EQUAL_1_8: - return torch.cuda.memory_stats(device) + return torch.cuda.memory_stats(device) else: - return self._get_gpu_stats(device) + return self._get_gpu_stats(device) def _get_gpu_stats(self, device: torch.device) -> Dict[str, float]: nvidia_smi_path = shutil.which("nvidia-smi") From 018e5cd9dfd0c3db083488b55327903b4465c68e Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Sat, 18 Sep 2021 06:48:00 +0000 Subject: [PATCH 07/23] fix gpu --- pytorch_lightning/accelerators/gpu.py | 52 ++++++++++++++++----------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 0df8dffdd9a3f..30f16f3b9db29 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -60,30 +60,39 @@ def set_nvidia_flags(local_rank: int) -> None: def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: """Gets stats for the given GPU device.""" if _TORCH_GREATER_EQUAL_1_8: - return torch.cuda.memory_stats(device) + return torch.cuda.memory_stats(device) else: - return self._get_gpu_stats(device) + return self._get_gpu_stats(device) def _get_gpu_stats(self, device: torch.device) -> Dict[str, float]: - nvidia_smi_path = shutil.which("nvidia-smi") - if nvidia_smi_path is None: - raise FileNotFoundError("nvidia-smi: command not found") + """Get the current gpu usage. - gpu_stat_keys = [ - "utilization.gpu", - "memory.used", - "memory.free", - "utilization.memory", - "fan.speed", - "temperature.gpu", - "temperature.memoy", - ] - gpu_ids = self._get_gpu_id(device.index) + Return: + A dictionary in which the keys are device ids as integers and + values are memory usage as integers in MB. + Raises: + FileNotFoundError: + If nvidia-smi installation not found + """ + gpu_stat_metrics = [ + ("utilization.gpu", "%"), + ("memory.used", "MB"), + ("memory.free", "MB"), + ("utilization.memory", "%"), + ("fan.speed", "%"), + ("temperature.gpu", "°C"), + ("temperature.memory", "°C"), + ] + gpu_stat_keys = [k for k, _ in gpu_stat_metrics] gpu_query = ",".join(gpu_stat_keys) - format = "csv,nounits,noheader" + + gpu_id = self._get_gpu_id(device.index) + nvidia_smi_path = shutil.which("nvidia-smi") + if nvidia_smi_path is None: + raise FileNotFoundError("nvidia-smi: command not found") result = subprocess.run( - [nvidia_smi_path, f"--query-gpu={gpu_query}", f"--format={format}", f"--id={gpu_ids}"], + [nvidia_smi_path, f"--query-gpu={gpu_query}", "--format=csv,nounits,noheader", f"--id={gpu_id}"], encoding="utf-8", stdout=subprocess.PIPE, stderr=subprocess.PIPE, # for backward compatibility with python version 3.6 @@ -96,9 +105,12 @@ def _to_float(x: str) -> float: except ValueError: return 0.0 - stats = [_to_float(x) for x in result.stdout.strip().split(os.linesep)] - for key in gpu_stat_keys: - gpu_stats = {key: stat for _, stat in enumerate(stats)} + s = result.stdout.strip() + stats = [_to_float(x) for x in s.split(", ")] + + gpu_stats = {} + for i, (x, unit) in enumerate(gpu_stat_metrics): + gpu_stats[f"{x} ({unit})"] = stats[i] return gpu_stats def _get_gpu_id(self, device_id: int) -> List[str]: From ec8084d3689c88fa2c4c0e6b8951df7da1bf7278 Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Sat, 18 Sep 2021 07:01:15 +0000 Subject: [PATCH 08/23] fix --- pytorch_lightning/accelerators/gpu.py | 116 +++++++++++++------------- 1 file changed, 59 insertions(+), 57 deletions(-) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 30f16f3b9db29..6eff33326d673 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -62,64 +62,66 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: if _TORCH_GREATER_EQUAL_1_8: return torch.cuda.memory_stats(device) else: - return self._get_gpu_stats(device) - - def _get_gpu_stats(self, device: torch.device) -> Dict[str, float]: - """Get the current gpu usage. - - Return: - A dictionary in which the keys are device ids as integers and - values are memory usage as integers in MB. - - Raises: - FileNotFoundError: - If nvidia-smi installation not found - """ - gpu_stat_metrics = [ - ("utilization.gpu", "%"), - ("memory.used", "MB"), - ("memory.free", "MB"), - ("utilization.memory", "%"), - ("fan.speed", "%"), - ("temperature.gpu", "°C"), - ("temperature.memory", "°C"), - ] - gpu_stat_keys = [k for k, _ in gpu_stat_metrics] - gpu_query = ",".join(gpu_stat_keys) - - gpu_id = self._get_gpu_id(device.index) - nvidia_smi_path = shutil.which("nvidia-smi") - if nvidia_smi_path is None: - raise FileNotFoundError("nvidia-smi: command not found") - result = subprocess.run( - [nvidia_smi_path, f"--query-gpu={gpu_query}", "--format=csv,nounits,noheader", f"--id={gpu_id}"], - encoding="utf-8", - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, # for backward compatibility with python version 3.6 - check=True, - ) - - def _to_float(x: str) -> float: - try: - return float(x) - except ValueError: - return 0.0 - - s = result.stdout.strip() - stats = [_to_float(x) for x in s.split(", ")] - - gpu_stats = {} - for i, (x, unit) in enumerate(gpu_stat_metrics): - gpu_stats[f"{x} ({unit})"] = stats[i] - return gpu_stats - - def _get_gpu_id(self, device_id: int) -> List[str]: - """Get the unmasked real GPU IDs.""" - # All devices if `CUDA_VISIBLE_DEVICES` unset - default = ",".join(str(i) for i in range(torch.cuda.device_count())) - cuda_visible_devices: List[str] = os.getenv("CUDA_VISIBLE_DEVICES", default=default).split(",") - return cuda_visible_devices[device_id].strip() + return _get_gpu_stats(device) def teardown(self) -> None: super().teardown() self._move_optimizer_state(torch.device("cpu")) + + +def _get_gpu_stats(device: torch.device) -> Dict[str, float]: + """Get the current gpu usage. + + Return: + A dictionary in which the keys are device ids as integers and + values are memory usage as integers in MB. + + Raises: + FileNotFoundError: + If nvidia-smi installation not found + """ + gpu_stat_metrics = [ + ("utilization.gpu", "%"), + ("memory.used", "MB"), + ("memory.free", "MB"), + ("utilization.memory", "%"), + ("fan.speed", "%"), + ("temperature.gpu", "°C"), + ("temperature.memory", "°C"), + ] + gpu_stat_keys = [k for k, _ in gpu_stat_metrics] + gpu_query = ",".join(gpu_stat_keys) + + gpu_id = _get_gpu_id(device.index) + nvidia_smi_path = shutil.which("nvidia-smi") + if nvidia_smi_path is None: + raise FileNotFoundError("nvidia-smi: command not found") + result = subprocess.run( + [nvidia_smi_path, f"--query-gpu={gpu_query}", "--format=csv,nounits,noheader", f"--id={gpu_id}"], + encoding="utf-8", + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, # for backward compatibility with python version 3.6 + check=True, + ) + + def _to_float(x: str) -> float: + try: + return float(x) + except ValueError: + return 0.0 + + s = result.stdout.strip() + stats = [_to_float(x) for x in s.split(", ")] + + gpu_stats = {} + for i, (x, unit) in enumerate(gpu_stat_metrics): + gpu_stats[f"{x} ({unit})"] = stats[i] + return gpu_stats + + +def _get_gpu_id(device_id: int) -> List[str]: + """Get the unmasked real GPU IDs.""" + # All devices if `CUDA_VISIBLE_DEVICES` unset + default = ",".join(str(i) for i in range(torch.cuda.device_count())) + cuda_visible_devices: List[str] = os.getenv("CUDA_VISIBLE_DEVICES", default=default).split(",") + return cuda_visible_devices[device_id].strip() From 5abce119117ed4832a18261b9e0f72c8048a5c03 Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Sat, 18 Sep 2021 07:05:12 +0000 Subject: [PATCH 09/23] update docstring --- pytorch_lightning/accelerators/gpu.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 6eff33326d673..b4a41c367604c 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -62,19 +62,18 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: if _TORCH_GREATER_EQUAL_1_8: return torch.cuda.memory_stats(device) else: - return _get_gpu_stats(device) + return _get_nvidia_gpu_stats(device) def teardown(self) -> None: super().teardown() self._move_optimizer_state(torch.device("cpu")) -def _get_gpu_stats(device: torch.device) -> Dict[str, float]: - """Get the current gpu usage. +def _get_nvidia_gpu_stats(device: torch.device) -> Dict[str, float]: + """Get GPU stats including memory, fan speed, and temperature from nvidia-smi Return: - A dictionary in which the keys are device ids as integers and - values are memory usage as integers in MB. + A dictionary mapping the metrics to their values. Raises: FileNotFoundError: From 3936242538509620b3f27f1f1bc616ee9943a5ba Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 18 Sep 2021 07:06:25 +0000 Subject: [PATCH 10/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pytorch_lightning/accelerators/gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index b4a41c367604c..2002a3a51a55e 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -70,7 +70,7 @@ def teardown(self) -> None: def _get_nvidia_gpu_stats(device: torch.device) -> Dict[str, float]: - """Get GPU stats including memory, fan speed, and temperature from nvidia-smi + """Get GPU stats including memory, fan speed, and temperature from nvidia-smi. Return: A dictionary mapping the metrics to their values. From 0fdd3687932aa1b2bedb232a18e51788a6fd284f Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Sat, 18 Sep 2021 08:18:19 +0000 Subject: [PATCH 11/23] fix tests --- pytorch_lightning/accelerators/accelerator.py | 1 - pytorch_lightning/accelerators/cpu.py | 7 +++++++ pytorch_lightning/accelerators/gpu.py | 12 ++++++++++-- pytorch_lightning/accelerators/ipu.py | 6 +++++- pytorch_lightning/accelerators/tpu.py | 6 +++++- 5 files changed, 27 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 0103910e68b7d..2c24d7c31447a 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -441,7 +441,6 @@ def restore_checkpoint_after_pre_dispatch(self) -> bool: @abstractmethod def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: """Gets stats for a given device.""" - pass def on_train_start(self) -> None: """Called when train begins.""" diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index 46e74193fb557..2cfe996307e94 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -11,6 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Any, Dict, Union + +import torch + import pytorch_lightning as pl from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -29,3 +33,6 @@ def setup(self, trainer: "pl.Trainer") -> None: raise MisconfigurationException(f"Device should be CPU, got {self.root_device} instead.") return super().setup(trainer) + + def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: + raise NotImplementedError diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index b4a41c367604c..e898de2824004 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -58,7 +58,15 @@ def set_nvidia_flags(local_rank: int) -> None: _log.info(f"LOCAL_RANK: {local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]") def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: - """Gets stats for the given GPU device.""" + """Gets stats for the given GPU device. + + Returns: + A dictionary mapping the metrics to their values. + + Raises: + FileNotFoundError: + If nvidia-smi installation not found + """ if _TORCH_GREATER_EQUAL_1_8: return torch.cuda.memory_stats(device) else: @@ -72,7 +80,7 @@ def teardown(self) -> None: def _get_nvidia_gpu_stats(device: torch.device) -> Dict[str, float]: """Get GPU stats including memory, fan speed, and temperature from nvidia-smi - Return: + Returns: A dictionary mapping the metrics to their values. Raises: diff --git a/pytorch_lightning/accelerators/ipu.py b/pytorch_lightning/accelerators/ipu.py index 4de644b15eac5..b32593deeb810 100644 --- a/pytorch_lightning/accelerators/ipu.py +++ b/pytorch_lightning/accelerators/ipu.py @@ -11,8 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Callable +from typing import Any, Callable, Dict, Union +import torch from torch.optim import Optimizer import pytorch_lightning as pl @@ -32,3 +33,6 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None: def optimizer_step(self, optimizer: Optimizer, opt_idx: int, lambda_closure: Callable, **kwargs: Any) -> None: # Optimizer step is handled by the IPU accelerator. lambda_closure() + + def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: + raise NotImplementedError diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py index 58272d6285347..a1e006528c9b5 100644 --- a/pytorch_lightning/accelerators/tpu.py +++ b/pytorch_lightning/accelerators/tpu.py @@ -61,7 +61,11 @@ def _move_optimizer_state(self, device: Optional[torch.device] = None) -> None: opt.state[p] = apply_to_collection(v, torch.Tensor, move_data_to_device, self.root_device) def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: - """Gets stats for the given TPU device.""" + """Gets stats for the given TPU device. + + Returns: + A dictionary mapping the metrics (free memory and peak memory) to their values. + """ device_stats = {} memory_info = xm.get_memory_info(device) From d8314cf72b4b41fa0ba92e2fce2a761b7e2e64b0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 18 Sep 2021 08:19:49 +0000 Subject: [PATCH 12/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pytorch_lightning/accelerators/gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 44117ded5a3ec..f1851a00881c1 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -59,7 +59,7 @@ def set_nvidia_flags(local_rank: int) -> None: def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: """Gets stats for the given GPU device. - + Returns: A dictionary mapping the metrics to their values. From 5699e858f547a10a5e42ccea95bd0b65c547a8b8 Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Sat, 18 Sep 2021 08:24:43 +0000 Subject: [PATCH 13/23] type fix --- pytorch_lightning/accelerators/gpu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 44117ded5a3ec..316aa7b8599ee 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -59,7 +59,7 @@ def set_nvidia_flags(local_rank: int) -> None: def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: """Gets stats for the given GPU device. - + Returns: A dictionary mapping the metrics to their values. @@ -126,7 +126,7 @@ def _to_float(x: str) -> float: return gpu_stats -def _get_gpu_id(device_id: int) -> List[str]: +def _get_gpu_id(device_id: int) -> str: """Get the unmasked real GPU IDs.""" # All devices if `CUDA_VISIBLE_DEVICES` unset default = ",".join(str(i) for i in range(torch.cuda.device_count())) From 3ac0821ac9993f6dd2e76b59bacb3cedd3c7d569 Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Sat, 18 Sep 2021 18:04:54 +0000 Subject: [PATCH 14/23] fix test --- tests/accelerators/test_accelerator_connector.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 650b7949ac1ba..062fb69c84a17 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -13,7 +13,7 @@ # limitations under the License import os -from typing import Optional +from typing import Optional, Dict, Union, Any from unittest import mock import pytest @@ -385,7 +385,8 @@ def creates_children(self) -> bool: @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) def test_custom_accelerator(device_count_mock, setup_distributed_mock): class Accel(Accelerator): - pass + def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: + return [] class Prec(PrecisionPlugin): pass From 1160cd069c33f2b2ae7fcf1df21d4a1622e1dac8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 18 Sep 2021 18:06:13 +0000 Subject: [PATCH 15/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/accelerators/test_accelerator_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 062fb69c84a17..f943d52d76e08 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -13,7 +13,7 @@ # limitations under the License import os -from typing import Optional, Dict, Union, Any +from typing import Any, Dict, Optional, Union from unittest import mock import pytest From ef5bc17163837c1cf21061655fb7208838408397 Mon Sep 17 00:00:00 2001 From: Danielle Pintz <38207072+daniellepintz@users.noreply.github.com> Date: Sun, 19 Sep 2021 16:37:37 -0700 Subject: [PATCH 16/23] Update pytorch_lightning/accelerators/gpu.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- pytorch_lightning/accelerators/gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 316aa7b8599ee..688b32897586d 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -61,7 +61,7 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: """Gets stats for the given GPU device. Returns: - A dictionary mapping the metrics to their values. + A dictionary mapping the metrics to their values. Raises: FileNotFoundError: From 497680c9dee36209aaf4f2dbf8ccda16ff17df79 Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Tue, 21 Sep 2021 22:13:47 +0000 Subject: [PATCH 17/23] address comments --- CHANGELOG.md | 2 +- pytorch_lightning/accelerators/accelerator.py | 5 ++--- pytorch_lightning/accelerators/cpu.py | 3 --- pytorch_lightning/accelerators/gpu.py | 2 +- pytorch_lightning/accelerators/ipu.py | 3 --- tests/accelerators/test_accelerator_connector.py | 3 +-- 6 files changed, 5 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 683e7350ed039..ec41f1f68129b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -142,7 +142,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `PL_RECONCILE_PROCESS` environment variable to enable process reconciliation regardless of cluster environment settings ([#9389](https://github.com/PyTorchLightning/pytorch-lightning/pull/9389)) -- Added `get_device_stats` to Accelerator interface and implement it for GPU and TPU ([#9586](https://github.com/PyTorchLightning/pytorch-lightning/pull/9586)) +- Added `get_device_stats` to the Accelerator Interface and added its implementation for GPU and TPU ([#9586](https://github.com/PyTorchLightning/pytorch-lightning/pull/9586)) - Added `RichModelSummary` callback ([#9546](https://github.com/PyTorchLightning/pytorch-lightning/pull/9546)) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 2c24d7c31447a..11bf0e96d0e01 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import contextlib -from abc import ABC, abstractmethod from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Union import torch @@ -34,7 +33,7 @@ from torch.cuda.amp import GradScaler -class Accelerator(ABC): +class Accelerator: """The Accelerator Base Class. An Accelerator is meant to deal with one type of Hardware. Currently there are accelerators for: @@ -438,9 +437,9 @@ def restore_checkpoint_after_pre_dispatch(self) -> bool: """ return self.training_type_plugin.restore_checkpoint_after_pre_dispatch - @abstractmethod def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: """Gets stats for a given device.""" + raise NotImplementedError def on_train_start(self) -> None: """Called when train begins.""" diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index 2cfe996307e94..3e996c1809d96 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -33,6 +33,3 @@ def setup(self, trainer: "pl.Trainer") -> None: raise MisconfigurationException(f"Device should be CPU, got {self.root_device} instead.") return super().setup(trainer) - - def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: - raise NotImplementedError diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 316aa7b8599ee..688b32897586d 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -61,7 +61,7 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: """Gets stats for the given GPU device. Returns: - A dictionary mapping the metrics to their values. + A dictionary mapping the metrics to their values. Raises: FileNotFoundError: diff --git a/pytorch_lightning/accelerators/ipu.py b/pytorch_lightning/accelerators/ipu.py index b32593deeb810..deacae8b0af61 100644 --- a/pytorch_lightning/accelerators/ipu.py +++ b/pytorch_lightning/accelerators/ipu.py @@ -33,6 +33,3 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None: def optimizer_step(self, optimizer: Optimizer, opt_idx: int, lambda_closure: Callable, **kwargs: Any) -> None: # Optimizer step is handled by the IPU accelerator. lambda_closure() - - def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: - raise NotImplementedError diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 062fb69c84a17..a8cc23ee4b241 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -385,8 +385,7 @@ def creates_children(self) -> bool: @mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True) def test_custom_accelerator(device_count_mock, setup_distributed_mock): class Accel(Accelerator): - def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: - return [] + pass class Prec(PrecisionPlugin): pass From ae7e912caac0b30e103d6b2f10ac96bf5ebc18c2 Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Thu, 23 Sep 2021 00:18:55 +0000 Subject: [PATCH 18/23] Add unit tests --- tests/accelerators/test_gpu.py | 36 ++++++++++++++++++++++++++++++++++ tests/accelerators/test_tpu.py | 16 +++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 tests/accelerators/test_gpu.py create mode 100644 tests/accelerators/test_tpu.py diff --git a/tests/accelerators/test_gpu.py b/tests/accelerators/test_gpu.py new file mode 100644 index 0000000000000..058ac1f8ef1fb --- /dev/null +++ b/tests/accelerators/test_gpu.py @@ -0,0 +1,36 @@ +import torch + +from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin +from pytorch_lightning.accelerators import GPUAccelerator +from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin +from tests.helpers.runif import RunIf + + +@RunIf(min_torch="1.8") +@RunIf(min_gpus=1) +def test_get_torch_gpu_stats(tmpdir): + """Test GPU get_device_stats with Pytorch >= 1.8.0.""" + current_device = torch.device(f"cuda:{torch.cuda.current_device()}") + GPUAccel = GPUAccelerator( + training_type_plugin=DataParallelPlugin(parallel_devices=[current_device]), precision_plugin=PrecisionPlugin() + ) + gpu_stats = GPUAccel.get_device_stats(current_device) + fields = ["allocated_bytes.all.freed", "inactive_split.all.peak", "reserved_bytes.large_pool.peak"] + + for f in fields: + assert any(f in h for h in gpu_stats.keys()) + + +@RunIf(max_torch="1.7") +@RunIf(min_gpus=1) +def test_get_nvidia_gpu_stats(tmpdir): + """Test GPU get_device_stats with Pytorch < 1.8.0.""" + current_device = torch.device(f"cuda:{torch.cuda.current_device()}") + GPUAccel = GPUAccelerator( + training_type_plugin=DataParallelPlugin(parallel_devices=[current_device]), precision_plugin=PrecisionPlugin() + ) + gpu_stats = GPUAccel.get_device_stats(current_device) + fields = ["utilization.gpu", "memory.used", "memory.free", "utilization.memory"] + + for f in fields: + assert any(f in h for h in gpu_stats.keys()) diff --git a/tests/accelerators/test_tpu.py b/tests/accelerators/test_tpu.py new file mode 100644 index 0000000000000..7d63c31ee7d39 --- /dev/null +++ b/tests/accelerators/test_tpu.py @@ -0,0 +1,16 @@ +from pytorch_lightning.plugins.training_type import TPUSpawnPlugin +from pytorch_lightning.accelerators import TPUAccelerator +from tests.helpers.runif import RunIf +from pytorch_lightning.plugins import SingleTPUPlugin + + +@RunIf(tpu=True) +def test_device_stats_tpu(tmpdir): + """Test TPU get_device_stats.""" + plugin = SingleTPUPlugin(1) + TPUAccel = TPUAccelerator(training_type_plugin=TPUSpawnPlugin(), precision_plugin=plugin) + tpu_stats = TPUAccel.get_device_stats("1") + fields = ["avg. free memory (MB)", "avg. peak memory (MB)"] + + for f in fields: + assert any(f in h for h in tpu_stats.keys()) From 418e4a079adb7f7203cb62bcc32559763ab1cf9c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 23 Sep 2021 00:23:15 +0000 Subject: [PATCH 19/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/accelerators/test_gpu.py | 2 +- tests/accelerators/test_tpu.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/accelerators/test_gpu.py b/tests/accelerators/test_gpu.py index 058ac1f8ef1fb..85ce0cd9f0f18 100644 --- a/tests/accelerators/test_gpu.py +++ b/tests/accelerators/test_gpu.py @@ -1,8 +1,8 @@ import torch -from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin from pytorch_lightning.accelerators import GPUAccelerator from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin +from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin from tests.helpers.runif import RunIf diff --git a/tests/accelerators/test_tpu.py b/tests/accelerators/test_tpu.py index 7d63c31ee7d39..f3a2c50c0e347 100644 --- a/tests/accelerators/test_tpu.py +++ b/tests/accelerators/test_tpu.py @@ -1,7 +1,7 @@ -from pytorch_lightning.plugins.training_type import TPUSpawnPlugin from pytorch_lightning.accelerators import TPUAccelerator -from tests.helpers.runif import RunIf from pytorch_lightning.plugins import SingleTPUPlugin +from pytorch_lightning.plugins.training_type import TPUSpawnPlugin +from tests.helpers.runif import RunIf @RunIf(tpu=True) From 46b9f3665e642e95825fabd7a8b6c5f6fd01a86e Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Thu, 23 Sep 2021 01:01:35 +0000 Subject: [PATCH 20/23] comments --- pytorch_lightning/accelerators/cpu.py | 4 ++++ pytorch_lightning/accelerators/ipu.py | 3 +-- pytorch_lightning/accelerators/tpu.py | 3 --- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index 3e996c1809d96..7e7ae26a2713f 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -33,3 +33,7 @@ def setup(self, trainer: "pl.Trainer") -> None: raise MisconfigurationException(f"Device should be CPU, got {self.root_device} instead.") return super().setup(trainer) + + def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: + """Returns dummy implementation for now""" + return {} diff --git a/pytorch_lightning/accelerators/ipu.py b/pytorch_lightning/accelerators/ipu.py index deacae8b0af61..4de644b15eac5 100644 --- a/pytorch_lightning/accelerators/ipu.py +++ b/pytorch_lightning/accelerators/ipu.py @@ -11,9 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Callable, Dict, Union +from typing import Any, Callable -import torch from torch.optim import Optimizer import pytorch_lightning as pl diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py index a1e006528c9b5..7e826d4317308 100644 --- a/pytorch_lightning/accelerators/tpu.py +++ b/pytorch_lightning/accelerators/tpu.py @@ -72,9 +72,6 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: free_memory = memory_info["kb_free"] peak_memory = memory_info["kb_total"] - free_memory - free_memory = self.training_type_plugin.reduce(free_memory) * 0.001 - peak_memory = self.training_type_plugin.reduce(peak_memory) * 0.001 - device_stats["avg. free memory (MB)"] = free_memory device_stats["avg. peak memory (MB)"] = peak_memory return device_stats From ccadca51daf83c395c97956c24a1f825435e27af Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 23 Sep 2021 01:03:31 +0000 Subject: [PATCH 21/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pytorch_lightning/accelerators/cpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py index 7e7ae26a2713f..baa922b6d796b 100644 --- a/pytorch_lightning/accelerators/cpu.py +++ b/pytorch_lightning/accelerators/cpu.py @@ -35,5 +35,5 @@ def setup(self, trainer: "pl.Trainer") -> None: return super().setup(trainer) def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: - """Returns dummy implementation for now""" + """Returns dummy implementation for now.""" return {} From 2658b4af7910e76b15fbc28986d0f4fb69e5fafc Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Thu, 23 Sep 2021 06:39:33 +0000 Subject: [PATCH 22/23] lint --- tests/accelerators/test_accelerator_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index c90abc38df4bd..650b7949ac1ba 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -13,7 +13,7 @@ # limitations under the License import os -from typing import Any, Dict, Optional, Union +from typing import Optional from unittest import mock import pytest From c4f0d02d26cd63ae0a7936381c950815f7928211 Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Thu, 23 Sep 2021 20:54:59 +0000 Subject: [PATCH 23/23] comments --- pytorch_lightning/accelerators/accelerator.py | 9 ++++++++- pytorch_lightning/accelerators/gpu.py | 9 +++++++-- pytorch_lightning/accelerators/tpu.py | 14 ++++++++------ 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 11bf0e96d0e01..137ee9b98ab67 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -438,7 +438,14 @@ def restore_checkpoint_after_pre_dispatch(self) -> bool: return self.training_type_plugin.restore_checkpoint_after_pre_dispatch def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: - """Gets stats for a given device.""" + """Gets stats for a given device. + + Args: + device: device for which to get stats + + Returns: + Dictionary of device stats + """ raise NotImplementedError def on_train_start(self) -> None: diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 688b32897586d..b33903c2d60c9 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -60,6 +60,9 @@ def set_nvidia_flags(local_rank: int) -> None: def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: """Gets stats for the given GPU device. + Args: + device: GPU device for which to get stats + Returns: A dictionary mapping the metrics to their values. @@ -69,8 +72,7 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: """ if _TORCH_GREATER_EQUAL_1_8: return torch.cuda.memory_stats(device) - else: - return _get_nvidia_gpu_stats(device) + return _get_nvidia_gpu_stats(device) def teardown(self) -> None: super().teardown() @@ -80,6 +82,9 @@ def teardown(self) -> None: def _get_nvidia_gpu_stats(device: torch.device) -> Dict[str, float]: """Get GPU stats including memory, fan speed, and temperature from nvidia-smi. + Args: + device: GPU device for which to get stats + Returns: A dictionary mapping the metrics to their values. diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py index 9f09fafcb5bdf..68925ab67aca9 100644 --- a/pytorch_lightning/accelerators/tpu.py +++ b/pytorch_lightning/accelerators/tpu.py @@ -65,15 +65,17 @@ def _move_optimizer_state(self, device: Optional[torch.device] = None) -> None: def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: """Gets stats for the given TPU device. + Args: + device: TPU device for which to get stats + Returns: - A dictionary mapping the metrics (free memory and peak memory) to their values. + A dictionary mapping the metrics (free memory and peak memory) to their values. """ - device_stats = {} memory_info = xm.get_memory_info(device) - free_memory = memory_info["kb_free"] peak_memory = memory_info["kb_total"] - free_memory - - device_stats["avg. free memory (MB)"] = free_memory - device_stats["avg. peak memory (MB)"] = peak_memory + device_stats = { + "avg. free memory (MB)": free_memory, + "avg. peak memory (MB)": peak_memory, + } return device_stats