[2/4] Add DeviceStatsMonitor callback (#9712)

daniellepintz · ananthsub · tchaton · web-flow · commit 940b910d271b · 2021-10-13T18:29:36.000Z
Co-authored-by: ananthsub &lt;ananth.subramaniam@gmail.com&gt;
Co-authored-by: thomas chaton &lt;thomas@grid.ai&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
Co-authored-by: Adrian Wälchli &lt;aedu.waelchli@gmail.com&gt;
Co-authored-by: Kaushik B &lt;kaushikbokka@gmail.com&gt;
Co-authored-by: Kaushik B &lt;45285388+kaushikb11@users.noreply.github.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -163,6 +163,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added a warning when an unknown key is encountered in optimizer configuration, and when `OneCycleLR` is used with `"interval": "epoch"` ([#9666](https://github.com/PyTorchLightning/pytorch-lightning/pull/9666))
 
 
+- Added `DeviceStatsMonitor` callback ([#9712](https://github.com/PyTorchLightning/pytorch-lightning/pull/9712))
+
+
 - Added `enable_progress_bar` to Trainer constructor ([#9664](https://github.com/PyTorchLightning/pytorch-lightning/pull/9664))
 
 
diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/tpu-tests/tpu_test_cases.jsonnet
@@ -36,6 +36,7 @@ local tputests = base.BaseTest {
           tests/profiler/test_xla_profiler.py \
           pytorch_lightning/utilities/xla_device.py \
           tests/accelerators/test_tpu_backend.py \
+          tests/callbacks/test_device_stats_monitor.py \
           tests/models/test_tpu.py
       test_exit_code=$?
       echo "\n||| END PYTEST LOGS |||\n"
diff --git a/docs/source/extensions/accelerators.rst b/docs/source/extensions/accelerators.rst
@@ -14,6 +14,7 @@ Currently there are accelerators for:
 - CPU
 - GPU
 - TPU
+- IPU
 
 Each Accelerator gets two plugins upon initialization:
 One to handle differences from the training routine and one to handle different precisions.
diff --git a/docs/source/extensions/callbacks.rst b/docs/source/extensions/callbacks.rst
@@ -99,6 +99,7 @@ Lightning has a few built-in callbacks.
     BaseFinetuning
     BasePredictionWriter
     Callback
+    DeviceStatsMonitor
     EarlyStopping
     GPUStatsMonitor
     GradientAccumulationScheduler
diff --git a/pyproject.toml b/pyproject.toml
@@ -61,6 +61,7 @@ ignore_errors = "True"
 
 [[tool.mypy.overrides]]
 module = [
+    "pytorch_lightning.callbacks.device_stats_monitor",
     "pytorch_lightning.callbacks.model_summary",
     "pytorch_lightning.callbacks.pruning",
     "pytorch_lightning.callbacks.rich_model_summary",
diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py
@@ -35,5 +35,5 @@ def setup(self, trainer: "pl.Trainer") -> None:
         return super().setup(trainer)
 
     def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
-        """Returns dummy implementation for now."""
+        """CPU device stats aren't supported yet."""
         return {}
diff --git a/pytorch_lightning/accelerators/ipu.py b/pytorch_lightning/accelerators/ipu.py
@@ -11,8 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Callable
+from typing import Any, Callable, Dict, Union
 
+import torch
 from torch.optim import Optimizer
 
 import pytorch_lightning as pl
@@ -37,3 +38,7 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None:
     def optimizer_step(self, optimizer: Optimizer, opt_idx: int, lambda_closure: Callable, **kwargs: Any) -> None:
         # Optimizer step is handled by the IPU accelerator.
         lambda_closure()
+
+    def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
+        """IPU device stats aren't supported yet."""
+        return {}
diff --git a/pytorch_lightning/callbacks/__init__.py b/pytorch_lightning/callbacks/__init__.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from pytorch_lightning.callbacks.base import Callback
+from pytorch_lightning.callbacks.device_stats_monitor import DeviceStatsMonitor
 from pytorch_lightning.callbacks.early_stopping import EarlyStopping
 from pytorch_lightning.callbacks.finetuning import BackboneFinetuning, BaseFinetuning
 from pytorch_lightning.callbacks.gpu_stats_monitor import GPUStatsMonitor
@@ -33,6 +34,7 @@
     "BackboneFinetuning",
     "BaseFinetuning",
     "Callback",
+    "DeviceStatsMonitor",
     "EarlyStopping",
     "GPUStatsMonitor",
     "XLAStatsMonitor",
diff --git a/pytorch_lightning/callbacks/device_stats_monitor.py b/pytorch_lightning/callbacks/device_stats_monitor.py
@@ -0,0 +1,82 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Device Stats Monitor
+====================
+
+Monitors and logs device stats during training.
+
+"""
+from typing import Any, Dict, Optional
+
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks.base import Callback
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.types import STEP_OUTPUT
+
+
+class DeviceStatsMonitor(Callback):
+    r"""
+    Automatically monitors and logs device stats during training stage. ``DeviceStatsMonitor``
+    is a special callback as it requires a ``logger`` to passed as argument to the ``Trainer``.
+
+    Raises:
+        MisconfigurationException:
+            If ``Trainer`` has no logger.
+
+    Example:
+        >>> from pytorch_lightning import Trainer
+        >>> from pytorch_lightning.callbacks import DeviceStatsMonitor
+        >>> device_stats = DeviceStatsMonitor() # doctest: +SKIP
+        >>> trainer = Trainer(callbacks=[device_stats]) # doctest: +SKIP
+    """
+
+    def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: Optional[str] = None) -> None:
+        if not trainer.logger:
+            raise MisconfigurationException("Cannot use DeviceStatsMonitor callback with Trainer that has no logger.")
+
+    def on_train_batch_start(
+        self,
+        trainer: "pl.Trainer",
+        pl_module: "pl.LightningModule",
+        batch: Any,
+        batch_idx: int,
+        unused: Optional[int] = 0,
+    ) -> None:
+        if not trainer.logger_connector.should_update_logs:
+            return
+
+        device_stats = trainer.accelerator.get_device_stats(pl_module.device)
+        prefixed_device_stats = prefix_metrics_keys(device_stats, "on_train_batch_start")
+        trainer.logger.log_metrics(prefixed_device_stats, step=trainer.global_step)
+
+    def on_train_batch_end(
+        self,
+        trainer: "pl.Trainer",
+        pl_module: "pl.LightningModule",
+        outputs: STEP_OUTPUT,
+        batch: Any,
+        batch_idx: int,
+        unused: Optional[int] = 0,
+    ) -> None:
+        if not trainer.logger_connector.should_update_logs:
+            return
+
+        device_stats = trainer.accelerator.get_device_stats(pl_module.device)
+        prefixed_device_stats = prefix_metrics_keys(device_stats, "on_train_batch_end")
+        trainer.logger.log_metrics(prefixed_device_stats, step=trainer.global_step)
+
+
+def prefix_metrics_keys(metrics_dict: Dict[str, float], prefix: str) -> Dict[str, float]:
+    return {prefix + "." + k: v for k, v in metrics_dict.items()}
diff --git a/tests/accelerators/test_tpu.py b/tests/accelerators/test_tpu.py
diff --git a/tests/callbacks/test_device_stats_monitor.py b/tests/callbacks/test_device_stats_monitor.py
@@ -0,0 +1,130 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, Optional
+
+import pytest
+
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import DeviceStatsMonitor
+from pytorch_lightning.loggers import CSVLogger
+from pytorch_lightning.utilities.distributed import rank_zero_only
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from tests.helpers import BoringModel
+from tests.helpers.runif import RunIf
+
+
+@RunIf(min_torch="1.8")
+@RunIf(min_gpus=1)
+def test_device_stats_gpu_from_torch(tmpdir):
+    """Test GPU stats are logged using a logger with Pytorch >= 1.8.0."""
+    model = BoringModel()
+    device_stats = DeviceStatsMonitor()
+
+    class DebugLogger(CSVLogger):
+        @rank_zero_only
+        def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
+            fields = ["allocated_bytes.all.freed", "inactive_split.all.peak", "reserved_bytes.large_pool.peak"]
+            for f in fields:
+                assert any(f in h for h in metrics.keys())
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=2,
+        limit_train_batches=7,
+        log_every_n_steps=1,
+        gpus=1,
+        callbacks=[device_stats],
+        logger=DebugLogger(tmpdir),
+        checkpoint_callback=False,
+        enable_progress_bar=False,
+    )
+
+    trainer.fit(model)
+
+
+@RunIf(max_torch="1.7")
+@RunIf(min_gpus=1)
+def test_device_stats_gpu_from_nvidia(tmpdir):
+    """Test GPU stats are logged using a logger with Pytorch < 1.8.0."""
+    model = BoringModel()
+    device_stats = DeviceStatsMonitor()
+
+    class DebugLogger(CSVLogger):
+        @rank_zero_only
+        def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
+            fields = ["utilization.gpu", "memory.used", "memory.free", "utilization.memory"]
+            for f in fields:
+                assert any(f in h for h in metrics.keys())
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=2,
+        limit_train_batches=7,
+        log_every_n_steps=1,
+        gpus=1,
+        callbacks=[device_stats],
+        logger=DebugLogger(tmpdir),
+        checkpoint_callback=False,
+        enable_progress_bar=False,
+    )
+
+    trainer.fit(model)
+
+
+@RunIf(tpu=True)
+def test_device_stats_monitor_tpu(tmpdir):
+    """Test TPU stats are logged using a logger."""
+
+    model = BoringModel()
+    device_stats = DeviceStatsMonitor()
+
+    class DebugLogger(CSVLogger):
+        @rank_zero_only
+        def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
+            fields = ["avg. free memory (MB)", "avg. peak memory (MB)"]
+            for f in fields:
+                assert any(f in h for h in metrics.keys())
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_train_batches=1,
+        tpu_cores=8,
+        log_every_n_steps=1,
+        callbacks=[device_stats],
+        logger=DebugLogger(tmpdir),
+        checkpoint_callback=False,
+        enable_progress_bar=False,
+    )
+
+    trainer.fit(model)
+
+
+def test_device_stats_monitor_no_logger(tmpdir):
+    """Test DeviceStatsMonitor with no logger in Trainer."""
+
+    model = BoringModel()
+    device_stats = DeviceStatsMonitor()
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        callbacks=[device_stats],
+        max_epochs=1,
+        logger=False,
+        checkpoint_callback=False,
+        enable_progress_bar=False,
+    )
+
+    with pytest.raises(MisconfigurationException, match="Trainer that has no logger."):
+        trainer.fit(model)