Move logger and profiler finalization to trainer's teardown (#8685)

Binh Tang · ananthsub · awaelchli · web-flow · commit efec3d461c38 · 2021-08-05T10:09:43.000+02:00
Co-authored-by: ananthsub &lt;ananth.subramaniam@gmail.com&gt;
Co-authored-by: Adrian Wälchli &lt;aedu.waelchli@gmail.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -117,6 +117,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed recursive call for `apply_to_collection(include_none=False)` ([#8719](https://github.com/PyTorchLightning/pytorch-lightning/pull/8719))
 
 
+- Fixed an issue with logger outputs not being finalized correctly after prediction runs ([#8333](https://github.com/PyTorchLightning/pytorch-lightning/issues/8333))
+
 
 ## [1.4.0] - 2021-07-27
 
diff --git a/pytorch_lightning/loops/dataloader/evaluation_loop.py b/pytorch_lightning/loops/dataloader/evaluation_loop.py
@@ -20,7 +20,6 @@
 from pytorch_lightning.loops.dataloader import DataLoaderLoop
 from pytorch_lightning.loops.epoch import EvaluationEpochLoop
 from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection
-from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.types import EPOCH_OUTPUT
 
@@ -206,10 +205,6 @@ def on_evaluation_end(self, *args: Any, **kwargs: Any) -> None:
         else:
             self.trainer.call_hook("on_validation_end", *args, **kwargs)
 
-        if self.trainer.state.fn != TrainerFn.FITTING:
-            # summarize profile results
-            self.trainer.profiler.describe()
-
         # reset any `torchmetrics.Metric` and the logger connector state
         self.trainer.logger_connector.reset(metrics=True)
 
diff --git a/pytorch_lightning/loops/dataloader/prediction_loop.py b/pytorch_lightning/loops/dataloader/prediction_loop.py
@@ -119,8 +119,6 @@ def on_predict_epoch_end(self) -> Optional[_PREDICT_OUTPUT]:
         Returns:
             the results for all dataloaders
         """
-        self.trainer.profiler.describe()
-
         results = self.predictions
 
         self.trainer.call_hook("on_predict_epoch_end", results)
diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py
@@ -225,15 +225,6 @@ def on_run_end(self) -> None:
         # hook
         self.trainer.call_hook("on_train_end")
 
-        # todo: TPU 8 cores hangs in flush with TensorBoard. Might do for all loggers.
-        # It might be related to xla tensors blocked when moving the cpu
-        # kill loggers
-        if self.trainer.logger is not None:
-            self.trainer.logger.finalize("success")
-
-        # summarize profile results
-        self.trainer.profiler.describe()
-
         # give accelerators a chance to finish
         self.trainer.accelerator.on_train_end()
 
diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py
@@ -203,6 +203,9 @@ def new_process(self, process_idx, trainer, mp_queue):
         # persist info in ddp_spawn
         self.transfer_distrib_spawn_state_on_fit_end(results)
 
+        # ensure that spawned processes go through teardown before joining
+        trainer._call_teardown_hook()
+
     def post_dispatch(self):
         # restore main state with best weights
         best_path = self.mp_queue.get()
diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py
@@ -172,6 +172,9 @@ def new_process(self, process_idx: int, trainer, mp_queue) -> None:
         if self.local_rank == 0:
             time.sleep(2)
 
+        # ensure that spawned processes go through teardown before joining
+        trainer._call_teardown_hook()
+
     @parameter_validation
     def model_to_device(self) -> None:
         self.model = self.wrapped_model.to(self.root_device)
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -76,6 +76,7 @@
 )
 from pytorch_lightning.utilities.debugging import InternalDebugger
 from pytorch_lightning.utilities.distributed import distributed_available
+from pytorch_lightning.utilities.enums import DistributedType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _fault_tolerant_enabled
 from pytorch_lightning.utilities.model_helpers import is_overridden
@@ -944,8 +945,10 @@ def _run(self, model: "pl.LightningModule") -> Optional[Union[_EVALUATE_OUTPUT,
         if self.state.fn == TrainerFn.FITTING:
             self.call_hook("on_fit_end")
 
-        # teardown
-        self._call_teardown_hook()
+        # teardown if necessary (similar calls for spawn plugins are excluded as they have
+        # been included at the end of `new_process` functions)
+        if self._distrib_type not in DistributedType.interactive_compatible_types():
+            self._call_teardown_hook()
 
         if self.state.status != TrainerStatus.INTERRUPTED:
             self.state.status = TrainerStatus.FINISHED
@@ -1211,7 +1214,7 @@ def _call_teardown_hook(self) -> None:
 
         if self.datamodule is not None:
             self.datamodule.teardown(stage=fn)
-        self.profiler.teardown(stage=fn)
+
         self.teardown(stage=fn)
         self.lightning_module.teardown(stage=fn)
 
@@ -1220,6 +1223,14 @@ def _call_teardown_hook(self) -> None:
         # these could have become stale if metrics are defined in `setup`
         self.lightning_module._metric_attributes = None
 
+        # todo: TPU 8 cores hangs in flush with TensorBoard. Might do for all loggers.
+        # It might be related to xla tensors blocked when moving the cpu kill loggers.
+        if self.logger is not None:
+            self.logger.finalize("success")
+
+        # summarize profile results
+        self.profiler.describe()
+
     def call_hook(self, hook_name: str, *args, **kwargs) -> Any:
         if self.lightning_module:
             prev_fx_name = self.lightning_module._current_fx_name
diff --git a/tests/trainer/logging_/test_distributed_logging.py b/tests/trainer/logging_/test_distributed_logging.py
@@ -12,10 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+from typing import Any, Dict, Optional, Union
 from unittest import mock
 from unittest.mock import Mock
 
+import pytorch_lightning as pl
 from pytorch_lightning import Callback, Trainer
+from pytorch_lightning.loggers.base import LightningLoggerBase
 from tests.helpers import BoringModel
 from tests.helpers.runif import RunIf
 
@@ -101,3 +104,68 @@ def on_train_start(self, trainer, pl_module):
         callbacks=[LoggerCallsObserver()],
     )
     trainer.fit(model)
+
+
+def test_logger_after_fit_predict_test_calls(tmpdir):
+    """
+    Make sure logger outputs are finalized after fit, prediction, and test calls.
+    """
+
+    class BufferLogger(LightningLoggerBase):
+        def __init__(self):
+            super().__init__()
+            self.buffer = {}
+            self.logs = {}
+
+        def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
+            self.buffer.update(metrics)
+
+        def finalize(self, status: str) -> None:
+            self.logs.update(self.buffer)
+            self.buffer = {}
+
+        @property
+        def experiment(self) -> Any:
+            return None
+
+        @property
+        def version(self) -> Union[int, str]:
+            return 1
+
+        @property
+        def name(self) -> str:
+            return "BufferLogger"
+
+        def log_hyperparams(self, *args, **kwargs) -> None:
+            return None
+
+    class LoggerCallsObserver(Callback):
+        def on_fit_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+            trainer.logger.log_metrics({"fit": 1})
+
+        def on_validation_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+            trainer.logger.log_metrics({"validate": 1})
+
+        def on_predict_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+            trainer.logger.log_metrics({"predict": 1})
+
+        def on_test_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+            trainer.logger.log_metrics({"test": 1})
+
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        limit_train_batches=1,
+        limit_val_batches=1,
+        max_epochs=1,
+        logger=BufferLogger(),
+        callbacks=[LoggerCallsObserver()],
+    )
+
+    assert not trainer.logger.logs
+    trainer.fit(model)
+    assert trainer.logger.logs == {"fit": 1, "validate": 1}
+    trainer.test(model)
+    assert trainer.logger.logs == {"fit": 1, "validate": 1, "test": 1}
+    trainer.predict(model)
+    assert trainer.logger.logs == {"fit": 1, "validate": 1, "test": 1, "predict": 1}