Revert "Support serialized checkpoint loading (#9605)" (#10057)

jjenniferdai · web-flow · commit 2d9db211b5e9 · 2021-10-21T02:51:22.000+02:00
This reverts commit f0e6f1b.
diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
@@ -21,7 +21,7 @@
 import time
 from pathlib import Path
 from time import sleep
-from typing import Any, Dict, List, Mapping, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import __main__
 import numpy as np
@@ -51,16 +51,10 @@
 )
 from pytorch_lightning.utilities.distributed import distributed_available
 from pytorch_lightning.utilities.distributed import group as _group
-from pytorch_lightning.utilities.distributed import (
-    init_ddp_connection,
-    rank_zero_info,
-    rank_zero_only,
-    ReduceOp,
-    sync_ddp_if_available,
-)
+from pytorch_lightning.utilities.distributed import init_ddp_connection, rank_zero_only, ReduceOp, sync_ddp_if_available
 from pytorch_lightning.utilities.exceptions import DeadlockDetectedException, MisconfigurationException
 from pytorch_lightning.utilities.seed import reset_seed
-from pytorch_lightning.utilities.types import _PATH, STEP_OUTPUT
+from pytorch_lightning.utilities.types import STEP_OUTPUT
 
 if _TORCH_GREATER_EQUAL_1_10:
     from torch.distributed.optim import DistributedOptimizer, PostLocalSGDOptimizer, ZeroRedundancyOptimizer
@@ -129,7 +123,6 @@ def __init__(
         self._pids: Optional[List[int]] = None
         self._sync_dir: Optional[str] = None
         self._rank_0_has_called_call_children_scripts: bool = False
-        self._has_loaded_state_dict: bool = False
         self.set_world_ranks()
 
     @property
@@ -540,26 +533,3 @@ def teardown(self) -> None:
             self.lightning_module.cpu()
             # clean up memory
             torch.cuda.empty_cache()
-
-        self._has_loaded_state_dict = False
-
-    def load_model_state_dict(self, checkpoint: Mapping[str, Any]) -> None:
-        if "state_dict" not in checkpoint and self._has_loaded_state_dict:
-            return
-        self.lightning_module.load_state_dict(checkpoint["state_dict"])
-
-    def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]:
-        rank_zero_info(
-            f"DistributedDataParallel has {self.num_processes} processes. "
-            "Serializing checkpoint loading to avoid CPU OOMs."
-        )
-        for current_worker in range(self.num_processes):
-            if self.local_rank == current_worker:
-                checkpoint = super().load_checkpoint(checkpoint_path)
-                self.lightning_module.on_load_checkpoint(checkpoint)
-                self.load_model_state_dict(checkpoint)
-                del checkpoint["state_dict"]
-                self._has_loaded_state_dict = True
-                log.info(f"Rank {self.global_rank}: done loading model states from {checkpoint_path}.")
-            self.barrier()
-        return checkpoint
diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -781,8 +781,6 @@ def lightning_restore_optimizer_and_schedulers(self) -> bool:
         return False
 
     def load_model_state_dict(self, checkpoint: Mapping[str, Any]) -> None:
-        if "state_dict" not in checkpoint and self._has_loaded_state_dict:
-            return
         # override to do nothing, deepspeed engine already loaded the weights in `load_checkpoint()`
         if self.load_full_weights and self.zero_stage_3:
             self.model_to_device()
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -195,7 +195,6 @@ def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]:
         return self.checkpoint_io.load_checkpoint(checkpoint_path)
 
     def load_model_state_dict(self, checkpoint: Mapping[str, Any]) -> None:
-        self.lightning_module.on_load_checkpoint(checkpoint)
         self.lightning_module.load_state_dict(checkpoint["state_dict"])
 
     def load_optimizer_state_dict(self, checkpoint: Mapping[str, Any]) -> None:
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -141,6 +141,9 @@ def restore_model(self) -> None:
 
         model = self.trainer.lightning_module
 
+        # hook: give user access to checkpoint if needed.
+        model.on_load_checkpoint(self._loaded_checkpoint)
+
         # call hpc specific hook
         if self.hpc_resume_path is not None:
             model.on_hpc_load(self._loaded_checkpoint)
@@ -160,6 +163,7 @@ def restore_model_weights(self, checkpoint_path: Optional[_PATH]) -> None:
         if checkpoint_path is not None:
             checkpoint = self._load_and_validate_checkpoint(checkpoint_path)
 
+        self.trainer.lightning_module.on_load_checkpoint(checkpoint)
         self.trainer.training_type_plugin.load_model_state_dict(checkpoint)
 
     def restore_training_state(self) -> None:
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -1029,6 +1029,9 @@ def _run(self, model: "pl.LightningModule") -> Optional[Union[_EVALUATE_OUTPUT,
         self.data_connector.prepare_data()
         self.callback_connector._attach_model_callbacks()
 
+        if self._ckpt_path and not self.training_type_plugin.restore_checkpoint_after_pre_dispatch:
+            self._load_checkpoint_weights()
+
         # ----------------------------
         # SET UP TRAINING
         # ----------------------------
@@ -1038,8 +1041,6 @@ def _run(self, model: "pl.LightningModule") -> Optional[Union[_EVALUATE_OUTPUT,
 
         # check if we should delay restoring checkpoint till later
         if not self.training_type_plugin.restore_checkpoint_after_pre_dispatch:
-            if self._ckpt_path:
-                self._load_checkpoint_weights()
             self.checkpoint_connector.resume_start()
             self._restore_modules_and_callbacks()
 
diff --git a/tests/trainer/connectors/test_checkpoint_connector.py b/tests/trainer/connectors/test_checkpoint_connector.py
@@ -78,7 +78,6 @@ def test_preloaded_checkpoint_lifecycle(tmpdir):
     ckpt_path = trainer.checkpoint_callback.best_model_path
     trainer = Trainer(default_root_dir=tmpdir, max_steps=2, resume_from_checkpoint=ckpt_path)
     connector = trainer.checkpoint_connector
-    trainer.accelerator.connect(model)
     connector.resume_start()
     assert connector.resume_checkpoint_path == ckpt_path
     assert connector._loaded_checkpoint