Lightning-AI · mleshen · Jun 20, 2021 · Jun 20, 2021 · Jun 20, 2021 · Jun 20, 2021
@@ -362,6 +362,8 @@ def on_save_checkpoint(
             "best_model_path": self.best_model_path,
             "current_score": self.current_score,
             "dirpath": self.dirpath,
+            "world_size": trainer.world_size,
+            "node_rank": trainer.node_rank,
         }
 
     def on_load_checkpoint(

@@ -12,20 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import contextlib
-from typing import Dict, Generator, List, Optional
+import logging
+from pathlib import Path
+from typing import Any, Dict, Generator, List, Optional, Union
 
 import torch
+from torch import Tensor
 
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
 from pytorch_lightning.plugins.training_type.ddp import DDPPlugin
-from pytorch_lightning.utilities import _FAIRSCALE_FULLY_SHARDED_AVAILABLE
+from pytorch_lightning.utilities import _FAIRSCALE_FULLY_SHARDED_AVAILABLE, rank_zero_info
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 if _FAIRSCALE_FULLY_SHARDED_AVAILABLE:
     from fairscale.nn import default_auto_wrap_policy, enable_wrap
     from fairscale.nn.data_parallel import FullyShardedDataParallel
 
+log: logging.Logger = logging.getLogger(__name__)
+
 
 class DDPFullyShardedPlugin(DDPPlugin):
     def __init__(
@@ -174,6 +179,42 @@ def model_to_device(self) -> None:
         # ensure we update the device type in the lightning module
         self.lightning_module.to(self.root_device)
 
+    def lightning_module_state_dict(self) -> Dict[str, Union[Any, Tensor]]:
+        # Currently it is same as default TrainingTypePlugin, i.e. return
+        # the full state dict for FSDP, in the future, we will provide sharded
+        # state dict.
+        return super().lightning_module_state_dict()
+
+    def load_model_state(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
+        """
+        Use for each rank to reload the model weights.
+
+        Args:
+            checkpoint_path: Path to the current checkpoint.
+
+        Returns:
+            checkpoint: Current checkpoint
+        """
+        checkpoint = {}
+        rank_zero_info(
+            f"FullyShardedDataParallel has {self.num_processes} processes. Serializing model "
+            "state restoration to avoid CPU OOMs."
+        )
+        # Each rank will load the current checkpoint from `checkpoint_path`
+        # and load the weights while the others are waiting for this operation to complete
+        # with a barrier.
+        for current_worker in range(self.num_processes):
+            if self.local_rank == current_worker:
+                checkpoint = self.load_checkpoint_file(checkpoint_path)
+                self.on_load_checkpoint(checkpoint)
+                self.load_model_state_dict(checkpoint.pop("state_dict"))
+                log.info(
+                    f"Rank {self.global_rank}: done loading model states from {checkpoint_path}, "
+                    "deleted state_dict from checkpoint."
+                )
+            self.barrier()
+        return checkpoint
+
     @property
     def setup_optimizers_in_pre_dispatch(self) -> bool:
         # Setup optimizers after the Fully Sharded Model has been made

@@ -160,6 +160,15 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
     def load_model_state_dict(self, checkpoint: Mapping[str, Any]) -> None:
         self.lightning_module.load_state_dict(checkpoint["state_dict"])
 
+    def on_load_checkpoint(self, checkpoint: Mapping[str, Any]) -> None:
+        self.lightning_module.on_load_checkpoint(checkpoint)
+
+    def load_model_state(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
+        checkpoint = self.load_checkpoint_file(checkpoint_path)
+        self.on_load_checkpoint(checkpoint)
+        self.load_model_state_dict(checkpoint)
+        return checkpoint
+
     def load_optimizer_state_dict(self, checkpoint: Mapping[str, Any]) -> None:
         optimizer_states = checkpoint["optimizer_states"]
         for optimizer, opt_state in zip(self.lightning_module.trainer.accelerator.optimizers, optimizer_states):

@@ -142,12 +142,7 @@ def restore_model(self) -> None:
 
     def restore_model_weights(self, checkpoint_path: Optional[Union[str, Path]]) -> None:
         """Restore only the model weights."""
-        checkpoint = self._loaded_checkpoint
-        if checkpoint_path is not None:
-            checkpoint = self.trainer.training_type_plugin.load_checkpoint(checkpoint_path)
-
-        self.trainer.lightning_module.on_load_checkpoint(checkpoint)
-        self.trainer.training_type_plugin.load_model_state_dict(checkpoint)
+        self.trainer.training_type_plugin.load_model_state(checkpoint_path)
 
     def restore_training_state(self) -> None:
         """