Lightning-AI · mleshen · Jun 20, 2021 · Jun 20, 2021 · Jun 20, 2021 · Jun 20, 2021
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import contextlib
+import logging
+from pathlib import Path
 from typing import Any, Dict, Generator, List, Optional, Union
 
 import torch
@@ -27,6 +29,8 @@
     from fairscale.nn import default_auto_wrap_policy, enable_wrap
     from fairscale.nn.data_parallel import FullyShardedDataParallel
 
+log: logging.Logger = logging.getLogger(__name__)
+
 
 class DDPFullyShardedPlugin(DDPPlugin):
 
@@ -178,6 +182,19 @@ def lightning_module_state_dict(self) -> Dict[str, Union[Any, Tensor]]:
         # state dict.
         return super().lightning_module_state_dict()
 
+    def serialized_restore_model_state(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
+        checkpoint = {}
+        log.info(f"FullyShardedDataParallel has {self.num_processes} processes. Serializing to avoid CPU OOMs.")
+        for current_worker in range(self.num_processes):
+            if self.local_rank == current_worker:
+                checkpoint = super().load_checkpoint_file(checkpoint_path)
+                self.lightning_module.on_load_checkpoint(checkpoint)
+                self.load_model_state_dict(checkpoint)
+                log.info(f"Rank {self.global_rank}: done loading model states from {checkpoint_path}.")
+                del checkpoint["state_dict"]
+            self.barrier()
+        return checkpoint
+
     @property
     def setup_optimizers_in_pre_dispatch(self) -> bool:
         # Setup optimizers after the Fully Sharded Model has been made

@@ -145,12 +145,20 @@ def restore_model(self) -> None:
     def restore_model_weights(self, checkpoint_path: Optional[Union[str, Path]]) -> None:
         """ Restore only the model weights. """
         checkpoint = self._loaded_checkpoint
+        if hasattr(self.trainer.training_type_plugin, "serialized_restore_model_state"):
+            checkpoint = self.trainer.training_type_plugin.serialized_restore_model_state(checkpoint_path)
+        else:
+            checkpoint = self.restore_model_state(checkpoint_path)
+
+    def restore_model_state(self, checkpoint_path: Optional[Union[str, Path]]) -> dict:
         if checkpoint_path is not None:
             checkpoint = self.trainer.training_type_plugin.load_checkpoint_file(checkpoint_path)
 
         self.trainer.lightning_module.on_load_checkpoint(checkpoint)
         self.trainer.training_type_plugin.load_model_state_dict(checkpoint)
 
+        return checkpoint
+
     def restore_training_state(self) -> None:
         """
         Restore the trainer state from the pre-loaded checkpoint. This includes the precision settings, loop progress,