Lightning-AI · ananthsub · Aug 23, 2021 · Aug 10, 2021 · Aug 16, 2021 · Aug 17, 2021
@@ -37,7 +37,7 @@
 from pytorch_lightning.core.optimizer import LightningOptimizer
 from pytorch_lightning.core.saving import ModelIO
 from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import FxValidator
-from pytorch_lightning.utilities import rank_zero_deprecation, rank_zero_warn
+from pytorch_lightning.utilities import _TORCH_SHARDED_TENSOR_AVAILABLE, rank_zero_deprecation, rank_zero_warn
 from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.distributed import distributed_available, sync_ddp
@@ -115,6 +115,8 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         self._metric_attributes: Optional[Dict[int, str]] = None
         self._should_prevent_trainer_and_dataloaders_deepcopy: bool = False
 
+        self._register_sharded_tensor_state_dict_hooks_if_available()
+
         # deprecated, will be removed in 1.6
         self._loaded_optimizer_states_dict = {}
 
@@ -1974,3 +1976,16 @@ def __getstate__(self) -> Dict[str, Any]:
             state.pop("test_dataloader", None)
             state.pop("predict_dataloader", None)
         return state
+
+    def _register_sharded_tensor_state_dict_hooks_if_available(self) -> None:
+        """
+        Adds ShardedTensor state dict hooks if ShardedTensors are supported. These hooks ensure that
+        ShardedTensors are included when saving, and are loaded the LightningModule correctly.
+        """
+        if not _TORCH_SHARDED_TENSOR_AVAILABLE:
+            return
+
+        from torch.distributed._sharded_tensor import pre_load_state_dict_hook, state_dict_hook
+
+        self._register_state_dict_hook(state_dict_hook)
+        self._register_load_state_dict_pre_hook(pre_load_state_dict_hook, True)
@@ -47,6 +47,7 @@
     _TORCH_GREATER_EQUAL_1_8,
     _TORCH_GREATER_EQUAL_1_9,
     _TORCH_QUANTIZE_AVAILABLE,
+    _TORCH_SHARDED_TENSOR_AVAILABLE,
     _TORCHTEXT_AVAILABLE,
     _TORCHVISION_AVAILABLE,
     _TPU_AVAILABLE,

@@ -85,6 +85,7 @@ def _compare_version(package: str, op, version) -> bool:
 _OMEGACONF_AVAILABLE = _module_available("omegaconf")
 _POPTORCH_AVAILABLE = _module_available("poptorch")
 _TORCH_QUANTIZE_AVAILABLE = bool([eg for eg in torch.backends.quantized.supported_engines if eg != "none"])
+_TORCH_SHARDED_TENSOR_AVAILABLE = _compare_version("torch", operator.ge, "1.10.0.dev20210809")
 _TORCHTEXT_AVAILABLE = _module_available("torchtext")
 _TORCHVISION_AVAILABLE = _module_available("torchvision")
 _TORCHMETRICS_LOWER_THAN_0_3 = _compare_version("torchmetrics", operator.lt, "0.3.0")

@@ -13,13 +13,17 @@
 # limitations under the License.
 from unittest.mock import Mock
 
+import pytest
 import torch
+import torch.distributed as dist
 from torch import nn
 from torch.optim import Adam, SGD
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.loggers import TensorBoardLogger
+from pytorch_lightning.utilities import _TORCH_SHARDED_TENSOR_AVAILABLE
 from tests.helpers import BoringModel
+from tests.helpers.process_group import single_process_pg
 from tests.helpers.runif import RunIf
 
 
@@ -299,3 +303,39 @@ def assert_device(device: torch.device) -> None:
     assert_device(torch.device("cpu"))
     trainer.predict(model, dataloaders=model.train_dataloader())
     assert_device(torch.device("cpu"))
+
+
+class BoringModelWithShardedTensor(BoringModel):
+    def __init__(self, spec):
+        super().__init__()
+        self.sharded_tensor = dist._sharded_tensor.empty(spec, 10, 20)
+        self.sharded_tensor.local_shards()[0].tensor.fill_(0)
+
+
+@pytest.mark.skipif(
+    not _TORCH_SHARDED_TENSOR_AVAILABLE, reason="Test requires the torch version to support `ShardedTensor`"
+)
+def test_sharded_tensor_state_dict(tmpdir):
+    # Initialize the global process group since sharded tensor factory
+    # functions depend on a process group
+    with single_process_pg():
+        spec = dist._sharding_spec.ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cpu",
+            ],
+        )
+
+        m_0 = BoringModelWithShardedTensor(spec)
+        m_0.sharded_tensor.local_shards()[0].tensor.fill_(1)
+        assert "sharded_tensor" in m_0.state_dict(), 'Expect "sharded_tensor" to appear in the state dict'
+
+        m_1 = BoringModelWithShardedTensor(spec)
+        assert not torch.allclose(
+            m_1.sharded_tensor.local_shards()[0].tensor, m_0.sharded_tensor.local_shards()[0].tensor
+        ), "Expect the shards to be different before `m_1` loading `m_0`'s state dict"
+
+        m_1.load_state_dict(m_0.state_dict(), strict=False)
+        assert torch.allclose(
+            m_1.sharded_tensor.local_shards()[0].tensor, m_0.sharded_tensor.local_shards()[0].tensor
+        ), "Expect the shards to be same after `m_1` loading `m_0`'s state dict"
diff --git a/tests/helpers/process_group.py b/tests/helpers/process_group.py
@@ -0,0 +1,44 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from contextlib import contextmanager
+
+import torch.distributed as dist
+
+from pytorch_lightning.plugins.environments.lightning_environment import find_free_network_port
+
+
+@contextmanager
+def single_process_pg():
+    """
+    Initialize the default process group with only the current process for
+    testing purposes. The process group is destroyed when the with block is
+    exited.
+    """
+    if dist.is_initialized():
+        raise RuntimeError("Can't use `single_process_pg ` when the default process group is already initialized.")
+
+    orig_environ = os.environ.copy()
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = str(find_free_network_port())
+    os.environ["RANK"] = "0"
+    os.environ["WORLD_SIZE"] = "1"
+    dist.init_process_group("gloo")
+    try:
+        yield
+    finally:
+        dist.destroy_process_group()
+        os.environ.clear()
+        os.environ.update(orig_environ)