Lightning-AI · carmocca · Oct 4, 2022 · Oct 4, 2022 · Oct 4, 2022 · Oct 5, 2022
@@ -31,6 +31,7 @@
     TPUBf16Precision,
     TPUPrecision,
 )
+from lightning_lite.plugins.collectives import Collective
 from lightning_lite.plugins.environments import (
     ClusterEnvironment,
     KubeflowEnvironment,
@@ -57,7 +58,7 @@
 from lightning_lite.utilities.device_parser import determine_root_gpu_device
 from lightning_lite.utilities.imports import _HPU_AVAILABLE, _IPU_AVAILABLE, _IS_INTERACTIVE
 
-_PLUGIN = Union[Precision, ClusterEnvironment, CheckpointIO]
+_PLUGIN = Union[Precision, ClusterEnvironment, CheckpointIO, Collective]
 _PLUGIN_INPUT = Union[_PLUGIN, str]
 _PRECISION_INPUT = Literal[16, 32, 64, "bf16"]
 

@@ -11,7 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+from lightning_lite.plugins.collectives.collective import Collective
+from lightning_lite.plugins.collectives.deepspeed_collective import DeepSpeedCollective
+from lightning_lite.plugins.collectives.single_device_collective import SingleDeviceCollective
+from lightning_lite.plugins.collectives.torch_collective import TorchCollective
 from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment
 from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO
 from lightning_lite.plugins.io.torch_plugin import TorchCheckpointIO
@@ -24,14 +27,18 @@
 from lightning_lite.plugins.precision.tpu_bf16 import TPUBf16Precision
 
 __all__ = [
-    "ClusterEnvironment",
     "CheckpointIO",
-    "TorchCheckpointIO",
-    "XLACheckpointIO",
+    "ClusterEnvironment",
+    "Collective",
+    "DeepSpeedCollective",
     "DeepSpeedPrecision",
     "DoublePrecision",
     "NativeMixedPrecision",
     "Precision",
-    "TPUPrecision",
+    "SingleDeviceCollective",
+    "TorchCheckpointIO",
+    "TorchCollective",
     "TPUBf16Precision",
+    "TPUPrecision",
+    "XLACheckpointIO",
 ]
@@ -0,0 +1,11 @@
+from lightning_lite.plugins.collectives.collective import Collective
+from lightning_lite.plugins.collectives.deepspeed_collective import DeepSpeedCollective
+from lightning_lite.plugins.collectives.single_device_collective import SingleDeviceCollective
+from lightning_lite.plugins.collectives.torch_collective import TorchCollective
+
+__all__ = [
+    "Collective",
+    "DeepSpeedCollective",
+    "TorchCollective",
+    "SingleDeviceCollective",
+]
diff --git a/src/lightning_lite/plugins/collectives/collective.py b/src/lightning_lite/plugins/collectives/collective.py
@@ -0,0 +1,138 @@
+from abc import ABC, abstractmethod
+from typing import Any, List, Optional
+
+import torch
+from typing_extensions import Self
+
+from lightning_lite.utilities.types import CollectibleGroup
+
+
+class Collective(ABC):
+    def __init__(self, instantiate_group: bool = False, **group_kwargs: Any) -> None:
+        self._group_kwargs = group_kwargs
+        self._group: Optional[CollectibleGroup] = None
+        if instantiate_group:
+            self.create_group()
+
+    def create_group(self, **kwargs: Any) -> Self:  # type: ignore[valid-type]
+        if self._group is not None:
+            raise RuntimeError(f"{type(self).__name__} already owns a group.")
+        self._group_kwargs.update(kwargs)
+        self._group = self.init_group(**self._group_kwargs)
+        return self
+
+    @property
+    def group(self) -> CollectibleGroup:
+        if self._group is None:
+            raise RuntimeError(
+                f"{type(self).__name__} does not own a group. HINT: try `collective.create_group().group`"
+            )
+        return self._group
+
+    @property
+    @abstractmethod
+    def rank(self) -> int:
+        pass
+
+    @property
+    @abstractmethod
+    def world_size(self) -> int:
+        pass
+
+    @staticmethod
+    @abstractmethod
+    def init_group(
+        **kwargs: Any,
+    ) -> CollectibleGroup:
+        pass
+
+    def teardown(self) -> None:
+        if self._group is None:
+            raise RuntimeError(f"{type(self).__name__} does not own a group to destroy.")
+        self.destroy_group(self._group)
+        self._group = None
+
+    @staticmethod
+    @abstractmethod
+    def destroy_group(group: CollectibleGroup) -> None:
+        pass
+
+    @staticmethod
+    @abstractmethod
+    def convert_to_native_op(op: str) -> Any:
+        ...
+
+    @abstractmethod
+    def broadcast(
+        self,
+        tensor: torch.Tensor,
+        src: int,
+    ) -> torch.Tensor:
+        pass
+
+    @abstractmethod
+    def all_reduce(
+        self,
+        tensor: torch.Tensor,
+        op: Any,
+    ) -> torch.Tensor:
+        pass
+
+    @abstractmethod
+    def reduce(
+        self,
+        tensor: torch.Tensor,
+        dst: int,
+        op: Any,
+    ) -> torch.Tensor:
+        pass
+
+    @abstractmethod
+    def all_gather(
+        self,
+        tensor_list: List[torch.Tensor],
+        tensor: torch.Tensor,
+    ) -> List[torch.Tensor]:
+        pass
+
+    @abstractmethod
+    def gather(
+        self,
+        tensor: torch.Tensor,
+        gather_list: Optional[List[torch.Tensor]] = None,
+        dst: int = 0,
+    ) -> Optional[List[torch.Tensor]]:
+        pass
+
+    @abstractmethod
+    def scatter(
+        self,
+        tensor: torch.Tensor,
+        scatter_list: Optional[List[torch.Tensor]] = None,
+        src: int = 0,
+    ) -> torch.Tensor:
+        pass
+
+    @abstractmethod
+    def reduce_scatter(
+        self,
+        output: torch.Tensor,
+        input_list: List[torch.Tensor],
+        op: Any,
+    ) -> torch.Tensor:
+        pass
+
+    @abstractmethod
+    def all_to_all(
+        self,
+        output_tensor_list: List[torch.Tensor],
+        input_tensor_list: List[torch.Tensor],
+    ) -> List[torch.Tensor]:
+        pass
+
+    @abstractmethod
+    def barrier(
+        self,
+        device_ids: Optional[List[int]] = None,
+    ) -> None:
+        pass
diff --git a/src/lightning_lite/plugins/collectives/deepspeed_collective.py b/src/lightning_lite/plugins/collectives/deepspeed_collective.py
@@ -0,0 +1,152 @@
+import datetime
+from typing import Any, List, Optional
+
+import torch
+
+from lightning_lite.plugins.collectives.collective import Collective
+from lightning_lite.strategies.deepspeed import _DEEPSPEED_AVAILABLE
+from lightning_lite.utilities.types import CollectibleGroup
+
+if _DEEPSPEED_AVAILABLE:
+    import deepspeed.comm as dist
+
+
+class DeepSpeedCollective(Collective):
+    def __init__(self, instantiate_group: bool = False, **group_kwargs: Any) -> None:
+        if not _DEEPSPEED_AVAILABLE:
+            raise RuntimeError("Torch distributed is not available.")
+        super().__init__(instantiate_group, **group_kwargs)
+
+    @property
+    def rank(self) -> int:
+        return dist.get_rank(self.group)
+
+    @property
+    def world_size(self) -> int:
+        return dist.get_world_size(self.group)
+
+    @staticmethod
+    def init_group(
+        **kwargs: Any,
+    ) -> CollectibleGroup:
+        return dist.init_process_group(**kwargs)
+
+    @staticmethod
+    def destroy_group(group: CollectibleGroup) -> None:
+        dist.destroy_process_group(group)
+
+    def broadcast(
+        self,
+        tensor: torch.Tensor,
+        src: int,
+    ) -> torch.Tensor:
+        dist.broadcast(tensor, src, group=self.group)
+        return tensor
+
+    def all_reduce(
+        self,
+        tensor: torch.Tensor,
+        op: dist.ReduceOp = dist.ReduceOp.SUM,
+    ) -> torch.Tensor:
+        dist.all_reduce(tensor, op=op, group=self.group)
+        return tensor
+
+    def reduce(
+        self,
+        tensor: torch.Tensor,
+        dst: int,
+        op: dist.ReduceOp = dist.ReduceOp.SUM,
+    ) -> torch.Tensor:
+        dist.reduce(tensor, dst, op=op, group=self.group)
+        return tensor
+
+    def all_gather(
+        self,
+        tensor_list: List[torch.Tensor],
+        tensor: torch.Tensor,
+    ) -> List[torch.Tensor]:
+        dist.all_gather(tensor_list, tensor, group=self.group)
+        return tensor_list
+
+    def gather(
+        self,
+        tensor: torch.Tensor,
+        gather_list: Optional[List[torch.Tensor]] = None,
+        dst: int = 0,
+    ) -> Optional[List[torch.Tensor]]:
+        dist.gather(tensor, gather_list, dst, group=self.group)
+        return gather_list
+
+    def scatter(
+        self,
+        tensor: torch.Tensor,
+        scatter_list: Optional[List[torch.Tensor]] = None,
+        src: int = 0,
+    ) -> torch.Tensor:
+        dist.scatter(tensor, scatter_list, src, group=self.group)
+        return tensor
+
+    def reduce_scatter(
+        self,
+        output: torch.Tensor,
+        input_list: List[torch.Tensor],
+        op: dist.ReduceOp = dist.ReduceOp.SUM,
+    ) -> torch.Tensor:
+        dist.reduce_scatter(output, input_list, op=op, group=self.group)
+        return output
+
+    def all_to_all(
+        self,
+        output_tensor_list: List[torch.Tensor],
+        input_tensor_list: List[torch.Tensor],
+    ) -> List[torch.Tensor]:
+        dist.all_to_all(output_tensor_list, input_tensor_list, group=self.group)
+        return output_tensor_list
+
+    def barrier(
+        self,
+        device_ids: Optional[List[int]] = None,
+    ) -> None:
+        dist.barrier(group=self.group, device_ids=device_ids)
+
+    def all_gather_object(
+        self,
+        object_list: List[Any],
+        object: Any,
+    ) -> List[Any]:
+        dist.all_gather_object(object_list, object, group=self.group)
+        return object_list
+
+    def broadcast_object_list(
+        self,
+        object_list: List[Any],
+        src: int,
+        device: Optional[torch.device] = None,
+    ) -> List[Any]:
+        dist.broadcast_object_list(object_list, src, group=self.group, device=device)
+        return object_list
+
+    def gather_object(
+        self,
+        obj: Any,
+        object_gather_list: Optional[List[Any]] = None,
+        dst: int = 0,
+    ) -> Optional[List[Any]]:
+        dist.gather_object(obj, object_gather_list, dst, group=self.group)
+        return object_gather_list
+
+    def scatter_object_list(
+        self,
+        scatter_object_output_list: List[Any],
+        scatter_object_input_list: Optional[List[Any]],
+        src: int = 0,
+    ) -> List[Any]:
+        dist.scatter_object_list(scatter_object_output_list, scatter_object_input_list, src, group=self.group)
+        return scatter_object_output_list
+
+    def monitored_barrier(
+        self,
+        timeout: Optional[datetime.timedelta] = None,
+        wait_all_ranks: bool = False,
+    ) -> None:
+        dist.monitored_barrier(group=self.group, timeout=timeout, wait_all_ranks=wait_all_ranks)