Lightning-AI
diff --git a/‎pytorch_lightning/accelerators/accelerator.py
Lines changed: 3 additions & 3 deletions b/‎pytorch_lightning/accelerators/accelerator.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎pytorch_lightning/callbacks/early_stopping.py
Lines changed: 1 addition & 1 deletion b/‎pytorch_lightning/callbacks/early_stopping.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎pytorch_lightning/callbacks/model_checkpoint.py
Lines changed: 6 additions & 4 deletions b/‎pytorch_lightning/callbacks/model_checkpoint.py
Lines changed: 6 additions & 4 deletions
diff --git a/‎pytorch_lightning/callbacks/xla_stats_monitor.py
Lines changed: 4 additions & 4 deletions b/‎pytorch_lightning/callbacks/xla_stats_monitor.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎pytorch_lightning/core/lightning.py
Lines changed: 1 addition & 1 deletion b/‎pytorch_lightning/core/lightning.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎pytorch_lightning/loops/base.py
Lines changed: 3 additions & 1 deletion b/‎pytorch_lightning/loops/base.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎pytorch_lightning/plugins/collective/torch_collective.py
Lines changed: 7 additions & 4 deletions b/‎pytorch_lightning/plugins/collective/torch_collective.py
Lines changed: 7 additions & 4 deletions
diff --git a/‎pytorch_lightning/plugins/training_type/ddp.py
Lines changed: 12 additions & 41 deletions b/‎pytorch_lightning/plugins/training_type/ddp.py
Lines changed: 12 additions & 41 deletions
diff --git a/‎pytorch_lightning/plugins/training_type/ddp2.py
Lines changed: 1 addition & 23 deletions b/‎pytorch_lightning/plugins/training_type/ddp2.py
Lines changed: 1 addition & 23 deletions
@@ -338,7 +338,7 @@ def lightning_module_state_dict(self) -> Dict[str, Union[Any, Tensor]]:
         return self.training_type_plugin.lightning_module_state_dict()
 
     def barrier(self, name: Optional[str] = None) -> None:
-        self.training_type_plugin.barrier(name=name)
+        self.training_type_plugin.collective.barrier(name=name)
 
     def broadcast(self, obj: object, src: int = 0) -> object:
         """Broadcasts an object to all processes, such that the src object is broadcast to all other ranks if
@@ -348,7 +348,7 @@ def broadcast(self, obj: object, src: int = 0) -> object:
             obj: Object to broadcast to all process, usually a tensor or collection of tensors.
             src: The source rank of which the object will be broadcast from
         """
-        return self.training_type_plugin.broadcast(obj, src)
+        return self.training_type_plugin.collective.broadcast(obj, src)
 
     def all_gather(self, tensor: Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> Tensor:
         """Function to gather a tensor from several distributed processes.
@@ -361,7 +361,7 @@ def all_gather(self, tensor: Tensor, group: Optional[Any] = None, sync_grads: bo
         Return:
             A tensor of shape (world_size, batch, ...)
         """
-        return self.training_type_plugin.all_gather(tensor, group=group, sync_grads=sync_grads)
+        return self.training_type_plugin.collective.all_gather(tensor, group=group, sync_grads=sync_grads)
 
     def process_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
         """Wraps the dataloader if necessary.
 
@@ -206,7 +206,7 @@ def _run_early_stopping_check(self, trainer: "pl.Trainer") -> None:
         should_stop, reason = self._evaluate_stopping_criteria(current)
 
         # stop every ddp process if any world process decides to stop
-        should_stop = trainer.training_type_plugin.reduce_boolean_decision(should_stop)
+        should_stop = trainer.training_type_plugin.collective.reduce_boolean_decision(should_stop)
         trainer.should_stop = trainer.should_stop or should_stop
         if should_stop:
             self.stopped_epoch = trainer.current_epoch
 
@@ -294,7 +294,7 @@ def on_train_batch_end(
             skip_time = prev_time_check is None or (now - prev_time_check) < train_time_interval.total_seconds()
             # in case we have time differences across ranks
             # broadcast the decision on whether to checkpoint from rank 0 to avoid possible hangs
-            skip_time = trainer.training_type_plugin.broadcast(skip_time)
+            skip_time = trainer.training_type_plugin.collective.broadcast(skip_time)
 
         if skip_batch and skip_time:
             return
@@ -509,7 +509,9 @@ def check_monitor_top_k(self, trainer: "pl.Trainer", current: Optional[torch.Ten
         should_update_best_and_save = monitor_op(current, self.best_k_models[self.kth_best_model_path])
 
         # If using multiple devices, make sure all processes are unanimous on the decision.
-        should_update_best_and_save = trainer.training_type_plugin.reduce_boolean_decision(should_update_best_and_save)
+        should_update_best_and_save = trainer.training_type_plugin.collective.reduce_boolean_decision(
+            should_update_best_and_save
+        )
 
         return should_update_best_and_save
 
@@ -612,7 +614,7 @@ def __resolve_ckpt_dir(self, trainer: "pl.Trainer") -> None:
         else:
             ckpt_path = os.path.join(trainer.weights_save_path, "checkpoints")
 
-        ckpt_path = trainer.training_type_plugin.broadcast(ckpt_path)
+        ckpt_path = trainer.training_type_plugin.collective.broadcast(ckpt_path)
 
         self.dirpath = ckpt_path
 
@@ -748,4 +750,4 @@ def file_exists(self, filepath: Union[str, Path], trainer: "pl.Trainer") -> bool
         """Checks if a file exists on rank 0 and broadcasts the result to all other ranks, preventing the internal
         state to diverge between ranks."""
         exists = self._fs.exists(filepath)
-        return trainer.training_type_plugin.broadcast(exists)
+        return trainer.training_type_plugin.collective.broadcast(exists)
@@ -67,7 +67,7 @@ def on_train_start(self, trainer, pl_module) -> None:
             )
 
         memory_info = xm.get_memory_info(pl_module.device)
-        total_memory = trainer.training_type_plugin.reduce(memory_info["kb_total"]) * 0.001
+        total_memory = trainer.training_type_plugin.collective.reduce(memory_info["kb_total"]) * 0.001
         rank_zero_info(f"Average Total memory: {total_memory:.2f} MB")
 
     def on_train_epoch_start(self, trainer, pl_module) -> None:
@@ -81,9 +81,9 @@ def on_train_epoch_end(self, trainer, pl_module) -> None:
         free_memory = memory_info["kb_free"]
         peak_memory = memory_info["kb_total"] - free_memory
 
-        free_memory = trainer.training_type_plugin.reduce(free_memory) * 0.001
-        peak_memory = trainer.training_type_plugin.reduce(peak_memory) * 0.001
-        epoch_time = trainer.training_type_plugin.reduce(epoch_time)
+        free_memory = trainer.training_type_plugin.collective.reduce(free_memory) * 0.001
+        peak_memory = trainer.training_type_plugin.collective.reduce(peak_memory) * 0.001
+        epoch_time = trainer.training_type_plugin.collective.reduce(epoch_time)
 
         logs["avg. free memory (MB)"] = free_memory
         logs["avg. peak memory (MB)"] = peak_memory
 
@@ -466,7 +466,7 @@ def log(
             dataloader_idx=(self._current_dataloader_idx if add_dataloader_idx else None),
             batch_size=batch_size,
             sync_dist=sync_dist and distributed_available(),
-            sync_dist_fn=self.trainer.training_type_plugin.reduce or sync_ddp,
+            sync_dist_fn=self.trainer.training_type_plugin.collective.reduce or sync_ddp,
             sync_dist_group=sync_dist_group,
             metric_attribute=metric_attribute,
             rank_zero_only=rank_zero_only,
 
@@ -238,7 +238,9 @@ def _load_from_state_dict(
                 # On reload, we need to re-attach the `Metric`s back to the `ResultCollection`.
                 # The references are provided through the `metric_attributes` dictionary.
                 v.load_state_dict(
-                    state_dict[prefix + k], metrics=metric_attributes, sync_fn=self.trainer.training_type_plugin.reduce
+                    state_dict[prefix + k],
+                    metrics=metric_attributes,
+                    sync_fn=self.trainer.training_type_plugin.collective.reduce,
                 )
 
                 if not self.trainer.is_global_zero:
 
@@ -107,7 +107,10 @@ def mean(t: torch.Tensor) -> torch.Tensor:
         return tensor
 
     def reduce_boolean_decision(self, decision: bool) -> bool:
-        decision = torch.tensor(int(decision), device=self.device)
-        decision = self.reduce(decision, reduce_op=ReduceOp.SUM)
-        decision = bool(decision == self.world_size)
-        return decision
+        if self.local_reduce:
+            return decision
+        else:
+            decision1 = torch.tensor(int(decision), device=self.device)
+            decision2 = self.reduce(decision1, reduce_op=ReduceOp.SUM)
+            decision = bool(decision2 == self.world_size)
+            return decision
@@ -31,9 +31,10 @@
 
 import pytorch_lightning as pl
 from pytorch_lightning.core.optimizer import LightningOptimizer
-from pytorch_lightning.distributed import LightningDistributed
 from pytorch_lightning.overrides import LightningDistributedModule
 from pytorch_lightning.overrides.distributed import prepare_for_backward
+from pytorch_lightning.plugins.collective.collective_plugin import Collective
+from pytorch_lightning.plugins.collective.torch_collective import TorchCollective
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
@@ -48,13 +49,7 @@
     rank_zero_deprecation,
     rank_zero_warn,
 )
-from pytorch_lightning.utilities.distributed import (
-    distributed_available,
-    init_ddp_connection,
-    rank_zero_only,
-    ReduceOp,
-    sync_ddp_if_available,
-)
+from pytorch_lightning.utilities.distributed import init_ddp_connection, rank_zero_only
 from pytorch_lightning.utilities.exceptions import DeadlockDetectedException, MisconfigurationException
 from pytorch_lightning.utilities.seed import reset_seed
 from pytorch_lightning.utilities.types import STEP_OUTPUT
@@ -91,6 +86,7 @@ def __init__(
         num_nodes: Optional[int] = None,
         cluster_environment: Optional[ClusterEnvironment] = None,
         checkpoint_io: Optional[CheckpointIO] = None,
+        collective: Optional[Collective] = None,
         sync_batchnorm: Optional[bool] = None,
         ddp_comm_state: Optional[object] = None,
         ddp_comm_hook: Optional[callable] = None,
@@ -102,6 +98,7 @@ def __init__(
             parallel_devices=parallel_devices,
             cluster_environment=cluster_environment,
             checkpoint_io=checkpoint_io,
+            collective=collective or TorchCollective(),
         )
         self.interactive_ddp_procs = []
         if num_nodes is not None:
@@ -116,7 +113,6 @@ def __init__(
                 " Notice that it will be overriden by the trainer setting."
             )
         self._sync_batchnorm = sync_batchnorm or False
-        self.dist = LightningDistributed()
         self.num_processes = len(self.parallel_devices) if self.parallel_devices is not None else 0
         self._ddp_kwargs = kwargs
         self._task_idx = None
@@ -267,8 +263,10 @@ def setup_distributed(self):
         init_ddp_connection(self.cluster_environment, self.torch_distributed_backend)
 
         # set the ranks and devices
-        self.dist.rank = self.global_rank
-        self.dist.device = self.root_device
+        self.collective.rank = self.global_rank
+        self.collective.device = self.root_device
+        self.collective.device_id = self.determine_ddp_device_ids()
+        self.collective.world_size = self.world_size
 
     def _check_can_spawn_children(self):
         if self.local_rank != 0:
@@ -389,17 +387,6 @@ def pre_dispatch(self):
     def post_dispatch(self, trainer: "pl.Trainer") -> None:
         self.cluster_environment.teardown()
 
-    def barrier(self, *args, **kwargs) -> None:
-        if not distributed_available():
-            return
-        if _TORCH_GREATER_EQUAL_1_8 and torch.distributed.get_backend() == "nccl":
-            torch.distributed.barrier(device_ids=self.determine_ddp_device_ids())
-        else:
-            torch.distributed.barrier()
-
-    def broadcast(self, obj: object, src: int = 0) -> object:
-        return self.dist.broadcast(obj)
-
     def pre_backward(self, closure_loss: torch.Tensor) -> None:
         """Run before precision plugin executes backward."""
         if not self.lightning_module.automatic_optimization:
@@ -408,22 +395,6 @@ def pre_backward(self, closure_loss: torch.Tensor) -> None:
     def model_to_device(self):
         self.model.to(self.root_device)
 
-    def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Union[ReduceOp, str] = "mean") -> torch.Tensor:
-        """Reduces a tensor from several distributed processes to one aggregated tensor.
-
-        Args:
-            tensor: the tensor to sync and reduce
-            group: the process group to gather results from. Defaults to all processes (world)
-            reduce_op: the reduction operation. Defaults to 'mean'/'avg'.
-                Can also be a string 'sum' to calculate the sum during reduction.
-
-        Return:
-            reduced value, except when the input was not a tensor the output remains is unchanged
-        """
-        if isinstance(tensor, torch.Tensor):
-            tensor = sync_ddp_if_available(tensor, group, reduce_op=reduce_op)
-        return tensor
-
     def training_step(self, *args, **kwargs) -> Optional[Any]:
         return self.model(*args, **kwargs)
 
@@ -465,15 +436,15 @@ def _share_information_to_prevent_deadlock(self):
         sync_dirs = []
         global_node_rank_zero = 0
         for _ in range(self.num_nodes):
-            sync_dirs.append(self.broadcast(self._sync_dir, global_node_rank_zero))
+            sync_dirs.append(self.collective.broadcast(self._sync_dir, global_node_rank_zero))
             global_node_rank_zero += self.world_size // self.num_nodes
 
         self._sync_dir = sync_dirs[self.node_rank]
 
     def _share_pids(self):
         """Make all DDP processes aware of all processes pids."""
-        self.barrier()
-        pids = self.all_gather(torch.tensor(os.getpid(), device=self.root_device))
+        self.collective.barrier()
+        pids = self.collective.all_gather(torch.tensor(os.getpid(), device=self.root_device))
         pids = pids.cpu().numpy().tolist()
         self._pids = pids if isinstance(pids, list) else [pids]
 
 
@@ -11,11 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import torch
-
 from pytorch_lightning.plugins.training_type.ddp import DDPPlugin
-from pytorch_lightning.utilities.apply_func import apply_to_collection
-from pytorch_lightning.utilities.types import _METRIC_COLLECTION
 
 
 class DDP2Plugin(DDPPlugin):
@@ -33,25 +29,7 @@ def setup(self) -> None:
         # set the task idx
         self.task_idx = self.cluster_environment.local_rank()
         # the difference to DDP is that we don't call children processes here
-
-    def reduce(self, collection: _METRIC_COLLECTION, *args, **kwargs) -> _METRIC_COLLECTION:
-        """Reduces a collection of tensors from all processes. It can be applied to just a single tensor. In DDP2,
-        the reduction here is only across local devices within the node.
-
-        Args:
-            collection: The collection of tensors to sync and reduce.
-            *args: ignored for DDP2
-            **kwargs: ignored for DDP2
-
-        Return:
-            Reduced tensor values or the same value if it was not or did not contain a tensor.
-        """
-
-        def mean(t: torch.Tensor) -> torch.Tensor:
-            original_dtype = t.dtype
-            return t.float().mean().to(original_dtype)
-
-        return apply_to_collection(collection, torch.Tensor, mean)
+        self.collective.local_reduce = True
 
     @property
     def root_device(self):