2/n Consolidate collective functions - collective base and subclasses

four4fish · four4fish · commit 558956d5304c · 2021-09-10T18:39:39.000-07:00
diff --git a/pytorch_lightning/plugins/collective/__init__.py b/pytorch_lightning/plugins/collective/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from pytorch_lightning.plugins.collective.collective_plugin import Collective  # noqa: F401
-from pytorch_lightning.plugins.collective.single_device_collective import SingleNodeCollective  # noqa: F401
-from pytorch_lightning.plugins.collective.torch_collective import TorchCollective  # noqa: F401
 from pytorch_lightning.plugins.collective.horovod_collective import HorovodCollective  # noqa: F401
+from pytorch_lightning.plugins.collective.single_device_collective import SingleDeviceCollective  # noqa: F401
+from pytorch_lightning.plugins.collective.torch_collective import TorchCollective  # noqa: F401
 from pytorch_lightning.plugins.collective.tpu_collective import TPUCollective  # noqa: F401
diff --git a/pytorch_lightning/plugins/collective/horovod_collective.py b/pytorch_lightning/plugins/collective/horovod_collective.py
@@ -31,19 +31,19 @@ class HorovodCollective(Collective):
     def __init__(
         self,
         on_gpu: Optional[bool] = False,
-        local_rank: Optional[int] = 0,
+        local_rank: int = 0,
     ):
-        self._on_gpu = on_gpu
-        self._local_rank = local_rank
+        self.on_gpu = on_gpu
+        self.local_rank = local_rank
 
     def join(self) -> None:
         """Horovod function that indicates that the rank finished processing data.
 
         All ranks that did not call join() continue to process allreduce operations. This function blocks Python thread
         until all ranks join.
         """
-        if self._on_gpu:
-            hvd.join(self._local_rank)
+        if self.on_gpu:
+            hvd.join(self.local_rank)
         else:
             hvd.join()
 
diff --git a/pytorch_lightning/plugins/collective/single_device_collective.py b/pytorch_lightning/plugins/collective/single_device_collective.py
@@ -18,7 +18,7 @@
 from pytorch_lightning.plugins.collective import Collective
 
 
-class SingleNodeCollective(Collective):
+class SingleDeviceCollective(Collective):
     """Collective interface for single device training type plugins."""
 
     def barrier(self, name: Optional[str] = None, *args: Any, **kwargs: Any) -> None:
diff --git a/pytorch_lightning/plugins/collective/torch_collective.py b/pytorch_lightning/plugins/collective/torch_collective.py
@@ -29,7 +29,14 @@
 class TorchCollective(Collective):
     """Collective interface for DDP, DDPSpawn, DP and DDP2."""
 
-    def __init__(self, local_reduce: bool = False, rank=None, device=None):
+    def __init__(
+        self,
+        local_reduce: bool = False,
+        rank: Optional[int] = None,
+        device: Optional[Union[str, torch.device]] = torch.device("cpu"),
+        device_id: Optional[int] = None,
+        world_size: int = 1,
+    ):
         """.. note::
 
         DDP and DDPSpawn sync accross multiple nodes/devices, local_reduce = False
@@ -38,19 +45,21 @@ def __init__(self, local_reduce: bool = False, rank=None, device=None):
 
         local_reduce set in Plugins.setup() functions
         """
-        self._local_reduce = local_reduce
-        self._rank = rank
-        self._device = device
+        self.local_reduce = local_reduce
+        self.rank = rank
+        self.device = device
+        self.device_id = device_id
+        self.world_size = world_size
 
     def barrier(self, name: Optional[str] = None, *args: Any, **kwargs: Any) -> None:
         if not distributed_available():
             return
         if _TORCH_GREATER_EQUAL_1_8 and torch.distributed.get_backend() == "nccl":
-            torch.distributed.barrier(device_ids=self.determine_ddp_device_ids())
+            torch.distributed.barrier(device_ids=self.device_id)
         else:
             torch.distributed.barrier()
 
-    def broadcast(self, obj: object, src: int = 0) -> object:
+    def broadcast(self, obj: Any, src: int = 0) -> Any:
         if not distributed_available():
             return obj
         else:
@@ -97,7 +106,7 @@ def mean(t: torch.Tensor) -> torch.Tensor:
         return tensor
 
     def reduce_boolean_decision(self, decision: bool) -> bool:
-        decision = torch.tensor(int(decision), device=self.lightning_module.device)
+        decision = torch.tensor(int(decision), device=self.device)
         decision = self.reduce(decision, reduce_op=ReduceOp.SUM)
         decision = bool(decision == self.world_size)
         return decision
diff --git a/pytorch_lightning/plugins/collective/tpu_collective.py b/pytorch_lightning/plugins/collective/tpu_collective.py
@@ -25,12 +25,22 @@
     import torch_xla.core.xla_model as xm
     from torch_xla.core.xla_model import rendezvous
 else:
-    xm, rendezvous = [None] * 4
+    xm, rendezvous = [None] * 2
 
 
 class TPUCollective(Collective):
     """Collective interface for TPU and TPUSpawning training type plugins."""
 
+    def __init__(
+        self,
+        device: Union[str, torch.device] = torch.device("xla"),
+        root_device: torch.device = xm.xla_device(),
+        world_size: int = xm.xrt_world_size(),
+    ):
+        self.device = device
+        self.root_device = root_device
+        self.world_size = world_size
+
     def barrier(self, name: Optional[str] = None) -> None:
         if self.is_distributed:
             rendezvous(name)
@@ -59,11 +69,11 @@ def all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None, sync_gra
         """
         if isinstance(tensor, torch.Tensor) and tensor.dim() == 0:
             tensor = tensor.unsqueeze(0)
-        return self._xm.all_gather(tensor)
+        return xm.all_gather(tensor)
 
     def reduce(self, output: Any, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None) -> Any:
         if not isinstance(output, torch.Tensor):
-            output = torch.tensor(output, device=self.lightning_module.device)
+            output = torch.tensor(output, device=self.device)
 
         _invalid_reduce_op = isinstance(reduce_op, ReduceOp) and reduce_op != ReduceOp.SUM
         _invalid_reduce_op_str = isinstance(reduce_op, str) and reduce_op.lower() not in ("sum", "mean", "avg")