Add SyncBatchNormPlugin (#11754)

awaelchli · carmocca · kaushikb11 · web-flow · commit d4d197070fc2 · 2022-03-01T19:41:40.000+05:30
Co-authored-by: Carlos Mocholí &lt;carlossmocholi@gmail.com&gt;
Co-authored-by: Kaushik B &lt;45285388+kaushikb11@users.noreply.github.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -140,6 +140,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added support for pluggable Accelerators ([#12030](https://github.com/PyTorchLightning/pytorch-lightning/pull/12030))
 
 
+- Added `LayerSync` and `NativeSyncBatchNorm` plugins ([#11754](https://github.com/PyTorchLightning/pytorch-lightning/pull/11754))
+
+
+
 ### Changed
 
 - Make `benchmark` flag optional and set its value based on the deterministic flag ([#11944](https://github.com/PyTorchLightning/pytorch-lightning/pull/11944))
@@ -629,6 +633,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
 
+- Removed `configure_sync_batchnorm` from `ParallelStrategy` and all other strategies that inherit from it ([#11754](https://github.com/PyTorchLightning/pytorch-lightning/pull/11754))
+
+
+- Removed public attribute `sync_batchnorm` from strategies ([#11754](https://github.com/PyTorchLightning/pytorch-lightning/pull/11754))
+
+
 ### Fixed
 
 - Fixed an issue where `ModelCheckpoint` could delete older checkpoints when `dirpath` has changed during resumed training ([#12045](https://github.com/PyTorchLightning/pytorch-lightning/pull/12045))
diff --git a/docs/source/api_references.rst b/docs/source/api_references.rst
@@ -237,6 +237,21 @@ Checkpoint IO Plugins
     TorchCheckpointIO
     XLACheckpointIO
 
+
+Other Plugins
+^^^^^^^^^^^^^
+
+.. currentmodule:: pytorch_lightning.plugins
+
+.. autosummary::
+    :toctree: api
+    :nosignatures:
+    :template: classtemplate.rst
+
+    LayerSync
+    NativeSyncBatchNorm
+
+
 Profiler API
 ------------
 
diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py
@@ -4,6 +4,7 @@
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
 from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO
 from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO
+from pytorch_lightning.plugins.layer_sync import LayerSync, NativeSyncBatchNorm
 from pytorch_lightning.plugins.precision.apex_amp import ApexMixedPrecisionPlugin
 from pytorch_lightning.plugins.precision.deepspeed import DeepSpeedPrecisionPlugin
 from pytorch_lightning.plugins.precision.double import DoublePrecisionPlugin
@@ -31,7 +32,7 @@
 from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin
 from pytorch_lightning.strategies import Strategy
 
-PLUGIN = Union[Strategy, PrecisionPlugin, ClusterEnvironment, CheckpointIO]
+PLUGIN = Union[Strategy, PrecisionPlugin, ClusterEnvironment, CheckpointIO, LayerSync]
 PLUGIN_INPUT = Union[PLUGIN, str]
 
 __all__ = [
@@ -63,4 +64,6 @@
     "ParallelPlugin",
     "DDPShardedPlugin",
     "DDPSpawnShardedPlugin",
+    "LayerSync",
+    "NativeSyncBatchNorm",
 ]
diff --git a/pytorch_lightning/plugins/layer_sync.py b/pytorch_lightning/plugins/layer_sync.py
@@ -0,0 +1,94 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+
+import torch
+from torch.nn import Module
+
+
+class LayerSync(ABC):
+    """Abstract base class for creating plugins that wrap layers of a model with synchronization logic for
+    multiprocessing."""
+
+    @abstractmethod
+    def apply(self, model: Module) -> Module:
+        """Override this method to apply synchronization to the layers of this model."""
+
+    @abstractmethod
+    def revert(self, model: Module) -> Module:
+        """Override this method to undo all modifications made in :meth:`apply`."""
+
+
+class NativeSyncBatchNorm(LayerSync):
+    """A plugin that wraps all batch normalization layers of a model with synchronization logic for
+    multiprocessing.
+
+    This plugin has no effect in single-device operation.
+    """
+
+    def apply(self, model: Module) -> Module:
+        """Add global batchnorm for a model spread across multiple GPUs and nodes.
+
+        Override this method to synchronize batchnorm layers between specific process groups instead
+        of the whole world.
+
+        Args:
+            model: Reference to the current LightningModule
+
+        Return:
+            LightningModule with batchnorm layers synchronized within the process groups.
+        """
+        return torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+
+    def revert(self, model: Module) -> Module:
+        """Convert the wrapped batchnorm layers back to regular batchnorm layers.
+
+        Args:
+            model: Reference to the current LightningModule
+
+        Return:
+            LightningModule with regular batchnorm layers that will no longer sync across processes.
+        """
+        # Code adapted from https://github.com/pytorch/pytorch/issues/41081#issuecomment-783961547
+        # Original author: Kapil Yedidi (@kapily)
+        converted_module = model
+        if isinstance(model, torch.nn.modules.batchnorm.SyncBatchNorm):
+            # Unfortunately, LayerSync does not store the original class - if it did
+            # we could return the one that was originally created.
+            converted_module = _BatchNormXd(
+                model.num_features, model.eps, model.momentum, model.affine, model.track_running_stats
+            )
+            if model.affine:
+                with torch.no_grad():
+                    converted_module.weight = model.weight
+                    converted_module.bias = model.bias
+            converted_module.running_mean = model.running_mean
+            converted_module.running_var = model.running_var
+            converted_module.num_batches_tracked = model.num_batches_tracked
+            if hasattr(model, "qconfig"):
+                converted_module.qconfig = model.qconfig
+        for name, child in model.named_children():
+            converted_module.add_module(name, self.revert(child))
+        del model
+        return converted_module
+
+
+class _BatchNormXd(torch.nn.modules.batchnorm._BatchNorm):
+    def _check_input_dim(self, input: torch.Tensor) -> None:
+        # The only difference between BatchNorm1d, BatchNorm2d, BatchNorm3d, etc
+        # is this method that is overwritten by the subclass.
+        # Here, we are bypassing some tensor sanity checks and trusting that the user
+        # provides the right input dimensions at inference.
+        return
diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py
@@ -42,7 +42,7 @@
     _TORCH_GREATER_EQUAL_1_9,
     _TORCH_GREATER_EQUAL_1_10,
 )
-from pytorch_lightning.utilities.distributed import _revert_sync_batchnorm, distributed_available
+from pytorch_lightning.utilities.distributed import distributed_available
 from pytorch_lightning.utilities.distributed import group as _group
 from pytorch_lightning.utilities.distributed import init_dist_connection, ReduceOp, sync_ddp_if_available
 from pytorch_lightning.utilities.exceptions import DeadlockDetectedException
@@ -86,7 +86,6 @@ def __init__(
         )
         log.detail(f"{self.__class__.__name__}: initializing DDP plugin")
         self._num_nodes = 1
-        self.sync_batchnorm = False
         self._ddp_kwargs = kwargs
         self._ddp_comm_state = ddp_comm_state
         self._ddp_comm_hook = ddp_comm_hook
@@ -145,8 +144,8 @@ def setup(self, trainer: "pl.Trainer") -> None:
         # move the model to the correct device
         self.model_to_device()
 
-        if self.sync_batchnorm:
-            self.model = self.configure_sync_batchnorm(self.model)
+        if self._layer_sync:
+            self.model = self._layer_sync.apply(self.model)
 
         # skip wrapping the model if we are not fitting as no gradients need to be exchanged
         trainer_fn = trainer.state.fn
@@ -422,8 +421,8 @@ def teardown(self) -> None:
         if isinstance(self.model, DistributedDataParallel):
             self.model = self.lightning_module
 
-        if self.sync_batchnorm:
-            self.model = _revert_sync_batchnorm(self.model)
+        if self._layer_sync:
+            self.model = self._layer_sync.revert(self.model)
 
         if self.root_device.type == "cuda":
             # GPU teardown
diff --git a/pytorch_lightning/strategies/ddp_spawn.py b/pytorch_lightning/strategies/ddp_spawn.py
@@ -30,7 +30,7 @@
 from pytorch_lightning.strategies.parallel import ParallelStrategy
 from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_8
-from pytorch_lightning.utilities.distributed import _revert_sync_batchnorm, distributed_available
+from pytorch_lightning.utilities.distributed import distributed_available
 from pytorch_lightning.utilities.distributed import group as _group
 from pytorch_lightning.utilities.distributed import init_dist_connection, ReduceOp, sync_ddp_if_available
 from pytorch_lightning.utilities.rank_zero import rank_zero_only, rank_zero_warn
@@ -69,7 +69,6 @@ def __init__(
             precision_plugin=precision_plugin,
         )
         self._num_nodes = 1
-        self.sync_batchnorm = False
         self._ddp_kwargs = kwargs
         self._ddp_comm_state = ddp_comm_state
         self._ddp_comm_hook = ddp_comm_hook
@@ -116,8 +115,8 @@ def setup(self, trainer: "pl.Trainer") -> None:
         # move the model to the correct device
         self.model_to_device()
 
-        if self.sync_batchnorm:
-            self.model = self.configure_sync_batchnorm(self.model)
+        if self._layer_sync:
+            self.model = self._layer_sync.apply(self.model)
 
         # skip wrapping the model if we are not fitting as no gradients need to be exchanged
         trainer_fn = self.lightning_module.trainer.state.fn
@@ -269,8 +268,8 @@ def teardown(self) -> None:
         if isinstance(self.model, DistributedDataParallel):
             self.model = self.lightning_module
 
-        if self.sync_batchnorm:
-            self.model = _revert_sync_batchnorm(self.model)
+        if self._layer_sync:
+            self.model = self._layer_sync.revert(self.model)
 
         if self.root_device.type == "cuda":
             # GPU teardown
diff --git a/pytorch_lightning/strategies/fully_sharded.py b/pytorch_lightning/strategies/fully_sharded.py
@@ -139,8 +139,8 @@ def setup(self, trainer: "pl.Trainer") -> None:
         self.setup_precision_plugin()
         optimizers_to_device(self.optimizers, self.root_device)
 
-        if self.sync_batchnorm:
-            self.model = self.configure_sync_batchnorm(self.model)
+        if self._layer_sync:
+            self.model = self._layer_sync.apply(self.model)
 
         self.configure_ddp()
         self.barrier()
diff --git a/pytorch_lightning/strategies/parallel.py b/pytorch_lightning/strategies/parallel.py
@@ -21,6 +21,7 @@
 
 import pytorch_lightning as pl
 from pytorch_lightning.overrides.base import unwrap_lightning_module
+from pytorch_lightning.plugins import LayerSync
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
 from pytorch_lightning.plugins.precision import PrecisionPlugin
@@ -42,6 +43,7 @@ def __init__(
         super().__init__(accelerator=accelerator, checkpoint_io=checkpoint_io, precision_plugin=precision_plugin)
         self.parallel_devices = parallel_devices
         self.cluster_environment = cluster_environment
+        self._layer_sync: Optional[LayerSync] = None
 
     @property
     @abstractmethod
@@ -105,21 +107,6 @@ def torch_distributed_backend(self):
             torch_backend = "nccl" if self.root_device.type == "cuda" else "gloo"
         return torch_backend
 
-    @staticmethod
-    def configure_sync_batchnorm(model: "pl.LightningModule") -> "pl.LightningModule":
-        """Add global batchnorm for a model spread across multiple GPUs and nodes.
-
-        Override to synchronize batchnorm between specific process groups instead
-        of the whole world or use a different sync_bn like `apex`'s version.
-
-        Args:
-            model: pointer to current :class:`LightningModule`.
-
-        Return:
-            LightningModule with batchnorm layers synchronized between process groups
-        """
-        return torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
-
     @contextmanager
     def block_backward_sync(self):
         """Blocks ddp sync gradients behaviour on backwards pass.
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -46,6 +46,7 @@
     SLURMEnvironment,
     TorchElasticEnvironment,
 )
+from pytorch_lightning.plugins.layer_sync import LayerSync, NativeSyncBatchNorm
 from pytorch_lightning.strategies import (
     DDP2Strategy,
     DDPFullyShardedStrategy,
@@ -150,7 +151,6 @@ def __init__(
         # TODO: move to gpu accelerator
         torch.backends.cudnn.benchmark = self.benchmark
         self.replace_sampler_ddp = replace_sampler_ddp
-        self.sync_batchnorm = sync_batchnorm
         self._init_deterministic(deterministic)
 
         # 1. Parsing flags
@@ -169,6 +169,7 @@ def __init__(
         self._precision_plugin_flag: Optional[PrecisionPlugin] = None
         self._cluster_environment_flag: Optional[Union[ClusterEnvironment, str]] = None
         self._parallel_devices: List[Union[int, torch.device]] = []
+        self._layer_sync: Optional[LayerSync] = NativeSyncBatchNorm() if sync_batchnorm else None
         self.checkpoint_io: Optional[CheckpointIO] = None
         self._amp_type_flag: Optional[LightningEnum] = None
         self._amp_level_flag: Optional[str] = amp_level
@@ -180,6 +181,7 @@ def __init__(
             plugins=plugins,
             amp_type=amp_type,
             amp_level=amp_level,
+            sync_batchnorm=sync_batchnorm,
         )
         self._check_device_config_and_set_final_flags(
             devices=devices, num_nodes=num_nodes, num_processes=num_processes, gpus=gpus, ipus=ipus, tpu_cores=tpu_cores
@@ -230,6 +232,7 @@ def _check_config_and_set_final_flags(
         plugins: Optional[Union[PLUGIN_INPUT, List[PLUGIN_INPUT]]],
         amp_type: str,
         amp_level: Optional[str],
+        sync_batchnorm: bool,
     ) -> None:
         """This method checks:
 
@@ -317,6 +320,13 @@ def _check_config_and_set_final_flags(
                     self.checkpoint_io = plugin
                 elif isinstance(plugin, ClusterEnvironment):
                     self._cluster_environment_flag = plugin
+                elif isinstance(plugin, LayerSync):
+                    if sync_batchnorm and not isinstance(plugin, NativeSyncBatchNorm):
+                        raise MisconfigurationException(
+                            f"You set `Trainer(sync_batchnorm=True)` and provided a `{plugin.__class__.__name__}`"
+                            " plugin, but this is not allowed. Choose one or the other."
+                        )
+                    self._layer_sync = plugin
                 else:
                     raise MisconfigurationException(
                         f"Found invalid type for plugin {plugin}. Expected a precision plugin or training strategy."
@@ -715,8 +725,8 @@ def _lazy_init_strategy(self) -> None:
                 self.strategy.parallel_devices = self._parallel_devices
         if hasattr(self.strategy, "num_nodes"):
             self.strategy._num_nodes = self._num_nodes_flag
-        if hasattr(self.strategy, "sync_batchnorm"):
-            self.strategy.sync_batchnorm = self.sync_batchnorm
+        if hasattr(self.strategy, "_layer_sync"):
+            self.strategy._layer_sync = self._layer_sync
         if hasattr(self.strategy, "set_world_ranks"):
             self.strategy.set_world_ranks()
         self.strategy._configure_launcher()
diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
diff --git a/tests/strategies/test_custom_plugin.py b/tests/strategies/test_custom_plugin.py