polish colossalai plugin's code (#2)

1SAA · 1SAA · commit 5742d32c4b62 · 2022-09-13T10:20:16.000+08:00
Polish colossalai plugin's code to pass the tests of pre-commit hooks.
diff --git a/src/pytorch_lightning/plugins/precision/colossalai.py b/src/pytorch_lightning/plugins/precision/colossalai.py
@@ -3,6 +3,7 @@
 from torch import Tensor
 from torch.optim import Optimizer
 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
 class ColossalAIPrecisionPlugin(PrecisionPlugin):
@@ -19,4 +20,9 @@ def clip_grad_by_norm(self, optimizer: Optimizer, clip_val: Union[int, float]) -
     def optimizer_step(self, model, optimizer, optimizer_idx: int, closure, **kwargs: Any) -> Any:
         closure_result = closure()
         self._after_closure(model, optimizer, optimizer_idx)
+        skipped_backward = closure_result is None
+        if isinstance(model, pl.LightningModule) and model.automatic_optimization and skipped_backward:
+            raise MisconfigurationException(
+                "Skipping backward by returning `None` from your `training_step` is not supported by `Colossalai`"
+            )
         optimizer.step()
diff --git a/src/pytorch_lightning/strategies/__init__.py b/src/pytorch_lightning/strategies/__init__.py
@@ -31,7 +31,7 @@
 from pytorch_lightning.strategies.strategy import Strategy  # noqa: F401
 from pytorch_lightning.strategies.strategy_registry import call_register_strategies, StrategyRegistry  # noqa: F401
 from pytorch_lightning.strategies.tpu_spawn import TPUSpawnStrategy  # noqa: F401
-from pytorch_lightning.strategies.colossalai import ColossalAIStrategy
+from pytorch_lightning.strategies.colossalai import ColossalAIStrategy  # noqa: F401
 
 STRATEGIES_BASE_MODULE = "pytorch_lightning.strategies"
 
diff --git a/src/pytorch_lightning/strategies/colossalai.py b/src/pytorch_lightning/strategies/colossalai.py
@@ -26,7 +26,8 @@ def _post_init_method(self, module: torch.nn.Module, *args, **kwargs):
 
 class ColossalAIStrategy(DDPStrategy):
     """ColossalAI strategy.
-    It only supports single optimizer which must be  `colossalai.nn.optimizer.CPUAdam`_ or `colossalai.nn.optimizer.HybridAdam`_ now.
+    It only supports single optimizer which must be  `colossalai.nn.optimizer.CPUAdam`_ or
+    `colossalai.nn.optimizer.HybridAdam`_ now.
     You must initialize your model in ``configure_sharded_model()``.
 
     It configures accelerator and precision, and you should not configure them when initializing ``Trainer``.
@@ -48,32 +49,38 @@ def on_load_checkpoint(self, checkpoint) -> None:
             It can speed up training, but slightly more memory will be used. Defaults to True.
         chunk_size (Optional[int], optional): The size of a chunk.
             It will be ignored when ``use_chunk=False``.
-            If it's None, a best chunk size will be searched out based on ``chunk_search_range``, ``chunk_search_n_grids`` and ``min_chunk_size``.
+            If it's None, a best chunk size will be searched out based on ``chunk_search_range``,
+            ``chunk_search_n_grids`` and ``min_chunk_size``.
             Defaults to None.
         enable_distributed_storage (bool, optional): Whether to storage model in a distributed manner.
             It reduces memory from 1 to 1/N, but it may slow down training.
             Defaults to True.
         placement_policy (str, optional): It can be "cpu", "cuda" and "auto".
-            If it's "cpu", parameters, gradients and optimizer states will be offloaded to CPU, which means min CUDA memory will be used.
+            If it's "cpu", parameters, gradients and optimizer states will be offloaded to CPU,
+            which means min CUDA memory will be used.
             If it's "cuda", they won't be offloaded, which means max CUDA memory will be used. It's the fastest.
-            If it's "auto", they are moving dynamically based on CPU and CUDA memory usage. It will utilize heterogeneous memory space evenly and well.
+            If it's "auto", they are moving dynamically based on CPU and CUDA memory usage.
+            It will utilize heterogeneous memory space evenly and well.
             Note that "auto" policy can only work well when no other processes use CUDA during your training.
             Defaults to 'auto'.
         force_outputs_fp32 (bool, optional): Whether to cast outputs to fp32. Defaults to False.
-        gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward) 
-            which will be used by optimizer. 
+        gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward)
+            which will be used by optimizer.
             This argument will be ignored when ``placement_policy`` is not "auto".
             Defaults to 0.0.
         chunk_search_range (int, optional): The range of chunk size to search.
-            The actual search range will be from ``max(min_chunk_size, max_param_size)`` to ``max(min_chunk_size, max_param_size) + chunk_search_range``.
+            The actual search range will be from
+            ``max(min_chunk_size, max_param_size)`` to ``max(min_chunk_size, max_param_size) + chunk_search_range``.
             Defaults to 64*1024**2.
         chunk_search_n_grids (int, optional): The number of intervals in the search range. Defaults to 1024.
         min_chunk_size (Optional[int], optional): The minimum size for a chunk. Defaults to None.
         initial_scale (float, optional): The initial dynamic loss scale value. Defaults to 2**32.
         min_scale (float, optional): The minimum dynamic loss scaling value. Defaults to 1.
         growth_factor (float, optional): The multiplication factor for increasing loss scale. Defaults to 2.
         backoff_factor (float, optional): The multiplication factor for decreasing loss scale. Defaults to 0.5.
-        growth_interval (int, optional): The number of steps to increase loss scale when no overflow occurs. Defaults to 1000.
+        growth_interval (int, optional):
+            The number of steps to increase loss scale when no overflow occurs.
+            Defaults to 1000.
         hysteresis (int, optional): The number of overflows before decreasing loss scale. Defaults to 2.
         max_scale (float, optional): The maximum dynamic loss scaling value. Defaults to 2**32.
 
@@ -150,8 +157,8 @@ def setup_precision_plugin(self) -> None:
         super().setup_precision_plugin()
         assert len(self.optimizers) == 1, 'ColossalAIStrategy only supports single Optimizer now.'
         optimizer = self.optimizers[0]
-        assert isinstance(optimizer, (CPUAdam, HybridAdam)
-                          ), 'ColossalAIStrategy only supports colossalai.nn.optimizer.CPUAdam and colossalai.nn.optimizer.HybridAdam now'
+        assert isinstance(optimizer, (CPUAdam, HybridAdam)), \
+            'ColossalAIStrategy only supports colossalai.nn.optimizer.CPUAdam and colossalai.nn.optimizer.HybridAdam.'
         if self.use_chunk:
             chunk_size = self.chunk_size or ChunkManager.search_chunk_size(self.model, **self.chunk_size_search_kwargs)
         else: