Skip to content

Commit 5742d32

Browse files
committed
polish colossalai plugin's code (#2)
Polish colossalai plugin's code to pass the tests of pre-commit hooks.
1 parent d78eb95 commit 5742d32

File tree

3 files changed

+24
-11
lines changed

3 files changed

+24
-11
lines changed

src/pytorch_lightning/plugins/precision/colossalai.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from torch import Tensor
44
from torch.optim import Optimizer
55
from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
6+
from pytorch_lightning.utilities.exceptions import MisconfigurationException
67

78

89
class ColossalAIPrecisionPlugin(PrecisionPlugin):
@@ -19,4 +20,9 @@ def clip_grad_by_norm(self, optimizer: Optimizer, clip_val: Union[int, float]) -
1920
def optimizer_step(self, model, optimizer, optimizer_idx: int, closure, **kwargs: Any) -> Any:
2021
closure_result = closure()
2122
self._after_closure(model, optimizer, optimizer_idx)
23+
skipped_backward = closure_result is None
24+
if isinstance(model, pl.LightningModule) and model.automatic_optimization and skipped_backward:
25+
raise MisconfigurationException(
26+
"Skipping backward by returning `None` from your `training_step` is not supported by `Colossalai`"
27+
)
2228
optimizer.step()

src/pytorch_lightning/strategies/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
from pytorch_lightning.strategies.strategy import Strategy # noqa: F401
3232
from pytorch_lightning.strategies.strategy_registry import call_register_strategies, StrategyRegistry # noqa: F401
3333
from pytorch_lightning.strategies.tpu_spawn import TPUSpawnStrategy # noqa: F401
34-
from pytorch_lightning.strategies.colossalai import ColossalAIStrategy
34+
from pytorch_lightning.strategies.colossalai import ColossalAIStrategy # noqa: F401
3535

3636
STRATEGIES_BASE_MODULE = "pytorch_lightning.strategies"
3737

src/pytorch_lightning/strategies/colossalai.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ def _post_init_method(self, module: torch.nn.Module, *args, **kwargs):
2626

2727
class ColossalAIStrategy(DDPStrategy):
2828
"""ColossalAI strategy.
29-
It only supports single optimizer which must be `colossalai.nn.optimizer.CPUAdam`_ or `colossalai.nn.optimizer.HybridAdam`_ now.
29+
It only supports single optimizer which must be `colossalai.nn.optimizer.CPUAdam`_ or
30+
`colossalai.nn.optimizer.HybridAdam`_ now.
3031
You must initialize your model in ``configure_sharded_model()``.
3132
3233
It configures accelerator and precision, and you should not configure them when initializing ``Trainer``.
@@ -48,32 +49,38 @@ def on_load_checkpoint(self, checkpoint) -> None:
4849
It can speed up training, but slightly more memory will be used. Defaults to True.
4950
chunk_size (Optional[int], optional): The size of a chunk.
5051
It will be ignored when ``use_chunk=False``.
51-
If it's None, a best chunk size will be searched out based on ``chunk_search_range``, ``chunk_search_n_grids`` and ``min_chunk_size``.
52+
If it's None, a best chunk size will be searched out based on ``chunk_search_range``,
53+
``chunk_search_n_grids`` and ``min_chunk_size``.
5254
Defaults to None.
5355
enable_distributed_storage (bool, optional): Whether to storage model in a distributed manner.
5456
It reduces memory from 1 to 1/N, but it may slow down training.
5557
Defaults to True.
5658
placement_policy (str, optional): It can be "cpu", "cuda" and "auto".
57-
If it's "cpu", parameters, gradients and optimizer states will be offloaded to CPU, which means min CUDA memory will be used.
59+
If it's "cpu", parameters, gradients and optimizer states will be offloaded to CPU,
60+
which means min CUDA memory will be used.
5861
If it's "cuda", they won't be offloaded, which means max CUDA memory will be used. It's the fastest.
59-
If it's "auto", they are moving dynamically based on CPU and CUDA memory usage. It will utilize heterogeneous memory space evenly and well.
62+
If it's "auto", they are moving dynamically based on CPU and CUDA memory usage.
63+
It will utilize heterogeneous memory space evenly and well.
6064
Note that "auto" policy can only work well when no other processes use CUDA during your training.
6165
Defaults to 'auto'.
6266
force_outputs_fp32 (bool, optional): Whether to cast outputs to fp32. Defaults to False.
63-
gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward)
64-
which will be used by optimizer.
67+
gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward)
68+
which will be used by optimizer.
6569
This argument will be ignored when ``placement_policy`` is not "auto".
6670
Defaults to 0.0.
6771
chunk_search_range (int, optional): The range of chunk size to search.
68-
The actual search range will be from ``max(min_chunk_size, max_param_size)`` to ``max(min_chunk_size, max_param_size) + chunk_search_range``.
72+
The actual search range will be from
73+
``max(min_chunk_size, max_param_size)`` to ``max(min_chunk_size, max_param_size) + chunk_search_range``.
6974
Defaults to 64*1024**2.
7075
chunk_search_n_grids (int, optional): The number of intervals in the search range. Defaults to 1024.
7176
min_chunk_size (Optional[int], optional): The minimum size for a chunk. Defaults to None.
7277
initial_scale (float, optional): The initial dynamic loss scale value. Defaults to 2**32.
7378
min_scale (float, optional): The minimum dynamic loss scaling value. Defaults to 1.
7479
growth_factor (float, optional): The multiplication factor for increasing loss scale. Defaults to 2.
7580
backoff_factor (float, optional): The multiplication factor for decreasing loss scale. Defaults to 0.5.
76-
growth_interval (int, optional): The number of steps to increase loss scale when no overflow occurs. Defaults to 1000.
81+
growth_interval (int, optional):
82+
The number of steps to increase loss scale when no overflow occurs.
83+
Defaults to 1000.
7784
hysteresis (int, optional): The number of overflows before decreasing loss scale. Defaults to 2.
7885
max_scale (float, optional): The maximum dynamic loss scaling value. Defaults to 2**32.
7986
@@ -150,8 +157,8 @@ def setup_precision_plugin(self) -> None:
150157
super().setup_precision_plugin()
151158
assert len(self.optimizers) == 1, 'ColossalAIStrategy only supports single Optimizer now.'
152159
optimizer = self.optimizers[0]
153-
assert isinstance(optimizer, (CPUAdam, HybridAdam)
154-
), 'ColossalAIStrategy only supports colossalai.nn.optimizer.CPUAdam and colossalai.nn.optimizer.HybridAdam now'
160+
assert isinstance(optimizer, (CPUAdam, HybridAdam)), \
161+
'ColossalAIStrategy only supports colossalai.nn.optimizer.CPUAdam and colossalai.nn.optimizer.HybridAdam.'
155162
if self.use_chunk:
156163
chunk_size = self.chunk_size or ChunkManager.search_chunk_size(self.model, **self.chunk_size_search_kwargs)
157164
else:

0 commit comments

Comments
 (0)