@@ -26,7 +26,8 @@ def _post_init_method(self, module: torch.nn.Module, *args, **kwargs):
26
26
27
27
class ColossalAIStrategy (DDPStrategy ):
28
28
"""ColossalAI strategy.
29
- It only supports single optimizer which must be `colossalai.nn.optimizer.CPUAdam`_ or `colossalai.nn.optimizer.HybridAdam`_ now.
29
+ It only supports single optimizer which must be `colossalai.nn.optimizer.CPUAdam`_ or
30
+ `colossalai.nn.optimizer.HybridAdam`_ now.
30
31
You must initialize your model in ``configure_sharded_model()``.
31
32
32
33
It configures accelerator and precision, and you should not configure them when initializing ``Trainer``.
@@ -48,32 +49,38 @@ def on_load_checkpoint(self, checkpoint) -> None:
48
49
It can speed up training, but slightly more memory will be used. Defaults to True.
49
50
chunk_size (Optional[int], optional): The size of a chunk.
50
51
It will be ignored when ``use_chunk=False``.
51
- If it's None, a best chunk size will be searched out based on ``chunk_search_range``, ``chunk_search_n_grids`` and ``min_chunk_size``.
52
+ If it's None, a best chunk size will be searched out based on ``chunk_search_range``,
53
+ ``chunk_search_n_grids`` and ``min_chunk_size``.
52
54
Defaults to None.
53
55
enable_distributed_storage (bool, optional): Whether to storage model in a distributed manner.
54
56
It reduces memory from 1 to 1/N, but it may slow down training.
55
57
Defaults to True.
56
58
placement_policy (str, optional): It can be "cpu", "cuda" and "auto".
57
- If it's "cpu", parameters, gradients and optimizer states will be offloaded to CPU, which means min CUDA memory will be used.
59
+ If it's "cpu", parameters, gradients and optimizer states will be offloaded to CPU,
60
+ which means min CUDA memory will be used.
58
61
If it's "cuda", they won't be offloaded, which means max CUDA memory will be used. It's the fastest.
59
- If it's "auto", they are moving dynamically based on CPU and CUDA memory usage. It will utilize heterogeneous memory space evenly and well.
62
+ If it's "auto", they are moving dynamically based on CPU and CUDA memory usage.
63
+ It will utilize heterogeneous memory space evenly and well.
60
64
Note that "auto" policy can only work well when no other processes use CUDA during your training.
61
65
Defaults to 'auto'.
62
66
force_outputs_fp32 (bool, optional): Whether to cast outputs to fp32. Defaults to False.
63
- gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward)
64
- which will be used by optimizer.
67
+ gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward)
68
+ which will be used by optimizer.
65
69
This argument will be ignored when ``placement_policy`` is not "auto".
66
70
Defaults to 0.0.
67
71
chunk_search_range (int, optional): The range of chunk size to search.
68
- The actual search range will be from ``max(min_chunk_size, max_param_size)`` to ``max(min_chunk_size, max_param_size) + chunk_search_range``.
72
+ The actual search range will be from
73
+ ``max(min_chunk_size, max_param_size)`` to ``max(min_chunk_size, max_param_size) + chunk_search_range``.
69
74
Defaults to 64*1024**2.
70
75
chunk_search_n_grids (int, optional): The number of intervals in the search range. Defaults to 1024.
71
76
min_chunk_size (Optional[int], optional): The minimum size for a chunk. Defaults to None.
72
77
initial_scale (float, optional): The initial dynamic loss scale value. Defaults to 2**32.
73
78
min_scale (float, optional): The minimum dynamic loss scaling value. Defaults to 1.
74
79
growth_factor (float, optional): The multiplication factor for increasing loss scale. Defaults to 2.
75
80
backoff_factor (float, optional): The multiplication factor for decreasing loss scale. Defaults to 0.5.
76
- growth_interval (int, optional): The number of steps to increase loss scale when no overflow occurs. Defaults to 1000.
81
+ growth_interval (int, optional):
82
+ The number of steps to increase loss scale when no overflow occurs.
83
+ Defaults to 1000.
77
84
hysteresis (int, optional): The number of overflows before decreasing loss scale. Defaults to 2.
78
85
max_scale (float, optional): The maximum dynamic loss scaling value. Defaults to 2**32.
79
86
@@ -150,8 +157,8 @@ def setup_precision_plugin(self) -> None:
150
157
super ().setup_precision_plugin ()
151
158
assert len (self .optimizers ) == 1 , 'ColossalAIStrategy only supports single Optimizer now.'
152
159
optimizer = self .optimizers [0 ]
153
- assert isinstance (optimizer , (CPUAdam , HybridAdam )
154
- ), 'ColossalAIStrategy only supports colossalai.nn.optimizer.CPUAdam and colossalai.nn.optimizer.HybridAdam now '
160
+ assert isinstance (optimizer , (CPUAdam , HybridAdam )), \
161
+ 'ColossalAIStrategy only supports colossalai.nn.optimizer.CPUAdam and colossalai.nn.optimizer.HybridAdam. '
155
162
if self .use_chunk :
156
163
chunk_size = self .chunk_size or ChunkManager .search_chunk_size (self .model , ** self .chunk_size_search_kwargs )
157
164
else :
0 commit comments