Skip to content

Commit 5e8829b

Browse files
kaushikb11carmocca
andauthored
(1/n) tests: Use strategy flag instead of accelerator for training strategies (#9931)
Co-authored-by: Carlos Mocholí <[email protected]>
1 parent e973bcb commit 5e8829b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+420
-172
lines changed

.azure-pipelines/gpu-tests.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -106,10 +106,10 @@ jobs:
106106
set -e
107107
python -m pytest pl_examples -v --maxfail=2 --durations=0
108108
bash pl_examples/run_examples.sh --trainer.gpus=1
109-
bash pl_examples/run_examples.sh --trainer.gpus=2 --trainer.accelerator=ddp
110-
bash pl_examples/run_examples.sh --trainer.gpus=2 --trainer.accelerator=ddp --trainer.precision=16
111-
bash pl_examples/run_examples.sh --trainer.gpus=2 --trainer.accelerator=dp
112-
bash pl_examples/run_examples.sh --trainer.gpus=2 --trainer.accelerator=dp --trainer.precision=16
109+
bash pl_examples/run_examples.sh --trainer.gpus=2 --trainer.strategy=ddp
110+
bash pl_examples/run_examples.sh --trainer.gpus=2 --trainer.strategy=ddp --trainer.precision=16
111+
bash pl_examples/run_examples.sh --trainer.gpus=2 --trainer.strategy=dp
112+
bash pl_examples/run_examples.sh --trainer.gpus=2 --trainer.strategy=dp --trainer.precision=16
113113
env:
114114
PL_USE_MOCKED_MNIST: "1"
115115
displayName: 'Testing: examples'

benchmarks/test_sharded_parity.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,15 +137,15 @@ def plugin_parity_test(
137137
ddp_model = model_cls()
138138
use_cuda = gpus > 0
139139

140-
trainer = Trainer(fast_dev_run=True, max_epochs=1, gpus=gpus, precision=precision, accelerator="ddp_spawn")
140+
trainer = Trainer(fast_dev_run=True, max_epochs=1, gpus=gpus, precision=precision, strategy="ddp_spawn")
141141

142142
max_memory_ddp, ddp_time = record_ddp_fit_model_stats(trainer=trainer, model=ddp_model, use_cuda=use_cuda)
143143

144144
# Reset and train Custom DDP
145145
seed_everything(seed)
146146
custom_plugin_model = model_cls()
147147

148-
trainer = Trainer(fast_dev_run=True, max_epochs=1, gpus=gpus, precision=precision, accelerator="ddp_sharded_spawn")
148+
trainer = Trainer(fast_dev_run=True, max_epochs=1, gpus=gpus, precision=precision, strategy="ddp_sharded_spawn")
149149
assert isinstance(trainer.training_type_plugin, DDPSpawnShardedPlugin)
150150

151151
max_memory_custom, custom_model_time = record_ddp_fit_model_stats(

tests/accelerators/test_accelerator_connector.py

Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -717,3 +717,267 @@ def test_validate_precision_type(tmpdir, precision):
717717
def test_amp_level_raises_error_with_native(tmpdir):
718718
with pytest.raises(MisconfigurationException, match="not supported with `amp_backend='native'`"):
719719
_ = Trainer(default_root_dir=tmpdir, gpus=1, amp_level="O2", amp_backend="native", precision=16)
720+
721+
722+
def test_strategy_choice_ddp_spawn_cpu(tmpdir):
723+
trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", num_processes=2)
724+
assert isinstance(trainer.accelerator, CPUAccelerator)
725+
assert isinstance(trainer.training_type_plugin, DDPSpawnPlugin)
726+
assert isinstance(trainer.training_type_plugin.cluster_environment, LightningEnvironment)
727+
728+
729+
@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
730+
@mock.patch("torch.cuda.device_count", return_value=2)
731+
@mock.patch("torch.cuda.is_available", return_value=True)
732+
def test_strategy_choice_ddp(cuda_available_mock, device_count_mock):
733+
trainer = Trainer(fast_dev_run=True, strategy="ddp", gpus=1)
734+
assert isinstance(trainer.accelerator, GPUAccelerator)
735+
assert isinstance(trainer.training_type_plugin, DDPPlugin)
736+
assert isinstance(trainer.training_type_plugin.cluster_environment, LightningEnvironment)
737+
738+
739+
@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
740+
@mock.patch("torch.cuda.device_count", return_value=2)
741+
@mock.patch("torch.cuda.is_available", return_value=True)
742+
def test_strategy_choice_ddp_spawn(cuda_available_mock, device_count_mock):
743+
trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", gpus=1)
744+
assert isinstance(trainer.accelerator, GPUAccelerator)
745+
assert isinstance(trainer.training_type_plugin, DDPSpawnPlugin)
746+
assert isinstance(trainer.training_type_plugin.cluster_environment, LightningEnvironment)
747+
748+
749+
@RunIf(min_gpus=2)
750+
@mock.patch.dict(
751+
os.environ,
752+
{
753+
"CUDA_VISIBLE_DEVICES": "0,1",
754+
"SLURM_NTASKS": "2",
755+
"SLURM_JOB_NAME": "SOME_NAME",
756+
"SLURM_NODEID": "0",
757+
"SLURM_PROCID": "1",
758+
"SLURM_LOCALID": "1",
759+
},
760+
)
761+
@mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
762+
def test_strategy_choice_ddp_slurm(setup_distributed_mock):
763+
class CB(Callback):
764+
def on_fit_start(self, trainer, pl_module):
765+
assert trainer.accelerator_connector.is_slurm_managing_tasks
766+
assert isinstance(trainer.accelerator, GPUAccelerator)
767+
assert isinstance(trainer.training_type_plugin, DDPPlugin)
768+
assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
769+
assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
770+
assert trainer.training_type_plugin.task_idx == 1
771+
raise SystemExit()
772+
773+
model = BoringModel()
774+
trainer = Trainer(fast_dev_run=True, strategy="ddp", gpus=2, callbacks=[CB()])
775+
776+
with pytest.raises(SystemExit):
777+
trainer.fit(model)
778+
779+
780+
@RunIf(min_gpus=2)
781+
@mock.patch.dict(
782+
os.environ,
783+
{
784+
"CUDA_VISIBLE_DEVICES": "0,1",
785+
"SLURM_NTASKS": "2",
786+
"SLURM_JOB_NAME": "SOME_NAME",
787+
"SLURM_NODEID": "0",
788+
"SLURM_PROCID": "1",
789+
"SLURM_LOCALID": "1",
790+
},
791+
)
792+
@mock.patch("torch.cuda.device_count", return_value=2)
793+
@mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
794+
def test_strategy_choice_ddp2_slurm(device_count_mock, setup_distributed_mock):
795+
class CB(Callback):
796+
def on_fit_start(self, trainer, pl_module):
797+
assert trainer.accelerator_connector.is_slurm_managing_tasks
798+
assert isinstance(trainer.accelerator, GPUAccelerator)
799+
assert isinstance(trainer.training_type_plugin, DDP2Plugin)
800+
assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
801+
assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
802+
assert trainer.training_type_plugin.task_idx == 1
803+
raise SystemExit()
804+
805+
model = BoringModel()
806+
trainer = Trainer(fast_dev_run=True, strategy="ddp2", gpus=2, callbacks=[CB()])
807+
808+
with pytest.raises(SystemExit):
809+
trainer.fit(model)
810+
811+
812+
@RunIf(min_gpus=1)
813+
@mock.patch.dict(
814+
os.environ,
815+
{
816+
"CUDA_VISIBLE_DEVICES": "0,1",
817+
"WORLD_SIZE": "2",
818+
"LOCAL_WORLD_SIZE": "2",
819+
"RANK": "1",
820+
"LOCAL_RANK": "1",
821+
"GROUP_RANK": "0",
822+
},
823+
)
824+
@mock.patch("torch.cuda.device_count", return_value=2)
825+
@mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
826+
def test_strategy_choice_ddp_te(device_count_mock, setup_distributed_mock):
827+
class CB(Callback):
828+
def on_fit_start(self, trainer, pl_module):
829+
assert isinstance(trainer.accelerator, GPUAccelerator)
830+
assert isinstance(trainer.training_type_plugin, DDPPlugin)
831+
assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
832+
assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
833+
assert trainer.training_type_plugin.task_idx == 1
834+
raise SystemExit()
835+
836+
model = BoringModel()
837+
trainer = Trainer(fast_dev_run=True, strategy="ddp", gpus=2, callbacks=[CB()])
838+
839+
with pytest.raises(SystemExit):
840+
trainer.fit(model)
841+
842+
843+
@RunIf(min_gpus=1)
844+
@mock.patch.dict(
845+
os.environ,
846+
{
847+
"CUDA_VISIBLE_DEVICES": "0,1",
848+
"WORLD_SIZE": "2",
849+
"LOCAL_WORLD_SIZE": "2",
850+
"RANK": "1",
851+
"LOCAL_RANK": "1",
852+
"GROUP_RANK": "0",
853+
},
854+
)
855+
@mock.patch("torch.cuda.device_count", return_value=2)
856+
@mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
857+
def test_strategy_choice_ddp2_te(device_count_mock, setup_distributed_mock):
858+
class CB(Callback):
859+
def on_fit_start(self, trainer, pl_module):
860+
assert isinstance(trainer.accelerator, GPUAccelerator)
861+
assert isinstance(trainer.training_type_plugin, DDP2Plugin)
862+
assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
863+
assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
864+
assert trainer.training_type_plugin.task_idx == 1
865+
raise SystemExit()
866+
867+
model = BoringModel()
868+
trainer = Trainer(fast_dev_run=True, strategy="ddp2", gpus=2, callbacks=[CB()])
869+
870+
with pytest.raises(SystemExit):
871+
trainer.fit(model)
872+
873+
874+
@mock.patch.dict(
875+
os.environ, {"WORLD_SIZE": "2", "LOCAL_WORLD_SIZE": "2", "RANK": "1", "LOCAL_RANK": "1", "GROUP_RANK": "0"}
876+
)
877+
@mock.patch("torch.cuda.device_count", return_value=0)
878+
@mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
879+
def test_strategy_choice_ddp_cpu_te(device_count_mock, setup_distributed_mock):
880+
class CB(Callback):
881+
def on_fit_start(self, trainer, pl_module):
882+
assert isinstance(trainer.accelerator, CPUAccelerator)
883+
assert isinstance(trainer.training_type_plugin, DDPPlugin)
884+
assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
885+
assert trainer.training_type_plugin.cluster_environment.local_rank() == 1
886+
assert trainer.training_type_plugin.task_idx == 1
887+
raise SystemExit()
888+
889+
model = BoringModel()
890+
trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", num_processes=2, callbacks=[CB()])
891+
892+
with pytest.raises(SystemExit):
893+
trainer.fit(model)
894+
895+
896+
@RunIf(min_gpus=1)
897+
@mock.patch.dict(
898+
os.environ,
899+
{
900+
"CUDA_VISIBLE_DEVICES": "0",
901+
"KUBERNETES_PORT": "tcp://127.0.0.1:443",
902+
"MASTER_ADDR": "1.2.3.4",
903+
"MASTER_PORT": "500",
904+
"WORLD_SIZE": "20",
905+
"RANK": "1",
906+
},
907+
)
908+
@mock.patch("torch.cuda.device_count", return_value=1)
909+
@mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
910+
def test_strategy_choice_ddp_kubeflow(device_count_mock, setup_distributed_mock):
911+
class CB(Callback):
912+
def on_fit_start(self, trainer, pl_module):
913+
assert isinstance(trainer.accelerator, GPUAccelerator)
914+
assert isinstance(trainer.training_type_plugin, DDPPlugin)
915+
assert isinstance(trainer.training_type_plugin.cluster_environment, KubeflowEnvironment)
916+
assert trainer.training_type_plugin.cluster_environment.local_rank() == 0
917+
assert trainer.training_type_plugin.task_idx == 0
918+
raise SystemExit()
919+
920+
model = BoringModel()
921+
trainer = Trainer(fast_dev_run=True, strategy="ddp", gpus=1, callbacks=[CB()])
922+
923+
with pytest.raises(SystemExit):
924+
trainer.fit(model)
925+
926+
927+
@mock.patch.dict(
928+
os.environ,
929+
{
930+
"KUBERNETES_PORT": "tcp://127.0.0.1:443",
931+
"MASTER_ADDR": "1.2.3.4",
932+
"MASTER_PORT": "500",
933+
"WORLD_SIZE": "20",
934+
"RANK": "1",
935+
},
936+
)
937+
@mock.patch("torch.cuda.device_count", return_value=0)
938+
@mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
939+
def test_strategy_choice_ddp_cpu_kubeflow(device_count_mock, setup_distributed_mock):
940+
class CB(Callback):
941+
def on_fit_start(self, trainer, pl_module):
942+
assert isinstance(trainer.accelerator, CPUAccelerator)
943+
assert isinstance(trainer.training_type_plugin, DDPPlugin)
944+
assert isinstance(trainer.training_type_plugin.cluster_environment, KubeflowEnvironment)
945+
assert trainer.training_type_plugin.cluster_environment.local_rank() == 0
946+
assert trainer.training_type_plugin.task_idx == 0
947+
raise SystemExit()
948+
949+
model = BoringModel()
950+
trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", num_processes=2, callbacks=[CB()])
951+
952+
with pytest.raises(SystemExit):
953+
trainer.fit(model)
954+
955+
956+
@mock.patch.dict(
957+
os.environ,
958+
{
959+
"SLURM_NTASKS": "2",
960+
"SLURM_JOB_NAME": "SOME_NAME",
961+
"SLURM_NODEID": "0",
962+
"LOCAL_RANK": "0",
963+
"SLURM_PROCID": "0",
964+
"SLURM_LOCALID": "0",
965+
},
966+
)
967+
@mock.patch("torch.cuda.device_count", return_value=0)
968+
@mock.patch("pytorch_lightning.plugins.DDPPlugin.setup_distributed", autospec=True)
969+
def test_strategy_choice_ddp_cpu_slurm(device_count_mock, setup_distributed_mock):
970+
class CB(Callback):
971+
def on_fit_start(self, trainer, pl_module):
972+
assert trainer.accelerator_connector.is_slurm_managing_tasks
973+
assert isinstance(trainer.accelerator, CPUAccelerator)
974+
assert isinstance(trainer.training_type_plugin, DDPPlugin)
975+
assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
976+
assert trainer.training_type_plugin.task_idx == 0
977+
raise SystemExit()
978+
979+
model = BoringModel()
980+
trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", num_processes=2, callbacks=[CB()])
981+
982+
with pytest.raises(SystemExit):
983+
trainer.fit(model)

tests/accelerators/test_common.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@
2727
"trainer_kwargs",
2828
(
2929
pytest.param(dict(gpus=1), marks=RunIf(min_gpus=1)),
30-
pytest.param(dict(accelerator="dp", gpus=2), marks=RunIf(min_gpus=2)),
31-
pytest.param(dict(accelerator="ddp_spawn", gpus=2), marks=RunIf(min_gpus=2)),
30+
pytest.param(dict(strategy="dp", gpus=2), marks=RunIf(min_gpus=2)),
31+
pytest.param(dict(strategy="ddp_spawn", gpus=2), marks=RunIf(min_gpus=2)),
3232
),
3333
)
3434
def test_evaluate(tmpdir, trainer_kwargs):

tests/accelerators/test_ddp.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def test_torch_distributed_backend_env_variables(tmpdir):
8585
with patch.dict(os.environ, _environ), patch("torch.cuda.device_count", return_value=2):
8686
with pytest.raises(ValueError, match="Invalid backend: 'undefined'"):
8787
model = BoringModel()
88-
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, accelerator="ddp", gpus=2, logger=False)
88+
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="ddp", gpus=2, logger=False)
8989
trainer.fit(model)
9090

9191

@@ -103,7 +103,7 @@ def setup(self, stage: Optional[str] = None) -> None:
103103
raise SystemExit()
104104

105105
model = TestModel()
106-
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, accelerator="ddp", gpus=1)
106+
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy="ddp", gpus=1)
107107
with pytest.raises(SystemExit):
108108
trainer.fit(model)
109109

@@ -144,7 +144,7 @@ def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule")
144144
default_root_dir=tmpdir,
145145
fast_dev_run=True,
146146
precision=precision,
147-
accelerator="ddp",
147+
strategy="ddp",
148148
gpus=2,
149149
callbacks=CustomCallback(),
150150
)

tests/accelerators/test_ddp_spawn.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def test_multi_gpu_early_stop_ddp_spawn(tmpdir):
3333
limit_train_batches=10,
3434
limit_val_batches=10,
3535
gpus=[0, 1],
36-
accelerator="ddp_spawn",
36+
strategy="ddp_spawn",
3737
)
3838

3939
dm = ClassifDataModule()
@@ -51,7 +51,7 @@ def test_multi_gpu_model_ddp_spawn(tmpdir):
5151
limit_train_batches=10,
5252
limit_val_batches=10,
5353
gpus=[0, 1],
54-
accelerator="ddp_spawn",
54+
strategy="ddp_spawn",
5555
enable_progress_bar=False,
5656
)
5757

@@ -78,7 +78,7 @@ def test_ddp_all_dataloaders_passed_to_fit(tmpdir):
7878
limit_train_batches=0.2,
7979
limit_val_batches=0.2,
8080
gpus=[0, 1],
81-
accelerator="ddp_spawn",
81+
strategy="ddp_spawn",
8282
)
8383
trainer.fit(model, **fit_options)
8484
assert trainer.state.finished, "DDP doesn't work with dataloaders passed to fit()."

0 commit comments

Comments
 (0)