@@ -717,3 +717,267 @@ def test_validate_precision_type(tmpdir, precision):
717
717
def test_amp_level_raises_error_with_native (tmpdir ):
718
718
with pytest .raises (MisconfigurationException , match = "not supported with `amp_backend='native'`" ):
719
719
_ = Trainer (default_root_dir = tmpdir , gpus = 1 , amp_level = "O2" , amp_backend = "native" , precision = 16 )
720
+
721
+
722
+ def test_strategy_choice_ddp_spawn_cpu (tmpdir ):
723
+ trainer = Trainer (fast_dev_run = True , strategy = "ddp_spawn" , num_processes = 2 )
724
+ assert isinstance (trainer .accelerator , CPUAccelerator )
725
+ assert isinstance (trainer .training_type_plugin , DDPSpawnPlugin )
726
+ assert isinstance (trainer .training_type_plugin .cluster_environment , LightningEnvironment )
727
+
728
+
729
+ @mock .patch .dict (os .environ , {"CUDA_VISIBLE_DEVICES" : "0,1" })
730
+ @mock .patch ("torch.cuda.device_count" , return_value = 2 )
731
+ @mock .patch ("torch.cuda.is_available" , return_value = True )
732
+ def test_strategy_choice_ddp (cuda_available_mock , device_count_mock ):
733
+ trainer = Trainer (fast_dev_run = True , strategy = "ddp" , gpus = 1 )
734
+ assert isinstance (trainer .accelerator , GPUAccelerator )
735
+ assert isinstance (trainer .training_type_plugin , DDPPlugin )
736
+ assert isinstance (trainer .training_type_plugin .cluster_environment , LightningEnvironment )
737
+
738
+
739
+ @mock .patch .dict (os .environ , {"CUDA_VISIBLE_DEVICES" : "0,1" })
740
+ @mock .patch ("torch.cuda.device_count" , return_value = 2 )
741
+ @mock .patch ("torch.cuda.is_available" , return_value = True )
742
+ def test_strategy_choice_ddp_spawn (cuda_available_mock , device_count_mock ):
743
+ trainer = Trainer (fast_dev_run = True , strategy = "ddp_spawn" , gpus = 1 )
744
+ assert isinstance (trainer .accelerator , GPUAccelerator )
745
+ assert isinstance (trainer .training_type_plugin , DDPSpawnPlugin )
746
+ assert isinstance (trainer .training_type_plugin .cluster_environment , LightningEnvironment )
747
+
748
+
749
+ @RunIf (min_gpus = 2 )
750
+ @mock .patch .dict (
751
+ os .environ ,
752
+ {
753
+ "CUDA_VISIBLE_DEVICES" : "0,1" ,
754
+ "SLURM_NTASKS" : "2" ,
755
+ "SLURM_JOB_NAME" : "SOME_NAME" ,
756
+ "SLURM_NODEID" : "0" ,
757
+ "SLURM_PROCID" : "1" ,
758
+ "SLURM_LOCALID" : "1" ,
759
+ },
760
+ )
761
+ @mock .patch ("pytorch_lightning.plugins.DDPPlugin.setup_distributed" , autospec = True )
762
+ def test_strategy_choice_ddp_slurm (setup_distributed_mock ):
763
+ class CB (Callback ):
764
+ def on_fit_start (self , trainer , pl_module ):
765
+ assert trainer .accelerator_connector .is_slurm_managing_tasks
766
+ assert isinstance (trainer .accelerator , GPUAccelerator )
767
+ assert isinstance (trainer .training_type_plugin , DDPPlugin )
768
+ assert isinstance (trainer .training_type_plugin .cluster_environment , SLURMEnvironment )
769
+ assert trainer .training_type_plugin .cluster_environment .local_rank () == 1
770
+ assert trainer .training_type_plugin .task_idx == 1
771
+ raise SystemExit ()
772
+
773
+ model = BoringModel ()
774
+ trainer = Trainer (fast_dev_run = True , strategy = "ddp" , gpus = 2 , callbacks = [CB ()])
775
+
776
+ with pytest .raises (SystemExit ):
777
+ trainer .fit (model )
778
+
779
+
780
+ @RunIf (min_gpus = 2 )
781
+ @mock .patch .dict (
782
+ os .environ ,
783
+ {
784
+ "CUDA_VISIBLE_DEVICES" : "0,1" ,
785
+ "SLURM_NTASKS" : "2" ,
786
+ "SLURM_JOB_NAME" : "SOME_NAME" ,
787
+ "SLURM_NODEID" : "0" ,
788
+ "SLURM_PROCID" : "1" ,
789
+ "SLURM_LOCALID" : "1" ,
790
+ },
791
+ )
792
+ @mock .patch ("torch.cuda.device_count" , return_value = 2 )
793
+ @mock .patch ("pytorch_lightning.plugins.DDPPlugin.setup_distributed" , autospec = True )
794
+ def test_strategy_choice_ddp2_slurm (device_count_mock , setup_distributed_mock ):
795
+ class CB (Callback ):
796
+ def on_fit_start (self , trainer , pl_module ):
797
+ assert trainer .accelerator_connector .is_slurm_managing_tasks
798
+ assert isinstance (trainer .accelerator , GPUAccelerator )
799
+ assert isinstance (trainer .training_type_plugin , DDP2Plugin )
800
+ assert isinstance (trainer .training_type_plugin .cluster_environment , SLURMEnvironment )
801
+ assert trainer .training_type_plugin .cluster_environment .local_rank () == 1
802
+ assert trainer .training_type_plugin .task_idx == 1
803
+ raise SystemExit ()
804
+
805
+ model = BoringModel ()
806
+ trainer = Trainer (fast_dev_run = True , strategy = "ddp2" , gpus = 2 , callbacks = [CB ()])
807
+
808
+ with pytest .raises (SystemExit ):
809
+ trainer .fit (model )
810
+
811
+
812
+ @RunIf (min_gpus = 1 )
813
+ @mock .patch .dict (
814
+ os .environ ,
815
+ {
816
+ "CUDA_VISIBLE_DEVICES" : "0,1" ,
817
+ "WORLD_SIZE" : "2" ,
818
+ "LOCAL_WORLD_SIZE" : "2" ,
819
+ "RANK" : "1" ,
820
+ "LOCAL_RANK" : "1" ,
821
+ "GROUP_RANK" : "0" ,
822
+ },
823
+ )
824
+ @mock .patch ("torch.cuda.device_count" , return_value = 2 )
825
+ @mock .patch ("pytorch_lightning.plugins.DDPPlugin.setup_distributed" , autospec = True )
826
+ def test_strategy_choice_ddp_te (device_count_mock , setup_distributed_mock ):
827
+ class CB (Callback ):
828
+ def on_fit_start (self , trainer , pl_module ):
829
+ assert isinstance (trainer .accelerator , GPUAccelerator )
830
+ assert isinstance (trainer .training_type_plugin , DDPPlugin )
831
+ assert isinstance (trainer .training_type_plugin .cluster_environment , TorchElasticEnvironment )
832
+ assert trainer .training_type_plugin .cluster_environment .local_rank () == 1
833
+ assert trainer .training_type_plugin .task_idx == 1
834
+ raise SystemExit ()
835
+
836
+ model = BoringModel ()
837
+ trainer = Trainer (fast_dev_run = True , strategy = "ddp" , gpus = 2 , callbacks = [CB ()])
838
+
839
+ with pytest .raises (SystemExit ):
840
+ trainer .fit (model )
841
+
842
+
843
+ @RunIf (min_gpus = 1 )
844
+ @mock .patch .dict (
845
+ os .environ ,
846
+ {
847
+ "CUDA_VISIBLE_DEVICES" : "0,1" ,
848
+ "WORLD_SIZE" : "2" ,
849
+ "LOCAL_WORLD_SIZE" : "2" ,
850
+ "RANK" : "1" ,
851
+ "LOCAL_RANK" : "1" ,
852
+ "GROUP_RANK" : "0" ,
853
+ },
854
+ )
855
+ @mock .patch ("torch.cuda.device_count" , return_value = 2 )
856
+ @mock .patch ("pytorch_lightning.plugins.DDPPlugin.setup_distributed" , autospec = True )
857
+ def test_strategy_choice_ddp2_te (device_count_mock , setup_distributed_mock ):
858
+ class CB (Callback ):
859
+ def on_fit_start (self , trainer , pl_module ):
860
+ assert isinstance (trainer .accelerator , GPUAccelerator )
861
+ assert isinstance (trainer .training_type_plugin , DDP2Plugin )
862
+ assert isinstance (trainer .training_type_plugin .cluster_environment , TorchElasticEnvironment )
863
+ assert trainer .training_type_plugin .cluster_environment .local_rank () == 1
864
+ assert trainer .training_type_plugin .task_idx == 1
865
+ raise SystemExit ()
866
+
867
+ model = BoringModel ()
868
+ trainer = Trainer (fast_dev_run = True , strategy = "ddp2" , gpus = 2 , callbacks = [CB ()])
869
+
870
+ with pytest .raises (SystemExit ):
871
+ trainer .fit (model )
872
+
873
+
874
+ @mock .patch .dict (
875
+ os .environ , {"WORLD_SIZE" : "2" , "LOCAL_WORLD_SIZE" : "2" , "RANK" : "1" , "LOCAL_RANK" : "1" , "GROUP_RANK" : "0" }
876
+ )
877
+ @mock .patch ("torch.cuda.device_count" , return_value = 0 )
878
+ @mock .patch ("pytorch_lightning.plugins.DDPPlugin.setup_distributed" , autospec = True )
879
+ def test_strategy_choice_ddp_cpu_te (device_count_mock , setup_distributed_mock ):
880
+ class CB (Callback ):
881
+ def on_fit_start (self , trainer , pl_module ):
882
+ assert isinstance (trainer .accelerator , CPUAccelerator )
883
+ assert isinstance (trainer .training_type_plugin , DDPPlugin )
884
+ assert isinstance (trainer .training_type_plugin .cluster_environment , TorchElasticEnvironment )
885
+ assert trainer .training_type_plugin .cluster_environment .local_rank () == 1
886
+ assert trainer .training_type_plugin .task_idx == 1
887
+ raise SystemExit ()
888
+
889
+ model = BoringModel ()
890
+ trainer = Trainer (fast_dev_run = True , strategy = "ddp_spawn" , num_processes = 2 , callbacks = [CB ()])
891
+
892
+ with pytest .raises (SystemExit ):
893
+ trainer .fit (model )
894
+
895
+
896
+ @RunIf (min_gpus = 1 )
897
+ @mock .patch .dict (
898
+ os .environ ,
899
+ {
900
+ "CUDA_VISIBLE_DEVICES" : "0" ,
901
+ "KUBERNETES_PORT" : "tcp://127.0.0.1:443" ,
902
+ "MASTER_ADDR" : "1.2.3.4" ,
903
+ "MASTER_PORT" : "500" ,
904
+ "WORLD_SIZE" : "20" ,
905
+ "RANK" : "1" ,
906
+ },
907
+ )
908
+ @mock .patch ("torch.cuda.device_count" , return_value = 1 )
909
+ @mock .patch ("pytorch_lightning.plugins.DDPPlugin.setup_distributed" , autospec = True )
910
+ def test_strategy_choice_ddp_kubeflow (device_count_mock , setup_distributed_mock ):
911
+ class CB (Callback ):
912
+ def on_fit_start (self , trainer , pl_module ):
913
+ assert isinstance (trainer .accelerator , GPUAccelerator )
914
+ assert isinstance (trainer .training_type_plugin , DDPPlugin )
915
+ assert isinstance (trainer .training_type_plugin .cluster_environment , KubeflowEnvironment )
916
+ assert trainer .training_type_plugin .cluster_environment .local_rank () == 0
917
+ assert trainer .training_type_plugin .task_idx == 0
918
+ raise SystemExit ()
919
+
920
+ model = BoringModel ()
921
+ trainer = Trainer (fast_dev_run = True , strategy = "ddp" , gpus = 1 , callbacks = [CB ()])
922
+
923
+ with pytest .raises (SystemExit ):
924
+ trainer .fit (model )
925
+
926
+
927
+ @mock .patch .dict (
928
+ os .environ ,
929
+ {
930
+ "KUBERNETES_PORT" : "tcp://127.0.0.1:443" ,
931
+ "MASTER_ADDR" : "1.2.3.4" ,
932
+ "MASTER_PORT" : "500" ,
933
+ "WORLD_SIZE" : "20" ,
934
+ "RANK" : "1" ,
935
+ },
936
+ )
937
+ @mock .patch ("torch.cuda.device_count" , return_value = 0 )
938
+ @mock .patch ("pytorch_lightning.plugins.DDPPlugin.setup_distributed" , autospec = True )
939
+ def test_strategy_choice_ddp_cpu_kubeflow (device_count_mock , setup_distributed_mock ):
940
+ class CB (Callback ):
941
+ def on_fit_start (self , trainer , pl_module ):
942
+ assert isinstance (trainer .accelerator , CPUAccelerator )
943
+ assert isinstance (trainer .training_type_plugin , DDPPlugin )
944
+ assert isinstance (trainer .training_type_plugin .cluster_environment , KubeflowEnvironment )
945
+ assert trainer .training_type_plugin .cluster_environment .local_rank () == 0
946
+ assert trainer .training_type_plugin .task_idx == 0
947
+ raise SystemExit ()
948
+
949
+ model = BoringModel ()
950
+ trainer = Trainer (fast_dev_run = True , strategy = "ddp_spawn" , num_processes = 2 , callbacks = [CB ()])
951
+
952
+ with pytest .raises (SystemExit ):
953
+ trainer .fit (model )
954
+
955
+
956
+ @mock .patch .dict (
957
+ os .environ ,
958
+ {
959
+ "SLURM_NTASKS" : "2" ,
960
+ "SLURM_JOB_NAME" : "SOME_NAME" ,
961
+ "SLURM_NODEID" : "0" ,
962
+ "LOCAL_RANK" : "0" ,
963
+ "SLURM_PROCID" : "0" ,
964
+ "SLURM_LOCALID" : "0" ,
965
+ },
966
+ )
967
+ @mock .patch ("torch.cuda.device_count" , return_value = 0 )
968
+ @mock .patch ("pytorch_lightning.plugins.DDPPlugin.setup_distributed" , autospec = True )
969
+ def test_strategy_choice_ddp_cpu_slurm (device_count_mock , setup_distributed_mock ):
970
+ class CB (Callback ):
971
+ def on_fit_start (self , trainer , pl_module ):
972
+ assert trainer .accelerator_connector .is_slurm_managing_tasks
973
+ assert isinstance (trainer .accelerator , CPUAccelerator )
974
+ assert isinstance (trainer .training_type_plugin , DDPPlugin )
975
+ assert isinstance (trainer .training_type_plugin .cluster_environment , SLURMEnvironment )
976
+ assert trainer .training_type_plugin .task_idx == 0
977
+ raise SystemExit ()
978
+
979
+ model = BoringModel ()
980
+ trainer = Trainer (fast_dev_run = True , strategy = "ddp_spawn" , num_processes = 2 , callbacks = [CB ()])
981
+
982
+ with pytest .raises (SystemExit ):
983
+ trainer .fit (model )
0 commit comments