From 5725ec93aecf08b0e439637a69fc37bfad62e464 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 15 Oct 2021 03:10:47 +0200 Subject: [PATCH 1/7] Update accelerator connector messages after the addition of strategy --- .../connectors/accelerator_connector.py | 69 +++++++++---------- .../test_accelerator_connector.py | 8 +-- 2 files changed, 35 insertions(+), 42 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 7b86aab64130a..3c394bf20fce9 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -215,17 +215,19 @@ def select_accelerator_type(self) -> None: self._accelerator_type = DeviceType.CPU elif self.distributed_backend == DeviceType.TPU: if not self.has_tpu: - msg = "TPUs are not available" if not _TPU_AVAILABLE else "you didn't pass `tpu_cores` to `Trainer`" + msg = "TPUs are not available" if not _TPU_AVAILABLE else "you didn't pass `devices` to `Trainer`" raise MisconfigurationException(f"You passed `accelerator='tpu'`, but {msg}.") self._accelerator_type = DeviceType.TPU elif self.distributed_backend == DeviceType.IPU: if not self.has_ipu: - msg = "IPUs are not available" if not _IPU_AVAILABLE else "you didn't pass `ipus` to `Trainer`" + msg = "IPUs are not available" if not _IPU_AVAILABLE else "you didn't pass `devices` to `Trainer`" raise MisconfigurationException(f"You passed `accelerator='ipu'`, but {msg}.") self._accelerator_type = DeviceType.IPU elif self.distributed_backend == DeviceType.GPU: if not self.has_gpu: - msg = "you didn't pass `gpus` to `Trainer`" if torch.cuda.is_available() else "GPUs are not available" + msg = ( + "you didn't pass `devices` to `Trainer`" if torch.cuda.is_available() else "GPUs are not available" + ) raise MisconfigurationException(f"You passed `accelerator='gpu'`, but {msg}.") self._accelerator_type = DeviceType.GPU elif self.distributed_backend == DeviceType.CPU: @@ -240,14 +242,14 @@ def _validate_accelerator_and_devices(self) -> None: raise MisconfigurationException( f"You passed `devices={self.devices}` but haven't specified" " `accelerator=('auto'|'tpu'|'gpu'|'ipu'|'cpu')` for the devices mapping," - f" got `accelerator={self.distributed_backend}`." + f" got `accelerator={self.distributed_backend!r}`." ) def _validate_accelerator_type(self) -> None: if self._accelerator_type and self._accelerator_type != self._device_type: raise MisconfigurationException( - f"Mismatch between the requested accelerator type ({self._accelerator_type})" - f" and assigned device type ({self._device_type})." + f"Mismatch between the requested `strategy={self._accelerator_type.value!r}`" + f" and `accelerator={self._device_type.value!r}`." ) self._accelerator_type = self._device_type @@ -255,25 +257,16 @@ def _warn_if_devices_flag_ignored(self) -> None: if self.devices is None: return devices_warning = f"The flag `devices={self.devices}` will be ignored, as you have set" - if self.distributed_backend == "auto": + if self.distributed_backend in ("auto", DeviceType.TPU): if self.tpu_cores is not None: rank_zero_warn(f"{devices_warning} `tpu_cores={self.tpu_cores}`") - elif self.ipus is not None: - rank_zero_warn(f"{devices_warning} `ipus={self.ipus}`") - elif self.gpus is not None: - rank_zero_warn(f"{devices_warning} `gpus={self.gpus}`") - elif self.num_processes != 1: - rank_zero_warn(f"{devices_warning} `num_processes={self.num_processes}`") - elif self.distributed_backend == DeviceType.TPU: - if self.tpu_cores is not None: - rank_zero_warn(f"{devices_warning} `tpu_cores={self.tpu_cores}`") - elif self.distributed_backend == DeviceType.IPU: + elif self.distributed_backend in ("auto", DeviceType.IPU): if self.ipus is not None: rank_zero_warn(f"{devices_warning} `ipus={self.ipus}`") - elif self.distributed_backend == DeviceType.GPU: + elif self.distributed_backend in ("auto", DeviceType.GPU): if self.gpus is not None: rank_zero_warn(f"{devices_warning} `gpus={self.gpus}`") - elif self.distributed_backend == DeviceType.CPU: + elif self.distributed_backend in ("auto", DeviceType.CPU): if self.num_processes != 1: rank_zero_warn(f"{devices_warning} `num_processes={self.num_processes}`") @@ -294,26 +287,27 @@ def _handle_accelerator_and_distributed_backend( ) -> None: if distributed_backend is not None: rank_zero_deprecation( - f"`Trainer(distributed_backend={distributed_backend})` has been deprecated and will be removed in v1.5." - f" Use `Trainer(strategy={distributed_backend})` instead." + f"`Trainer(distributed_backend={distributed_backend!r})` " + "has been deprecated and will be removed in v1.5." + f" Use `Trainer(strategy={distributed_backend!r})` instead." ) if self.strategy is not None: raise MisconfigurationException( - f"You have passed `Trainer(strategy={self.strategy})` but have" - f" also passed `Trainer(distributed_backend={distributed_backend})`." - f"HINT: Use just `Trainer(strategy={self.strategy})` instead." + f"You have passed `Trainer(strategy={self.strategy!r})` but have" + f" also passed `Trainer(distributed_backend={distributed_backend!r})`." + f" HINT: Use just `Trainer(strategy={self.strategy!r})` instead." ) if accelerator is not None and accelerator in list(DistributedType): rank_zero_deprecation( - f"Passing {accelerator} `strategy` to the `accelerator` flag in Trainer has been deprecated" - f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={accelerator})` instead." + f"Passing `Trainer(accelerator={accelerator!r})` has been deprecated" + f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={accelerator!r})` instead." ) if self.strategy is not None: raise MisconfigurationException( - f"You have passed `Trainer(strategy={self.strategy})` but have" - f" also passed `Trainer(accelerator={accelerator})`." - f"HINT: Use just `Trainer(strategy={self.strategy})` instead." + f"You have passed `Trainer(strategy={self.strategy!r})` but have" + f" also passed `Trainer(accelerator={accelerator!r})`." + f" HINT: Use just `Trainer(strategy={self.strategy!r})` instead." ) def _set_training_type_plugin(self) -> None: @@ -329,7 +323,7 @@ def handle_given_plugins(self) -> None: for plug in self.plugins: if self.strategy is not None and self._is_plugin_training_type(plug): raise MisconfigurationException( - f"You have passed `Trainer(strategy={self.strategy})`" + f"You have passed `Trainer(strategy={self.strategy!r})`" f" and you can only specify one training type plugin, but you have passed {plug} as a plugin." ) if self._is_plugin_training_type(plug): @@ -503,7 +497,7 @@ def _map_devices_to_accelerator(self, accelerator: str) -> bool: if accelerator == DeviceType.CPU: if not isinstance(self.devices, int): raise MisconfigurationException( - "The flag `devices` only supports integer for `accelerator='cpu'`," + "The flag `devices` must be an int with `accelerator='cpu'`," f" got `devices={self.devices}` instead." ) self.num_processes = self.devices @@ -816,7 +810,7 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): elif self.num_gpus > 1 and not _use_cpu: rank_zero_warn( "You requested multiple GPUs but did not specify a backend, e.g." - ' `Trainer(accelerator="dp"|"ddp"|"ddp2")`. Setting `accelerator="ddp_spawn"` for you.' + ' `Trainer(strategy="dp"|"ddp"|"ddp2")`. Setting `strategy="ddp_spawn"` for you.' ) self.distributed_backend = DistributedType.DDP_SPAWN @@ -824,7 +818,7 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): if self.distributed_backend == DistributedType.DDP_CPU: if _TPU_AVAILABLE: raise MisconfigurationException( - "`accelerator='ddp_cpu'` is not supported on TPU machines. " + "`strategy='ddp_cpu'` is not supported on TPU machines. " "Learn more: https://github.com/PyTorchLightning/pytorch-lightning/issues/7810" ) if self.num_processes == 1 and self.num_nodes > 1: @@ -833,7 +827,7 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): self._distrib_type = DistributedType.DDP_SPAWN if self.num_gpus > 0: rank_zero_warn( - "You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs." + "You requested one or more GPUs, but set `strategy='ddp_cpu'`. Training will not use GPUs." ) self.parallel_device_ids = None if self.num_processes is None: @@ -859,7 +853,7 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): if (self.num_nodes and self.num_nodes > 1) or (self.num_processes and self.num_processes > 1): if self._distrib_type in (DistributedType.DP, DistributedType.DDP2): rank_zero_warn( - f"{self._distrib_type} is not supported on CPUs, hence setting the distributed type to `ddp`." + f"{self._distrib_type.value!r} is not supported on CPUs, hence setting `strategy='ddp'`." ) self._distrib_type = DistributedType.DDP else: @@ -887,8 +881,7 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): if self.num_nodes > 1 and not using_valid_distributed: # throw error to force user to choose a supported distributed type such as ddp or ddp2 raise MisconfigurationException( - "Your chosen distributed type does not support num_nodes > 1. " - "Please set accelerator=ddp or accelerator=ddp2." + "Your chosen strategy does not support `num_nodes > 1`. " "Please set `strategy=('ddp'|'ddp2')`." ) def _set_horovod_backend(self): @@ -910,7 +903,7 @@ def check_interactive_compatibility(self): if _IS_INTERACTIVE and self._distrib_type is not None and not self._distrib_type.is_interactive_compatible(): raise MisconfigurationException( - f"Selected distributed backend {self._distrib_type} is not compatible with an interactive" + f"`Trainer(strategy={self._distrib_type.value!r})` is not compatible with an interactive" " environment. Run your code as a script, or choose one of the compatible backends:" f" {', '.join(DistributedType.interactive_compatible_types())}." " In case you are spawning processes yourself, make sure to include the Trainer" diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index aa165feb89e79..84524f364b2a7 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -447,10 +447,10 @@ def on_fit_start(self, trainer, pl_module): @mock.patch("pytorch_lightning.utilities._IS_INTERACTIVE", return_value=True) @mock.patch("torch.cuda.device_count", return_value=2) def test_ipython_incompatible_backend_error(*_): - with pytest.raises(MisconfigurationException, match="backend ddp is not compatible"): + with pytest.raises(MisconfigurationException, match=r"strategy='ddp'\)` is not compatible"): Trainer(accelerator="ddp", gpus=2) - with pytest.raises(MisconfigurationException, match="backend ddp2 is not compatible"): + with pytest.raises(MisconfigurationException, match=r"strategy='ddp2'\)` is not compatible"): Trainer(accelerator="ddp2", gpus=2) @@ -615,14 +615,14 @@ def test_set_devices_if_none_gpu(): def test_devices_with_cpu_only_supports_integer(): - with pytest.raises(MisconfigurationException, match="The flag `devices` only supports integer"): + with pytest.raises(MisconfigurationException, match="The flag `devices` must be an int"): Trainer(accelerator="cpu", devices="1,3") @pytest.mark.parametrize("training_type", ["ddp2", "dp"]) def test_unsupported_distrib_types_on_cpu(training_type): - with pytest.warns(UserWarning, match="is not supported on CPUs, hence setting the distributed type to `ddp`."): + with pytest.warns(UserWarning, match="is not supported on CPUs, hence setting `strategy='ddp"): trainer = Trainer(accelerator=training_type, num_processes=2) assert trainer._distrib_type == DistributedType.DDP From 04663bc5425a63b54f92e1eb8f9ebbe0af800da8 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 15 Oct 2021 03:12:57 +0200 Subject: [PATCH 2/7] Whitespace --- pytorch_lightning/trainer/connectors/accelerator_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 3c394bf20fce9..4aba4380fc7ff 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -881,7 +881,7 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): if self.num_nodes > 1 and not using_valid_distributed: # throw error to force user to choose a supported distributed type such as ddp or ddp2 raise MisconfigurationException( - "Your chosen strategy does not support `num_nodes > 1`. " "Please set `strategy=('ddp'|'ddp2')`." + "Your chosen strategy does not support `num_nodes > 1`. Please set `strategy=('ddp'|'ddp2')`." ) def _set_horovod_backend(self): From 9f5a9fda7de4c39cb54c9e7c98427c33f07ff969 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 15 Oct 2021 22:32:46 +0200 Subject: [PATCH 3/7] Undo change --- .../trainer/connectors/accelerator_connector.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 4aba4380fc7ff..c4a377e23a6c3 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -215,19 +215,17 @@ def select_accelerator_type(self) -> None: self._accelerator_type = DeviceType.CPU elif self.distributed_backend == DeviceType.TPU: if not self.has_tpu: - msg = "TPUs are not available" if not _TPU_AVAILABLE else "you didn't pass `devices` to `Trainer`" + msg = "TPUs are not available" if not _TPU_AVAILABLE else "you didn't pass `tpu_cores` to `Trainer`" raise MisconfigurationException(f"You passed `accelerator='tpu'`, but {msg}.") self._accelerator_type = DeviceType.TPU elif self.distributed_backend == DeviceType.IPU: if not self.has_ipu: - msg = "IPUs are not available" if not _IPU_AVAILABLE else "you didn't pass `devices` to `Trainer`" + msg = "IPUs are not available" if not _IPU_AVAILABLE else "you didn't pass `ipus` to `Trainer`" raise MisconfigurationException(f"You passed `accelerator='ipu'`, but {msg}.") self._accelerator_type = DeviceType.IPU elif self.distributed_backend == DeviceType.GPU: if not self.has_gpu: - msg = ( - "you didn't pass `devices` to `Trainer`" if torch.cuda.is_available() else "GPUs are not available" - ) + msg = "you didn't pass `gpus` to `Trainer`" if torch.cuda.is_available() else "GPUs are not available" raise MisconfigurationException(f"You passed `accelerator='gpu'`, but {msg}.") self._accelerator_type = DeviceType.GPU elif self.distributed_backend == DeviceType.CPU: From f46d05d5cefd1dc9b5d55f5f8e50830c26da7d32 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Sun, 17 Oct 2021 16:41:00 +0200 Subject: [PATCH 4/7] Address review --- .../trainer/connectors/accelerator_connector.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index c4a377e23a6c3..e6be66ba8df12 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -245,9 +245,10 @@ def _validate_accelerator_and_devices(self) -> None: def _validate_accelerator_type(self) -> None: if self._accelerator_type and self._accelerator_type != self._device_type: - raise MisconfigurationException( - f"Mismatch between the requested `strategy={self._accelerator_type.value!r}`" - f" and `accelerator={self._device_type.value!r}`." + # internal error: should not happen. + raise ValueError( + f"Mismatch between the requested accelerator type ({self._accelerator_type})" + f" and assigned device type ({self._device_type})." ) self._accelerator_type = self._device_type From 1cd2da8750467feaf28c24194dd0d4a270b2223c Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Sun, 17 Oct 2021 16:42:59 +0200 Subject: [PATCH 5/7] Address review --- pytorch_lightning/trainer/connectors/accelerator_connector.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index e6be66ba8df12..fddcb49173c32 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -902,7 +902,8 @@ def check_interactive_compatibility(self): if _IS_INTERACTIVE and self._distrib_type is not None and not self._distrib_type.is_interactive_compatible(): raise MisconfigurationException( - f"`Trainer(strategy={self._distrib_type.value!r})` is not compatible with an interactive" + f"`Trainer(strategy={self._distrib_type.value!r})` or" + f" `Trainer(accelerator={self._distrib_type.value!r})` is not compatible with an interactive" " environment. Run your code as a script, or choose one of the compatible backends:" f" {', '.join(DistributedType.interactive_compatible_types())}." " In case you are spawning processes yourself, make sure to include the Trainer" From aa2a08ae7acbfdc02451e8e353bec469b44ca41e Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Sun, 17 Oct 2021 17:03:53 +0200 Subject: [PATCH 6/7] Update test --- tests/accelerators/test_accelerator_connector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 84524f364b2a7..dcda2f9779c6d 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -447,10 +447,10 @@ def on_fit_start(self, trainer, pl_module): @mock.patch("pytorch_lightning.utilities._IS_INTERACTIVE", return_value=True) @mock.patch("torch.cuda.device_count", return_value=2) def test_ipython_incompatible_backend_error(*_): - with pytest.raises(MisconfigurationException, match=r"strategy='ddp'\)` is not compatible"): + with pytest.raises(MisconfigurationException, match=r"strategy='ddp'\)`.*is not compatible"): Trainer(accelerator="ddp", gpus=2) - with pytest.raises(MisconfigurationException, match=r"strategy='ddp2'\)` is not compatible"): + with pytest.raises(MisconfigurationException, match=r"strategy='ddp2'\)`.*is not compatible"): Trainer(accelerator="ddp2", gpus=2) From 8edb3b3653806bd162c243962b7caab20c2cd388 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 18 Oct 2021 02:39:06 +0200 Subject: [PATCH 7/7] Fix message --- pytorch_lightning/trainer/connectors/accelerator_connector.py | 4 ++-- tests/accelerators/test_tpu.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index fddcb49173c32..a94f3a59399ae 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -817,7 +817,7 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): if self.distributed_backend == DistributedType.DDP_CPU: if _TPU_AVAILABLE: raise MisconfigurationException( - "`strategy='ddp_cpu'` is not supported on TPU machines. " + "`accelerator='ddp_cpu'` is not supported on TPU machines. " "Learn more: https://github.com/PyTorchLightning/pytorch-lightning/issues/7810" ) if self.num_processes == 1 and self.num_nodes > 1: @@ -826,7 +826,7 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): self._distrib_type = DistributedType.DDP_SPAWN if self.num_gpus > 0: rank_zero_warn( - "You requested one or more GPUs, but set `strategy='ddp_cpu'`. Training will not use GPUs." + "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs." ) self.parallel_device_ids = None if self.num_processes is None: diff --git a/tests/accelerators/test_tpu.py b/tests/accelerators/test_tpu.py index df5444ac776a6..622ead614b1c7 100644 --- a/tests/accelerators/test_tpu.py +++ b/tests/accelerators/test_tpu.py @@ -222,7 +222,6 @@ def on_train_end(self, trainer, pl_module): @RunIf(tpu=True) def test_ddp_cpu_not_supported_on_tpus(): - with pytest.raises(MisconfigurationException, match="`accelerator='ddp_cpu'` is not supported on TPU machines"): Trainer(accelerator="ddp_cpu")