From 0a6237d297c892e03243b00be1ee607c55313bc7 Mon Sep 17 00:00:00 2001 From: DuYicong515 Date: Wed, 23 Feb 2022 12:06:47 -0800 Subject: [PATCH 01/13] remove Accelerator.parallel_device_ids and deprecate Trainer.data_parallel_device_ids --- .../trainer/connectors/accelerator_connector.py | 4 ---- .../connectors/logger_connector/logger_connector.py | 8 +++++++- pytorch_lightning/trainer/trainer.py | 12 ++++++++---- tests/models/test_gpu.py | 7 ++++++- tests/trainer/test_trainer_cli.py | 3 ++- tests/utilities/test_cli.py | 3 ++- 6 files changed, 25 insertions(+), 12 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index aa9b25c64e28c..a28f7cfd8939c 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -819,10 +819,6 @@ def num_ipus(self) -> int: def gpus(self) -> Optional[Union[List[int], str, int]]: return self._gpus - @property - def parallel_device_ids(self) -> List[int]: - return [i for i in range(len(self.parallel_devices))] if isinstance(self.accelerator, GPUAccelerator) else [] - @property def is_distributed(self) -> bool: # Used for custom plugins. diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 0de5a5f882925..8800df116b963 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -221,7 +221,13 @@ def _log_gpus_metrics(self) -> None: self.trainer.lightning_module.log(key, mem, prog_bar=False, logger=True) else: gpu_id = int(key.split("/")[0].split(":")[1]) - if gpu_id in self.trainer._accelerator_connector.parallel_device_ids: + parallel_device_ids = [] + if isinstance(self.trainer.accelerator, GPUAccelerator): + if isinstance(self.trainer.strategy, ParallelStrategy): + parallel_device_ids = [i for i in range(len(self.trainer.strategy.parallel_devices))] + elif isinstance(self.strategy, SingleDeviceStrategy): + parallel_device_ids = [0] + if gpu_id in parallel_device_ids: self.trainer.lightning_module.log( key, mem, prog_bar=False, logger=True, on_step=True, on_epoch=False ) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index f718e0f64ee43..da9af88824c48 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -58,7 +58,7 @@ SimpleProfiler, XLAProfiler, ) -from pytorch_lightning.strategies import ParallelStrategy, Strategy +from pytorch_lightning.strategies import ParallelStrategy, SingleDeviceStrategy, Strategy from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy from pytorch_lightning.trainer.callback_hook import TrainerCallbackHookMixin from pytorch_lightning.trainer.configuration_validator import verify_loop_configurations @@ -2087,9 +2087,13 @@ def devices(self) -> int: @property def data_parallel_device_ids(self) -> Optional[List[int]]: - return ( - self._accelerator_connector.parallel_device_ids if self._accelerator_connector.parallel_device_ids else None - ) + rank_zero_deprecation("`Trainer.data_parallel_device_ids` was deprecated in v1.6.") + if isinstance(self.accelerator, GPUAccelerator): + if isinstance(self.strategy, ParallelStrategy): + return [i for i in range(len(self.strategy.parallel_devices))] + elif isinstance(self.strategy, SingleDeviceStrategy): + return [0] + return None @property def lightning_module(self) -> "pl.LightningModule": diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 719cfb43024b1..511322d9c0698 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -27,6 +27,7 @@ from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _compare_version, _TORCHTEXT_LEGACY +from pytorch_lightning.utilities.rank_zero import rank_zero_only from tests.helpers import BoringModel from tests.helpers.datamodules import ClassifDataModule from tests.helpers.imports import Batch, Dataset, Example, Field, LabelField @@ -190,8 +191,12 @@ def test_torchelastic_gpu_parsing(mocked_device_count, mocked_is_available, gpus sanitizing the gpus as only one of the GPUs is visible.""" trainer = Trainer(gpus=gpus) assert isinstance(trainer._accelerator_connector.cluster_environment, TorchElasticEnvironment) - assert trainer.data_parallel_device_ids == device_parser.parse_gpu_ids(gpus) assert trainer.gpus == gpus + if rank_zero_only.rank == 0: + with pytest.deprecated_call(match="`Trainer.data_parallel_device_ids` was deprecated in v1.6."): + assert trainer.data_parallel_device_ids == device_parser.parse_gpu_ids(gpus) + else: + assert trainer.data_parallel_device_ids == device_parser.parse_gpu_ids(gpus) @RunIf(min_gpus=1) diff --git a/tests/trainer/test_trainer_cli.py b/tests/trainer/test_trainer_cli.py index 11b904870294b..fdaea98e7727d 100644 --- a/tests/trainer/test_trainer_cli.py +++ b/tests/trainer/test_trainer_cli.py @@ -178,7 +178,8 @@ def test_argparse_args_parsing_devices(cli_args, expected_parsed, expected_devic assert args.devices == expected_parsed trainer = Trainer.from_argparse_args(args) - assert trainer.data_parallel_device_ids == expected_device_ids + with pytest.deprecated_call(match="`Trainer.data_parallel_device_ids` was deprecated in v1.6."): + assert trainer.data_parallel_device_ids == expected_device_ids @pytest.mark.parametrize( diff --git a/tests/utilities/test_cli.py b/tests/utilities/test_cli.py index 5da16737fc2d7..485836e9f7c32 100644 --- a/tests/utilities/test_cli.py +++ b/tests/utilities/test_cli.py @@ -192,7 +192,8 @@ def test_parse_args_parsing_gpus(monkeypatch, cli_args, expected_gpu): args = parser.parse_args() trainer = Trainer.from_argparse_args(args) - assert trainer.data_parallel_device_ids == expected_gpu + with pytest.deprecated_call(match="`Trainer.data_parallel_device_ids` was deprecated in v1.6."): + assert trainer.data_parallel_device_ids == expected_gpu @pytest.mark.skipif( From 71c5e0150fd4cfdde23e30d65ce248d6d80daa34 Mon Sep 17 00:00:00 2001 From: DuYicong515 Date: Wed, 23 Feb 2022 14:41:03 -0800 Subject: [PATCH 02/13] rebase --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 541b3dcc1d261..192bd5c517fd5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -533,6 +533,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated `Trainer.num_gpus` in favor of `Trainer.num_devices` when GPU is used ([#12384](https://github.com/PyTorchLightning/pytorch-lightning/pull/12384)) +- Deprecated `Trainer.data_parallel_device_ids` in favor of `Trainer.device_ids` ([#12072](https://github.com/PyTorchLightning/pytorch-lightning/pull/12072)) + + ### Removed - Removed deprecated parameter `method` in `pytorch_lightning.utilities.model_helpers.is_overridden` ([#10507](https://github.com/PyTorchLightning/pytorch-lightning/pull/10507)) @@ -720,7 +723,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed public attribute `sync_batchnorm` from strategies ([#11754](https://github.com/PyTorchLightning/pytorch-lightning/pull/11754)) +<<<<<<< HEAD - Removed `AcceleratorConnector.root_gpu` property ([#12262](https://github.com/PyTorchLightning/pytorch-lightning/pull/12262)) +======= +- Removed deprecated `AcceleratorConnector.parallel_device_ids` property ([#12072](https://github.com/PyTorchLightning/pytorch-lightning/pull/12072)) +>>>>>>> c3e41378b (rebase) - Removed `AcceleratorConnector.num_gpus` property ([#12384](https://github.com/PyTorchLightning/pytorch-lightning/pull/12384)) From 0391920b273f10ca05776c4b359e0bcc9b86aa5f Mon Sep 17 00:00:00 2001 From: DuYicong515 Date: Wed, 23 Feb 2022 12:52:24 -0800 Subject: [PATCH 03/13] fix pre-commit --- .../trainer/connectors/logger_connector/logger_connector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 8800df116b963..b8d80be94f2df 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -19,6 +19,7 @@ from pytorch_lightning.accelerators import GPUAccelerator from pytorch_lightning.loggers import LightningLoggerBase, TensorBoardLogger from pytorch_lightning.plugins.environments.slurm_environment import SLURMEnvironment +from pytorch_lightning.strategies import ParallelStrategy, SingleDeviceStrategy from pytorch_lightning.trainer.connectors.logger_connector.result import _METRICS, _OUT_DICT, _PBAR_DICT from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities import memory From c1c527b3467f5d1a6b15668697681076f1aadc90 Mon Sep 17 00:00:00 2001 From: DuYicong515 Date: Wed, 23 Feb 2022 13:02:59 -0800 Subject: [PATCH 04/13] use trainer.data_parallel_device_ids in logger_connector --- .../connectors/logger_connector/logger_connector.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index b8d80be94f2df..59b5f36564c03 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -19,7 +19,6 @@ from pytorch_lightning.accelerators import GPUAccelerator from pytorch_lightning.loggers import LightningLoggerBase, TensorBoardLogger from pytorch_lightning.plugins.environments.slurm_environment import SLURMEnvironment -from pytorch_lightning.strategies import ParallelStrategy, SingleDeviceStrategy from pytorch_lightning.trainer.connectors.logger_connector.result import _METRICS, _OUT_DICT, _PBAR_DICT from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities import memory @@ -222,13 +221,7 @@ def _log_gpus_metrics(self) -> None: self.trainer.lightning_module.log(key, mem, prog_bar=False, logger=True) else: gpu_id = int(key.split("/")[0].split(":")[1]) - parallel_device_ids = [] - if isinstance(self.trainer.accelerator, GPUAccelerator): - if isinstance(self.trainer.strategy, ParallelStrategy): - parallel_device_ids = [i for i in range(len(self.trainer.strategy.parallel_devices))] - elif isinstance(self.strategy, SingleDeviceStrategy): - parallel_device_ids = [0] - if gpu_id in parallel_device_ids: + if gpu_id in self.trainer.data_parallel_device_ids: self.trainer.lightning_module.log( key, mem, prog_bar=False, logger=True, on_step=True, on_epoch=False ) From f47990a0cac67210c39ce5005c0f66d68d103356 Mon Sep 17 00:00:00 2001 From: DuYicong515 Date: Wed, 23 Feb 2022 13:20:52 -0800 Subject: [PATCH 05/13] fix tests --- .../connectors/logger_connector/logger_connector.py | 2 +- tests/callbacks/test_gpu_stats_monitor.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 59b5f36564c03..69b5b158a4b4f 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -221,7 +221,7 @@ def _log_gpus_metrics(self) -> None: self.trainer.lightning_module.log(key, mem, prog_bar=False, logger=True) else: gpu_id = int(key.split("/")[0].split(":")[1]) - if gpu_id in self.trainer.data_parallel_device_ids: + if self.trainer.data_parallel_device_ids and gpu_id in self.trainer.data_parallel_device_ids: self.trainer.lightning_module.log( key, mem, prog_bar=False, logger=True, on_step=True, on_epoch=False ) diff --git a/tests/callbacks/test_gpu_stats_monitor.py b/tests/callbacks/test_gpu_stats_monitor.py index 1e3a9953bf137..5af24000e06ed 100644 --- a/tests/callbacks/test_gpu_stats_monitor.py +++ b/tests/callbacks/test_gpu_stats_monitor.py @@ -47,7 +47,8 @@ def test_gpu_stats_monitor(tmpdir): logger=logger, ) - trainer.fit(model) + with pytest.deprecated_call(match="`Trainer.data_parallel_device_ids` was deprecated in v1.6."): + trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}" path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE) @@ -84,7 +85,9 @@ def test_gpu_stats_monitor_no_queries(tmpdir): devices=1, callbacks=[gpu_stats], ) - with mock.patch("pytorch_lightning.loggers.tensorboard.TensorBoardLogger.log_metrics") as log_metrics_mock: + with mock.patch( + "pytorch_lightning.loggers.tensorboard.TensorBoardLogger.log_metrics" + ) as log_metrics_mock, pytest.deprecated_call(match="`Trainer.data_parallel_device_ids` was deprecated in v1.6."): trainer.fit(model) assert log_metrics_mock.mock_calls[1:] == [ From e82870ec4e806b46126c913a7abf7156b903e5d6 Mon Sep 17 00:00:00 2001 From: DuYicong515 Date: Thu, 24 Feb 2022 12:00:15 -0800 Subject: [PATCH 06/13] address comments --- CHANGELOG.md | 2 ++ pytorch_lightning/callbacks/gpu_stats_monitor.py | 3 +-- .../connectors/logger_connector/logger_connector.py | 2 +- pytorch_lightning/trainer/trainer.py | 8 ++++---- tests/models/test_gpu.py | 4 ++-- tests/trainer/test_trainer_cli.py | 2 +- 6 files changed, 11 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 192bd5c517fd5..845afac58d71b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -368,6 +368,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - The strategies that support `sync_batchnorm` now only apply it when fitting ([#11919](https://github.com/PyTorchLightning/pytorch-lightning/pull/11919)) +- Changed `Trainer.data_parallel_device_ids` to return a list of GPU device indexes ([#12072](https://github.com/PyTorchLightning/pytorch-lightning/pull/12072)) + ### Deprecated diff --git a/pytorch_lightning/callbacks/gpu_stats_monitor.py b/pytorch_lightning/callbacks/gpu_stats_monitor.py index 8fb92006708f7..78fe2f70a8e1a 100644 --- a/pytorch_lightning/callbacks/gpu_stats_monitor.py +++ b/pytorch_lightning/callbacks/gpu_stats_monitor.py @@ -133,8 +133,7 @@ def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: O ) # The logical device IDs for selected devices - # ignoring mypy check because `trainer.data_parallel_device_ids` is None when using CPU - self._device_ids = sorted(set(trainer.data_parallel_device_ids)) # type: ignore + self._device_ids = sorted(set(trainer.data_parallel_device_ids)) # The unmasked real GPU IDs self._gpu_ids = self._get_gpu_ids(self._device_ids) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 69b5b158a4b4f..59b5f36564c03 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -221,7 +221,7 @@ def _log_gpus_metrics(self) -> None: self.trainer.lightning_module.log(key, mem, prog_bar=False, logger=True) else: gpu_id = int(key.split("/")[0].split(":")[1]) - if self.trainer.data_parallel_device_ids and gpu_id in self.trainer.data_parallel_device_ids: + if gpu_id in self.trainer.data_parallel_device_ids: self.trainer.lightning_module.log( key, mem, prog_bar=False, logger=True, on_step=True, on_epoch=False ) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index da9af88824c48..92024ccc29167 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -2086,14 +2086,14 @@ def devices(self) -> int: return self.num_devices @property - def data_parallel_device_ids(self) -> Optional[List[int]]: - rank_zero_deprecation("`Trainer.data_parallel_device_ids` was deprecated in v1.6.") + def data_parallel_device_ids(self) -> List[int]: + rank_zero_deprecation("`Trainer.data_parallel_device_ids` was deprecated in v1.6 and will be removed in v1.8.") if isinstance(self.accelerator, GPUAccelerator): if isinstance(self.strategy, ParallelStrategy): - return [i for i in range(len(self.strategy.parallel_devices))] + return [device.index for device in self.strategy.parallel_devices] elif isinstance(self.strategy, SingleDeviceStrategy): return [0] - return None + return [] @property def lightning_module(self) -> "pl.LightningModule": diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 511322d9c0698..6252b601b2870 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -185,7 +185,7 @@ def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_coun ) @mock.patch("torch.cuda.device_count", return_value=1) @mock.patch("torch.cuda.is_available", return_value=True) -@pytest.mark.parametrize("gpus", [[0, 1, 2], 2, "0"]) +@pytest.mark.parametrize("gpus", [[0, 1, 2], 2, "0", [0, 2]]) def test_torchelastic_gpu_parsing(mocked_device_count, mocked_is_available, gpus): """Ensure when using torchelastic and nproc_per_node is set to the default of 1 per GPU device That we omit sanitizing the gpus as only one of the GPUs is visible.""" @@ -194,7 +194,7 @@ def test_torchelastic_gpu_parsing(mocked_device_count, mocked_is_available, gpus assert trainer.gpus == gpus if rank_zero_only.rank == 0: with pytest.deprecated_call(match="`Trainer.data_parallel_device_ids` was deprecated in v1.6."): - assert trainer.data_parallel_device_ids == device_parser.parse_gpu_ids(gpus) + assert (trainer.data_parallel_device_ids or None) == device_parser.parse_gpu_ids(gpus) else: assert trainer.data_parallel_device_ids == device_parser.parse_gpu_ids(gpus) diff --git a/tests/trainer/test_trainer_cli.py b/tests/trainer/test_trainer_cli.py index fdaea98e7727d..b84fb4424dcb7 100644 --- a/tests/trainer/test_trainer_cli.py +++ b/tests/trainer/test_trainer_cli.py @@ -162,7 +162,7 @@ def test_argparse_args_parsing_fast_dev_run(cli_args, expected): @pytest.mark.parametrize( ["cli_args", "expected_parsed", "expected_device_ids"], - [("", None, None), ("--accelerator gpu --devices 1", "1", [0]), ("--accelerator gpu --devices 0,", "0,", [0])], + [("", None, []), ("--accelerator gpu --devices 1", "1", [0]), ("--accelerator gpu --devices 0,", "0,", [])], ) def test_argparse_args_parsing_devices(cli_args, expected_parsed, expected_device_ids, monkeypatch): """Test multi type argument with bool.""" From ac12427693df82b6ce0cfc1d9c3e076acba62cbd Mon Sep 17 00:00:00 2001 From: DuYicong515 Date: Thu, 24 Feb 2022 17:56:59 -0800 Subject: [PATCH 07/13] update changelog --- CHANGELOG.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 845afac58d71b..3d28e20402a3a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -725,16 +725,15 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed public attribute `sync_batchnorm` from strategies ([#11754](https://github.com/PyTorchLightning/pytorch-lightning/pull/11754)) -<<<<<<< HEAD - Removed `AcceleratorConnector.root_gpu` property ([#12262](https://github.com/PyTorchLightning/pytorch-lightning/pull/12262)) -======= -- Removed deprecated `AcceleratorConnector.parallel_device_ids` property ([#12072](https://github.com/PyTorchLightning/pytorch-lightning/pull/12072)) ->>>>>>> c3e41378b (rebase) - Removed `AcceleratorConnector.num_gpus` property ([#12384](https://github.com/PyTorchLightning/pytorch-lightning/pull/12384)) +- Removed `AcceleratorConnector.parallel_device_ids` property ([#12072](https://github.com/PyTorchLightning/pytorch-lightning/pull/12072)) + + ### Fixed - Fixed an issue where `ModelCheckpoint` could delete older checkpoints when `dirpath` has changed during resumed training ([#12045](https://github.com/PyTorchLightning/pytorch-lightning/pull/12045)) From 09613094148eee42c5d9d3868d3c728b25159f43 Mon Sep 17 00:00:00 2001 From: DuYicong515 Date: Thu, 24 Feb 2022 21:06:49 -0800 Subject: [PATCH 08/13] rebase --- tests/callbacks/test_gpu_stats_monitor.py | 8 ++++++-- tests/models/test_gpu.py | 6 ++++-- tests/trainer/test_trainer_cli.py | 4 +++- tests/utilities/test_cli.py | 4 +++- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/tests/callbacks/test_gpu_stats_monitor.py b/tests/callbacks/test_gpu_stats_monitor.py index 5af24000e06ed..a905b9da48ff0 100644 --- a/tests/callbacks/test_gpu_stats_monitor.py +++ b/tests/callbacks/test_gpu_stats_monitor.py @@ -47,7 +47,9 @@ def test_gpu_stats_monitor(tmpdir): logger=logger, ) - with pytest.deprecated_call(match="`Trainer.data_parallel_device_ids` was deprecated in v1.6."): + with pytest.deprecated_call( + match="Trainer.data_parallel_device_ids` was deprecated in v1.6 and will be removed in v1.8." + ): trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}" @@ -87,7 +89,9 @@ def test_gpu_stats_monitor_no_queries(tmpdir): ) with mock.patch( "pytorch_lightning.loggers.tensorboard.TensorBoardLogger.log_metrics" - ) as log_metrics_mock, pytest.deprecated_call(match="`Trainer.data_parallel_device_ids` was deprecated in v1.6."): + ) as log_metrics_mock, pytest.deprecated_call( + match="Trainer.data_parallel_device_ids` was deprecated in v1.6 and will be removed in v1.8." + ): trainer.fit(model) assert log_metrics_mock.mock_calls[1:] == [ diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 6252b601b2870..74535d7a569a2 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -193,10 +193,12 @@ def test_torchelastic_gpu_parsing(mocked_device_count, mocked_is_available, gpus assert isinstance(trainer._accelerator_connector.cluster_environment, TorchElasticEnvironment) assert trainer.gpus == gpus if rank_zero_only.rank == 0: - with pytest.deprecated_call(match="`Trainer.data_parallel_device_ids` was deprecated in v1.6."): + with pytest.deprecated_call( + match="Trainer.data_parallel_device_ids` was deprecated in v1.6 and will be removed in v1.8." + ): assert (trainer.data_parallel_device_ids or None) == device_parser.parse_gpu_ids(gpus) else: - assert trainer.data_parallel_device_ids == device_parser.parse_gpu_ids(gpus) + assert (trainer.data_parallel_device_ids or None) == device_parser.parse_gpu_ids(gpus) @RunIf(min_gpus=1) diff --git a/tests/trainer/test_trainer_cli.py b/tests/trainer/test_trainer_cli.py index b84fb4424dcb7..00f223c689b3e 100644 --- a/tests/trainer/test_trainer_cli.py +++ b/tests/trainer/test_trainer_cli.py @@ -178,7 +178,9 @@ def test_argparse_args_parsing_devices(cli_args, expected_parsed, expected_devic assert args.devices == expected_parsed trainer = Trainer.from_argparse_args(args) - with pytest.deprecated_call(match="`Trainer.data_parallel_device_ids` was deprecated in v1.6."): + with pytest.deprecated_call( + match="Trainer.data_parallel_device_ids` was deprecated in v1.6 and will be removed in v1.8." + ): assert trainer.data_parallel_device_ids == expected_device_ids diff --git a/tests/utilities/test_cli.py b/tests/utilities/test_cli.py index 485836e9f7c32..46bbbbb6253fb 100644 --- a/tests/utilities/test_cli.py +++ b/tests/utilities/test_cli.py @@ -192,7 +192,9 @@ def test_parse_args_parsing_gpus(monkeypatch, cli_args, expected_gpu): args = parser.parse_args() trainer = Trainer.from_argparse_args(args) - with pytest.deprecated_call(match="`Trainer.data_parallel_device_ids` was deprecated in v1.6."): + with pytest.deprecated_call( + match="Trainer.data_parallel_device_ids` was deprecated in v1.6 and will be removed in v1.8." + ): assert trainer.data_parallel_device_ids == expected_gpu From 25cc8fc0073d340374ab7a4de5fc2832cf7df1bc Mon Sep 17 00:00:00 2001 From: DuYicong515 Date: Fri, 25 Feb 2022 12:08:43 -0800 Subject: [PATCH 09/13] rebase and use Trainer.device_ids --- .../callbacks/gpu_stats_monitor.py | 2 +- .../logger_connector/logger_connector.py | 2 +- pytorch_lightning/trainer/trainer.py | 14 +++++----- tests/callbacks/test_gpu_stats_monitor.py | 11 ++------ tests/deprecated_api/test_remove_1-8.py | 27 +++++++++++++++++++ tests/models/test_gpu.py | 11 +++----- tests/trainer/test_trainer_cli.py | 12 +++------ tests/utilities/test_cli.py | 9 +++---- 8 files changed, 48 insertions(+), 40 deletions(-) diff --git a/pytorch_lightning/callbacks/gpu_stats_monitor.py b/pytorch_lightning/callbacks/gpu_stats_monitor.py index 78fe2f70a8e1a..607dc4cf0efbd 100644 --- a/pytorch_lightning/callbacks/gpu_stats_monitor.py +++ b/pytorch_lightning/callbacks/gpu_stats_monitor.py @@ -133,7 +133,7 @@ def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: O ) # The logical device IDs for selected devices - self._device_ids = sorted(set(trainer.data_parallel_device_ids)) + self._device_ids = sorted(set(trainer.device_ids)) # The unmasked real GPU IDs self._gpu_ids = self._get_gpu_ids(self._device_ids) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 59b5f36564c03..9cc91c5adf171 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -221,7 +221,7 @@ def _log_gpus_metrics(self) -> None: self.trainer.lightning_module.log(key, mem, prog_bar=False, logger=True) else: gpu_id = int(key.split("/")[0].split(":")[1]) - if gpu_id in self.trainer.data_parallel_device_ids: + if gpu_id in self.trainer.device_ids: self.trainer.lightning_module.log( key, mem, prog_bar=False, logger=True, on_step=True, on_epoch=False ) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 92024ccc29167..c32a5a2b2ec68 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -58,7 +58,7 @@ SimpleProfiler, XLAProfiler, ) -from pytorch_lightning.strategies import ParallelStrategy, SingleDeviceStrategy, Strategy +from pytorch_lightning.strategies import ParallelStrategy, Strategy from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy from pytorch_lightning.trainer.callback_hook import TrainerCallbackHookMixin from pytorch_lightning.trainer.configuration_validator import verify_loop_configurations @@ -2087,13 +2087,11 @@ def devices(self) -> int: @property def data_parallel_device_ids(self) -> List[int]: - rank_zero_deprecation("`Trainer.data_parallel_device_ids` was deprecated in v1.6 and will be removed in v1.8.") - if isinstance(self.accelerator, GPUAccelerator): - if isinstance(self.strategy, ParallelStrategy): - return [device.index for device in self.strategy.parallel_devices] - elif isinstance(self.strategy, SingleDeviceStrategy): - return [0] - return [] + rank_zero_deprecation( + "`Trainer.data_parallel_device_ids` was deprecated in v1.6 and will be removed in v1.8." + " Please use `Trainer.device_ids` instead." + ) + return self.device_ids if isinstance(self.accelerator, GPUAccelerator) else [] @property def lightning_module(self) -> "pl.LightningModule": diff --git a/tests/callbacks/test_gpu_stats_monitor.py b/tests/callbacks/test_gpu_stats_monitor.py index a905b9da48ff0..1e3a9953bf137 100644 --- a/tests/callbacks/test_gpu_stats_monitor.py +++ b/tests/callbacks/test_gpu_stats_monitor.py @@ -47,10 +47,7 @@ def test_gpu_stats_monitor(tmpdir): logger=logger, ) - with pytest.deprecated_call( - match="Trainer.data_parallel_device_ids` was deprecated in v1.6 and will be removed in v1.8." - ): - trainer.fit(model) + trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}" path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE) @@ -87,11 +84,7 @@ def test_gpu_stats_monitor_no_queries(tmpdir): devices=1, callbacks=[gpu_stats], ) - with mock.patch( - "pytorch_lightning.loggers.tensorboard.TensorBoardLogger.log_metrics" - ) as log_metrics_mock, pytest.deprecated_call( - match="Trainer.data_parallel_device_ids` was deprecated in v1.6 and will be removed in v1.8." - ): + with mock.patch("pytorch_lightning.loggers.tensorboard.TensorBoardLogger.log_metrics") as log_metrics_mock: trainer.fit(model) assert log_metrics_mock.mock_calls[1:] == [ diff --git a/tests/deprecated_api/test_remove_1-8.py b/tests/deprecated_api/test_remove_1-8.py index 035f76a748ee1..f556c241e6c82 100644 --- a/tests/deprecated_api/test_remove_1-8.py +++ b/tests/deprecated_api/test_remove_1-8.py @@ -962,3 +962,30 @@ def test_trainer_num_gpu_0(monkeypatch, gpus, expected_num_gpus, strategy): " Please use `Trainer.num_devices` instead." ): assert Trainer(gpus=gpus, strategy=strategy).num_gpus == expected_num_gpus + + +@pytest.mark.parametrize( + ["trainer_kwargs", "expected_data_parallel_device_ids"], + [ + ({}, []), + ({"devices": 1}, []), + ({"devices": "1"}, []), + ({"accelerator": "gpu", "devices": 1}, [0]), + ({"accelerator": "gpu", "devices": 2}, [0, 1]), + ({"accelerator": "gpu", "devices": [1]}, [1]), + ({"accelerator": "gpu", "devices": "0"}, []), + ({"accelerator": "gpu", "devices": "0,"}, [0]), + ], +) +def test_trainer_data_parallel_device_ids(monkeypatch, trainer_kwargs, expected_data_parallel_device_ids): + """Test multi type argument with bool.""" + + monkeypatch.setattr(torch.cuda, "is_available", lambda: True) + monkeypatch.setattr(torch.cuda, "device_count", lambda: 2) + + trainer = Trainer(**trainer_kwargs) + with pytest.deprecated_call( + match="`Trainer.data_parallel_device_ids` was deprecated in v1.6 and will be removed in v1.8." + " Please use `Trainer.device_ids` instead." + ): + assert trainer.data_parallel_device_ids == expected_data_parallel_device_ids diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 74535d7a569a2..306604737db53 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -27,7 +27,6 @@ from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _compare_version, _TORCHTEXT_LEGACY -from pytorch_lightning.utilities.rank_zero import rank_zero_only from tests.helpers import BoringModel from tests.helpers.datamodules import ClassifDataModule from tests.helpers.imports import Batch, Dataset, Example, Field, LabelField @@ -192,13 +191,9 @@ def test_torchelastic_gpu_parsing(mocked_device_count, mocked_is_available, gpus trainer = Trainer(gpus=gpus) assert isinstance(trainer._accelerator_connector.cluster_environment, TorchElasticEnvironment) assert trainer.gpus == gpus - if rank_zero_only.rank == 0: - with pytest.deprecated_call( - match="Trainer.data_parallel_device_ids` was deprecated in v1.6 and will be removed in v1.8." - ): - assert (trainer.data_parallel_device_ids or None) == device_parser.parse_gpu_ids(gpus) - else: - assert (trainer.data_parallel_device_ids or None) == device_parser.parse_gpu_ids(gpus) + # when use gpu + if device_parser.parse_gpu_ids(gpus) is not None: + assert trainer.device_ids == device_parser.parse_gpu_ids(gpus) @RunIf(min_gpus=1) diff --git a/tests/trainer/test_trainer_cli.py b/tests/trainer/test_trainer_cli.py index 00f223c689b3e..37a02e895d560 100644 --- a/tests/trainer/test_trainer_cli.py +++ b/tests/trainer/test_trainer_cli.py @@ -161,10 +161,10 @@ def test_argparse_args_parsing_fast_dev_run(cli_args, expected): @pytest.mark.parametrize( - ["cli_args", "expected_parsed", "expected_device_ids"], - [("", None, []), ("--accelerator gpu --devices 1", "1", [0]), ("--accelerator gpu --devices 0,", "0,", [])], + ["cli_args", "expected_parsed"], + [("", None), ("--accelerator gpu --devices 1", "1"), ("--accelerator gpu --devices 0,", "0,")], ) -def test_argparse_args_parsing_devices(cli_args, expected_parsed, expected_device_ids, monkeypatch): +def test_argparse_args_parsing_devices(cli_args, expected_parsed, monkeypatch): """Test multi type argument with bool.""" monkeypatch.setattr(torch.cuda, "is_available", lambda: True) @@ -177,11 +177,7 @@ def test_argparse_args_parsing_devices(cli_args, expected_parsed, expected_devic args = Trainer.parse_argparser(parser) assert args.devices == expected_parsed - trainer = Trainer.from_argparse_args(args) - with pytest.deprecated_call( - match="Trainer.data_parallel_device_ids` was deprecated in v1.6 and will be removed in v1.8." - ): - assert trainer.data_parallel_device_ids == expected_device_ids + assert Trainer.from_argparse_args(args) @pytest.mark.parametrize( diff --git a/tests/utilities/test_cli.py b/tests/utilities/test_cli.py index 46bbbbb6253fb..8afd831901ba7 100644 --- a/tests/utilities/test_cli.py +++ b/tests/utilities/test_cli.py @@ -180,7 +180,9 @@ def test_parse_args_parsing_complex_types(cli_args, expected, instantiate): assert Trainer.from_argparse_args(args) -@pytest.mark.parametrize(["cli_args", "expected_gpu"], [("--gpus 1", [0]), ("--gpus 0,", [0]), ("--gpus 0,1", [0, 1])]) +@pytest.mark.parametrize( + ["cli_args", "expected_gpu"], [("--gpus 1", [0]), ("--gpus 0,", [0]), ("--gpus 1,", [1]), ("--gpus 0,1", [0, 1])] +) def test_parse_args_parsing_gpus(monkeypatch, cli_args, expected_gpu): """Test parsing of gpus and instantiation of Trainer.""" monkeypatch.setattr("torch.cuda.device_count", lambda: 2) @@ -192,10 +194,7 @@ def test_parse_args_parsing_gpus(monkeypatch, cli_args, expected_gpu): args = parser.parse_args() trainer = Trainer.from_argparse_args(args) - with pytest.deprecated_call( - match="Trainer.data_parallel_device_ids` was deprecated in v1.6 and will be removed in v1.8." - ): - assert trainer.data_parallel_device_ids == expected_gpu + assert trainer.device_ids == expected_gpu @pytest.mark.skipif( From 0a268f6ee636f51a46294b0b0d5a28ea4e20a3ea Mon Sep 17 00:00:00 2001 From: DuYicong515 Date: Sun, 20 Mar 2022 11:42:38 -0700 Subject: [PATCH 10/13] keep returning None when GPU not used --- pytorch_lightning/trainer/trainer.py | 4 ++-- tests/deprecated_api/test_remove_1-8.py | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index c32a5a2b2ec68..6365c82eca61a 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -2086,12 +2086,12 @@ def devices(self) -> int: return self.num_devices @property - def data_parallel_device_ids(self) -> List[int]: + def data_parallel_device_ids(self) -> Optional[List[int]]: rank_zero_deprecation( "`Trainer.data_parallel_device_ids` was deprecated in v1.6 and will be removed in v1.8." " Please use `Trainer.device_ids` instead." ) - return self.device_ids if isinstance(self.accelerator, GPUAccelerator) else [] + return self.device_ids if isinstance(self.accelerator, GPUAccelerator) else None @property def lightning_module(self) -> "pl.LightningModule": diff --git a/tests/deprecated_api/test_remove_1-8.py b/tests/deprecated_api/test_remove_1-8.py index f556c241e6c82..c5ad12b11a205 100644 --- a/tests/deprecated_api/test_remove_1-8.py +++ b/tests/deprecated_api/test_remove_1-8.py @@ -967,21 +967,21 @@ def test_trainer_num_gpu_0(monkeypatch, gpus, expected_num_gpus, strategy): @pytest.mark.parametrize( ["trainer_kwargs", "expected_data_parallel_device_ids"], [ - ({}, []), - ({"devices": 1}, []), - ({"devices": "1"}, []), + ({}, None), + ({"devices": 1}, None), + ({"devices": "1"}, None), ({"accelerator": "gpu", "devices": 1}, [0]), ({"accelerator": "gpu", "devices": 2}, [0, 1]), ({"accelerator": "gpu", "devices": [1]}, [1]), - ({"accelerator": "gpu", "devices": "0"}, []), + ({"accelerator": "gpu", "devices": "0"}, None), ({"accelerator": "gpu", "devices": "0,"}, [0]), ], ) def test_trainer_data_parallel_device_ids(monkeypatch, trainer_kwargs, expected_data_parallel_device_ids): """Test multi type argument with bool.""" - - monkeypatch.setattr(torch.cuda, "is_available", lambda: True) - monkeypatch.setattr(torch.cuda, "device_count", lambda: 2) + if trainer_kwargs.get("accelerator") == "gpu": + monkeypatch.setattr(torch.cuda, "is_available", lambda: True) + monkeypatch.setattr(torch.cuda, "device_count", lambda: 2) trainer = Trainer(**trainer_kwargs) with pytest.deprecated_call( From ab6469186a2f776926a93135479d82bf209d01e0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 21 Mar 2022 21:32:31 +0000 Subject: [PATCH 11/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/deprecated_api/test_remove_1-8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/deprecated_api/test_remove_1-8.py b/tests/deprecated_api/test_remove_1-8.py index 7f9c84de36548..553eb6f8f860e 100644 --- a/tests/deprecated_api/test_remove_1-8.py +++ b/tests/deprecated_api/test_remove_1-8.py @@ -985,7 +985,7 @@ def stop(self, action_name: str) -> None: # No deprecation message CustomProfiler2() - + @pytest.mark.parametrize( ["trainer_kwargs", "expected_data_parallel_device_ids"], [ From 07b7dc5143b1db268c30908e1829eadab60f68f0 Mon Sep 17 00:00:00 2001 From: DuYicong515 Date: Mon, 21 Mar 2022 14:35:49 -0700 Subject: [PATCH 12/13] update changelog --- CHANGELOG.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 944250f9d44f8..86fb00b23784d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -368,9 +368,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - The strategies that support `sync_batchnorm` now only apply it when fitting ([#11919](https://github.com/PyTorchLightning/pytorch-lightning/pull/11919)) -- Changed `Trainer.data_parallel_device_ids` to return a list of GPU device indexes ([#12072](https://github.com/PyTorchLightning/pytorch-lightning/pull/12072)) - - ### Deprecated - Deprecated `training_type_plugin` property in favor of `strategy` in `Trainer` and updated the references ([#11141](https://github.com/PyTorchLightning/pytorch-lightning/pull/11141)) From 26bcf3bb3ab0c72785ec8f89c570f09c7c3c45e2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 22 Mar 2022 16:31:41 +0000 Subject: [PATCH 13/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ab21e6125e5ed..cda7110dc7429 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -538,7 +538,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated `Trainer.num_processes` in favor of `Trainer.num_devices` ([#12388](https://github.com/PyTorchLightning/pytorch-lightning/pull/12388)) -- Deprecated `Trainer.data_parallel_device_ids` in favor of `Trainer.device_ids` ([#12072](https://github.com/PyTorchLightning/pytorch-lightning/pull/12072)) +- Deprecated `Trainer.data_parallel_device_ids` in favor of `Trainer.device_ids` ([#12072](https://github.com/PyTorchLightning/pytorch-lightning/pull/12072)) ### Removed