From e3eb9353302c6301707bca0ceedf6583dddac49e Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 4 Nov 2021 18:45:58 +0100 Subject: [PATCH 01/16] Drop torch 1.6 support --- .github/workflows/ci_dockers.yml | 2 +- .github/workflows/ci_test-conda.yml | 2 +- .github/workflows/ci_test-full.yml | 5 ----- .github/workflows/events-nightly.yml | 2 +- .github/workflows/release-docker.yml | 2 +- dockers/base-cuda/Dockerfile | 2 +- environment.yml | 5 +++-- requirements.txt | 2 +- requirements/adjust_versions.py | 1 - requirements/examples.txt | 2 +- requirements/extra.txt | 2 +- 11 files changed, 11 insertions(+), 16 deletions(-) diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index 701223c795a3b..02426529574f6 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -93,7 +93,7 @@ jobs: matrix: # the config used in '.github/workflows/ci_test-conda.yml' python_version: ["3.8"] - pytorch_version: ["1.6", "1.7", "1.8", "1.9", "1.10"] + pytorch_version: ["1.7", "1.8", "1.9", "1.10"] steps: - name: Checkout uses: actions/checkout@v2 diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml index edae03db7936b..e0808a79fd384 100644 --- a/.github/workflows/ci_test-conda.yml +++ b/.github/workflows/ci_test-conda.yml @@ -15,7 +15,7 @@ jobs: fail-fast: false matrix: python-version: ["3.8"] # previous to last Python version as that one is already used in test-full - pytorch-version: ["1.6", "1.7", "1.8", "1.9", "1.10"] + pytorch-version: ["1.7", "1.8", "1.9", "1.10"] # Timeout: https://stackoverflow.com/a/59076067/4521646 timeout-minutes: 35 diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml index 8be8fd1146864..57aedf68dcb84 100644 --- a/.github/workflows/ci_test-full.yml +++ b/.github/workflows/ci_test-full.yml @@ -28,11 +28,6 @@ jobs: - {os: macOS-10.15, python-version: "3.6", requires: "oldest", release: "stable"} # nightly: add when there's a release candidate #- {os: ubuntu-20.04, python-version: "3.10", requires: "latest", release: "pre"} - exclude: - # PyTorch 1.6 is not available with Python 3.9: https://github.com/pytorch/pytorch/issues/46205 - - {os: ubuntu-18.04, python-version: "3.9", requires: "oldest", release: "stable"} - - {os: windows-2019, python-version: "3.9", requires: "oldest", release: "stable"} - - {os: macOS-10.15, python-version: "3.9", requires: "oldest", release: "stable"} # Timeout: https://stackoverflow.com/a/59076067/4521646 # TODO: the macOS is taking too long, probably caching did not work... diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index ce2072e5f45aa..f450e98380f10 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -123,7 +123,7 @@ jobs: matrix: # the config used in '.github/workflows/ci_test-conda.yml' python_version: ["3.8"] - pytorch_version: ["1.6", "1.7", "1.8", "1.9", "1.10"] + pytorch_version: ["1.7", "1.8", "1.9", "1.10"] steps: - name: Checkout diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index 92bf62d3c1ead..f7017d35d9e88 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -16,7 +16,7 @@ jobs: fail-fast: false matrix: python_version: ["3.6", "3.7", "3.8", "3.9"] - pytorch_version: ["1.6", "1.7", "1.8", "1.9"] + pytorch_version: ["1.7", "1.8", "1.9"] steps: - name: Checkout uses: actions/checkout@v2 diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index ab26af6c7accf..4d2979e32034a 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG CUDA_VERSION=10.2 +ARG CUDA_VERSION=11.1 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu18.04 diff --git a/environment.yml b/environment.yml index fb21d21c97730..e4fde41c3770a 100644 --- a/environment.yml +++ b/environment.yml @@ -29,7 +29,7 @@ dependencies: - python>=3.6 - pip>20.1 - numpy>=1.17.2 - - pytorch>=1.6 + - pytorch>=1.7 - future>=0.17.1 - PyYAML>=5.1 - tqdm>=4.41.0 @@ -41,9 +41,10 @@ dependencies: - scikit-learn>=0.20.0 - matplotlib>=3.1.1 - omegaconf>=2.0.5 + - torchtext>=0.8 # Examples - - torchvision>=0.6 + - torchvision>=0.8 - pip: - test-tube>=0.7.5 diff --git a/requirements.txt b/requirements.txt index 69074cbfb249c..2638022a4733a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # the default package dependencies numpy>=1.17.2 -torch>=1.6 +torch>=1.7 future>=0.17.1 # required for builtins in setup.py tqdm>=4.41.0 PyYAML>=5.1 diff --git a/requirements/adjust_versions.py b/requirements/adjust_versions.py index 3ebb3c28835b3..634e5731e265e 100644 --- a/requirements/adjust_versions.py +++ b/requirements/adjust_versions.py @@ -14,7 +14,6 @@ dict(torch="1.8.0", torchvision="0.9.0", torchtext="0.9.0"), dict(torch="1.7.1", torchvision="0.8.2", torchtext="0.8.1"), dict(torch="1.7.0", torchvision="0.8.1", torchtext="0.8.0"), - dict(torch="1.6.0", torchvision="0.7.0", torchtext="0.7"), ] diff --git a/requirements/examples.txt b/requirements/examples.txt index e38f1f92bcb83..cd8073c9304fa 100644 --- a/requirements/examples.txt +++ b/requirements/examples.txt @@ -1,3 +1,3 @@ -torchvision>=0.7 +torchvision>=0.8 gym>=0.17.0 ipython[all] diff --git a/requirements/extra.txt b/requirements/extra.txt index e3763fcae487b..c397e8350f179 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -2,7 +2,7 @@ matplotlib>3.1 horovod>=0.21.2 # no need to install with [pytorch] as pytorch is already installed -torchtext>=0.7 +torchtext>=0.8 omegaconf>=2.0.5 hydra-core>=1.0.5 jsonargparse[signatures]>=3.19.3 From a8ced05e1988c983578bca7cc1c59fb4cd7774b9 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 5 Nov 2021 05:16:35 +0100 Subject: [PATCH 02/16] Drop 1.6 support --- docs/source/common/trainer.rst | 36 +------ pytorch_lightning/callbacks/quantization.py | 2 +- pytorch_lightning/distributed/dist.py | 5 +- .../overrides/torch_distributed.py | 99 ------------------- .../plugins/training_type/ddp.py | 15 +-- .../plugins/training_type/ddp_spawn.py | 21 ++-- .../connectors/accelerator_connector.py | 5 +- pytorch_lightning/utilities/__init__.py | 1 - pytorch_lightning/utilities/auto_restart.py | 14 +-- pytorch_lightning/utilities/cloud_io.py | 9 +- pytorch_lightning/utilities/imports.py | 3 +- pytorch_lightning/utilities/seed.py | 6 +- tests/callbacks/test_quantization.py | 1 - tests/conftest.py | 6 +- tests/core/test_metric_result_integration.py | 5 +- tests/helpers/datamodules.py | 1 - .../loops/optimization/test_optimizer_loop.py | 2 - tests/loops/test_loops.py | 5 - tests/plugins/test_double_plugin.py | 6 +- tests/profiler/test_profiler.py | 4 +- .../connectors/test_checkpoint_connector.py | 2 - .../connectors/test_signal_connector.py | 2 +- tests/trainer/test_data_loading.py | 2 +- tests/trainer/test_supporters.py | 2 - tests/utilities/test_auto_restart.py | 6 +- 25 files changed, 36 insertions(+), 224 deletions(-) delete mode 100644 pytorch_lightning/overrides/torch_distributed.py diff --git a/docs/source/common/trainer.rst b/docs/source/common/trainer.rst index 8f8cb5ef288e4..26605941fe5e4 100644 --- a/docs/source/common/trainer.rst +++ b/docs/source/common/trainer.rst @@ -282,7 +282,7 @@ amp_backend | -Use PyTorch AMP ('native') (available PyTorch 1.6+), or NVIDIA apex ('apex'). +Use PyTorch AMP ('native'), or NVIDIA apex ('apex'). .. testcode:: @@ -1183,35 +1183,13 @@ Half precision, or mixed precision, is the combined use of 32 and 16 bit floatin .. note:: 16-bit precision is not supported on CPUs. +.. admonition:: If you are interested in using Apex 16-bit training + :class: dropdown -.. admonition:: When using PyTorch 1.6+, Lightning uses the native AMP implementation to support 16-bit precision. 16-bit precision with PyTorch < 1.6 is supported by NVIDIA Apex library. - :class: dropdown, warning - - NVIDIA Apex and DDP have instability problems. We recommend upgrading to PyTorch 1.6+ in order to use the native AMP 16-bit precision with multiple GPUs. - - If you are using an earlier version of PyTorch (before 1.6), Lightning uses `Apex `_ to support 16-bit training. - + NVIDIA Apex and DDP have instability problems. We recommend using the native AMP for 16-bit precision with multiple GPUs. To use Apex 16-bit training: - 1. Install Apex - - .. code-block:: bash - - # ------------------------ - # OPTIONAL: on your cluster you might need to load CUDA 10 or 9 - # depending on how you installed PyTorch - - # see available modules - module avail - - # load correct CUDA before install - module load cuda-10.0 - # ------------------------ - - # make sure you've loaded a GCC version > 4.0 and < 7.0 - module load gcc-6.1.0 - - pip install --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" https://github.com/NVIDIA/apex + 1. `Install apex. `__ 2. Set the ``precision`` trainer flag to 16. You can customize the `Apex optimization level `_ by setting the `amp_level` flag. @@ -1221,10 +1199,6 @@ Half precision, or mixed precision, is the combined use of 32 and 16 bit floatin # turn on 16-bit trainer = Trainer(amp_backend="apex", amp_level="O2", precision=16) - If you need to configure the apex init for your particular use case, or want to customize the - 16-bit training behaviour, override :meth:`pytorch_lightning.core.LightningModule.configure_apex`. - - process_position ^^^^^^^^^^^^^^^^ diff --git a/pytorch_lightning/callbacks/quantization.py b/pytorch_lightning/callbacks/quantization.py index ca82a574f71d1..42f0d575ffb6f 100644 --- a/pytorch_lightning/callbacks/quantization.py +++ b/pytorch_lightning/callbacks/quantization.py @@ -28,7 +28,7 @@ if _TORCH_GREATER_EQUAL_1_8: from torch.quantization import FakeQuantizeBase else: - # For torch 1.6 and 1.7. + # For torch 1.7. from torch.quantization import FakeQuantize as FakeQuantizeBase import pytorch_lightning as pl diff --git a/pytorch_lightning/distributed/dist.py b/pytorch_lightning/distributed/dist.py index 082e0c617a5f7..a0054d17936b0 100644 --- a/pytorch_lightning/distributed/dist.py +++ b/pytorch_lightning/distributed/dist.py @@ -13,7 +13,8 @@ # limitations under the License. from typing import Any -from pytorch_lightning.overrides.torch_distributed import broadcast_object_list +import torch.distributed + from pytorch_lightning.utilities import rank_zero_deprecation from pytorch_lightning.utilities.distributed import group as _group @@ -40,6 +41,6 @@ def broadcast(self, obj: Any, group=_group.WORLD): if self.rank != 0: obj = [None] * len(obj) - broadcast_object_list(obj, 0, group=group or _group.WORLD) + torch.distributed.broadcast_object_list(obj, 0, group=group or _group.WORLD) return obj[0] diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py deleted file mode 100644 index 3cbbe5ea760ff..0000000000000 --- a/pytorch_lightning/overrides/torch_distributed.py +++ /dev/null @@ -1,99 +0,0 @@ -import logging -import pickle - -import torch - -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8 - -log = logging.getLogger(__name__) - -if torch.distributed.is_available(): - from torch.distributed import Backend, broadcast, get_backend, get_rank, GroupMember - -# The code underneath is taken from PyTorch `torch/distributed/distributed_c10d.py` -# and enable broadcasting for PyTorch 1.6 and lower. - - -# https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L160 -def _rank_not_in_group(group): - """Helper that checks if the current process's rank is not in a given group.""" - if group is None: - return False - return group == GroupMember.NON_GROUP_MEMBER - - -# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1164 -def _object_to_tensor(obj): - buffer = pickle.dumps(obj) - byte_storage = torch.ByteStorage.from_buffer(buffer) # type: ignore[attr-defined] - byte_tensor = torch.ByteTensor(byte_storage) - local_size = torch.LongTensor([byte_tensor.numel()]) - return byte_tensor, local_size - - -# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py -def _tensor_to_object(tensor, tensor_size): - buf = tensor.numpy().tobytes()[:tensor_size] - out = pickle.loads(buf) - return out - - -# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1327 -def _broadcast_object_list(object_list, src=0, group=None): - if _rank_not_in_group(group): - return - - my_rank = get_rank() - # Serialize object_list elements to tensors on src rank. - if my_rank == src: - tensor_list, size_list = zip(*(_object_to_tensor(obj) for obj in object_list)) - object_sizes_tensor = torch.cat(size_list) - else: - object_sizes_tensor = torch.LongTensor(len(object_list)) - - group_backend = get_backend(group) - is_nccl_backend = group_backend == Backend.NCCL - current_device = torch.device("cpu") - if is_nccl_backend: - # See note about using torch.cuda.current_device() here in docstring. - # We cannot simply use my_rank since rank == device is not necessarily - # true. - current_device = torch.device("cuda", torch.cuda.current_device()) - object_sizes_tensor = object_sizes_tensor.to(current_device) - object_sizes_tensor = object_sizes_tensor.to(current_device) - - # Broadcast object sizes - broadcast(object_sizes_tensor, src=src, group=group) - - # Concatenate and broadcast serialized object tensors - if my_rank == src: - object_tensor = torch.cat(tensor_list) - else: - object_tensor = torch.ByteTensor(torch.sum(object_sizes_tensor).item()) - - if is_nccl_backend: - object_tensor = object_tensor.to(current_device) - - broadcast(object_tensor, src=src, group=group) - - # Deserialize objects using their stored sizes. - offset = 0 - if my_rank != src: - for i, obj_size in enumerate(object_sizes_tensor): - obj_view = object_tensor[offset : offset + obj_size] - obj_view = obj_view.type(torch.ByteTensor) # type: ignore[call-overload] - offset += obj_size - object_list[i] = _tensor_to_object(obj_view, obj_size) - - -if not torch.distributed.is_available(): - # avoid failures on early PyTorch versions for Windows where - # not all functions used in `broadcast_object_list` are available. - def _broadcast_noop(obj, *_, **__): - return obj - - broadcast_object_list = _broadcast_noop -elif _TORCH_GREATER_EQUAL_1_8: - from torch.distributed.distributed_c10d import broadcast_object_list -else: - broadcast_object_list = _broadcast_object_list diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index ea4820f61ec7c..ad4fbc875eb11 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -34,7 +34,6 @@ from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.overrides.distributed import prepare_for_backward -from pytorch_lightning.overrides.torch_distributed import broadcast_object_list from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin @@ -43,7 +42,6 @@ _FAIRSCALE_AVAILABLE, _HYDRA_AVAILABLE, _IS_WINDOWS, - _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8, _TORCH_GREATER_EQUAL_1_9, _TORCH_GREATER_EQUAL_1_10, @@ -299,15 +297,12 @@ def pre_configure_ddp(self): # when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True. # This flag does come with a performance hit, so it is suggested to disable in cases where it is possible. self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) - # todo: PyTorch 1.7.0 DDP introduces `self.reducer._rebuild_buckets()` breaking manual_optimization - if ( - _TORCH_GREATER_EQUAL_1_7 - and not self.lightning_module.automatic_optimization - and not self._ddp_kwargs.get("find_unused_parameters", False) + if not self.lightning_module.automatic_optimization and not self._ddp_kwargs.get( + "find_unused_parameters", False ): + # TODO: PyTorch 1.7.0 DDP introduces `self.reducer._rebuild_buckets()` breaking manual_optimization rank_zero_warn( - "From PyTorch 1.7.0, Lightning ``manual_optimization`` needs to set ``find_unused_parameters=True`` " - "to properly work with DDP." + "Lightning `manual_optimization` needs to set `find_unused_parameters=True` to properly work with DDP." ) self._ddp_kwargs["find_unused_parameters"] = True @@ -410,7 +405,7 @@ def broadcast(self, obj: object, src: int = 0) -> object: obj = [obj] if self.global_rank != src: obj = [None] - broadcast_object_list(obj, src, group=_group.WORLD) + torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD) return obj[0] def pre_backward(self, closure_loss: torch.Tensor) -> None: diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index ff5159f739cdc..3dcdc05f42ee3 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -27,17 +27,11 @@ import pytorch_lightning as pl from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.overrides.distributed import prepare_for_backward -from pytorch_lightning.overrides.torch_distributed import broadcast_object_list from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities import ( - _TORCH_GREATER_EQUAL_1_7, - _TORCH_GREATER_EQUAL_1_8, - rank_zero_deprecation, - rank_zero_warn, -) +from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_8, rank_zero_deprecation, rank_zero_warn from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device from pytorch_lightning.utilities.cloud_io import atomic_save from pytorch_lightning.utilities.cloud_io import load as pl_load @@ -263,15 +257,12 @@ def pre_configure_ddp(self): # when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True. # This flag does come with a performance hit, so it is suggested to disable in cases where it is possible. self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) - # todo: PyTorch 1.7.0 DDP introduces `self.reducer._rebuild_buckets()` breaking manual_optimization - if ( - _TORCH_GREATER_EQUAL_1_7 - and not self.lightning_module.automatic_optimization - and not self._ddp_kwargs.get("find_unused_parameters", False) + if not self.lightning_module.automatic_optimization and not self._ddp_kwargs.get( + "find_unused_parameters", False ): + # TODO: PyTorch 1.7.0 DDP introduces `self.reducer._rebuild_buckets()` breaking manual_optimization rank_zero_warn( - "From PyTorch 1.7.0, Lightning ``manual_optimization`` needs to set ``find_unused_parameters=True`` " - "to properly work with DDP." + "Lightning `manual_optimization` needs to set `find_unused_parameters=True` to properly work with DDP." ) self._ddp_kwargs["find_unused_parameters"] = True @@ -348,7 +339,7 @@ def broadcast(self, obj: object, src: int = 0) -> object: obj = [obj] if self.global_rank != src: obj = [None] - broadcast_object_list(obj, src, group=_group.WORLD) + torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD) return obj[0] def model_to_device(self): diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 5895c1c6a141e..26c6365a14bfd 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -74,7 +74,6 @@ from pytorch_lightning.utilities.imports import ( _HOROVOD_AVAILABLE, _IPU_AVAILABLE, - _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8, _TPU_AVAILABLE, ) @@ -190,10 +189,8 @@ def _init_deterministic(self, deterministic: bool) -> None: self.deterministic = deterministic if _TORCH_GREATER_EQUAL_1_8: torch.use_deterministic_algorithms(deterministic) - elif _TORCH_GREATER_EQUAL_1_7: + else: torch.set_deterministic(deterministic) - else: # the minimum version Lightning supports is PyTorch 1.6 - torch._set_deterministic(deterministic) if deterministic: # fixing non-deterministic part of horovod # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383 diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py index 158d7356c91ce..7343e28d6d811 100644 --- a/pytorch_lightning/utilities/__init__.py +++ b/pytorch_lightning/utilities/__init__.py @@ -44,7 +44,6 @@ _OMEGACONF_AVAILABLE, _POPTORCH_AVAILABLE, _RICH_AVAILABLE, - _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8, _TORCH_GREATER_EQUAL_1_9, _TORCH_GREATER_EQUAL_1_10, diff --git a/pytorch_lightning/utilities/auto_restart.py b/pytorch_lightning/utilities/auto_restart.py index f0b50103cf2f2..7a18eb8f0107c 100644 --- a/pytorch_lightning/utilities/auto_restart.py +++ b/pytorch_lightning/utilities/auto_restart.py @@ -318,19 +318,11 @@ def _wrap_generator_samplers(self) -> None: if isinstance(generator, Sampler): continue - # used to handle a weird behaviour from PyTorch 1.6 - # where the sampler is converted to a list_iterator - is_legacy = False - - if isinstance(generator, Generator): - # Generator name have the the form `SamplerName.__iter__` - generator_name = generator.__qualname__.split(".")[0] - else: - # assume the retrieved iterator is coming from sampler. - is_legacy = True + # Generator name have the the form `SamplerName.__iter__` + generator_name = generator.__qualname__.split(".")[0] # validate the base generator name matches a sampler name. - if is_legacy or any(sampler_name == generator_name for sampler_name in samplers_names): + if any(sampler_name == generator_name for sampler_name in samplers_names): # wrap the generator into a `FastForwardSampler` sampler = FastForwardSampler(generator, attr_name=generator_attr_name) diff --git a/pytorch_lightning/utilities/cloud_io.py b/pytorch_lightning/utilities/cloud_io.py index 9b40f6d69cfad..2c9eb1f768d3c 100644 --- a/pytorch_lightning/utilities/cloud_io.py +++ b/pytorch_lightning/utilities/cloud_io.py @@ -19,7 +19,6 @@ import fsspec import torch from fsspec.implementations.local import AbstractFileSystem, LocalFileSystem -from packaging.version import Version def load( @@ -59,12 +58,6 @@ def atomic_save(checkpoint: Dict[str, Any], filepath: Union[str, Path]) -> None: """ bytesbuffer = io.BytesIO() - # Can't use the new zipfile serialization for 1.6.0 because there's a bug in - # torch.hub.load_state_dict_from_url() that prevents it from loading the new files. - # More details can be found here: https://github.com/pytorch/pytorch/issues/42239 - if Version(torch.__version__).release[:3] == (1, 6, 0): - torch.save(checkpoint, bytesbuffer, _use_new_zipfile_serialization=False) - else: - torch.save(checkpoint, bytesbuffer) + torch.save(checkpoint, bytesbuffer) with fsspec.open(filepath, "wb") as f: f.write(bytesbuffer.getvalue()) diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index edf5f75aee6a9..5db24fe0f5cff 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -70,7 +70,6 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version: _IS_WINDOWS = platform.system() == "Windows" _IS_INTERACTIVE = hasattr(sys, "ps1") # https://stackoverflow.com/a/64523765 -_TORCH_GREATER_EQUAL_1_7 = _compare_version("torch", operator.ge, "1.7.0") _TORCH_GREATER_EQUAL_1_8 = _compare_version("torch", operator.ge, "1.8.0") _TORCH_GREATER_EQUAL_1_8_1 = _compare_version("torch", operator.ge, "1.8.1") _TORCH_GREATER_EQUAL_1_9 = _compare_version("torch", operator.ge, "1.9.0") @@ -112,4 +111,4 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version: # experimental feature within PyTorch Lightning. def _fault_tolerant_training() -> bool: - return _TORCH_GREATER_EQUAL_1_7 and int(os.getenv("PL_FAULT_TOLERANT_TRAINING", 0)) + return bool(int(os.getenv("PL_FAULT_TOLERANT_TRAINING", 0))) diff --git a/pytorch_lightning/utilities/seed.py b/pytorch_lightning/utilities/seed.py index 3b20c53353411..e8fc243f484f8 100644 --- a/pytorch_lightning/utilities/seed.py +++ b/pytorch_lightning/utilities/seed.py @@ -21,7 +21,7 @@ import numpy as np import torch -from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_7, rank_zero_warn +from pytorch_lightning.utilities import rank_zero_warn from pytorch_lightning.utilities.distributed import rank_zero_only log = logging.getLogger(__name__) @@ -113,9 +113,7 @@ def pl_worker_init_function(worker_id: int, rank: Optional[int] = None) -> None: np.random.seed(ss.generate_state(4)) # Spawn distinct SeedSequences for the PyTorch PRNG and the stdlib random module torch_ss, stdlib_ss = ss.spawn(2) - # PyTorch 1.7 and above takes a 64-bit seed - dtype = np.uint64 if _TORCH_GREATER_EQUAL_1_7 else np.uint32 - torch.manual_seed(torch_ss.generate_state(1, dtype=dtype)[0]) + torch.manual_seed(torch_ss.generate_state(1, dtype=np.uint64)[0]) # use 128 bits expressed as an integer stdlib_seed = (stdlib_ss.generate_state(2, dtype=np.uint64).astype(object) * [1 << 64, 1]).sum() random.seed(stdlib_seed) diff --git a/tests/callbacks/test_quantization.py b/tests/callbacks/test_quantization.py index fa2ee767bdc8c..24312ce0c3bab 100644 --- a/tests/callbacks/test_quantization.py +++ b/tests/callbacks/test_quantization.py @@ -31,7 +31,6 @@ if _TORCH_GREATER_EQUAL_1_8: from torch.quantization import FakeQuantizeBase else: - # For torch 1.6 and 1.7. from torch.quantization import FakeQuantize as FakeQuantizeBase diff --git a/tests/conftest.py b/tests/conftest.py index 860f9357e4636..3d5548b7bd0ae 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -22,7 +22,7 @@ import torch.distributed from pytorch_lightning.plugins.environments.lightning_environment import find_free_network_port -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8 +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8 from tests import _PATH_DATASETS @@ -95,10 +95,8 @@ def reset_deterministic_algorithm(): yield if _TORCH_GREATER_EQUAL_1_8: torch.use_deterministic_algorithms(False) - elif _TORCH_GREATER_EQUAL_1_7: + else: torch.set_deterministic(False) - else: # the minimum version Lightning supports is PyTorch 1.6 - torch._set_deterministic(False) @pytest.fixture diff --git a/tests/core/test_metric_result_integration.py b/tests/core/test_metric_result_integration.py index 12fe7f2fb4652..9ec2f150ac5d4 100644 --- a/tests/core/test_metric_result_integration.py +++ b/tests/core/test_metric_result_integration.py @@ -33,7 +33,7 @@ ResultCollection, ResultMetric, ) -from pytorch_lightning.utilities.imports import _fault_tolerant_training, _TORCH_GREATER_EQUAL_1_7 +from pytorch_lightning.utilities.imports import _fault_tolerant_training from tests.helpers import BoringModel from tests.helpers.runif import RunIf @@ -470,21 +470,18 @@ def on_epoch_end(self) -> None: @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_7, reason="Requires at least PyTorch 1.7") def test_result_collection_reload(tmpdir): result_collection_reload(default_root_dir=tmpdir) @RunIf(min_gpus=1) @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_7, reason="Requires at least PyTorch 1.7") def test_result_collection_reload_1_gpu_ddp(tmpdir): result_collection_reload(default_root_dir=tmpdir, strategy="ddp", gpus=1) @RunIf(min_gpus=2, special=True) @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_7, reason="Requires at least PyTorch 1.7") def test_result_collection_reload_2_gpus(tmpdir): result_collection_reload(default_root_dir=tmpdir, strategy="ddp", gpus=2) diff --git a/tests/helpers/datamodules.py b/tests/helpers/datamodules.py index 0cb178a749a09..78e806b37937e 100644 --- a/tests/helpers/datamodules.py +++ b/tests/helpers/datamodules.py @@ -46,7 +46,6 @@ def prepare_data(self): self.dataset_cls(self.data_dir, train=False, download=True) def setup(self, stage: Optional[str] = None): - # TODO: need to split using random_split once updated to torch >= 1.6 if stage == "fit" or stage is None: self.mnist_train = self.dataset_cls(self.data_dir, train=True) if stage == "test" or stage is None: diff --git a/tests/loops/optimization/test_optimizer_loop.py b/tests/loops/optimization/test_optimizer_loop.py index 7e17cbbd56645..ae77c4387a398 100644 --- a/tests/loops/optimization/test_optimizer_loop.py +++ b/tests/loops/optimization/test_optimizer_loop.py @@ -24,7 +24,6 @@ from pytorch_lightning.loops.optimization.optimizer_loop import ClosureResult from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel -from tests.helpers.runif import RunIf def test_closure_result_deepcopy(): @@ -140,7 +139,6 @@ class CustomException(Exception): pass -@RunIf(min_torch="1.7.0") @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) @pytest.mark.parametrize("stop_epoch", (0, 1)) @pytest.mark.parametrize("stop_batch", (0, 1, 2)) diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py index dd390ab4939d5..f6097864d65f3 100644 --- a/tests/loops/test_loops.py +++ b/tests/loops/test_loops.py @@ -253,7 +253,6 @@ def on_load_checkpoint(self, state_dict: Dict) -> None: assert state_dict == {"state_dict": {"a": 1}, "progress": {"increment": 1}} -@RunIf(min_torch="1.7.0") @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) @pytest.mark.parametrize("stop_epoch", (1, 2)) @pytest.mark.parametrize("stop_batch", (1, 2)) @@ -323,7 +322,6 @@ def val_dataloader(self): assert trainer.fit_loop.epoch_loop.val_loop.epoch_loop.batch_progress.state_dict() == expected -@RunIf(min_torch="1.7.0") @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) @pytest.mark.parametrize("accumulate_grad_batches", (1, 2, 3)) @pytest.mark.parametrize("n_optimizers", (1, 3, 5)) @@ -526,7 +524,6 @@ def configure_optimizers_multiple(self): assert state_dict["epoch_progress"]["current"]["started"] == stop_epoch -@RunIf(min_torch="1.7.0") @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) @pytest.mark.parametrize("n_optimizers", (1, 3, 5)) def test_loop_state_on_complete_run(n_optimizers, tmpdir): @@ -662,7 +659,6 @@ def train_dataloader(self): assert checkpoint["loops"]["fit_loop"] == expected -@RunIf(min_torch="1.7.0") @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) def test_fit_loop_reset(tmpdir): """Test that the reset logic in fit- and epoch loop is aware of whether the loop is restarting from a completed @@ -752,7 +748,6 @@ def test_fit_loop_reset(tmpdir): @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@RunIf(min_torch="1.7.0") @pytest.mark.parametrize( ["train_datasets", "val_datasets"], [([RandomDataset], [RandomDataset]), ([RandomDataset], [RandomDataset, RandomDataset])], diff --git a/tests/plugins/test_double_plugin.py b/tests/plugins/test_double_plugin.py index cadd02c692af5..b3fdf87428522 100644 --- a/tests/plugins/test_double_plugin.py +++ b/tests/plugins/test_double_plugin.py @@ -20,7 +20,6 @@ from pytorch_lightning import Trainer from pytorch_lightning.plugins import DoublePrecisionPlugin -from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_7 from tests.helpers.boring_model import BoringModel, RandomDataset from tests.helpers.runif import RunIf @@ -137,10 +136,7 @@ def on_fit_start(self): [ DoublePrecisionBoringModel, DoublePrecisionBoringModelNoForward, - pytest.param( - DoublePrecisionBoringModelComplexBuffer, - marks=pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_7, reason="torch.complex not available"), - ), + DoublePrecisionBoringModelComplexBuffer, ], ) def test_double_precision(tmpdir, boring_model): diff --git a/tests/profiler/test_profiler.py b/tests/profiler/test_profiler.py index 7369ab9a4a140..faf00f8890783 100644 --- a/tests/profiler/test_profiler.py +++ b/tests/profiler/test_profiler.py @@ -26,7 +26,6 @@ from pytorch_lightning.loggers.tensorboard import TensorBoardLogger from pytorch_lightning.profiler import AdvancedProfiler, PassThroughProfiler, PyTorchProfiler, SimpleProfiler from pytorch_lightning.profiler.pytorch import RegisterRecordFunction -from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_7 from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _KINETO_AVAILABLE from tests.helpers import BoringModel, ManualOptimBoringModel @@ -394,8 +393,7 @@ def test_pytorch_profiler_nested(tmpdir): names = {"a", "b", "c"} ops = {"add", "empty", "fill_", "ones", "zero_", "zeros"} - if _TORCH_GREATER_EQUAL_1_7: - ops = {"aten::" + op for op in ops} + ops = {"aten::" + op for op in ops} expected = names.union(ops) assert events_name == expected, (events_name, torch.__version__, platform.system()) diff --git a/tests/trainer/connectors/test_checkpoint_connector.py b/tests/trainer/connectors/test_checkpoint_connector.py index 6b408845ed879..4a42265eb21b0 100644 --- a/tests/trainer/connectors/test_checkpoint_connector.py +++ b/tests/trainer/connectors/test_checkpoint_connector.py @@ -21,7 +21,6 @@ from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.trainer.states import TrainerFn from tests.helpers import BoringModel -from tests.helpers.runif import RunIf class HPCHookdedModel(BoringModel): @@ -133,7 +132,6 @@ def test_hpc_max_ckpt_version(tmpdir): @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@RunIf(min_torch="1.7.0") def test_loops_restore(tmpdir): """Test that required loop state_dict is loaded correctly by checkpoint connector.""" model = BoringModel() diff --git a/tests/trainer/connectors/test_signal_connector.py b/tests/trainer/connectors/test_signal_connector.py index 3da8c100fe40c..aa5407e2f1228 100644 --- a/tests/trainer/connectors/test_signal_connector.py +++ b/tests/trainer/connectors/test_signal_connector.py @@ -26,7 +26,7 @@ @pytest.mark.parametrize("register_handler", [False, True]) @pytest.mark.parametrize("terminate_gracefully", [False, True]) -@RunIf(min_torch="1.7.0", skip_windows=True) +@RunIf(skip_windows=True) def test_fault_tolerant_sig_handler(register_handler, terminate_gracefully, tmpdir): # hack to reset the signal diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index 0f6abd38e6836..97097b2074ca1 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -26,7 +26,7 @@ from tests.helpers.runif import RunIf -@RunIf(skip_windows=True, min_torch="1.7.0") +@RunIf(skip_windows=True) @pytest.mark.parametrize("mode", (1, 2, 3)) def test_replace_distributed_sampler(tmpdir, mode): class IndexedRandomDataset(RandomDataset): diff --git a/tests/trainer/test_supporters.py b/tests/trainer/test_supporters.py index 204f3079f544b..1518779bcc7ef 100644 --- a/tests/trainer/test_supporters.py +++ b/tests/trainer/test_supporters.py @@ -34,7 +34,6 @@ from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.auto_restart import CaptureMapDataset, FastForwardSampler from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_7 def test_tensor_running_accum_reset(): @@ -310,7 +309,6 @@ def test_nested_calc_num_data(input_data, compute_func, expected_length): assert calculated_length == expected_length -@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_7, reason="Requires at least PyTorch 1.7") @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1", "PL_TRAINER_GPUS": "2"}) @mock.patch("torch.cuda.device_count", return_value=2) @mock.patch("torch.cuda.is_available", return_value=True) diff --git a/tests/utilities/test_auto_restart.py b/tests/utilities/test_auto_restart.py index 4e3385cebecbc..b36a9d1d76941 100644 --- a/tests/utilities/test_auto_restart.py +++ b/tests/utilities/test_auto_restart.py @@ -690,7 +690,6 @@ def create_dataloader(): } -@RunIf(min_torch="1.7.0") @pytest.mark.parametrize("use_fault_tolerant", ["0", "1"]) def test_data_loading_wraps_dataset_and_samplers(use_fault_tolerant, tmpdir): """This test ensures the dataset and sampler are properly wrapped when fault tolerant is enabled.""" @@ -785,7 +784,6 @@ def __len__(self): # TODO: test with `RandomGeneratorGetItemDataset` @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@RunIf(min_torch="1.7.0") @pytest.mark.parametrize( "dataset_class", [ @@ -921,7 +919,6 @@ def _run_training(trainer_kwargs, dataset_classes, fail_on_step: int = -1, ckpt_ @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@RunIf(min_torch="1.7.0") @pytest.mark.parametrize( "dataset_classes", [ @@ -975,7 +972,6 @@ def test_dataset_rng_states_restart_with_lightning(tmpdir, dataset_classes, mult @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@RunIf(min_torch="1.7.0") @pytest.mark.parametrize( ["train_datasets", "val_datasets"], [ @@ -1139,7 +1135,7 @@ def _fit_model( @pytest.mark.parametrize("failure_on_training", [False, True]) @pytest.mark.parametrize("failure_on_step", [False, True]) @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@RunIf(min_torch="1.7.0", skip_windows=True) +@RunIf(skip_windows=True) def test_auto_restart_under_signal(on_last_batch, val_check_interval, failure_on_training, failure_on_step, tmpdir): """This test asserts that if a signal is being sent during the training / validation phase, the model should restart in a reproducible way.""" From 33c5d51f9d4383ab2b55a35ae92c97c07fe471d1 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 5 Nov 2021 05:20:23 +0100 Subject: [PATCH 03/16] Update CHANGELOG --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a616d97678a1f..a2149e4e6bfe0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -73,7 +73,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - -- +- Removed PyTorch 1.6 support ([#10367](https://github.com/PyTorchLightning/pytorch-lightning/pull/10367)) ### Fixed From aa6d9095b3d41446a84ea26c87a1db06c9e2a368 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 5 Nov 2021 05:42:40 +0100 Subject: [PATCH 04/16] Fixes --- docs/source/common/trainer.rst | 17 +++++++------ pytorch_lightning/utilities/auto_restart.py | 27 +++++++-------------- tests/callbacks/test_quantization.py | 1 + 3 files changed, 19 insertions(+), 26 deletions(-) diff --git a/docs/source/common/trainer.rst b/docs/source/common/trainer.rst index 26605941fe5e4..bb19c3b914082 100644 --- a/docs/source/common/trainer.rst +++ b/docs/source/common/trainer.rst @@ -1162,7 +1162,7 @@ precision | -Lightning supports either double precision (64), full precision (32), or half precision (16) training. +Lightning supports either double (64), float (32), bfloat16 (bf16), or half (16) precision training. Half precision, or mixed precision, is the combined use of 32 and 16 bit floating points to reduce memory footprint during model training. This can result in improved performance, achieving +3X speedups on modern GPUs. @@ -1170,20 +1170,21 @@ Half precision, or mixed precision, is the combined use of 32 and 16 bit floatin :skipif: not torch.cuda.is_available() # default used by the Trainer - trainer = Trainer(precision=32, gpus=1) + trainer = Trainer(precision=32) # 16-bit precision - trainer = Trainer(precision=16, gpus=1) + trainer = Trainer(precision=16, gpus=1) # works only on CUDA - # 64-bit precision - trainer = Trainer(precision=64, gpus=1) + # bfloat16 precision + trainer = Trainer(precision="bf16") + # 64-bit precision + trainer = Trainer(precision=64) -.. note:: When running on TPUs, torch.float16 will be used but tensor printing will still show torch.float32. -.. note:: 16-bit precision is not supported on CPUs. +.. note:: When running on TPUs, torch.bfloat16 will be used but tensor printing will still show torch.float32. -.. admonition:: If you are interested in using Apex 16-bit training +.. admonition:: If you are interested in using Apex 16-bit training: :class: dropdown NVIDIA Apex and DDP have instability problems. We recommend using the native AMP for 16-bit precision with multiple GPUs. diff --git a/pytorch_lightning/utilities/auto_restart.py b/pytorch_lightning/utilities/auto_restart.py index 7a18eb8f0107c..ef52717636d90 100644 --- a/pytorch_lightning/utilities/auto_restart.py +++ b/pytorch_lightning/utilities/auto_restart.py @@ -305,9 +305,6 @@ def _wrap_generator_samplers(self) -> None: # access wrapped dataset attributes dataset_dict = self.dataset.__dict__ - # create a tuple of sampler names - samplers_names = tuple(v.__class__.__name__ for k, v in dataset_dict.items() if isinstance(v, Sampler)) - # create a dictionary of generator present within the dataset attributes dataset_sampler_generators = {k: v for k, v in dataset_dict.items() if isinstance(v, (Generator, Iterator))} @@ -318,23 +315,17 @@ def _wrap_generator_samplers(self) -> None: if isinstance(generator, Sampler): continue - # Generator name have the the form `SamplerName.__iter__` - generator_name = generator.__qualname__.split(".")[0] - - # validate the base generator name matches a sampler name. - if any(sampler_name == generator_name for sampler_name in samplers_names): - - # wrap the generator into a `FastForwardSampler` - sampler = FastForwardSampler(generator, attr_name=generator_attr_name) + # wrap the generator into a `FastForwardSampler` + sampler = FastForwardSampler(generator, attr_name=generator_attr_name) - # if `CaptureIterableDataset` was available, the sampler should reload its own state. - if self._state_dict is not None: - sampler.load_state_dict(self._state_dict[generator_attr_name]) - # store the samplers - self.samplers[generator_attr_name] = sampler + # if `CaptureIterableDataset` was available, the sampler should reload its own state. + if self._state_dict is not None: + sampler.load_state_dict(self._state_dict[generator_attr_name]) + # store the samplers + self.samplers[generator_attr_name] = sampler - # replace generator with the generator from the `FastForwardSampler`. - dataset_dict[generator_attr_name] = iter(sampler) + # replace generator with the generator from the `FastForwardSampler`. + dataset_dict[generator_attr_name] = iter(sampler) self.reset_on_epoch() diff --git a/tests/callbacks/test_quantization.py b/tests/callbacks/test_quantization.py index 24312ce0c3bab..e3dfb9b6a7edf 100644 --- a/tests/callbacks/test_quantization.py +++ b/tests/callbacks/test_quantization.py @@ -31,6 +31,7 @@ if _TORCH_GREATER_EQUAL_1_8: from torch.quantization import FakeQuantizeBase else: + # For torch 1.7. from torch.quantization import FakeQuantize as FakeQuantizeBase From ab4ba8cd507f74f91a8241b1ead8a0bee14af00f Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 5 Nov 2021 05:47:00 +0100 Subject: [PATCH 05/16] Split change --- docs/source/common/trainer.rst | 51 +++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/docs/source/common/trainer.rst b/docs/source/common/trainer.rst index bb19c3b914082..8f8cb5ef288e4 100644 --- a/docs/source/common/trainer.rst +++ b/docs/source/common/trainer.rst @@ -282,7 +282,7 @@ amp_backend | -Use PyTorch AMP ('native'), or NVIDIA apex ('apex'). +Use PyTorch AMP ('native') (available PyTorch 1.6+), or NVIDIA apex ('apex'). .. testcode:: @@ -1162,7 +1162,7 @@ precision | -Lightning supports either double (64), float (32), bfloat16 (bf16), or half (16) precision training. +Lightning supports either double precision (64), full precision (32), or half precision (16) training. Half precision, or mixed precision, is the combined use of 32 and 16 bit floating points to reduce memory footprint during model training. This can result in improved performance, achieving +3X speedups on modern GPUs. @@ -1170,27 +1170,48 @@ Half precision, or mixed precision, is the combined use of 32 and 16 bit floatin :skipif: not torch.cuda.is_available() # default used by the Trainer - trainer = Trainer(precision=32) + trainer = Trainer(precision=32, gpus=1) # 16-bit precision - trainer = Trainer(precision=16, gpus=1) # works only on CUDA - - # bfloat16 precision - trainer = Trainer(precision="bf16") + trainer = Trainer(precision=16, gpus=1) # 64-bit precision - trainer = Trainer(precision=64) + trainer = Trainer(precision=64, gpus=1) + + +.. note:: When running on TPUs, torch.float16 will be used but tensor printing will still show torch.float32. + +.. note:: 16-bit precision is not supported on CPUs. -.. note:: When running on TPUs, torch.bfloat16 will be used but tensor printing will still show torch.float32. +.. admonition:: When using PyTorch 1.6+, Lightning uses the native AMP implementation to support 16-bit precision. 16-bit precision with PyTorch < 1.6 is supported by NVIDIA Apex library. + :class: dropdown, warning -.. admonition:: If you are interested in using Apex 16-bit training: - :class: dropdown + NVIDIA Apex and DDP have instability problems. We recommend upgrading to PyTorch 1.6+ in order to use the native AMP 16-bit precision with multiple GPUs. + + If you are using an earlier version of PyTorch (before 1.6), Lightning uses `Apex `_ to support 16-bit training. - NVIDIA Apex and DDP have instability problems. We recommend using the native AMP for 16-bit precision with multiple GPUs. To use Apex 16-bit training: - 1. `Install apex. `__ + 1. Install Apex + + .. code-block:: bash + + # ------------------------ + # OPTIONAL: on your cluster you might need to load CUDA 10 or 9 + # depending on how you installed PyTorch + + # see available modules + module avail + + # load correct CUDA before install + module load cuda-10.0 + # ------------------------ + + # make sure you've loaded a GCC version > 4.0 and < 7.0 + module load gcc-6.1.0 + + pip install --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" https://github.com/NVIDIA/apex 2. Set the ``precision`` trainer flag to 16. You can customize the `Apex optimization level `_ by setting the `amp_level` flag. @@ -1200,6 +1221,10 @@ Half precision, or mixed precision, is the combined use of 32 and 16 bit floatin # turn on 16-bit trainer = Trainer(amp_backend="apex", amp_level="O2", precision=16) + If you need to configure the apex init for your particular use case, or want to customize the + 16-bit training behaviour, override :meth:`pytorch_lightning.core.LightningModule.configure_apex`. + + process_position ^^^^^^^^^^^^^^^^ From 90094eacb63f317c935557f8158cd0db6ed73a43 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 5 Nov 2021 06:03:31 +0100 Subject: [PATCH 06/16] Undo change --- dockers/base-cuda/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 4d2979e32034a..ab26af6c7accf 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG CUDA_VERSION=11.1 +ARG CUDA_VERSION=10.2 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu18.04 From b9b4ccbfb5e22ecbec884b051a097b01aa246d59 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 5 Nov 2021 06:53:56 +0100 Subject: [PATCH 07/16] 1.7 -> 1.7.1 https://github.com/pytorch/pytorch/issues/47354 --- .github/workflows/ci_dockers.yml | 6 +++--- .github/workflows/ci_test-conda.yml | 2 +- .github/workflows/events-nightly.yml | 6 +++--- .github/workflows/release-docker.yml | 2 +- dockers/ipu-ci-runner/Dockerfile | 2 +- environment.yml | 6 +++--- requirements.txt | 2 +- requirements/examples.txt | 2 +- requirements/extra.txt | 2 +- 9 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index 02426529574f6..4fffd172690c2 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -93,12 +93,12 @@ jobs: matrix: # the config used in '.github/workflows/ci_test-conda.yml' python_version: ["3.8"] - pytorch_version: ["1.7", "1.8", "1.9", "1.10"] + pytorch_version: ["1.7.1", "1.8", "1.9", "1.10"] steps: - name: Checkout uses: actions/checkout@v2 - run: | - cuda=$(python -c "from distutils.version import LooseVersion as LVer ; print(11.1 if LVer('${{matrix.pytorch_version}}') > LVer('1.7') else 10.2)" 2>&1) + cuda=$(python -c "from distutils.version import LooseVersion as LVer ; print(11.1 if LVer('${{matrix.pytorch_version}}') > LVer('1.7.1') else 10.2)" 2>&1) echo "::set-output name=CUDA::$cuda" id: extend - name: Build Conda Docker @@ -122,7 +122,7 @@ jobs: python_version: ["3.9"] # latest # TODO: upgrade - PopTorch 2.2 uses torch 1.9, see: # https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/installation.html#version-compatibility - pytorch_version: ["1.7"] + pytorch_version: ["1.7.1"] steps: - name: Checkout uses: actions/checkout@v2 diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml index e0808a79fd384..7f3fb1e89e837 100644 --- a/.github/workflows/ci_test-conda.yml +++ b/.github/workflows/ci_test-conda.yml @@ -15,7 +15,7 @@ jobs: fail-fast: false matrix: python-version: ["3.8"] # previous to last Python version as that one is already used in test-full - pytorch-version: ["1.7", "1.8", "1.9", "1.10"] + pytorch-version: ["1.7.1", "1.8", "1.9", "1.10"] # Timeout: https://stackoverflow.com/a/59076067/4521646 timeout-minutes: 35 diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index f450e98380f10..d803bc5288966 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -123,7 +123,7 @@ jobs: matrix: # the config used in '.github/workflows/ci_test-conda.yml' python_version: ["3.8"] - pytorch_version: ["1.7", "1.8", "1.9", "1.10"] + pytorch_version: ["1.7.1", "1.8", "1.9", "1.10"] steps: - name: Checkout @@ -138,7 +138,7 @@ jobs: # see: https://pytorch.org/get-started/previous-versions/ - run: | - cuda=$(python -c "from distutils.version import LooseVersion as LVer ; print(11.1 if LVer('${{matrix.pytorch_version}}') > LVer('1.7') else 10.2)" 2>&1) + cuda=$(python -c "from distutils.version import LooseVersion as LVer ; print(11.1 if LVer('${{matrix.pytorch_version}}') > LVer('1.7.1') else 10.2)" 2>&1) echo "::set-output name=CUDA::$cuda" id: extend @@ -164,7 +164,7 @@ jobs: # the config used in 'dockers/ipu-ci-runner/Dockerfile' include: - python_version: "3.9" - pytorch_version: "1.7" + pytorch_version: "1.7.1" steps: - name: Checkout diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index f7017d35d9e88..ae25e52cecc25 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -16,7 +16,7 @@ jobs: fail-fast: false matrix: python_version: ["3.6", "3.7", "3.8", "3.9"] - pytorch_version: ["1.7", "1.8", "1.9"] + pytorch_version: ["1.7.1", "1.8", "1.9"] steps: - name: Checkout uses: actions/checkout@v2 diff --git a/dockers/ipu-ci-runner/Dockerfile b/dockers/ipu-ci-runner/Dockerfile index 98f769f78fe8f..54990feda318c 100644 --- a/dockers/ipu-ci-runner/Dockerfile +++ b/dockers/ipu-ci-runner/Dockerfile @@ -13,7 +13,7 @@ # limitations under the License. ARG PYTHON_VERSION=3.9 -ARG PYTORCH_VERSION=1.7 +ARG PYTORCH_VERSION=1.7.1 FROM pytorchlightning/pytorch_lightning:base-ipu-py${PYTHON_VERSION}-torch${PYTORCH_VERSION} diff --git a/environment.yml b/environment.yml index e4fde41c3770a..c97fef987d060 100644 --- a/environment.yml +++ b/environment.yml @@ -29,7 +29,7 @@ dependencies: - python>=3.6 - pip>20.1 - numpy>=1.17.2 - - pytorch>=1.7 + - pytorch>=1.7.1 - future>=0.17.1 - PyYAML>=5.1 - tqdm>=4.41.0 @@ -41,10 +41,10 @@ dependencies: - scikit-learn>=0.20.0 - matplotlib>=3.1.1 - omegaconf>=2.0.5 - - torchtext>=0.8 + - torchtext>=0.8.1 # Examples - - torchvision>=0.8 + - torchvision>=0.8.2 - pip: - test-tube>=0.7.5 diff --git a/requirements.txt b/requirements.txt index 2638022a4733a..942fd0f81f6dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # the default package dependencies numpy>=1.17.2 -torch>=1.7 +torch>=1.7.1 future>=0.17.1 # required for builtins in setup.py tqdm>=4.41.0 PyYAML>=5.1 diff --git a/requirements/examples.txt b/requirements/examples.txt index cd8073c9304fa..8c0d96bd1e6cf 100644 --- a/requirements/examples.txt +++ b/requirements/examples.txt @@ -1,3 +1,3 @@ -torchvision>=0.8 +torchvision>=0.8.2 gym>=0.17.0 ipython[all] diff --git a/requirements/extra.txt b/requirements/extra.txt index c397e8350f179..d274713969030 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -2,7 +2,7 @@ matplotlib>3.1 horovod>=0.21.2 # no need to install with [pytorch] as pytorch is already installed -torchtext>=0.8 +torchtext>=0.8.1 omegaconf>=2.0.5 hydra-core>=1.0.5 jsonargparse[signatures]>=3.19.3 From 313c72c36a62258a7c5067e3001abec4b0787c55 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 5 Nov 2021 06:57:24 +0100 Subject: [PATCH 08/16] Force trigger nightly --- .github/workflows/events-nightly.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index d803bc5288966..3cbcd0d9039d5 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -6,6 +6,8 @@ on: schedule: # At the end of every day - cron: "0 0 * * *" + # FIXME + push: {} env: PUSH_TO_HUB: true From 4a4831598ccfeb0ed84f7926ae37e5806e39f61a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 5 Nov 2021 14:53:38 +0100 Subject: [PATCH 09/16] Update .github/workflows/events-nightly.yml Co-authored-by: Aki Nitta --- .github/workflows/events-nightly.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index 3cbcd0d9039d5..d803bc5288966 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -6,8 +6,6 @@ on: schedule: # At the end of every day - cron: "0 0 * * *" - # FIXME - push: {} env: PUSH_TO_HUB: true From c498a3b0c21699de8818ca18ae2d4c67547d4caa Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 5 Nov 2021 15:26:24 +0100 Subject: [PATCH 10/16] Revert 1.7.1 change - try wildcard --- .github/workflows/ci_dockers.yml | 6 +++--- .github/workflows/ci_test-conda.yml | 2 +- .github/workflows/events-nightly.yml | 6 +++--- .github/workflows/release-docker.yml | 2 +- dockers/ipu-ci-runner/Dockerfile | 2 +- environment.yml | 6 +++--- requirements.txt | 2 +- requirements/examples.txt | 2 +- requirements/extra.txt | 2 +- 9 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index 4fffd172690c2..02426529574f6 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -93,12 +93,12 @@ jobs: matrix: # the config used in '.github/workflows/ci_test-conda.yml' python_version: ["3.8"] - pytorch_version: ["1.7.1", "1.8", "1.9", "1.10"] + pytorch_version: ["1.7", "1.8", "1.9", "1.10"] steps: - name: Checkout uses: actions/checkout@v2 - run: | - cuda=$(python -c "from distutils.version import LooseVersion as LVer ; print(11.1 if LVer('${{matrix.pytorch_version}}') > LVer('1.7.1') else 10.2)" 2>&1) + cuda=$(python -c "from distutils.version import LooseVersion as LVer ; print(11.1 if LVer('${{matrix.pytorch_version}}') > LVer('1.7') else 10.2)" 2>&1) echo "::set-output name=CUDA::$cuda" id: extend - name: Build Conda Docker @@ -122,7 +122,7 @@ jobs: python_version: ["3.9"] # latest # TODO: upgrade - PopTorch 2.2 uses torch 1.9, see: # https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/installation.html#version-compatibility - pytorch_version: ["1.7.1"] + pytorch_version: ["1.7"] steps: - name: Checkout uses: actions/checkout@v2 diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml index 7f3fb1e89e837..e0808a79fd384 100644 --- a/.github/workflows/ci_test-conda.yml +++ b/.github/workflows/ci_test-conda.yml @@ -15,7 +15,7 @@ jobs: fail-fast: false matrix: python-version: ["3.8"] # previous to last Python version as that one is already used in test-full - pytorch-version: ["1.7.1", "1.8", "1.9", "1.10"] + pytorch-version: ["1.7", "1.8", "1.9", "1.10"] # Timeout: https://stackoverflow.com/a/59076067/4521646 timeout-minutes: 35 diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index d803bc5288966..f450e98380f10 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -123,7 +123,7 @@ jobs: matrix: # the config used in '.github/workflows/ci_test-conda.yml' python_version: ["3.8"] - pytorch_version: ["1.7.1", "1.8", "1.9", "1.10"] + pytorch_version: ["1.7", "1.8", "1.9", "1.10"] steps: - name: Checkout @@ -138,7 +138,7 @@ jobs: # see: https://pytorch.org/get-started/previous-versions/ - run: | - cuda=$(python -c "from distutils.version import LooseVersion as LVer ; print(11.1 if LVer('${{matrix.pytorch_version}}') > LVer('1.7.1') else 10.2)" 2>&1) + cuda=$(python -c "from distutils.version import LooseVersion as LVer ; print(11.1 if LVer('${{matrix.pytorch_version}}') > LVer('1.7') else 10.2)" 2>&1) echo "::set-output name=CUDA::$cuda" id: extend @@ -164,7 +164,7 @@ jobs: # the config used in 'dockers/ipu-ci-runner/Dockerfile' include: - python_version: "3.9" - pytorch_version: "1.7.1" + pytorch_version: "1.7" steps: - name: Checkout diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index ae25e52cecc25..f7017d35d9e88 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -16,7 +16,7 @@ jobs: fail-fast: false matrix: python_version: ["3.6", "3.7", "3.8", "3.9"] - pytorch_version: ["1.7.1", "1.8", "1.9"] + pytorch_version: ["1.7", "1.8", "1.9"] steps: - name: Checkout uses: actions/checkout@v2 diff --git a/dockers/ipu-ci-runner/Dockerfile b/dockers/ipu-ci-runner/Dockerfile index 54990feda318c..98f769f78fe8f 100644 --- a/dockers/ipu-ci-runner/Dockerfile +++ b/dockers/ipu-ci-runner/Dockerfile @@ -13,7 +13,7 @@ # limitations under the License. ARG PYTHON_VERSION=3.9 -ARG PYTORCH_VERSION=1.7.1 +ARG PYTORCH_VERSION=1.7 FROM pytorchlightning/pytorch_lightning:base-ipu-py${PYTHON_VERSION}-torch${PYTORCH_VERSION} diff --git a/environment.yml b/environment.yml index c97fef987d060..d7d34c387af15 100644 --- a/environment.yml +++ b/environment.yml @@ -29,7 +29,7 @@ dependencies: - python>=3.6 - pip>20.1 - numpy>=1.17.2 - - pytorch>=1.7.1 + - pytorch>=1.7.* - future>=0.17.1 - PyYAML>=5.1 - tqdm>=4.41.0 @@ -41,10 +41,10 @@ dependencies: - scikit-learn>=0.20.0 - matplotlib>=3.1.1 - omegaconf>=2.0.5 - - torchtext>=0.8.1 + - torchtext>=0.8.* # Examples - - torchvision>=0.8.2 + - torchvision>=0.8.* - pip: - test-tube>=0.7.5 diff --git a/requirements.txt b/requirements.txt index 942fd0f81f6dc..34879d9290acb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # the default package dependencies numpy>=1.17.2 -torch>=1.7.1 +torch>=1.7.* future>=0.17.1 # required for builtins in setup.py tqdm>=4.41.0 PyYAML>=5.1 diff --git a/requirements/examples.txt b/requirements/examples.txt index 8c0d96bd1e6cf..8591f9bd509c2 100644 --- a/requirements/examples.txt +++ b/requirements/examples.txt @@ -1,3 +1,3 @@ -torchvision>=0.8.2 +torchvision>=0.8.* gym>=0.17.0 ipython[all] diff --git a/requirements/extra.txt b/requirements/extra.txt index d274713969030..4aea9dad9cfad 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -2,7 +2,7 @@ matplotlib>3.1 horovod>=0.21.2 # no need to install with [pytorch] as pytorch is already installed -torchtext>=0.8.1 +torchtext>=0.8.* omegaconf>=2.0.5 hydra-core>=1.0.5 jsonargparse[signatures]>=3.19.3 From dce9a9cf169af1f11beabee9f637572df5351c86 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Sat, 6 Nov 2021 19:28:06 +0100 Subject: [PATCH 11/16] Update adjust versions and test it --- requirements/adjust_versions.py | 55 ++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/requirements/adjust_versions.py b/requirements/adjust_versions.py index 634e5731e265e..8295a726e7873 100644 --- a/requirements/adjust_versions.py +++ b/requirements/adjust_versions.py @@ -32,28 +32,59 @@ def find_latest(ver: str) -> Dict[str, str]: raise ValueError(f"Missing {ver} in {VERSIONS}") -def main(path_req: str, torch_version: Optional[str] = None) -> None: +def main(req: str, torch_version: Optional[str] = None) -> str: if not torch_version: import torch torch_version = torch.__version__ assert torch_version, f"invalid torch: {torch_version}" - with open(path_req) as fp: - req = fp.read() - # remove comments - req = re.sub(rf"\s*#.*{os.linesep}", os.linesep, req) + # remove comments and strip whitespace + req = re.sub(rf"\s*#.*{os.linesep}", os.linesep, req).strip() latest = find_latest(torch_version) for lib, version in latest.items(): - replace = f"{lib}=={version}" if version else lib - replace += os.linesep - req = re.sub(rf"{lib}[>=]*[\d\.]*{os.linesep}", replace, req) + replace = f"{lib}=={version}" if version else "" + req = re.sub(rf"\b{lib}(?!\w).*", replace, req) - print(req) # on purpose - to debug - with open(path_req, "w") as fp: - fp.write(req) + return req + + +def test(): + requirements = """ + torch>=1.2.* + torch==1.2.3 + torch==1.4 + torch + future>=0.17.1 + pytorch==1.5.6+123dev0 + torchvision + torchmetrics>=0.4.1 + """ + expected = """ + torch==1.9.1 + torch==1.9.1 + torch==1.9.1 + torch==1.9.1 + future>=0.17.1 + pytorch==1.5.6+123dev0 + torchvision==0.10.1 + torchmetrics>=0.4.1 + """.strip() + actual = main(requirements, "1.9") + assert actual == expected, (actual, expected) if __name__ == "__main__": - main(*sys.argv[1:]) + test() # sanity check + + if len(sys.argv) == 3: + requirements_path, torch_version = sys.argv[1:] + else: + requirements_path, torch_version = sys.argv[1], None + + with open(requirements_path, "r+") as fp: + requirements = fp.read() + requirements = main(requirements, torch_version) + print(requirements) # on purpose - to debug + fp.write(requirements) From 3a6acadd115e86f02d83a788f1978372ab6764f3 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Sat, 6 Nov 2021 19:41:34 +0100 Subject: [PATCH 12/16] Undo test changes --- pytorch_lightning/callbacks/quantization.py | 2 +- pytorch_lightning/distributed/dist.py | 5 +- .../overrides/torch_distributed.py | 99 +++++++++++++++++++ .../plugins/training_type/ddp.py | 15 ++- .../plugins/training_type/ddp_spawn.py | 16 +-- .../connectors/accelerator_connector.py | 5 +- pytorch_lightning/utilities/__init__.py | 1 + pytorch_lightning/utilities/auto_restart.py | 35 +++++-- pytorch_lightning/utilities/cloud_io.py | 9 +- pytorch_lightning/utilities/imports.py | 3 +- pytorch_lightning/utilities/seed.py | 6 +- tests/callbacks/test_quantization.py | 2 +- tests/conftest.py | 6 +- tests/core/test_metric_result_integration.py | 5 +- tests/helpers/datamodules.py | 1 + .../loops/optimization/test_optimizer_loop.py | 2 + tests/loops/test_loops.py | 5 + tests/plugins/test_double_plugin.py | 6 +- tests/profiler/test_profiler.py | 4 +- .../connectors/test_checkpoint_connector.py | 2 + .../connectors/test_signal_connector.py | 2 +- tests/trainer/test_data_loading.py | 2 +- tests/trainer/test_supporters.py | 2 + tests/utilities/test_auto_restart.py | 6 +- 24 files changed, 203 insertions(+), 38 deletions(-) create mode 100644 pytorch_lightning/overrides/torch_distributed.py diff --git a/pytorch_lightning/callbacks/quantization.py b/pytorch_lightning/callbacks/quantization.py index 42f0d575ffb6f..ca82a574f71d1 100644 --- a/pytorch_lightning/callbacks/quantization.py +++ b/pytorch_lightning/callbacks/quantization.py @@ -28,7 +28,7 @@ if _TORCH_GREATER_EQUAL_1_8: from torch.quantization import FakeQuantizeBase else: - # For torch 1.7. + # For torch 1.6 and 1.7. from torch.quantization import FakeQuantize as FakeQuantizeBase import pytorch_lightning as pl diff --git a/pytorch_lightning/distributed/dist.py b/pytorch_lightning/distributed/dist.py index a0054d17936b0..082e0c617a5f7 100644 --- a/pytorch_lightning/distributed/dist.py +++ b/pytorch_lightning/distributed/dist.py @@ -13,8 +13,7 @@ # limitations under the License. from typing import Any -import torch.distributed - +from pytorch_lightning.overrides.torch_distributed import broadcast_object_list from pytorch_lightning.utilities import rank_zero_deprecation from pytorch_lightning.utilities.distributed import group as _group @@ -41,6 +40,6 @@ def broadcast(self, obj: Any, group=_group.WORLD): if self.rank != 0: obj = [None] * len(obj) - torch.distributed.broadcast_object_list(obj, 0, group=group or _group.WORLD) + broadcast_object_list(obj, 0, group=group or _group.WORLD) return obj[0] diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py new file mode 100644 index 0000000000000..3cbbe5ea760ff --- /dev/null +++ b/pytorch_lightning/overrides/torch_distributed.py @@ -0,0 +1,99 @@ +import logging +import pickle + +import torch + +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8 + +log = logging.getLogger(__name__) + +if torch.distributed.is_available(): + from torch.distributed import Backend, broadcast, get_backend, get_rank, GroupMember + +# The code underneath is taken from PyTorch `torch/distributed/distributed_c10d.py` +# and enable broadcasting for PyTorch 1.6 and lower. + + +# https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L160 +def _rank_not_in_group(group): + """Helper that checks if the current process's rank is not in a given group.""" + if group is None: + return False + return group == GroupMember.NON_GROUP_MEMBER + + +# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1164 +def _object_to_tensor(obj): + buffer = pickle.dumps(obj) + byte_storage = torch.ByteStorage.from_buffer(buffer) # type: ignore[attr-defined] + byte_tensor = torch.ByteTensor(byte_storage) + local_size = torch.LongTensor([byte_tensor.numel()]) + return byte_tensor, local_size + + +# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py +def _tensor_to_object(tensor, tensor_size): + buf = tensor.numpy().tobytes()[:tensor_size] + out = pickle.loads(buf) + return out + + +# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1327 +def _broadcast_object_list(object_list, src=0, group=None): + if _rank_not_in_group(group): + return + + my_rank = get_rank() + # Serialize object_list elements to tensors on src rank. + if my_rank == src: + tensor_list, size_list = zip(*(_object_to_tensor(obj) for obj in object_list)) + object_sizes_tensor = torch.cat(size_list) + else: + object_sizes_tensor = torch.LongTensor(len(object_list)) + + group_backend = get_backend(group) + is_nccl_backend = group_backend == Backend.NCCL + current_device = torch.device("cpu") + if is_nccl_backend: + # See note about using torch.cuda.current_device() here in docstring. + # We cannot simply use my_rank since rank == device is not necessarily + # true. + current_device = torch.device("cuda", torch.cuda.current_device()) + object_sizes_tensor = object_sizes_tensor.to(current_device) + object_sizes_tensor = object_sizes_tensor.to(current_device) + + # Broadcast object sizes + broadcast(object_sizes_tensor, src=src, group=group) + + # Concatenate and broadcast serialized object tensors + if my_rank == src: + object_tensor = torch.cat(tensor_list) + else: + object_tensor = torch.ByteTensor(torch.sum(object_sizes_tensor).item()) + + if is_nccl_backend: + object_tensor = object_tensor.to(current_device) + + broadcast(object_tensor, src=src, group=group) + + # Deserialize objects using their stored sizes. + offset = 0 + if my_rank != src: + for i, obj_size in enumerate(object_sizes_tensor): + obj_view = object_tensor[offset : offset + obj_size] + obj_view = obj_view.type(torch.ByteTensor) # type: ignore[call-overload] + offset += obj_size + object_list[i] = _tensor_to_object(obj_view, obj_size) + + +if not torch.distributed.is_available(): + # avoid failures on early PyTorch versions for Windows where + # not all functions used in `broadcast_object_list` are available. + def _broadcast_noop(obj, *_, **__): + return obj + + broadcast_object_list = _broadcast_noop +elif _TORCH_GREATER_EQUAL_1_8: + from torch.distributed.distributed_c10d import broadcast_object_list +else: + broadcast_object_list = _broadcast_object_list diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 8cddac2021ae7..53aff5128ff37 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -34,6 +34,7 @@ from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.overrides.distributed import prepare_for_backward +from pytorch_lightning.overrides.torch_distributed import broadcast_object_list from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin @@ -42,6 +43,7 @@ _FAIRSCALE_AVAILABLE, _HYDRA_AVAILABLE, _IS_WINDOWS, + _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8, _TORCH_GREATER_EQUAL_1_9, _TORCH_GREATER_EQUAL_1_10, @@ -285,12 +287,15 @@ def pre_configure_ddp(self): # when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True. # This flag does come with a performance hit, so it is suggested to disable in cases where it is possible. self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) - if not self.lightning_module.automatic_optimization and not self._ddp_kwargs.get( - "find_unused_parameters", False + # todo: PyTorch 1.7.0 DDP introduces `self.reducer._rebuild_buckets()` breaking manual_optimization + if ( + _TORCH_GREATER_EQUAL_1_7 + and not self.lightning_module.automatic_optimization + and not self._ddp_kwargs.get("find_unused_parameters", False) ): - # TODO: PyTorch 1.7.0 DDP introduces `self.reducer._rebuild_buckets()` breaking manual_optimization rank_zero_warn( - "Lightning `manual_optimization` needs to set `find_unused_parameters=True` to properly work with DDP." + "From PyTorch 1.7.0, Lightning ``manual_optimization`` needs to set ``find_unused_parameters=True`` " + "to properly work with DDP." ) self._ddp_kwargs["find_unused_parameters"] = True @@ -393,7 +398,7 @@ def broadcast(self, obj: object, src: int = 0) -> object: obj = [obj] if self.global_rank != src: obj = [None] - torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD) + broadcast_object_list(obj, src, group=_group.WORLD) return obj[0] def pre_backward(self, closure_loss: torch.Tensor) -> None: diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 8609b5614ba9c..5e04ca95743eb 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -27,11 +27,12 @@ import pytorch_lightning as pl from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.overrides.distributed import prepare_for_backward +from pytorch_lightning.overrides.torch_distributed import broadcast_object_list from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_8, rank_zero_warn +from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8, rank_zero_warn from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device from pytorch_lightning.utilities.cloud_io import atomic_save from pytorch_lightning.utilities.cloud_io import load as pl_load @@ -245,12 +246,15 @@ def pre_configure_ddp(self): # when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True. # This flag does come with a performance hit, so it is suggested to disable in cases where it is possible. self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) - if not self.lightning_module.automatic_optimization and not self._ddp_kwargs.get( - "find_unused_parameters", False + # todo: PyTorch 1.7.0 DDP introduces `self.reducer._rebuild_buckets()` breaking manual_optimization + if ( + _TORCH_GREATER_EQUAL_1_7 + and not self.lightning_module.automatic_optimization + and not self._ddp_kwargs.get("find_unused_parameters", False) ): - # TODO: PyTorch 1.7.0 DDP introduces `self.reducer._rebuild_buckets()` breaking manual_optimization rank_zero_warn( - "Lightning `manual_optimization` needs to set `find_unused_parameters=True` to properly work with DDP." + "From PyTorch 1.7.0, Lightning ``manual_optimization`` needs to set ``find_unused_parameters=True`` " + "to properly work with DDP." ) self._ddp_kwargs["find_unused_parameters"] = True @@ -327,7 +331,7 @@ def broadcast(self, obj: object, src: int = 0) -> object: obj = [obj] if self.global_rank != src: obj = [None] - torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD) + broadcast_object_list(obj, src, group=_group.WORLD) return obj[0] def model_to_device(self): diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 43eb65ce21a22..e15f7bb853db8 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -74,6 +74,7 @@ from pytorch_lightning.utilities.imports import ( _HOROVOD_AVAILABLE, _IPU_AVAILABLE, + _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8, _TPU_AVAILABLE, ) @@ -189,8 +190,10 @@ def _init_deterministic(self, deterministic: bool) -> None: self.deterministic = deterministic if _TORCH_GREATER_EQUAL_1_8: torch.use_deterministic_algorithms(deterministic) - else: + elif _TORCH_GREATER_EQUAL_1_7: torch.set_deterministic(deterministic) + else: # the minimum version Lightning supports is PyTorch 1.6 + torch._set_deterministic(deterministic) if deterministic: # fixing non-deterministic part of horovod # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383 diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py index 7343e28d6d811..158d7356c91ce 100644 --- a/pytorch_lightning/utilities/__init__.py +++ b/pytorch_lightning/utilities/__init__.py @@ -44,6 +44,7 @@ _OMEGACONF_AVAILABLE, _POPTORCH_AVAILABLE, _RICH_AVAILABLE, + _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8, _TORCH_GREATER_EQUAL_1_9, _TORCH_GREATER_EQUAL_1_10, diff --git a/pytorch_lightning/utilities/auto_restart.py b/pytorch_lightning/utilities/auto_restart.py index ef52717636d90..f0b50103cf2f2 100644 --- a/pytorch_lightning/utilities/auto_restart.py +++ b/pytorch_lightning/utilities/auto_restart.py @@ -305,6 +305,9 @@ def _wrap_generator_samplers(self) -> None: # access wrapped dataset attributes dataset_dict = self.dataset.__dict__ + # create a tuple of sampler names + samplers_names = tuple(v.__class__.__name__ for k, v in dataset_dict.items() if isinstance(v, Sampler)) + # create a dictionary of generator present within the dataset attributes dataset_sampler_generators = {k: v for k, v in dataset_dict.items() if isinstance(v, (Generator, Iterator))} @@ -315,17 +318,31 @@ def _wrap_generator_samplers(self) -> None: if isinstance(generator, Sampler): continue - # wrap the generator into a `FastForwardSampler` - sampler = FastForwardSampler(generator, attr_name=generator_attr_name) + # used to handle a weird behaviour from PyTorch 1.6 + # where the sampler is converted to a list_iterator + is_legacy = False + + if isinstance(generator, Generator): + # Generator name have the the form `SamplerName.__iter__` + generator_name = generator.__qualname__.split(".")[0] + else: + # assume the retrieved iterator is coming from sampler. + is_legacy = True + + # validate the base generator name matches a sampler name. + if is_legacy or any(sampler_name == generator_name for sampler_name in samplers_names): + + # wrap the generator into a `FastForwardSampler` + sampler = FastForwardSampler(generator, attr_name=generator_attr_name) - # if `CaptureIterableDataset` was available, the sampler should reload its own state. - if self._state_dict is not None: - sampler.load_state_dict(self._state_dict[generator_attr_name]) - # store the samplers - self.samplers[generator_attr_name] = sampler + # if `CaptureIterableDataset` was available, the sampler should reload its own state. + if self._state_dict is not None: + sampler.load_state_dict(self._state_dict[generator_attr_name]) + # store the samplers + self.samplers[generator_attr_name] = sampler - # replace generator with the generator from the `FastForwardSampler`. - dataset_dict[generator_attr_name] = iter(sampler) + # replace generator with the generator from the `FastForwardSampler`. + dataset_dict[generator_attr_name] = iter(sampler) self.reset_on_epoch() diff --git a/pytorch_lightning/utilities/cloud_io.py b/pytorch_lightning/utilities/cloud_io.py index 2c9eb1f768d3c..9b40f6d69cfad 100644 --- a/pytorch_lightning/utilities/cloud_io.py +++ b/pytorch_lightning/utilities/cloud_io.py @@ -19,6 +19,7 @@ import fsspec import torch from fsspec.implementations.local import AbstractFileSystem, LocalFileSystem +from packaging.version import Version def load( @@ -58,6 +59,12 @@ def atomic_save(checkpoint: Dict[str, Any], filepath: Union[str, Path]) -> None: """ bytesbuffer = io.BytesIO() - torch.save(checkpoint, bytesbuffer) + # Can't use the new zipfile serialization for 1.6.0 because there's a bug in + # torch.hub.load_state_dict_from_url() that prevents it from loading the new files. + # More details can be found here: https://github.com/pytorch/pytorch/issues/42239 + if Version(torch.__version__).release[:3] == (1, 6, 0): + torch.save(checkpoint, bytesbuffer, _use_new_zipfile_serialization=False) + else: + torch.save(checkpoint, bytesbuffer) with fsspec.open(filepath, "wb") as f: f.write(bytesbuffer.getvalue()) diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index 5db24fe0f5cff..edf5f75aee6a9 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -70,6 +70,7 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version: _IS_WINDOWS = platform.system() == "Windows" _IS_INTERACTIVE = hasattr(sys, "ps1") # https://stackoverflow.com/a/64523765 +_TORCH_GREATER_EQUAL_1_7 = _compare_version("torch", operator.ge, "1.7.0") _TORCH_GREATER_EQUAL_1_8 = _compare_version("torch", operator.ge, "1.8.0") _TORCH_GREATER_EQUAL_1_8_1 = _compare_version("torch", operator.ge, "1.8.1") _TORCH_GREATER_EQUAL_1_9 = _compare_version("torch", operator.ge, "1.9.0") @@ -111,4 +112,4 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version: # experimental feature within PyTorch Lightning. def _fault_tolerant_training() -> bool: - return bool(int(os.getenv("PL_FAULT_TOLERANT_TRAINING", 0))) + return _TORCH_GREATER_EQUAL_1_7 and int(os.getenv("PL_FAULT_TOLERANT_TRAINING", 0)) diff --git a/pytorch_lightning/utilities/seed.py b/pytorch_lightning/utilities/seed.py index e8fc243f484f8..3b20c53353411 100644 --- a/pytorch_lightning/utilities/seed.py +++ b/pytorch_lightning/utilities/seed.py @@ -21,7 +21,7 @@ import numpy as np import torch -from pytorch_lightning.utilities import rank_zero_warn +from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_7, rank_zero_warn from pytorch_lightning.utilities.distributed import rank_zero_only log = logging.getLogger(__name__) @@ -113,7 +113,9 @@ def pl_worker_init_function(worker_id: int, rank: Optional[int] = None) -> None: np.random.seed(ss.generate_state(4)) # Spawn distinct SeedSequences for the PyTorch PRNG and the stdlib random module torch_ss, stdlib_ss = ss.spawn(2) - torch.manual_seed(torch_ss.generate_state(1, dtype=np.uint64)[0]) + # PyTorch 1.7 and above takes a 64-bit seed + dtype = np.uint64 if _TORCH_GREATER_EQUAL_1_7 else np.uint32 + torch.manual_seed(torch_ss.generate_state(1, dtype=dtype)[0]) # use 128 bits expressed as an integer stdlib_seed = (stdlib_ss.generate_state(2, dtype=np.uint64).astype(object) * [1 << 64, 1]).sum() random.seed(stdlib_seed) diff --git a/tests/callbacks/test_quantization.py b/tests/callbacks/test_quantization.py index e3dfb9b6a7edf..fa2ee767bdc8c 100644 --- a/tests/callbacks/test_quantization.py +++ b/tests/callbacks/test_quantization.py @@ -31,7 +31,7 @@ if _TORCH_GREATER_EQUAL_1_8: from torch.quantization import FakeQuantizeBase else: - # For torch 1.7. + # For torch 1.6 and 1.7. from torch.quantization import FakeQuantize as FakeQuantizeBase diff --git a/tests/conftest.py b/tests/conftest.py index 3d5548b7bd0ae..860f9357e4636 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -22,7 +22,7 @@ import torch.distributed from pytorch_lightning.plugins.environments.lightning_environment import find_free_network_port -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8 +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8 from tests import _PATH_DATASETS @@ -95,8 +95,10 @@ def reset_deterministic_algorithm(): yield if _TORCH_GREATER_EQUAL_1_8: torch.use_deterministic_algorithms(False) - else: + elif _TORCH_GREATER_EQUAL_1_7: torch.set_deterministic(False) + else: # the minimum version Lightning supports is PyTorch 1.6 + torch._set_deterministic(False) @pytest.fixture diff --git a/tests/core/test_metric_result_integration.py b/tests/core/test_metric_result_integration.py index 9ec2f150ac5d4..12fe7f2fb4652 100644 --- a/tests/core/test_metric_result_integration.py +++ b/tests/core/test_metric_result_integration.py @@ -33,7 +33,7 @@ ResultCollection, ResultMetric, ) -from pytorch_lightning.utilities.imports import _fault_tolerant_training +from pytorch_lightning.utilities.imports import _fault_tolerant_training, _TORCH_GREATER_EQUAL_1_7 from tests.helpers import BoringModel from tests.helpers.runif import RunIf @@ -470,18 +470,21 @@ def on_epoch_end(self) -> None: @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) +@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_7, reason="Requires at least PyTorch 1.7") def test_result_collection_reload(tmpdir): result_collection_reload(default_root_dir=tmpdir) @RunIf(min_gpus=1) @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) +@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_7, reason="Requires at least PyTorch 1.7") def test_result_collection_reload_1_gpu_ddp(tmpdir): result_collection_reload(default_root_dir=tmpdir, strategy="ddp", gpus=1) @RunIf(min_gpus=2, special=True) @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) +@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_7, reason="Requires at least PyTorch 1.7") def test_result_collection_reload_2_gpus(tmpdir): result_collection_reload(default_root_dir=tmpdir, strategy="ddp", gpus=2) diff --git a/tests/helpers/datamodules.py b/tests/helpers/datamodules.py index 78e806b37937e..0cb178a749a09 100644 --- a/tests/helpers/datamodules.py +++ b/tests/helpers/datamodules.py @@ -46,6 +46,7 @@ def prepare_data(self): self.dataset_cls(self.data_dir, train=False, download=True) def setup(self, stage: Optional[str] = None): + # TODO: need to split using random_split once updated to torch >= 1.6 if stage == "fit" or stage is None: self.mnist_train = self.dataset_cls(self.data_dir, train=True) if stage == "test" or stage is None: diff --git a/tests/loops/optimization/test_optimizer_loop.py b/tests/loops/optimization/test_optimizer_loop.py index ae77c4387a398..7e17cbbd56645 100644 --- a/tests/loops/optimization/test_optimizer_loop.py +++ b/tests/loops/optimization/test_optimizer_loop.py @@ -24,6 +24,7 @@ from pytorch_lightning.loops.optimization.optimizer_loop import ClosureResult from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel +from tests.helpers.runif import RunIf def test_closure_result_deepcopy(): @@ -139,6 +140,7 @@ class CustomException(Exception): pass +@RunIf(min_torch="1.7.0") @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) @pytest.mark.parametrize("stop_epoch", (0, 1)) @pytest.mark.parametrize("stop_batch", (0, 1, 2)) diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py index f6097864d65f3..dd390ab4939d5 100644 --- a/tests/loops/test_loops.py +++ b/tests/loops/test_loops.py @@ -253,6 +253,7 @@ def on_load_checkpoint(self, state_dict: Dict) -> None: assert state_dict == {"state_dict": {"a": 1}, "progress": {"increment": 1}} +@RunIf(min_torch="1.7.0") @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) @pytest.mark.parametrize("stop_epoch", (1, 2)) @pytest.mark.parametrize("stop_batch", (1, 2)) @@ -322,6 +323,7 @@ def val_dataloader(self): assert trainer.fit_loop.epoch_loop.val_loop.epoch_loop.batch_progress.state_dict() == expected +@RunIf(min_torch="1.7.0") @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) @pytest.mark.parametrize("accumulate_grad_batches", (1, 2, 3)) @pytest.mark.parametrize("n_optimizers", (1, 3, 5)) @@ -524,6 +526,7 @@ def configure_optimizers_multiple(self): assert state_dict["epoch_progress"]["current"]["started"] == stop_epoch +@RunIf(min_torch="1.7.0") @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) @pytest.mark.parametrize("n_optimizers", (1, 3, 5)) def test_loop_state_on_complete_run(n_optimizers, tmpdir): @@ -659,6 +662,7 @@ def train_dataloader(self): assert checkpoint["loops"]["fit_loop"] == expected +@RunIf(min_torch="1.7.0") @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) def test_fit_loop_reset(tmpdir): """Test that the reset logic in fit- and epoch loop is aware of whether the loop is restarting from a completed @@ -748,6 +752,7 @@ def test_fit_loop_reset(tmpdir): @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) +@RunIf(min_torch="1.7.0") @pytest.mark.parametrize( ["train_datasets", "val_datasets"], [([RandomDataset], [RandomDataset]), ([RandomDataset], [RandomDataset, RandomDataset])], diff --git a/tests/plugins/test_double_plugin.py b/tests/plugins/test_double_plugin.py index b3fdf87428522..cadd02c692af5 100644 --- a/tests/plugins/test_double_plugin.py +++ b/tests/plugins/test_double_plugin.py @@ -20,6 +20,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.plugins import DoublePrecisionPlugin +from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_7 from tests.helpers.boring_model import BoringModel, RandomDataset from tests.helpers.runif import RunIf @@ -136,7 +137,10 @@ def on_fit_start(self): [ DoublePrecisionBoringModel, DoublePrecisionBoringModelNoForward, - DoublePrecisionBoringModelComplexBuffer, + pytest.param( + DoublePrecisionBoringModelComplexBuffer, + marks=pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_7, reason="torch.complex not available"), + ), ], ) def test_double_precision(tmpdir, boring_model): diff --git a/tests/profiler/test_profiler.py b/tests/profiler/test_profiler.py index faf00f8890783..7369ab9a4a140 100644 --- a/tests/profiler/test_profiler.py +++ b/tests/profiler/test_profiler.py @@ -26,6 +26,7 @@ from pytorch_lightning.loggers.tensorboard import TensorBoardLogger from pytorch_lightning.profiler import AdvancedProfiler, PassThroughProfiler, PyTorchProfiler, SimpleProfiler from pytorch_lightning.profiler.pytorch import RegisterRecordFunction +from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_7 from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _KINETO_AVAILABLE from tests.helpers import BoringModel, ManualOptimBoringModel @@ -393,7 +394,8 @@ def test_pytorch_profiler_nested(tmpdir): names = {"a", "b", "c"} ops = {"add", "empty", "fill_", "ones", "zero_", "zeros"} - ops = {"aten::" + op for op in ops} + if _TORCH_GREATER_EQUAL_1_7: + ops = {"aten::" + op for op in ops} expected = names.union(ops) assert events_name == expected, (events_name, torch.__version__, platform.system()) diff --git a/tests/trainer/connectors/test_checkpoint_connector.py b/tests/trainer/connectors/test_checkpoint_connector.py index 4a42265eb21b0..6b408845ed879 100644 --- a/tests/trainer/connectors/test_checkpoint_connector.py +++ b/tests/trainer/connectors/test_checkpoint_connector.py @@ -21,6 +21,7 @@ from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.trainer.states import TrainerFn from tests.helpers import BoringModel +from tests.helpers.runif import RunIf class HPCHookdedModel(BoringModel): @@ -132,6 +133,7 @@ def test_hpc_max_ckpt_version(tmpdir): @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) +@RunIf(min_torch="1.7.0") def test_loops_restore(tmpdir): """Test that required loop state_dict is loaded correctly by checkpoint connector.""" model = BoringModel() diff --git a/tests/trainer/connectors/test_signal_connector.py b/tests/trainer/connectors/test_signal_connector.py index aa5407e2f1228..3da8c100fe40c 100644 --- a/tests/trainer/connectors/test_signal_connector.py +++ b/tests/trainer/connectors/test_signal_connector.py @@ -26,7 +26,7 @@ @pytest.mark.parametrize("register_handler", [False, True]) @pytest.mark.parametrize("terminate_gracefully", [False, True]) -@RunIf(skip_windows=True) +@RunIf(min_torch="1.7.0", skip_windows=True) def test_fault_tolerant_sig_handler(register_handler, terminate_gracefully, tmpdir): # hack to reset the signal diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index 97097b2074ca1..0f6abd38e6836 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -26,7 +26,7 @@ from tests.helpers.runif import RunIf -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, min_torch="1.7.0") @pytest.mark.parametrize("mode", (1, 2, 3)) def test_replace_distributed_sampler(tmpdir, mode): class IndexedRandomDataset(RandomDataset): diff --git a/tests/trainer/test_supporters.py b/tests/trainer/test_supporters.py index 1518779bcc7ef..204f3079f544b 100644 --- a/tests/trainer/test_supporters.py +++ b/tests/trainer/test_supporters.py @@ -34,6 +34,7 @@ from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.auto_restart import CaptureMapDataset, FastForwardSampler from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_7 def test_tensor_running_accum_reset(): @@ -309,6 +310,7 @@ def test_nested_calc_num_data(input_data, compute_func, expected_length): assert calculated_length == expected_length +@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_7, reason="Requires at least PyTorch 1.7") @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1", "PL_TRAINER_GPUS": "2"}) @mock.patch("torch.cuda.device_count", return_value=2) @mock.patch("torch.cuda.is_available", return_value=True) diff --git a/tests/utilities/test_auto_restart.py b/tests/utilities/test_auto_restart.py index b36a9d1d76941..4e3385cebecbc 100644 --- a/tests/utilities/test_auto_restart.py +++ b/tests/utilities/test_auto_restart.py @@ -690,6 +690,7 @@ def create_dataloader(): } +@RunIf(min_torch="1.7.0") @pytest.mark.parametrize("use_fault_tolerant", ["0", "1"]) def test_data_loading_wraps_dataset_and_samplers(use_fault_tolerant, tmpdir): """This test ensures the dataset and sampler are properly wrapped when fault tolerant is enabled.""" @@ -784,6 +785,7 @@ def __len__(self): # TODO: test with `RandomGeneratorGetItemDataset` @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) +@RunIf(min_torch="1.7.0") @pytest.mark.parametrize( "dataset_class", [ @@ -919,6 +921,7 @@ def _run_training(trainer_kwargs, dataset_classes, fail_on_step: int = -1, ckpt_ @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) +@RunIf(min_torch="1.7.0") @pytest.mark.parametrize( "dataset_classes", [ @@ -972,6 +975,7 @@ def test_dataset_rng_states_restart_with_lightning(tmpdir, dataset_classes, mult @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) +@RunIf(min_torch="1.7.0") @pytest.mark.parametrize( ["train_datasets", "val_datasets"], [ @@ -1135,7 +1139,7 @@ def _fit_model( @pytest.mark.parametrize("failure_on_training", [False, True]) @pytest.mark.parametrize("failure_on_step", [False, True]) @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@RunIf(skip_windows=True) +@RunIf(min_torch="1.7.0", skip_windows=True) def test_auto_restart_under_signal(on_last_batch, val_check_interval, failure_on_training, failure_on_step, tmpdir): """This test asserts that if a signal is being sent during the training / validation phase, the model should restart in a reproducible way.""" From 9d90e555a1de91889e86d4f55e233191c150b614 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Sat, 6 Nov 2021 19:42:13 +0100 Subject: [PATCH 13/16] Revert "Undo test changes" This reverts commit 3a6acadd115e86f02d83a788f1978372ab6764f3. --- pytorch_lightning/callbacks/quantization.py | 2 +- pytorch_lightning/distributed/dist.py | 5 +- .../overrides/torch_distributed.py | 99 ------------------- .../plugins/training_type/ddp.py | 15 +-- .../plugins/training_type/ddp_spawn.py | 16 ++- .../connectors/accelerator_connector.py | 5 +- pytorch_lightning/utilities/__init__.py | 1 - pytorch_lightning/utilities/auto_restart.py | 35 ++----- pytorch_lightning/utilities/cloud_io.py | 9 +- pytorch_lightning/utilities/imports.py | 3 +- pytorch_lightning/utilities/seed.py | 6 +- tests/callbacks/test_quantization.py | 2 +- tests/conftest.py | 6 +- tests/core/test_metric_result_integration.py | 5 +- tests/helpers/datamodules.py | 1 - .../loops/optimization/test_optimizer_loop.py | 2 - tests/loops/test_loops.py | 5 - tests/plugins/test_double_plugin.py | 6 +- tests/profiler/test_profiler.py | 4 +- .../connectors/test_checkpoint_connector.py | 2 - .../connectors/test_signal_connector.py | 2 +- tests/trainer/test_data_loading.py | 2 +- tests/trainer/test_supporters.py | 2 - tests/utilities/test_auto_restart.py | 6 +- 24 files changed, 38 insertions(+), 203 deletions(-) delete mode 100644 pytorch_lightning/overrides/torch_distributed.py diff --git a/pytorch_lightning/callbacks/quantization.py b/pytorch_lightning/callbacks/quantization.py index ca82a574f71d1..42f0d575ffb6f 100644 --- a/pytorch_lightning/callbacks/quantization.py +++ b/pytorch_lightning/callbacks/quantization.py @@ -28,7 +28,7 @@ if _TORCH_GREATER_EQUAL_1_8: from torch.quantization import FakeQuantizeBase else: - # For torch 1.6 and 1.7. + # For torch 1.7. from torch.quantization import FakeQuantize as FakeQuantizeBase import pytorch_lightning as pl diff --git a/pytorch_lightning/distributed/dist.py b/pytorch_lightning/distributed/dist.py index 082e0c617a5f7..a0054d17936b0 100644 --- a/pytorch_lightning/distributed/dist.py +++ b/pytorch_lightning/distributed/dist.py @@ -13,7 +13,8 @@ # limitations under the License. from typing import Any -from pytorch_lightning.overrides.torch_distributed import broadcast_object_list +import torch.distributed + from pytorch_lightning.utilities import rank_zero_deprecation from pytorch_lightning.utilities.distributed import group as _group @@ -40,6 +41,6 @@ def broadcast(self, obj: Any, group=_group.WORLD): if self.rank != 0: obj = [None] * len(obj) - broadcast_object_list(obj, 0, group=group or _group.WORLD) + torch.distributed.broadcast_object_list(obj, 0, group=group or _group.WORLD) return obj[0] diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py deleted file mode 100644 index 3cbbe5ea760ff..0000000000000 --- a/pytorch_lightning/overrides/torch_distributed.py +++ /dev/null @@ -1,99 +0,0 @@ -import logging -import pickle - -import torch - -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8 - -log = logging.getLogger(__name__) - -if torch.distributed.is_available(): - from torch.distributed import Backend, broadcast, get_backend, get_rank, GroupMember - -# The code underneath is taken from PyTorch `torch/distributed/distributed_c10d.py` -# and enable broadcasting for PyTorch 1.6 and lower. - - -# https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L160 -def _rank_not_in_group(group): - """Helper that checks if the current process's rank is not in a given group.""" - if group is None: - return False - return group == GroupMember.NON_GROUP_MEMBER - - -# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1164 -def _object_to_tensor(obj): - buffer = pickle.dumps(obj) - byte_storage = torch.ByteStorage.from_buffer(buffer) # type: ignore[attr-defined] - byte_tensor = torch.ByteTensor(byte_storage) - local_size = torch.LongTensor([byte_tensor.numel()]) - return byte_tensor, local_size - - -# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py -def _tensor_to_object(tensor, tensor_size): - buf = tensor.numpy().tobytes()[:tensor_size] - out = pickle.loads(buf) - return out - - -# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1327 -def _broadcast_object_list(object_list, src=0, group=None): - if _rank_not_in_group(group): - return - - my_rank = get_rank() - # Serialize object_list elements to tensors on src rank. - if my_rank == src: - tensor_list, size_list = zip(*(_object_to_tensor(obj) for obj in object_list)) - object_sizes_tensor = torch.cat(size_list) - else: - object_sizes_tensor = torch.LongTensor(len(object_list)) - - group_backend = get_backend(group) - is_nccl_backend = group_backend == Backend.NCCL - current_device = torch.device("cpu") - if is_nccl_backend: - # See note about using torch.cuda.current_device() here in docstring. - # We cannot simply use my_rank since rank == device is not necessarily - # true. - current_device = torch.device("cuda", torch.cuda.current_device()) - object_sizes_tensor = object_sizes_tensor.to(current_device) - object_sizes_tensor = object_sizes_tensor.to(current_device) - - # Broadcast object sizes - broadcast(object_sizes_tensor, src=src, group=group) - - # Concatenate and broadcast serialized object tensors - if my_rank == src: - object_tensor = torch.cat(tensor_list) - else: - object_tensor = torch.ByteTensor(torch.sum(object_sizes_tensor).item()) - - if is_nccl_backend: - object_tensor = object_tensor.to(current_device) - - broadcast(object_tensor, src=src, group=group) - - # Deserialize objects using their stored sizes. - offset = 0 - if my_rank != src: - for i, obj_size in enumerate(object_sizes_tensor): - obj_view = object_tensor[offset : offset + obj_size] - obj_view = obj_view.type(torch.ByteTensor) # type: ignore[call-overload] - offset += obj_size - object_list[i] = _tensor_to_object(obj_view, obj_size) - - -if not torch.distributed.is_available(): - # avoid failures on early PyTorch versions for Windows where - # not all functions used in `broadcast_object_list` are available. - def _broadcast_noop(obj, *_, **__): - return obj - - broadcast_object_list = _broadcast_noop -elif _TORCH_GREATER_EQUAL_1_8: - from torch.distributed.distributed_c10d import broadcast_object_list -else: - broadcast_object_list = _broadcast_object_list diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 53aff5128ff37..8cddac2021ae7 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -34,7 +34,6 @@ from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.overrides.distributed import prepare_for_backward -from pytorch_lightning.overrides.torch_distributed import broadcast_object_list from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin @@ -43,7 +42,6 @@ _FAIRSCALE_AVAILABLE, _HYDRA_AVAILABLE, _IS_WINDOWS, - _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8, _TORCH_GREATER_EQUAL_1_9, _TORCH_GREATER_EQUAL_1_10, @@ -287,15 +285,12 @@ def pre_configure_ddp(self): # when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True. # This flag does come with a performance hit, so it is suggested to disable in cases where it is possible. self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) - # todo: PyTorch 1.7.0 DDP introduces `self.reducer._rebuild_buckets()` breaking manual_optimization - if ( - _TORCH_GREATER_EQUAL_1_7 - and not self.lightning_module.automatic_optimization - and not self._ddp_kwargs.get("find_unused_parameters", False) + if not self.lightning_module.automatic_optimization and not self._ddp_kwargs.get( + "find_unused_parameters", False ): + # TODO: PyTorch 1.7.0 DDP introduces `self.reducer._rebuild_buckets()` breaking manual_optimization rank_zero_warn( - "From PyTorch 1.7.0, Lightning ``manual_optimization`` needs to set ``find_unused_parameters=True`` " - "to properly work with DDP." + "Lightning `manual_optimization` needs to set `find_unused_parameters=True` to properly work with DDP." ) self._ddp_kwargs["find_unused_parameters"] = True @@ -398,7 +393,7 @@ def broadcast(self, obj: object, src: int = 0) -> object: obj = [obj] if self.global_rank != src: obj = [None] - broadcast_object_list(obj, src, group=_group.WORLD) + torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD) return obj[0] def pre_backward(self, closure_loss: torch.Tensor) -> None: diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 5e04ca95743eb..8609b5614ba9c 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -27,12 +27,11 @@ import pytorch_lightning as pl from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.overrides.distributed import prepare_for_backward -from pytorch_lightning.overrides.torch_distributed import broadcast_object_list from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8, rank_zero_warn +from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_8, rank_zero_warn from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device from pytorch_lightning.utilities.cloud_io import atomic_save from pytorch_lightning.utilities.cloud_io import load as pl_load @@ -246,15 +245,12 @@ def pre_configure_ddp(self): # when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True. # This flag does come with a performance hit, so it is suggested to disable in cases where it is possible. self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) - # todo: PyTorch 1.7.0 DDP introduces `self.reducer._rebuild_buckets()` breaking manual_optimization - if ( - _TORCH_GREATER_EQUAL_1_7 - and not self.lightning_module.automatic_optimization - and not self._ddp_kwargs.get("find_unused_parameters", False) + if not self.lightning_module.automatic_optimization and not self._ddp_kwargs.get( + "find_unused_parameters", False ): + # TODO: PyTorch 1.7.0 DDP introduces `self.reducer._rebuild_buckets()` breaking manual_optimization rank_zero_warn( - "From PyTorch 1.7.0, Lightning ``manual_optimization`` needs to set ``find_unused_parameters=True`` " - "to properly work with DDP." + "Lightning `manual_optimization` needs to set `find_unused_parameters=True` to properly work with DDP." ) self._ddp_kwargs["find_unused_parameters"] = True @@ -331,7 +327,7 @@ def broadcast(self, obj: object, src: int = 0) -> object: obj = [obj] if self.global_rank != src: obj = [None] - broadcast_object_list(obj, src, group=_group.WORLD) + torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD) return obj[0] def model_to_device(self): diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index e15f7bb853db8..43eb65ce21a22 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -74,7 +74,6 @@ from pytorch_lightning.utilities.imports import ( _HOROVOD_AVAILABLE, _IPU_AVAILABLE, - _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8, _TPU_AVAILABLE, ) @@ -190,10 +189,8 @@ def _init_deterministic(self, deterministic: bool) -> None: self.deterministic = deterministic if _TORCH_GREATER_EQUAL_1_8: torch.use_deterministic_algorithms(deterministic) - elif _TORCH_GREATER_EQUAL_1_7: + else: torch.set_deterministic(deterministic) - else: # the minimum version Lightning supports is PyTorch 1.6 - torch._set_deterministic(deterministic) if deterministic: # fixing non-deterministic part of horovod # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383 diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py index 158d7356c91ce..7343e28d6d811 100644 --- a/pytorch_lightning/utilities/__init__.py +++ b/pytorch_lightning/utilities/__init__.py @@ -44,7 +44,6 @@ _OMEGACONF_AVAILABLE, _POPTORCH_AVAILABLE, _RICH_AVAILABLE, - _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8, _TORCH_GREATER_EQUAL_1_9, _TORCH_GREATER_EQUAL_1_10, diff --git a/pytorch_lightning/utilities/auto_restart.py b/pytorch_lightning/utilities/auto_restart.py index f0b50103cf2f2..ef52717636d90 100644 --- a/pytorch_lightning/utilities/auto_restart.py +++ b/pytorch_lightning/utilities/auto_restart.py @@ -305,9 +305,6 @@ def _wrap_generator_samplers(self) -> None: # access wrapped dataset attributes dataset_dict = self.dataset.__dict__ - # create a tuple of sampler names - samplers_names = tuple(v.__class__.__name__ for k, v in dataset_dict.items() if isinstance(v, Sampler)) - # create a dictionary of generator present within the dataset attributes dataset_sampler_generators = {k: v for k, v in dataset_dict.items() if isinstance(v, (Generator, Iterator))} @@ -318,31 +315,17 @@ def _wrap_generator_samplers(self) -> None: if isinstance(generator, Sampler): continue - # used to handle a weird behaviour from PyTorch 1.6 - # where the sampler is converted to a list_iterator - is_legacy = False - - if isinstance(generator, Generator): - # Generator name have the the form `SamplerName.__iter__` - generator_name = generator.__qualname__.split(".")[0] - else: - # assume the retrieved iterator is coming from sampler. - is_legacy = True - - # validate the base generator name matches a sampler name. - if is_legacy or any(sampler_name == generator_name for sampler_name in samplers_names): - - # wrap the generator into a `FastForwardSampler` - sampler = FastForwardSampler(generator, attr_name=generator_attr_name) + # wrap the generator into a `FastForwardSampler` + sampler = FastForwardSampler(generator, attr_name=generator_attr_name) - # if `CaptureIterableDataset` was available, the sampler should reload its own state. - if self._state_dict is not None: - sampler.load_state_dict(self._state_dict[generator_attr_name]) - # store the samplers - self.samplers[generator_attr_name] = sampler + # if `CaptureIterableDataset` was available, the sampler should reload its own state. + if self._state_dict is not None: + sampler.load_state_dict(self._state_dict[generator_attr_name]) + # store the samplers + self.samplers[generator_attr_name] = sampler - # replace generator with the generator from the `FastForwardSampler`. - dataset_dict[generator_attr_name] = iter(sampler) + # replace generator with the generator from the `FastForwardSampler`. + dataset_dict[generator_attr_name] = iter(sampler) self.reset_on_epoch() diff --git a/pytorch_lightning/utilities/cloud_io.py b/pytorch_lightning/utilities/cloud_io.py index 9b40f6d69cfad..2c9eb1f768d3c 100644 --- a/pytorch_lightning/utilities/cloud_io.py +++ b/pytorch_lightning/utilities/cloud_io.py @@ -19,7 +19,6 @@ import fsspec import torch from fsspec.implementations.local import AbstractFileSystem, LocalFileSystem -from packaging.version import Version def load( @@ -59,12 +58,6 @@ def atomic_save(checkpoint: Dict[str, Any], filepath: Union[str, Path]) -> None: """ bytesbuffer = io.BytesIO() - # Can't use the new zipfile serialization for 1.6.0 because there's a bug in - # torch.hub.load_state_dict_from_url() that prevents it from loading the new files. - # More details can be found here: https://github.com/pytorch/pytorch/issues/42239 - if Version(torch.__version__).release[:3] == (1, 6, 0): - torch.save(checkpoint, bytesbuffer, _use_new_zipfile_serialization=False) - else: - torch.save(checkpoint, bytesbuffer) + torch.save(checkpoint, bytesbuffer) with fsspec.open(filepath, "wb") as f: f.write(bytesbuffer.getvalue()) diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index edf5f75aee6a9..5db24fe0f5cff 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -70,7 +70,6 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version: _IS_WINDOWS = platform.system() == "Windows" _IS_INTERACTIVE = hasattr(sys, "ps1") # https://stackoverflow.com/a/64523765 -_TORCH_GREATER_EQUAL_1_7 = _compare_version("torch", operator.ge, "1.7.0") _TORCH_GREATER_EQUAL_1_8 = _compare_version("torch", operator.ge, "1.8.0") _TORCH_GREATER_EQUAL_1_8_1 = _compare_version("torch", operator.ge, "1.8.1") _TORCH_GREATER_EQUAL_1_9 = _compare_version("torch", operator.ge, "1.9.0") @@ -112,4 +111,4 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version: # experimental feature within PyTorch Lightning. def _fault_tolerant_training() -> bool: - return _TORCH_GREATER_EQUAL_1_7 and int(os.getenv("PL_FAULT_TOLERANT_TRAINING", 0)) + return bool(int(os.getenv("PL_FAULT_TOLERANT_TRAINING", 0))) diff --git a/pytorch_lightning/utilities/seed.py b/pytorch_lightning/utilities/seed.py index 3b20c53353411..e8fc243f484f8 100644 --- a/pytorch_lightning/utilities/seed.py +++ b/pytorch_lightning/utilities/seed.py @@ -21,7 +21,7 @@ import numpy as np import torch -from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_7, rank_zero_warn +from pytorch_lightning.utilities import rank_zero_warn from pytorch_lightning.utilities.distributed import rank_zero_only log = logging.getLogger(__name__) @@ -113,9 +113,7 @@ def pl_worker_init_function(worker_id: int, rank: Optional[int] = None) -> None: np.random.seed(ss.generate_state(4)) # Spawn distinct SeedSequences for the PyTorch PRNG and the stdlib random module torch_ss, stdlib_ss = ss.spawn(2) - # PyTorch 1.7 and above takes a 64-bit seed - dtype = np.uint64 if _TORCH_GREATER_EQUAL_1_7 else np.uint32 - torch.manual_seed(torch_ss.generate_state(1, dtype=dtype)[0]) + torch.manual_seed(torch_ss.generate_state(1, dtype=np.uint64)[0]) # use 128 bits expressed as an integer stdlib_seed = (stdlib_ss.generate_state(2, dtype=np.uint64).astype(object) * [1 << 64, 1]).sum() random.seed(stdlib_seed) diff --git a/tests/callbacks/test_quantization.py b/tests/callbacks/test_quantization.py index fa2ee767bdc8c..e3dfb9b6a7edf 100644 --- a/tests/callbacks/test_quantization.py +++ b/tests/callbacks/test_quantization.py @@ -31,7 +31,7 @@ if _TORCH_GREATER_EQUAL_1_8: from torch.quantization import FakeQuantizeBase else: - # For torch 1.6 and 1.7. + # For torch 1.7. from torch.quantization import FakeQuantize as FakeQuantizeBase diff --git a/tests/conftest.py b/tests/conftest.py index 860f9357e4636..3d5548b7bd0ae 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -22,7 +22,7 @@ import torch.distributed from pytorch_lightning.plugins.environments.lightning_environment import find_free_network_port -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8 +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8 from tests import _PATH_DATASETS @@ -95,10 +95,8 @@ def reset_deterministic_algorithm(): yield if _TORCH_GREATER_EQUAL_1_8: torch.use_deterministic_algorithms(False) - elif _TORCH_GREATER_EQUAL_1_7: + else: torch.set_deterministic(False) - else: # the minimum version Lightning supports is PyTorch 1.6 - torch._set_deterministic(False) @pytest.fixture diff --git a/tests/core/test_metric_result_integration.py b/tests/core/test_metric_result_integration.py index 12fe7f2fb4652..9ec2f150ac5d4 100644 --- a/tests/core/test_metric_result_integration.py +++ b/tests/core/test_metric_result_integration.py @@ -33,7 +33,7 @@ ResultCollection, ResultMetric, ) -from pytorch_lightning.utilities.imports import _fault_tolerant_training, _TORCH_GREATER_EQUAL_1_7 +from pytorch_lightning.utilities.imports import _fault_tolerant_training from tests.helpers import BoringModel from tests.helpers.runif import RunIf @@ -470,21 +470,18 @@ def on_epoch_end(self) -> None: @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_7, reason="Requires at least PyTorch 1.7") def test_result_collection_reload(tmpdir): result_collection_reload(default_root_dir=tmpdir) @RunIf(min_gpus=1) @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_7, reason="Requires at least PyTorch 1.7") def test_result_collection_reload_1_gpu_ddp(tmpdir): result_collection_reload(default_root_dir=tmpdir, strategy="ddp", gpus=1) @RunIf(min_gpus=2, special=True) @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_7, reason="Requires at least PyTorch 1.7") def test_result_collection_reload_2_gpus(tmpdir): result_collection_reload(default_root_dir=tmpdir, strategy="ddp", gpus=2) diff --git a/tests/helpers/datamodules.py b/tests/helpers/datamodules.py index 0cb178a749a09..78e806b37937e 100644 --- a/tests/helpers/datamodules.py +++ b/tests/helpers/datamodules.py @@ -46,7 +46,6 @@ def prepare_data(self): self.dataset_cls(self.data_dir, train=False, download=True) def setup(self, stage: Optional[str] = None): - # TODO: need to split using random_split once updated to torch >= 1.6 if stage == "fit" or stage is None: self.mnist_train = self.dataset_cls(self.data_dir, train=True) if stage == "test" or stage is None: diff --git a/tests/loops/optimization/test_optimizer_loop.py b/tests/loops/optimization/test_optimizer_loop.py index 7e17cbbd56645..ae77c4387a398 100644 --- a/tests/loops/optimization/test_optimizer_loop.py +++ b/tests/loops/optimization/test_optimizer_loop.py @@ -24,7 +24,6 @@ from pytorch_lightning.loops.optimization.optimizer_loop import ClosureResult from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel -from tests.helpers.runif import RunIf def test_closure_result_deepcopy(): @@ -140,7 +139,6 @@ class CustomException(Exception): pass -@RunIf(min_torch="1.7.0") @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) @pytest.mark.parametrize("stop_epoch", (0, 1)) @pytest.mark.parametrize("stop_batch", (0, 1, 2)) diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py index dd390ab4939d5..f6097864d65f3 100644 --- a/tests/loops/test_loops.py +++ b/tests/loops/test_loops.py @@ -253,7 +253,6 @@ def on_load_checkpoint(self, state_dict: Dict) -> None: assert state_dict == {"state_dict": {"a": 1}, "progress": {"increment": 1}} -@RunIf(min_torch="1.7.0") @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) @pytest.mark.parametrize("stop_epoch", (1, 2)) @pytest.mark.parametrize("stop_batch", (1, 2)) @@ -323,7 +322,6 @@ def val_dataloader(self): assert trainer.fit_loop.epoch_loop.val_loop.epoch_loop.batch_progress.state_dict() == expected -@RunIf(min_torch="1.7.0") @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) @pytest.mark.parametrize("accumulate_grad_batches", (1, 2, 3)) @pytest.mark.parametrize("n_optimizers", (1, 3, 5)) @@ -526,7 +524,6 @@ def configure_optimizers_multiple(self): assert state_dict["epoch_progress"]["current"]["started"] == stop_epoch -@RunIf(min_torch="1.7.0") @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) @pytest.mark.parametrize("n_optimizers", (1, 3, 5)) def test_loop_state_on_complete_run(n_optimizers, tmpdir): @@ -662,7 +659,6 @@ def train_dataloader(self): assert checkpoint["loops"]["fit_loop"] == expected -@RunIf(min_torch="1.7.0") @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) def test_fit_loop_reset(tmpdir): """Test that the reset logic in fit- and epoch loop is aware of whether the loop is restarting from a completed @@ -752,7 +748,6 @@ def test_fit_loop_reset(tmpdir): @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@RunIf(min_torch="1.7.0") @pytest.mark.parametrize( ["train_datasets", "val_datasets"], [([RandomDataset], [RandomDataset]), ([RandomDataset], [RandomDataset, RandomDataset])], diff --git a/tests/plugins/test_double_plugin.py b/tests/plugins/test_double_plugin.py index cadd02c692af5..b3fdf87428522 100644 --- a/tests/plugins/test_double_plugin.py +++ b/tests/plugins/test_double_plugin.py @@ -20,7 +20,6 @@ from pytorch_lightning import Trainer from pytorch_lightning.plugins import DoublePrecisionPlugin -from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_7 from tests.helpers.boring_model import BoringModel, RandomDataset from tests.helpers.runif import RunIf @@ -137,10 +136,7 @@ def on_fit_start(self): [ DoublePrecisionBoringModel, DoublePrecisionBoringModelNoForward, - pytest.param( - DoublePrecisionBoringModelComplexBuffer, - marks=pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_7, reason="torch.complex not available"), - ), + DoublePrecisionBoringModelComplexBuffer, ], ) def test_double_precision(tmpdir, boring_model): diff --git a/tests/profiler/test_profiler.py b/tests/profiler/test_profiler.py index 7369ab9a4a140..faf00f8890783 100644 --- a/tests/profiler/test_profiler.py +++ b/tests/profiler/test_profiler.py @@ -26,7 +26,6 @@ from pytorch_lightning.loggers.tensorboard import TensorBoardLogger from pytorch_lightning.profiler import AdvancedProfiler, PassThroughProfiler, PyTorchProfiler, SimpleProfiler from pytorch_lightning.profiler.pytorch import RegisterRecordFunction -from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_7 from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _KINETO_AVAILABLE from tests.helpers import BoringModel, ManualOptimBoringModel @@ -394,8 +393,7 @@ def test_pytorch_profiler_nested(tmpdir): names = {"a", "b", "c"} ops = {"add", "empty", "fill_", "ones", "zero_", "zeros"} - if _TORCH_GREATER_EQUAL_1_7: - ops = {"aten::" + op for op in ops} + ops = {"aten::" + op for op in ops} expected = names.union(ops) assert events_name == expected, (events_name, torch.__version__, platform.system()) diff --git a/tests/trainer/connectors/test_checkpoint_connector.py b/tests/trainer/connectors/test_checkpoint_connector.py index 6b408845ed879..4a42265eb21b0 100644 --- a/tests/trainer/connectors/test_checkpoint_connector.py +++ b/tests/trainer/connectors/test_checkpoint_connector.py @@ -21,7 +21,6 @@ from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.trainer.states import TrainerFn from tests.helpers import BoringModel -from tests.helpers.runif import RunIf class HPCHookdedModel(BoringModel): @@ -133,7 +132,6 @@ def test_hpc_max_ckpt_version(tmpdir): @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@RunIf(min_torch="1.7.0") def test_loops_restore(tmpdir): """Test that required loop state_dict is loaded correctly by checkpoint connector.""" model = BoringModel() diff --git a/tests/trainer/connectors/test_signal_connector.py b/tests/trainer/connectors/test_signal_connector.py index 3da8c100fe40c..aa5407e2f1228 100644 --- a/tests/trainer/connectors/test_signal_connector.py +++ b/tests/trainer/connectors/test_signal_connector.py @@ -26,7 +26,7 @@ @pytest.mark.parametrize("register_handler", [False, True]) @pytest.mark.parametrize("terminate_gracefully", [False, True]) -@RunIf(min_torch="1.7.0", skip_windows=True) +@RunIf(skip_windows=True) def test_fault_tolerant_sig_handler(register_handler, terminate_gracefully, tmpdir): # hack to reset the signal diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index 0f6abd38e6836..97097b2074ca1 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -26,7 +26,7 @@ from tests.helpers.runif import RunIf -@RunIf(skip_windows=True, min_torch="1.7.0") +@RunIf(skip_windows=True) @pytest.mark.parametrize("mode", (1, 2, 3)) def test_replace_distributed_sampler(tmpdir, mode): class IndexedRandomDataset(RandomDataset): diff --git a/tests/trainer/test_supporters.py b/tests/trainer/test_supporters.py index 204f3079f544b..1518779bcc7ef 100644 --- a/tests/trainer/test_supporters.py +++ b/tests/trainer/test_supporters.py @@ -34,7 +34,6 @@ from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.auto_restart import CaptureMapDataset, FastForwardSampler from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_7 def test_tensor_running_accum_reset(): @@ -310,7 +309,6 @@ def test_nested_calc_num_data(input_data, compute_func, expected_length): assert calculated_length == expected_length -@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_7, reason="Requires at least PyTorch 1.7") @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1", "PL_TRAINER_GPUS": "2"}) @mock.patch("torch.cuda.device_count", return_value=2) @mock.patch("torch.cuda.is_available", return_value=True) diff --git a/tests/utilities/test_auto_restart.py b/tests/utilities/test_auto_restart.py index 4e3385cebecbc..b36a9d1d76941 100644 --- a/tests/utilities/test_auto_restart.py +++ b/tests/utilities/test_auto_restart.py @@ -690,7 +690,6 @@ def create_dataloader(): } -@RunIf(min_torch="1.7.0") @pytest.mark.parametrize("use_fault_tolerant", ["0", "1"]) def test_data_loading_wraps_dataset_and_samplers(use_fault_tolerant, tmpdir): """This test ensures the dataset and sampler are properly wrapped when fault tolerant is enabled.""" @@ -785,7 +784,6 @@ def __len__(self): # TODO: test with `RandomGeneratorGetItemDataset` @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@RunIf(min_torch="1.7.0") @pytest.mark.parametrize( "dataset_class", [ @@ -921,7 +919,6 @@ def _run_training(trainer_kwargs, dataset_classes, fail_on_step: int = -1, ckpt_ @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@RunIf(min_torch="1.7.0") @pytest.mark.parametrize( "dataset_classes", [ @@ -975,7 +972,6 @@ def test_dataset_rng_states_restart_with_lightning(tmpdir, dataset_classes, mult @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@RunIf(min_torch="1.7.0") @pytest.mark.parametrize( ["train_datasets", "val_datasets"], [ @@ -1139,7 +1135,7 @@ def _fit_model( @pytest.mark.parametrize("failure_on_training", [False, True]) @pytest.mark.parametrize("failure_on_step", [False, True]) @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@RunIf(min_torch="1.7.0", skip_windows=True) +@RunIf(skip_windows=True) def test_auto_restart_under_signal(on_last_batch, val_check_interval, failure_on_training, failure_on_step, tmpdir): """This test asserts that if a signal is being sent during the training / validation phase, the model should restart in a reproducible way.""" From 0c33decc4858bafa34a1a2d557e976d21d7e0088 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 8 Nov 2021 17:18:32 +0100 Subject: [PATCH 14/16] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 93537419d93b4..df7c34d627bcc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -89,6 +89,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed deprecated method `master_params` from PrecisionPlugin ([#10372](https://github.com/PyTorchLightning/pytorch-lightning/pull/10372)) + ### Fixed - Fixed `apply_to_collection(defaultdict)` ([#10316](https://github.com/PyTorchLightning/pytorch-lightning/issues/10316)) From 126aedb900755188c0fe6b4da435c99306c1a6be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 11 Nov 2021 15:30:30 +0100 Subject: [PATCH 15/16] Apply suggestions from code review Co-authored-by: Aki Nitta --- pytorch_lightning/plugins/training_type/ddp.py | 1 + pytorch_lightning/plugins/training_type/ddp_spawn.py | 1 + 2 files changed, 2 insertions(+) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index be0e516d48f0d..36726cb977c85 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -259,6 +259,7 @@ def pre_configure_ddp(self): # TODO: PyTorch 1.7.0 DDP introduces `self.reducer._rebuild_buckets()` breaking manual_optimization rank_zero_warn( "Lightning `manual_optimization` needs to set `find_unused_parameters=True` to properly work with DDP." + " Using `find_unused_parameters=True`." ) self._ddp_kwargs["find_unused_parameters"] = True diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 92a25c9f7c78c..8ba0d89932e62 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -243,6 +243,7 @@ def pre_configure_ddp(self): # TODO: PyTorch 1.7.0 DDP introduces `self.reducer._rebuild_buckets()` breaking manual_optimization rank_zero_warn( "Lightning `manual_optimization` needs to set `find_unused_parameters=True` to properly work with DDP." + " Using `find_unused_parameters=True`." ) self._ddp_kwargs["find_unused_parameters"] = True From 99cece8069ab366324c86e98226a9b3b4a1c3da8 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 11 Nov 2021 15:32:33 +0100 Subject: [PATCH 16/16] Fix message --- pytorch_lightning/plugins/training_type/ddp.py | 4 ++-- pytorch_lightning/plugins/training_type/ddp_spawn.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 36726cb977c85..84e9b55b9ee08 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -258,8 +258,8 @@ def pre_configure_ddp(self): ): # TODO: PyTorch 1.7.0 DDP introduces `self.reducer._rebuild_buckets()` breaking manual_optimization rank_zero_warn( - "Lightning `manual_optimization` needs to set `find_unused_parameters=True` to properly work with DDP." - " Using `find_unused_parameters=True`." + "From PyTorch 1.7.0, Lightning `manual_optimization` needs to set `find_unused_parameters=True` to" + " properly work with DDP. Using `find_unused_parameters=True`." ) self._ddp_kwargs["find_unused_parameters"] = True diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 8ba0d89932e62..677e031cd04af 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -242,8 +242,8 @@ def pre_configure_ddp(self): ): # TODO: PyTorch 1.7.0 DDP introduces `self.reducer._rebuild_buckets()` breaking manual_optimization rank_zero_warn( - "Lightning `manual_optimization` needs to set `find_unused_parameters=True` to properly work with DDP." - " Using `find_unused_parameters=True`." + "From PyTorch 1.7.0, Lightning `manual_optimization` needs to set `find_unused_parameters=True` to" + " properly work with DDP. Using `find_unused_parameters=True`." ) self._ddp_kwargs["find_unused_parameters"] = True