Skip to content

add accelerator.is_available() check #12104

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Mar 2, 2022
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
3ffe5bf
first commit
jjenniferdai Feb 24, 2022
4c0f4a7
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 24, 2022
9c642f9
update
jjenniferdai Feb 25, 2022
55b4995
doctest?
jjenniferdai Feb 25, 2022
729a8aa
codeblock instead
jjenniferdai Feb 25, 2022
15d386c
update msg
jjenniferdai Feb 25, 2022
d36d852
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 25, 2022
99b4ecf
Merge branch 'master' into acc-available-check
jjenniferdai Feb 28, 2022
c69bcf7
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 28, 2022
2385855
update msg, hardware_name
jjenniferdai Feb 28, 2022
250dc2c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 28, 2022
d76b7e0
format
jjenniferdai Feb 28, 2022
cd920ac
format Update tests/trainer/properties/test_estimated_stepping_batche…
jjenniferdai Mar 2, 2022
1e031f4
format Update tests/plugins/test_cluster_integration.py
jjenniferdai Mar 2, 2022
4c57f8a
format Update tests/accelerators/test_ipu.py
jjenniferdai Mar 2, 2022
8b632b4
format Update tests/accelerators/test_accelerator_connector.py
jjenniferdai Mar 2, 2022
2193484
format Update tests/accelerators/test_accelerator_connector.py
jjenniferdai Mar 2, 2022
fadb69b
format Update tests/accelerators/test_accelerator_connector.py
jjenniferdai Mar 2, 2022
31a54eb
req base, update name, lowercase
jjenniferdai Mar 2, 2022
67301d6
update
jjenniferdai Mar 2, 2022
70e9f71
Address reviews
kaushikb11 Mar 2, 2022
65b23b6
Fix test
kaushikb11 Mar 2, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions pytorch_lightning/accelerators/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,8 @@ def auto_device_count() -> int:
def is_available() -> bool:
"""CPU is always available for execution."""
return True

@staticmethod
def hardware_name() -> str:
"""Name of the hardware."""
return "CPU"
5 changes: 5 additions & 0 deletions pytorch_lightning/accelerators/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,11 @@ def auto_device_count() -> int:
def is_available() -> bool:
return torch.cuda.device_count() > 0

@staticmethod
def hardware_name() -> str:
"""Name of the hardware."""
return "GPU"


def get_nvidia_gpu_stats(device: _DEVICE) -> Dict[str, float]:
"""Get GPU stats including memory, fan speed, and temperature from nvidia-smi.
Expand Down
5 changes: 5 additions & 0 deletions pytorch_lightning/accelerators/ipu.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,8 @@ def auto_device_count() -> int:
@staticmethod
def is_available() -> bool:
return _IPU_AVAILABLE

@staticmethod
def hardware_name() -> str:
"""Name of the hardware."""
return "IPU"
5 changes: 5 additions & 0 deletions pytorch_lightning/accelerators/tpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,8 @@ def auto_device_count() -> int:
@staticmethod
def is_available() -> bool:
return _TPU_AVAILABLE

@staticmethod
def hardware_name() -> str:
"""Name of the hardware."""
return "TPU"
22 changes: 15 additions & 7 deletions pytorch_lightning/trainer/connectors/accelerator_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,16 +455,15 @@ def _choose_accelerator(self) -> str:
return "cpu"

def _set_parallel_devices_and_init_accelerator(self) -> None:
# TODO add device availability check
ACCELERATORS = {
"cpu": CPUAccelerator,
"gpu": GPUAccelerator,
"tpu": TPUAccelerator,
"ipu": IPUAccelerator,
}
if isinstance(self._accelerator_flag, Accelerator):
self.accelerator: Accelerator = self._accelerator_flag
else:
ACCELERATORS = {
"cpu": CPUAccelerator,
"gpu": GPUAccelerator,
"tpu": TPUAccelerator,
"ipu": IPUAccelerator,
}
assert self._accelerator_flag is not None
self._accelerator_flag = self._accelerator_flag.lower()
if self._accelerator_flag not in ACCELERATORS:
Expand All @@ -475,6 +474,15 @@ def _set_parallel_devices_and_init_accelerator(self) -> None:
accelerator_class = ACCELERATORS[self._accelerator_flag]
self.accelerator = accelerator_class() # type: ignore[abstract]

if not self.accelerator.is_available():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

depending on the accelerator, we might not be able to instantiate it without it being available. The init should have the freedom to assume it is available or raise additional errors. IMO the check should have been before the instantiation.

available_hardware = [acc_str for acc_str in list(ACCELERATORS) if ACCELERATORS[acc_str].is_available()]
raise MisconfigurationException(
f"{self.accelerator.__class__.__qualname__} can not run on this hardware"
f" since {self.accelerator.hardware_name()}s are not available."
" The following hardware is available and can be passed into"
f" `accelerator` argument of `Trainer`: {available_hardware}."
)

self._set_devices_flag_if_auto_passed()

self._gpus = self._devices_flag if not self._gpus else self._gpus
Expand Down
16 changes: 10 additions & 6 deletions tests/accelerators/test_accelerator_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,8 +428,9 @@ def test_ipython_compatible_dp_strategy_gpu(_, monkeypatch):
assert trainer.strategy.launcher is None or trainer.strategy.launcher.is_interactive_compatible


@mock.patch("pytorch_lightning.accelerators.tpu.TPUAccelerator.is_available", return_value=True)
@mock.patch("pytorch_lightning.accelerators.tpu.TPUAccelerator.parse_devices", return_value=8)
def test_ipython_compatible_strategy_tpu(_, monkeypatch):
def test_ipython_compatible_strategy_tpu(mock_devices, mock_tpuacc_avail, monkeypatch):
monkeypatch.setattr(pytorch_lightning.utilities, "_IS_INTERACTIVE", True)
trainer = Trainer(accelerator="tpu")
assert trainer.strategy.launcher is None or trainer.strategy.launcher.is_interactive_compatible
Expand Down Expand Up @@ -479,9 +480,10 @@ def test_accelerator_cpu(_):

with pytest.raises(MisconfigurationException, match="You requested gpu:"):
trainer = Trainer(gpus=1)
# TODO enable this test when add device availability check
# with pytest.raises(MisconfigurationException, match="You requested gpu, but gpu is not available"):
# trainer = Trainer(accelerator="gpu")
with pytest.raises(
MisconfigurationException, match="GPUAccelerator can not run on this hardware since GPUs are not available."
):
trainer = Trainer(accelerator="gpu")
with pytest.raises(MisconfigurationException, match="You requested gpu:"):
trainer = Trainer(accelerator="cpu", gpus=1)

Expand Down Expand Up @@ -898,8 +900,9 @@ def test_strategy_choice_ddp_cpu_slurm(device_count_mock, setup_distributed_mock
assert trainer.strategy.local_rank == 0


@mock.patch("pytorch_lightning.accelerators.tpu.TPUAccelerator.is_available", return_value=True)
@mock.patch("pytorch_lightning.accelerators.tpu.TPUAccelerator.parse_devices", return_value=8)
def test_unsupported_tpu_choice(mock_devices):
def test_unsupported_tpu_choice(mock_devices, mock_tpuacc_avail):

with pytest.raises(MisconfigurationException, match=r"accelerator='tpu', precision=64\)` is not implemented"):
Trainer(accelerator="tpu", precision=64)
Expand All @@ -914,7 +917,8 @@ def test_unsupported_tpu_choice(mock_devices):
Trainer(accelerator="tpu", precision=16, amp_backend="apex", strategy="single_device")


def test_unsupported_ipu_choice(monkeypatch):
@mock.patch("pytorch_lightning.accelerators.ipu.IPUAccelerator.is_available", return_value=True)
def test_unsupported_ipu_choice(mock_ipuacc_avail, monkeypatch):
import pytorch_lightning.strategies.ipu as ipu
import pytorch_lightning.utilities.imports as imports

Expand Down
4 changes: 3 additions & 1 deletion tests/accelerators/test_ipu.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.
import os
from typing import Optional
from unittest import mock

import pytest
import torch
Expand Down Expand Up @@ -97,7 +98,8 @@ def test_epoch_end(self, outputs) -> None:


@pytest.mark.skipif(_IPU_AVAILABLE, reason="test requires non-IPU machine")
def test_fail_if_no_ipus(tmpdir):
@mock.patch("pytorch_lightning.accelerators.ipu.IPUAccelerator.is_available", return_value=True)
def test_fail_if_no_ipus(mock_ipuacc_avail, tmpdir):
with pytest.raises(MisconfigurationException, match="IPU Accelerator requires IPU devices to run"):
Trainer(default_root_dir=tmpdir, ipus=1)

Expand Down
3 changes: 2 additions & 1 deletion tests/plugins/test_cluster_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ def environment_combinations():
"strategy_cls",
[DDPStrategy, DDPShardedStrategy, DDP2Strategy, pytest.param(DeepSpeedStrategy, marks=RunIf(deepspeed=True))],
)
def test_ranks_available_manual_strategy_selection(strategy_cls):
@mock.patch("pytorch_lightning.accelerators.gpu.GPUAccelerator.is_available", return_value=True)
def test_ranks_available_manual_strategy_selection(mock_gpuacc_available, strategy_cls):
"""Test that the rank information is readily available after Trainer initialization."""
num_nodes = 2
for cluster, variables, expected in environment_combinations():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

import logging
from unittest import mock

import pytest
import torch
Expand Down Expand Up @@ -148,7 +149,8 @@ def test_num_stepping_batches_with_tpu(devices, estimated_steps):
assert trainer.estimated_stepping_batches == estimated_steps


def test_num_stepping_batches_with_ipu(monkeypatch):
@mock.patch("pytorch_lightning.accelerators.ipu.IPUAccelerator.is_available", return_value=True)
def test_num_stepping_batches_with_ipu(mock_ipuacc_avail, monkeypatch):
"""Test stepping batches with IPU training which acts like DP."""
import pytorch_lightning.strategies.ipu as ipu

Expand Down