Skip to content

Fabric: auto default #16842

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Feb 23, 2023
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docs/source-pytorch/fabric/fundamentals/accelerators.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ Fabric enables you to take full advantage of the hardware on your system. It sup
- GPU (NVIDIA, AMD, Apple Silicon)
- TPU

By default, Fabric recognizes the accelerator(s) on your system
By default, Fabric tries to maximize the hardware utilization of your system

.. code-block:: python

# Default settings
fabric = Fabric(accelerator="auto", devices="auto")
fabric = Fabric(accelerator="auto", devices="auto", strategy="auto")

# Same as
fabric = Fabric()
Expand All @@ -40,7 +40,7 @@ You can also explicitly set which accelerator to use:
fabric = Fabric(accelerator="gpu", devices=8)

# GPU: Apple M1/M2 only
fabric = Fabric(accelerator="mps", devices=8)
fabric = Fabric(accelerator="mps")

# GPU: NVIDIA CUDA only
fabric = Fabric(accelerator="cuda", devices=8)
Expand Down
6 changes: 3 additions & 3 deletions docs/source-pytorch/fabric/fundamentals/launch.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@ This is essentially the same as running ``python path/to/your/script.py``, but i
itself and are expected to be parsed there.

Options:
--accelerator [cpu|gpu|cuda|mps|tpu]
--accelerator [auto|cpu|gpu|cuda|mps|tpu]
The hardware accelerator to run on.
--strategy [ddp|dp|deepspeed] Strategy for how to run across multiple
devices.
--strategy [auto|ddp|dp|deepspeed]
Strategy for how to run across multiple devices.
--devices TEXT Number of devices to run on (``int``), which
devices to run on (``list`` or ``str``), or
``'auto'``. The value applies per node.
Expand Down
3 changes: 3 additions & 0 deletions src/lightning/fabric/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Changed

- Fabric now chooses `accelerator="auto", strategy="auto", devices="auto"` as defaults ([#16842](https://github.com/Lightning-AI/lightning/pull/16842))


- Checkpoint saving and loading redesign ([#16434](https://github.com/Lightning-AI/lightning/pull/16434))
* Changed the method signatrue of `Fabric.save` and `Fabric.load`
* Changed the method signature of `Strategy.save_checkpoint` and `Fabric.load_checkpoint`
Expand Down
68 changes: 28 additions & 40 deletions src/lightning/fabric/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,18 +100,18 @@ class _Connector:

def __init__(
self,
accelerator: Optional[Union[str, Accelerator]] = None,
strategy: Optional[Union[str, Strategy]] = None,
devices: Optional[Union[List[int], str, int]] = None,
accelerator: Union[str, Accelerator] = "auto",
strategy: Union[str, Strategy] = "auto",
devices: Union[List[int], str, int] = "auto",
num_nodes: int = 1,
precision: _PRECISION_INPUT = "32-true",
plugins: Optional[Union[_PLUGIN_INPUT, List[_PLUGIN_INPUT]]] = None,
) -> None:

# These arguments can be set through environment variables set by the CLI
accelerator = self._argument_from_env("accelerator", accelerator, default=None)
strategy = self._argument_from_env("strategy", strategy, default=None)
devices = self._argument_from_env("devices", devices, default=None)
accelerator = self._argument_from_env("accelerator", accelerator, default="auto")
strategy = self._argument_from_env("strategy", strategy, default="auto")
devices = self._argument_from_env("devices", devices, default="auto")
num_nodes = self._argument_from_env("num_nodes", num_nodes, default=1)
precision = self._argument_from_env("precision", precision, default="32-true")

Expand All @@ -123,8 +123,8 @@ def __init__(
# Raise an exception if there are conflicts between flags
# Set each valid flag to `self._x_flag` after validation
# For devices: Assign gpus, etc. to the accelerator flag and devices flag
self._strategy_flag: Optional[Union[Strategy, str]] = None
self._accelerator_flag: Optional[Union[Accelerator, str]] = None
self._strategy_flag: Union[Strategy, str] = "auto"
self._accelerator_flag: Union[Accelerator, str] = "auto"
self._precision_input: _PRECISION_INPUT_STR = "32-true"
self._precision_instance: Optional[Precision] = None
self._cluster_environment_flag: Optional[Union[ClusterEnvironment, str]] = None
Expand All @@ -141,7 +141,7 @@ def __init__(

# 2. Instantiate Accelerator
# handle `auto`, `None` and `gpu`
if self._accelerator_flag == "auto" or self._accelerator_flag is None:
if self._accelerator_flag == "auto":
self._accelerator_flag = self._choose_auto_accelerator()
elif self._accelerator_flag == "gpu":
self._accelerator_flag = self._choose_gpu_accelerator_backend()
Expand All @@ -152,7 +152,7 @@ def __init__(
self.cluster_environment: ClusterEnvironment = self._choose_and_init_cluster_environment()

# 4. Instantiate Strategy - Part 1
if self._strategy_flag is None:
if self._strategy_flag == "auto":
self._strategy_flag = self._choose_strategy()
# In specific cases, ignore user selection and fall back to a different strategy
self._check_strategy_and_fallback()
Expand All @@ -166,8 +166,8 @@ def __init__(

def _check_config_and_set_final_flags(
self,
strategy: Optional[Union[str, Strategy]],
accelerator: Optional[Union[str, Accelerator]],
strategy: Union[str, Strategy],
accelerator: Union[str, Accelerator],
precision: _PRECISION_INPUT,
plugins: Optional[Union[_PLUGIN_INPUT, List[_PLUGIN_INPUT]]],
) -> None:
Expand All @@ -188,26 +188,24 @@ def _check_config_and_set_final_flags(
if isinstance(strategy, str):
strategy = strategy.lower()

if strategy is not None:
self._strategy_flag = strategy
self._strategy_flag = strategy

if strategy is not None and strategy not in self._registered_strategies and not isinstance(strategy, Strategy):
if strategy != "auto" and strategy not in self._registered_strategies and not isinstance(strategy, Strategy):
raise ValueError(
f"You selected an invalid strategy name: `strategy={strategy!r}`."
" It must be either a string or an instance of `lightning.fabric.strategies.Strategy`."
" Example choices: ddp, ddp_spawn, deepspeed, dp, ..."
" Example choices: auto, ddp, ddp_spawn, deepspeed, dp, ..."
" Find a complete list of options in our documentation at https://lightning.ai"
)

if (
accelerator is not None
and accelerator not in self._registered_accelerators
accelerator not in self._registered_accelerators
and accelerator not in ("auto", "gpu")
and not isinstance(accelerator, Accelerator)
):
raise ValueError(
f"You selected an invalid accelerator name: `accelerator={accelerator!r}`."
f" Available names are: {', '.join(self._registered_accelerators)}."
f" Available names are: auto, {', '.join(self._registered_accelerators)}."
)

# MPS accelerator is incompatible with DDP family of strategies. It supports single-device operation only.
Expand Down Expand Up @@ -256,9 +254,9 @@ def _check_config_and_set_final_flags(
# handle the case when the user passes in a strategy instance which has an accelerator, precision,
# checkpoint io or cluster env set up
# TODO: improve the error messages below
if self._strategy_flag and isinstance(self._strategy_flag, Strategy):
if isinstance(self._strategy_flag, Strategy):
if self._strategy_flag._accelerator:
if self._accelerator_flag:
if self._accelerator_flag != "auto":
raise ValueError("accelerator set through both strategy class and accelerator flag, choose one")
else:
self._accelerator_flag = self._strategy_flag._accelerator
Expand Down Expand Up @@ -297,9 +295,7 @@ def _check_config_and_set_final_flags(
self._accelerator_flag = "cuda"
self._parallel_devices = self._strategy_flag.parallel_devices

def _check_device_config_and_set_final_flags(
self, devices: Optional[Union[List[int], str, int]], num_nodes: int
) -> None:
def _check_device_config_and_set_final_flags(self, devices: Union[List[int], str, int], num_nodes: int) -> None:
self._num_nodes_flag = int(num_nodes) if num_nodes is not None else 1
self._devices_flag = devices

Expand All @@ -314,21 +310,14 @@ def _check_device_config_and_set_final_flags(
f" using {accelerator_name} accelerator."
)

if self._devices_flag == "auto" and self._accelerator_flag is None:
raise ValueError(
f"You passed `devices={devices}` but haven't specified"
" `accelerator=('auto'|'tpu'|'gpu'|'cpu'|'mps')` for the devices mapping."
)

def _choose_auto_accelerator(self) -> str:
"""Choose the accelerator type (str) based on availability when ``accelerator='auto'``."""
if self._accelerator_flag == "auto":
if TPUAccelerator.is_available():
return "tpu"
if MPSAccelerator.is_available():
return "mps"
if CUDAAccelerator.is_available():
return "cuda"
if TPUAccelerator.is_available():
return "tpu"
if MPSAccelerator.is_available():
return "mps"
if CUDAAccelerator.is_available():
return "cuda"
return "cpu"

@staticmethod
Expand All @@ -337,7 +326,6 @@ def _choose_gpu_accelerator_backend() -> str:
return "mps"
if CUDAAccelerator.is_available():
return "cuda"

raise RuntimeError("No supported gpu backend found!")

def _set_parallel_devices_and_init_accelerator(self) -> None:
Expand Down Expand Up @@ -368,7 +356,7 @@ def _set_parallel_devices_and_init_accelerator(self) -> None:
self._parallel_devices = accelerator_cls.get_parallel_devices(self._devices_flag)

def _set_devices_flag_if_auto_passed(self) -> None:
if self._devices_flag == "auto" or self._devices_flag is None:
if self._devices_flag == "auto":
self._devices_flag = self.accelerator.auto_device_count()

def _choose_and_init_cluster_environment(self) -> ClusterEnvironment:
Expand Down Expand Up @@ -527,7 +515,7 @@ def _lazy_init_strategy(self) -> None:
raise RuntimeError(
f"`Fabric(strategy={self._strategy_flag!r})` is not compatible with an interactive"
" environment. Run your code as a script, or choose one of the compatible strategies:"
f" `Fabric(strategy=None|'dp'|'ddp_notebook')`."
f" `Fabric(strategy='dp'|'ddp_notebook')`."
" In case you are spawning processes yourself, make sure to include the Fabric"
" creation inside the worker function."
)
Expand Down
6 changes: 3 additions & 3 deletions src/lightning/fabric/fabric.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,9 @@ class Fabric:

def __init__(
self,
accelerator: Optional[Union[str, Accelerator]] = None,
strategy: Optional[Union[str, Strategy]] = None,
devices: Optional[Union[List[int], str, int]] = None,
accelerator: Union[str, Accelerator] = "auto",
strategy: Union[str, Strategy] = "auto",
devices: Union[List[int], str, int] = "auto",
num_nodes: int = 1,
precision: _PRECISION_INPUT = "32-true",
plugins: Optional[Union[_PLUGIN_INPUT, List[_PLUGIN_INPUT]]] = None,
Expand Down
21 changes: 15 additions & 6 deletions tests/tests_fabric/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,17 +75,26 @@ def reset_deterministic_algorithm():
torch.use_deterministic_algorithms(False)


def mock_xla_available(monkeypatch: pytest.MonkeyPatch, value: bool = True) -> None:
monkeypatch.setattr(lightning.fabric.accelerators.tpu, "_XLA_AVAILABLE", value)
monkeypatch.setattr(lightning.fabric.plugins.environments.xla, "_XLA_AVAILABLE", value)
monkeypatch.setattr(lightning.fabric.strategies.xla, "_XLA_AVAILABLE", value)
monkeypatch.setattr(lightning.fabric.strategies.launchers.xla, "_XLA_AVAILABLE", value)


@pytest.fixture(scope="function")
def xla_available(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(lightning.fabric.accelerators.tpu, "_XLA_AVAILABLE", True)
monkeypatch.setattr(lightning.fabric.plugins.environments.xla, "_XLA_AVAILABLE", True)
monkeypatch.setattr(lightning.fabric.strategies.xla, "_XLA_AVAILABLE", True)
monkeypatch.setattr(lightning.fabric.strategies.launchers.xla, "_XLA_AVAILABLE", True)
mock_xla_available(monkeypatch)


def mock_tpu_available(monkeypatch: pytest.MonkeyPatch, value: bool = True) -> None:
mock_xla_available(monkeypatch, value)
monkeypatch.setattr(lightning.fabric.accelerators.tpu.TPUAccelerator, "is_available", lambda: value)


@pytest.fixture(scope="function")
def tpu_available(xla_available, monkeypatch) -> None:
monkeypatch.setattr(lightning.fabric.accelerators.tpu.TPUAccelerator, "is_available", lambda: True)
def tpu_available(monkeypatch: pytest.MonkeyPatch) -> None:
mock_tpu_available(monkeypatch)


@pytest.fixture
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,9 @@ def after_backward(self, model):
],
)
def test_amp(accelerator, precision, expected_dtype):
fabric = MixedPrecisionBoringFabric(accelerator=accelerator, precision=precision)
# TODO: devices>1 fails with:
# DDP expects same model across all ranks, but Rank 0 has 2 params, while rank 1 has inconsistent 1 params
fabric = MixedPrecisionBoringFabric(accelerator=accelerator, precision=precision, devices=1)
fabric.expected_dtype = expected_dtype
fabric.run()

Expand Down
Loading