Merge pull request #5 from abhilash1910/patch-3

jingxu10 · web-flow · commit 400bc5972bda · 2023-06-06T04:48:59.000+09:00
add seeding for pytorch utilities
diff --git a/src/lightning/fabric/strategies/launchers/multiprocessing.py b/src/lightning/fabric/strategies/launchers/multiprocessing.py
@@ -24,10 +24,13 @@
 from lightning.fabric.utilities.apply_func import move_data_to_device
 from lightning.fabric.utilities.imports import _IS_INTERACTIVE
 from lightning.fabric.utilities.seed import _collect_rng_states, _set_rng_states
+from lightning.fabric.utilities.imports import _LIGHTNING_XPU_AVAILABLE
 
 if TYPE_CHECKING:
     from lightning.fabric.strategies import ParallelStrategy
-
+    
+if _LIGHTNING_XPU_AVAILABLE:
+    from lightning_xpu.fabric import XPUAccelerator
 
 class _MultiProcessingLauncher(_Launcher):
     r"""Launches processes that run a given function in parallel, and joins them all at the end.
@@ -85,6 +88,8 @@ def launch(self, function: Callable, *args: Any, **kwargs: Any) -> Any:
         """
         if self._start_method in ("fork", "forkserver"):
             _check_bad_cuda_fork()
+            if XPUAccelerator.is_available():
+                _check_bad_xpu_fork()
 
         # The default cluster environment in Lightning chooses a random free port number
         # This needs to be done in the main process here before starting processes to ensure each rank will connect
@@ -187,3 +192,21 @@ def _check_bad_cuda_fork() -> None:
     if _IS_INTERACTIVE:
         message += " You will have to restart the Python kernel."
     raise RuntimeError(message)
+
+def _check_bad_xpu_fork() -> None:
+    """Checks whether it is safe to fork and initialize XPU in the new processes, and raises an exception if not.
+
+    The error message replaces PyTorch's 'Cannot re-initialize XPU in forked subprocess' with helpful advice for
+    Lightning users.
+    """
+    if not XPUAccelerator.is_xpu_initialized():
+        return
+
+    message = (
+        "Lightning can't create new processes if XPU is already initialized. Did you manually call"
+        " `torch.xpu.*` functions, have moved the model to the device, or allocated memory on the GPU any"
+        " other way? Please remove any such calls, or change the selected strategy."
+    )
+    if _IS_INTERACTIVE:
+        message += " You will have to restart the Python kernel."
+    raise RuntimeError(message)    
diff --git a/src/lightning/pytorch/strategies/launchers/multiprocessing.py b/src/lightning/pytorch/strategies/launchers/multiprocessing.py
@@ -27,15 +27,19 @@
 from torch import Tensor
 
 import lightning.pytorch as pl
-from lightning.fabric.strategies.launchers.multiprocessing import _check_bad_cuda_fork
+from lightning.fabric.strategies.launchers.multiprocessing import _check_bad_cuda_fork, _check_bad_xpu_fork
 from lightning.fabric.utilities import move_data_to_device
 from lightning.fabric.utilities.seed import _collect_rng_states, _set_rng_states
 from lightning.fabric.utilities.types import _PATH
 from lightning.pytorch.strategies.launchers.launcher import _Launcher
 from lightning.pytorch.trainer.connectors.signal_connector import _SIGNUM
 from lightning.pytorch.trainer.states import TrainerFn, TrainerState
 from lightning.pytorch.utilities.rank_zero import rank_zero_debug
+from lightning.pytorch.utilities.imports import _LIGHTNING_XPU_AVAILABLE
 
+if _LIGHTNING_XPU_AVAILABLE:
+    from lightning_xpu.pytorch import XPUAccelerator
+    
 log = logging.getLogger(__name__)
 
 
@@ -97,6 +101,8 @@ def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"]
         self._check_torchdistx_support()
         if self._start_method in ("fork", "forkserver"):
             _check_bad_cuda_fork()
+            if XPUAccelerator.is_available():
+                _check_bad_xpu_fork()
 
         # The default cluster environment in Lightning chooses a random free port number
         # This needs to be done in the main process here before starting processes to ensure each rank will connect
diff --git a/src/lightning/pytorch/utilities/seed.py b/src/lightning/pytorch/utilities/seed.py
@@ -19,7 +19,7 @@
 
 
 @contextmanager
-def isolate_rng(include_cuda: bool = True) -> Generator[None, None, None]:
+def isolate_rng(include_cuda: bool = True, include_xpu: bool = True) -> Generator[None, None, None]:
     """A context manager that resets the global random state on exit to what it was before entering.
 
     It supports isolating the states for PyTorch, Numpy, and Python built-in random number generators.
@@ -39,6 +39,6 @@ def isolate_rng(include_cuda: bool = True) -> Generator[None, None, None]:
         >>> torch.rand(1)
         tensor([0.7576])
     """
-    states = _collect_rng_states(include_cuda)
+    states = _collect_rng_states(include_cuda, include_xpu)
     yield
     _set_rng_states(states)