[App] Enable state broadcast with MultiNode (#15607)

tchaton · web-flow · commit 61c1f699ea84 · 2022-11-11T10:06:40.000Z
diff --git a/examples/app_multi_node/train_lite.py b/examples/app_multi_node/train_lite.py
@@ -6,23 +6,26 @@
 
 
 class LitePyTorchDistributed(L.LightningWork):
-    @staticmethod
-    def run():
-        # 1. Create LightningLite.
-        lite = LightningLite(strategy="ddp", precision=16)
+    def run(self):
+        # 1. Prepare the model
+        model = torch.nn.Sequential(
+            torch.nn.Linear(1, 1),
+            torch.nn.ReLU(),
+            torch.nn.Linear(1, 1),
+        )
 
-        # 2. Prepare distributed model and optimizer.
-        model = torch.nn.Linear(32, 2)
-        optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
-        model, optimizer = lite.setup(model, optimizer)
+        # 2. Create LightningLite.
+        lite = LightningLite(strategy="ddp", precision=16)
+        model, optimizer = lite.setup(model, torch.optim.SGD(model.parameters(), lr=0.01))
         criterion = torch.nn.MSELoss()
 
-        # 3. Train the model for 50 steps.
-        for step in range(50):
+        # 3. Train the model for 1000 steps.
+        for step in range(1000):
             model.zero_grad()
-            x = torch.randn(64, 32).to(lite.device)
+            x = torch.tensor([0.8]).to(lite.device)
+            target = torch.tensor([1.0]).to(lite.device)
             output = model(x)
-            loss = criterion(output, torch.ones_like(output))
+            loss = criterion(output, target)
             print(f"global_rank: {lite.global_rank} step: {step} loss: {loss}")
             lite.backward(loss)
             optimizer.step()
diff --git a/examples/app_multi_node/train_lt.py b/examples/app_multi_node/train_lt.py
@@ -4,11 +4,10 @@
 
 
 class LightningTrainerDistributed(L.LightningWork):
-    @staticmethod
-    def run():
+    def run(self):
         model = BoringModel()
         trainer = L.Trainer(
-            max_epochs=10,
+            max_steps=1000,
             strategy="ddp",
         )
         trainer.fit(model)
diff --git a/examples/app_multi_node/train_pytorch.py b/examples/app_multi_node/train_pytorch.py
@@ -18,29 +18,28 @@ def distributed_train(local_rank: int, main_address: str, main_port: int, num_no
             init_method=f"tcp://{main_address}:{main_port}",
         )
 
-    # 2. Prepare distributed model
-    model = torch.nn.Linear(32, 2)
+    # 2. Prepare the model
+    model = torch.nn.Sequential(
+        torch.nn.Linear(1, 1),
+        torch.nn.ReLU(),
+        torch.nn.Linear(1, 1),
+    )
 
     # 3. Setup distributed training
-    if torch.cuda.is_available():
-        device = torch.device(f"cuda:{local_rank}")
-        torch.cuda.set_device(device)
-    else:
-        device = torch.device("cpu")
-
-    model = model.to(device)
-    model = DistributedDataParallel(model, device_ids=[device.index] if torch.cuda.is_available() else None)
+    device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu")
+    model = DistributedDataParallel(model.to(device), device_ids=[local_rank] if torch.cuda.is_available() else None)
 
     # 4. Prepare loss and optimizer
     criterion = torch.nn.MSELoss()
     optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
 
-    # 5. Train the model for 50 steps.
-    for step in range(50):
+    # 5. Train the model for 1000 steps.
+    for step in range(1000):
         model.zero_grad()
-        x = torch.randn(64, 32).to(device)
+        x = torch.tensor([0.8]).to(device)
+        target = torch.tensor([1.0]).to(device)
         output = model(x)
-        loss = criterion(output, torch.ones_like(output))
+        loss = criterion(output, target)
         print(f"global_rank: {global_rank} step: {step} loss: {loss}")
         loss.backward()
         optimizer.step()
diff --git a/examples/app_multi_node/train_pytorch_spawn.py b/examples/app_multi_node/train_pytorch_spawn.py
@@ -6,38 +6,37 @@
 
 
 class PyTorchDistributed(L.LightningWork):
-
-    # Note: Only staticmethod are support for now with `PyTorchSpawnMultiNode`
-    @staticmethod
     def run(
+        self,
         world_size: int,
         node_rank: int,
         global_rank: str,
         local_rank: int,
     ):
-        # 1. Prepare distributed model
-        model = torch.nn.Linear(32, 2)
+        # 1. Prepare the model
+        model = torch.nn.Sequential(
+            torch.nn.Linear(1, 1),
+            torch.nn.ReLU(),
+            torch.nn.Linear(1, 1),
+        )
 
         # 2. Setup distributed training
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{local_rank}")
-            torch.cuda.set_device(device)
-        else:
-            device = torch.device("cpu")
-
-        model = model.to(device)
-        model = DistributedDataParallel(model, device_ids=[device.index] if torch.cuda.is_available() else None)
+        device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu")
+        model = DistributedDataParallel(
+            model.to(device), device_ids=[local_rank] if torch.cuda.is_available() else None
+        )
 
         # 3. Prepare loss and optimizer
         criterion = torch.nn.MSELoss()
         optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
 
-        # 4. Train the model for 50 steps.
-        for step in range(50):
+        # 4. Train the model for 1000 steps.
+        for step in range(1000):
             model.zero_grad()
-            x = torch.randn(64, 32).to(device)
+            x = torch.tensor([0.8]).to(device)
+            target = torch.tensor([1.0]).to(device)
             output = model(x)
-            loss = criterion(output, torch.ones_like(output))
+            loss = criterion(output, target)
             print(f"global_rank: {global_rank} step: {step} loss: {loss}")
             loss.backward()
             optimizer.step()
diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md
@@ -26,6 +26,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Added `bi-directional` delta updates between the flow and the works ([#15582](https://github.com/Lightning-AI/lightning/pull/15582))
 
+- Enabled MultiNode Components to support state broadcasting ([#15607](https://github.com/Lightning-AI/lightning/pull/15607))
+
 
 ### Changed
 
diff --git a/src/lightning_app/components/multi_node/lite.py b/src/lightning_app/components/multi_node/lite.py
@@ -7,7 +7,6 @@
 from lightning_app.components.multi_node.base import MultiNode
 from lightning_app.components.multi_node.pytorch_spawn import _PyTorchSpawnRunExecutor
 from lightning_app.core.work import LightningWork
-from lightning_app.utilities.app_helpers import is_static_method
 from lightning_app.utilities.packaging.cloud_compute import CloudCompute
 from lightning_app.utilities.tracer import Tracer
 
@@ -82,11 +81,6 @@ def __init__(
         **work_kwargs: Any,
     ) -> None:
         assert issubclass(work_cls, _LiteWorkProtocol)
-        if not is_static_method(work_cls, "run"):
-            raise TypeError(
-                f"The provided {work_cls} run method needs to be static for now."
-                "HINT: Remove `self` and add staticmethod decorator."
-            )
 
         # Note: Private way to modify the work run executor
         # Probably exposed to the users in the future if needed.
diff --git a/src/lightning_app/components/multi_node/pytorch_spawn.py b/src/lightning_app/components/multi_node/pytorch_spawn.py
@@ -3,10 +3,10 @@
 from typing_extensions import Protocol, runtime_checkable
 
 from lightning_app.components.multi_node.base import MultiNode
+from lightning_app.core.queues import MultiProcessQueue
 from lightning_app.core.work import LightningWork
-from lightning_app.utilities.app_helpers import is_static_method
 from lightning_app.utilities.packaging.cloud_compute import CloudCompute
-from lightning_app.utilities.proxies import WorkRunExecutor
+from lightning_app.utilities.proxies import _proxy_setattr, unwrap, WorkRunExecutor, WorkStateObserver
 
 
 @runtime_checkable
@@ -22,6 +22,9 @@ def run(
 
 
 class _PyTorchSpawnRunExecutor(WorkRunExecutor):
+
+    enable_start_observer: bool = False
+
     def __call__(
         self,
         main_address: str,
@@ -31,10 +34,31 @@ def __call__(
     ):
         import torch
 
-        nprocs = torch.cuda.device_count() if torch.cuda.is_available() else 1
-        torch.multiprocessing.spawn(
-            self.run, args=(self.work_run, main_address, main_port, num_nodes, node_rank, nprocs), nprocs=nprocs
-        )
+        with self.enable_spawn():
+            nprocs = torch.cuda.device_count() if torch.cuda.is_available() else 1
+            queue = self.delta_queue if isinstance(self.delta_queue, MultiProcessQueue) else self.delta_queue.to_dict()
+            torch.multiprocessing.spawn(
+                self.dispatch_run,
+                args=(self.__class__, self.work, queue, main_address, main_port, num_nodes, node_rank, nprocs),
+                nprocs=nprocs,
+            )
+
+    @staticmethod
+    def dispatch_run(local_rank, cls, work, delta_queue, *args, **kwargs):
+        if local_rank == 0:
+            if isinstance(delta_queue, dict):
+                delta_queue = cls.process_queue(delta_queue)
+                work._request_queue = cls.process_queue(work._request_queue)
+                work._response_queue = cls.process_queue(work._response_queue)
+
+            state_observer = WorkStateObserver(work, delta_queue=delta_queue)
+            state_observer.start()
+            _proxy_setattr(work, delta_queue, state_observer)
+
+        cls.run(local_rank, unwrap(work.run), *args, **kwargs)
+
+        if local_rank == 0:
+            state_observer.join(0)
 
     @staticmethod
     def run(
@@ -46,6 +70,7 @@ def run(
         node_rank: int,
         nprocs: int,
     ):
+
         import torch
 
         # 1. Setting distributed environment
@@ -76,11 +101,6 @@ def __init__(
         **work_kwargs: Any,
     ) -> None:
         assert issubclass(work_cls, _PyTorchSpawnWorkProtocol)
-        if not is_static_method(work_cls, "run"):
-            raise TypeError(
-                f"The provided {work_cls} run method needs to be static for now."
-                "HINT: Remove `self` and add staticmethod decorator."
-            )
 
         # Note: Private way to modify the work run executor
         # Probably exposed to the users in the future if needed.
diff --git a/src/lightning_app/components/multi_node/trainer.py b/src/lightning_app/components/multi_node/trainer.py
@@ -7,7 +7,6 @@
 from lightning_app.components.multi_node.base import MultiNode
 from lightning_app.components.multi_node.pytorch_spawn import _PyTorchSpawnRunExecutor
 from lightning_app.core.work import LightningWork
-from lightning_app.utilities.app_helpers import is_static_method
 from lightning_app.utilities.packaging.cloud_compute import CloudCompute
 from lightning_app.utilities.tracer import Tracer
 
@@ -81,11 +80,6 @@ def __init__(
         **work_kwargs: Any,
     ) -> None:
         assert issubclass(work_cls, _LightningTrainerWorkProtocol)
-        if not is_static_method(work_cls, "run"):
-            raise TypeError(
-                f"The provided {work_cls} run method needs to be static for now."
-                "HINT: Remove `self` and add staticmethod decorator."
-            )
 
         # Note: Private way to modify the work run executor
         # Probably exposed to the users in the future if needed.
diff --git a/src/lightning_app/core/queues.py b/src/lightning_app/core/queues.py
@@ -235,12 +235,12 @@ def __init__(
         """
         if name is None:
             raise ValueError("You must specify a name for the queue")
-        host = host or REDIS_HOST
-        port = port or REDIS_PORT
-        password = password or REDIS_PASSWORD
+        self.host = host or REDIS_HOST
+        self.port = port or REDIS_PORT
+        self.password = password or REDIS_PASSWORD
         self.name = name
         self.default_timeout = default_timeout
-        self.redis = redis.Redis(host=host, port=port, password=password)
+        self.redis = redis.Redis(host=self.host, port=self.port, password=self.password)
 
     def put(self, item: Any) -> None:
         from lightning_app import LightningWork
@@ -329,6 +329,20 @@ def is_running(self) -> bool:
         except redis.exceptions.ConnectionError:
             return False
 
+    def to_dict(self):
+        return {
+            "type": "redis",
+            "name": self.name,
+            "default_timeout": self.default_timeout,
+            "host": self.host,
+            "port": self.port,
+            "password": self.password,
+        }
+
+    @classmethod
+    def from_dict(cls, state):
+        return cls(**state)
+
 
 class HTTPQueue(BaseQueue):
     def __init__(self, name: str, default_timeout: float):
@@ -414,6 +428,17 @@ def _split_app_id_and_queue_name(queue_name):
         app_id, queue_name = queue_name.split("_", 1)
         return app_id, queue_name
 
+    def to_dict(self):
+        return {
+            "type": "http",
+            "name": self.name,
+            "default_timeout": self.default_timeout,
+        }
+
+    @classmethod
+    def from_dict(cls, state):
+        return cls(**state)
+
 
 def debug_log_callback(message: str, *args: Any, **kwargs: Any) -> None:
     if QUEUE_DEBUG_ENABLED or (Path(LIGHTNING_DIR) / "QUEUE_DEBUG_ENABLED").exists():
diff --git a/src/lightning_app/utilities/proxies.py b/src/lightning_app/utilities/proxies.py