From 4fa2e2e506db01aedbb4df7a619f22a3b46c24a0 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 8 Nov 2022 14:37:15 +0000 Subject: [PATCH 1/2] remove lite --- examples/app_multi_node/README.md | 10 ---------- examples/app_multi_node/train_any.py | 22 ---------------------- 2 files changed, 32 deletions(-) delete mode 100644 examples/app_multi_node/train_any.py diff --git a/examples/app_multi_node/README.md b/examples/app_multi_node/README.md index 23e7afa23d68e..02871dc77d06b 100644 --- a/examples/app_multi_node/README.md +++ b/examples/app_multi_node/README.md @@ -18,16 +18,6 @@ or you can use the built-in component for it. lightning run app train_pytorch_spawn.py ``` -## Multi Node with raw PyTorch + Lite - -You can run the multi-node raw PyTorch and Lite by running the following commands. - -```bash -lightning run app train_lite.py -``` - -Using Lite, you retain control over your loops while accessing in a minimal way all Lightning distributed strategies. - ## Multi Node with PyTorch Lightning Lightning supports running PyTorch Lightning from a script or within a Lightning Work. diff --git a/examples/app_multi_node/train_any.py b/examples/app_multi_node/train_any.py deleted file mode 100644 index 5dfb947134dcd..0000000000000 --- a/examples/app_multi_node/train_any.py +++ /dev/null @@ -1,22 +0,0 @@ -import lightning as L -from lightning.app.components import MultiNode - - -class AnyDistributedComponent(L.LightningWork): - def run( - self, - main_address: str, - main_port: int, - num_nodes: int, - node_rank: int, - ): - print(f"ADD YOUR DISTRIBUTED CODE: {main_address} {main_port} {num_nodes} {node_rank}.") - - -app = L.LightningApp( - MultiNode( - AnyDistributedComponent, - num_nodes=2, - cloud_compute=L.CloudCompute("gpu"), - ) -) From 0d2db5d28e1d645e7171d0b921316baf5eec5c24 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 8 Nov 2022 14:40:57 +0000 Subject: [PATCH 2/2] update --- examples/app_multi_node/train_any.py | 22 ++++++++++++++++ examples/app_multi_node/train_lite.py | 38 --------------------------- 2 files changed, 22 insertions(+), 38 deletions(-) create mode 100644 examples/app_multi_node/train_any.py delete mode 100644 examples/app_multi_node/train_lite.py diff --git a/examples/app_multi_node/train_any.py b/examples/app_multi_node/train_any.py new file mode 100644 index 0000000000000..5dfb947134dcd --- /dev/null +++ b/examples/app_multi_node/train_any.py @@ -0,0 +1,22 @@ +import lightning as L +from lightning.app.components import MultiNode + + +class AnyDistributedComponent(L.LightningWork): + def run( + self, + main_address: str, + main_port: int, + num_nodes: int, + node_rank: int, + ): + print(f"ADD YOUR DISTRIBUTED CODE: {main_address} {main_port} {num_nodes} {node_rank}.") + + +app = L.LightningApp( + MultiNode( + AnyDistributedComponent, + num_nodes=2, + cloud_compute=L.CloudCompute("gpu"), + ) +) diff --git a/examples/app_multi_node/train_lite.py b/examples/app_multi_node/train_lite.py deleted file mode 100644 index ed9777a1064f6..0000000000000 --- a/examples/app_multi_node/train_lite.py +++ /dev/null @@ -1,38 +0,0 @@ -import torch - -import lightning as L -from lightning.app.components import LiteMultiNode -from lightning.lite import LightningLite - - -class LitePyTorchDistributed(L.LightningWork): - @staticmethod - def run(): - # 1. Create LightningLite. - lite = LightningLite(strategy="ddp", precision="bf16") - - # 2. Prepare distributed model and optimizer. - model = torch.nn.Linear(32, 2) - optimizer = torch.optim.SGD(model.parameters(), lr=0.01) - model, optimizer = lite.setup(model, optimizer) - criterion = torch.nn.MSELoss() - - # 3. Train the model for 50 steps. - for step in range(50): - model.zero_grad() - x = torch.randn(64, 32).to(lite.device) - output = model(x) - loss = criterion(output, torch.ones_like(output)) - print(f"global_rank: {lite.global_rank} step: {step} loss: {loss}") - lite.backward(loss) - optimizer.step() - - -# Run over 2 nodes of 4 x V100 -app = L.LightningApp( - LiteMultiNode( - LitePyTorchDistributed, - cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x V100 - num_nodes=2, - ) -)