Skip to content

Commit 8306797

Browse files
ethanwharrisBordacarmocca
authored
[App] Update multi-node examples (#15700)
Co-authored-by: Jirka Borovec <[email protected]> Co-authored-by: Carlos Mocholí <[email protected]>
1 parent 08d14ec commit 8306797

File tree

7 files changed

+36
-49
lines changed

7 files changed

+36
-49
lines changed

.github/workflows/ci-app-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ on:
1111
- ".github/workflows/ci-app-tests.yml"
1212
- "src/lightning_app/**"
1313
- "tests/tests_app/**"
14-
- "examples/app_*" # some tests_app tests call examples files
14+
- "examples/app_*/**" # some tests_app tests call examples files
1515
- "requirements/app/**"
1616
- "setup.py"
1717
- ".actions/**"

docs/source-app/levels/basic/hello_components/pl_multinode.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,7 @@
55

66

77
class LightningTrainerDistributed(L.LightningWork):
8-
@staticmethod
9-
def run():
8+
def run(self):
109
model = BoringModel()
1110
trainer = L.Trainer(max_epochs=10, strategy="ddp")
1211
trainer.fit(model)

docs/source-app/levels/basic/hello_components/pt_multinode.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,7 @@ def distributed_train(local_rank: int, main_address: str, main_port: int, num_no
2222
# 2. PREPARE DISTRIBUTED MODEL
2323
model = torch.nn.Linear(32, 2)
2424
device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu")
25-
device_ids = device if torch.cuda.is_available() else None
26-
model = DistributedDataParallel(model, device_ids=device_ids).to(device)
25+
model = DistributedDataParallel(model, device_ids=[local_rank]).to(device)
2726

2827
# 3. SETUP LOSS AND OPTIMIZER
2928
criterion = torch.nn.MSELoss()
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
.. lit_tabs::
22
:titles: Hello world; Hello GPU world; PyTorch & ⚡⚡⚡ Trainer (1+ cloud GPUs); Train PyTorch (cloud GPU); Train PyTorch (32 cloud GPUs); Deploy a model on cloud GPUs; Run a model script; XGBoost; Streamlit demo
33
:code_files: /levels/basic/hello_components/hello_world.py; /levels/basic/hello_components/hello_world_gpu.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/train_pytorch.py; /levels/basic/hello_components/pt_multinode.py; /levels/basic/hello_components/deploy_model.py; /levels/basic/hello_components/run_ptl_script.py; /levels/basic/hello_components/xgboost.py; /levels/basic/hello_components/streamlit_demo.py
4-
:highlights: 7; 10, 11; 10-12, 17, 18; 4, 8, 12, 18-19, 26; 5, 10, 22, 28, 32, 42, 58-60; 3, 11-12, 25, 29; 7, 10; 15, 21; 9, 15, 24
4+
:highlights: 7; 10, 11; 9-11, 16, 17; 4, 8, 12, 18-19, 26; 5, 10, 22, 27, 31, 41, 57-59; 3, 11-12, 25, 29; 7, 10; 15, 21; 9, 15, 24
55
:enable_run: true
66
:tab_rows: 3
77
:height: 620px

docs/source-app/levels/basic/real_lightning_component_implementations.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ or cloud GPUs without code changes.
2626
.. lit_tabs::
2727
:descriptions: import Lightning; We're using a demo LightningModule; Move your training code here (usually your main.py); Pass your component to the multi-node executor (it works on CPU or single GPUs also); Select the number of machines (nodes). Here we choose 2.; Choose from over 15+ machine types. This one has 4 v100 GPUs.; Initialize the App object that executes the component logic.
2828
:code_files: /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py;
29-
:highlights: 2; 4; 10-12; 15-18; 17; 18; 20
29+
:highlights: 2; 4; 9-11; 14-17; 16; 17; 19
3030
:enable_run: true
3131
:tab_rows: 5
3232
:height: 420px

examples/app_multi_node/train_lt.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# app.py
12
import lightning as L
23
from lightning.app.components import LightningTrainerMultiNode
34
from lightning.pytorch.demos.boring_classes import BoringModel
@@ -6,18 +7,14 @@
67
class LightningTrainerDistributed(L.LightningWork):
78
def run(self):
89
model = BoringModel()
9-
trainer = L.Trainer(
10-
max_steps=1000,
11-
strategy="ddp",
12-
)
10+
trainer = L.Trainer(max_epochs=10, strategy="ddp")
1311
trainer.fit(model)
1412

1513

16-
# Run over 2 nodes of 4 x V100
17-
app = L.LightningApp(
18-
LightningTrainerMultiNode(
19-
LightningTrainerDistributed,
20-
num_nodes=2,
21-
cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x V100
22-
)
14+
# 8 GPU: (2 nodes of 4 x v100)
15+
component = LightningTrainerMultiNode(
16+
LightningTrainerDistributed,
17+
num_nodes=4,
18+
cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x v100
2319
)
20+
app = L.LightningApp(component)
Lines changed: 23 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# app.py
2+
# ! pip install torch
13
import torch
24
from torch.nn.parallel.distributed import DistributedDataParallel
35

@@ -6,7 +8,7 @@
68

79

810
def distributed_train(local_rank: int, main_address: str, main_port: int, num_nodes: int, node_rank: int, nprocs: int):
9-
# 1. Setting distributed environment
11+
# 1. SET UP DISTRIBUTED ENVIRONMENT
1012
global_rank = local_rank + node_rank * nprocs
1113
world_size = num_nodes * nprocs
1214

@@ -18,52 +20,42 @@ def distributed_train(local_rank: int, main_address: str, main_port: int, num_no
1820
init_method=f"tcp://{main_address}:{main_port}",
1921
)
2022

21-
# 2. Prepare the model
22-
model = torch.nn.Sequential(
23-
torch.nn.Linear(1, 1),
24-
torch.nn.ReLU(),
25-
torch.nn.Linear(1, 1),
26-
)
27-
28-
# 3. Setup distributed training
23+
# 2. PREPARE DISTRIBUTED MODEL
24+
model = torch.nn.Linear(32, 2)
2925
device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu")
30-
model = DistributedDataParallel(model.to(device), device_ids=[local_rank] if torch.cuda.is_available() else None)
26+
model = DistributedDataParallel(model, device_ids=[local_rank]).to(device)
3127

32-
# 4. Prepare loss and optimizer
28+
# 3. SETUP LOSS AND OPTIMIZER
3329
criterion = torch.nn.MSELoss()
3430
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
3531

36-
# 5. Train the model for 1000 steps.
37-
for step in range(1000):
32+
# 4.TRAIN THE MODEL FOR 50 STEPS
33+
for step in range(50):
3834
model.zero_grad()
39-
x = torch.tensor([0.8]).to(device)
40-
target = torch.tensor([1.0]).to(device)
35+
x = torch.randn(64, 32).to(device)
4136
output = model(x)
42-
loss = criterion(output, target)
37+
loss = criterion(output, torch.ones_like(output))
4338
print(f"global_rank: {global_rank} step: {step} loss: {loss}")
4439
loss.backward()
4540
optimizer.step()
4641

42+
# 5. VERIFY ALL COPIES OF THE MODEL HAVE THE SAME WEIGTHS AT END OF TRAINING
43+
weight = model.module.weight.clone()
44+
torch.distributed.all_reduce(weight)
45+
assert torch.equal(model.module.weight, weight / world_size)
46+
47+
print("Multi Node Distributed Training Done!")
48+
4749

4850
class PyTorchDistributed(L.LightningWork):
49-
def run(
50-
self,
51-
main_address: str,
52-
main_port: int,
53-
num_nodes: int,
54-
node_rank: int,
55-
):
51+
def run(self, main_address: str, main_port: int, num_nodes: int, node_rank: int):
5652
nprocs = torch.cuda.device_count() if torch.cuda.is_available() else 1
5753
torch.multiprocessing.spawn(
5854
distributed_train, args=(main_address, main_port, num_nodes, node_rank, nprocs), nprocs=nprocs
5955
)
6056

6157

62-
# Run over 2 nodes of 4 x V100
63-
app = L.LightningApp(
64-
MultiNode(
65-
PyTorchDistributed,
66-
num_nodes=2,
67-
cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x V100
68-
)
69-
)
58+
# 32 GPUs: (8 nodes x 4 v 100)
59+
compute = L.CloudCompute("gpu-fast-multi") # 4xV100
60+
component = MultiNode(PyTorchDistributed, num_nodes=2, cloud_compute=compute)
61+
app = L.LightningApp(component)

0 commit comments

Comments
 (0)