Skip to content

Commit eb0d362

Browse files
committed
feat: torchx run support
BREAKING CHANGE: this pulls in `@guidebooks/store@6` which changes the menu structure of ml/codeflare/run so as to introduce TorchX
1 parent dbd7048 commit eb0d362

File tree

22 files changed

+260
-28
lines changed

22 files changed

+260
-28
lines changed

Diff for: .github/workflows/kind.yml

+9-8
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,16 @@ jobs:
1313
strategy:
1414
matrix:
1515
profile:
16-
- non-gpu1/keep-it-simple
17-
- non-gpu2/keep-it-simple
18-
- non-gpu3/keep-it-simple
19-
- non-gpu4/keep-it-simple
20-
- non-gpu5/keep-it-simple
16+
- non-gpu1/keep-it-simple # ray
17+
- non-gpu2/keep-it-simple # ray
18+
- non-gpu3/keep-it-simple # ray
19+
- non-gpu4/keep-it-simple # ray
20+
- non-gpu5/keep-it-simple # ray with dashdash args
21+
- non-gpu6/mcad-default # torchx
2122
# - non-gpu1/ray-autoscaler
22-
- non-gpu1/mcad-default
23-
- non-gpu1/mcad-coscheduler
24-
- non-gpu1/mcad-preinstalled
23+
- non-gpu1/mcad-default # ray
24+
- non-gpu1/mcad-coscheduler # ray
25+
- non-gpu1/mcad-preinstalled # ray
2526
os: [ubuntu-latest]
2627
node-version: [16.x]
2728

Diff for: package-lock.json

+9-9
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: plugins/plugin-codeflare/package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
"@types/split2": "^3.2.1"
3131
},
3232
"dependencies": {
33-
"@guidebooks/store": "^5.6.2",
33+
"@guidebooks/store": "^6.0.8",
3434
"@logdna/tail-file": "^3.0.1",
3535
"@patternfly/react-charts": "^6.94.18",
3636
"@patternfly/react-core": "^4.276.6",
+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
Compute World Size Example
2+
############################
3+
4+
This is a minimal "hello world" style example application that uses
5+
PyTorch Distributed to compute the world size. It is a minimal example
6+
in that it initializes the ``torch.distributed`` process group and
7+
performs a single collective operation (all_reduce) which is enough to
8+
validate the infrastructure and scheduler setup.
9+
10+
This example is compatible with the ``dist.ddp``. To run from CLI:
11+
12+
.. code-block:: shell-session
13+
14+
$ cd $torchx-git-repo-root/torchx/examples/apps
15+
$ torchx run dist.ddp --script compute_world_size/main.py -j 1x2

Diff for: tests/kind/inputs/torchx/compute_world_size/config/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
hydra:
2+
run:
3+
dir: /tmp
4+
main:
5+
backend: gloo
6+
rank: 0
7+
world_size: 1
8+
master_addr: localhost
9+
master_port: 29500
10+
throws: False

Diff for: tests/kind/inputs/torchx/compute_world_size/main.py

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
import sys # @starpit 20230312
8+
print(sys.argv[1:]) # @starpit 20230312
9+
10+
"""
11+
Compute World Size Example
12+
============================
13+
14+
This is a minimal "hello world" style example application that uses
15+
PyTorch Distributed to compute the world size. It does not do ML training
16+
but it does initialize process groups and performs a single collective operation (all_reduce)
17+
which is enough to validate the infrastructure and scheduler setup.
18+
19+
As simple as this application is, the actual ``compute_world_size()`` function is
20+
split into a separate submodule (``.module.util.compute_world_size``) to double
21+
as a E2E test for workspace patching logic, which typically diff-patches a full project
22+
directory rather than a single file. This application also uses `Hydra <https://hydra.cc/docs/intro/>`_
23+
configs as an expository example of how to use Hydra configs in an application that launches with TorchX.
24+
25+
Run it with the ``dist.ddp`` builtin component to use as a validation application
26+
to ensure that the stack has been setup properly for more serious distributed training jobs.
27+
"""
28+
29+
import hydra
30+
from omegaconf import DictConfig, OmegaConf
31+
from torch.distributed.elastic.multiprocessing.errors import record
32+
from module.util import compute_world_size
33+
34+
35+
@record
36+
def run(cfg: DictConfig) -> None:
37+
print(OmegaConf.to_yaml(cfg))
38+
39+
if cfg.main.throws:
40+
raise RuntimeError(f"raising error because cfg.main.throws={cfg.main.throws}")
41+
compute_world_size(cfg)
42+
43+
44+
if __name__ == "__main__":
45+
# use compose API to make this compatible with ipython notebooks
46+
# need to initialize the config directory as a module to make it
47+
# not depends on rel path (PWD) or abs path (torchx install dir)
48+
# see: https://hydra.cc/docs/advanced/jupyter_notebooks/
49+
with hydra.initialize_config_module(
50+
config_module="compute_world_size.config"
51+
):
52+
cfg: DictConfig = hydra.compose(config_name="defaults")
53+
run(cfg)
54+
55+
print("SUCCEEDED") # @starpit 20230312 for testing
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
import random
8+
import unittest
9+
10+
from omegaconf import DictConfig
11+
from torchx.examples.apps.compute_world_size.module.util import compute_world_size
12+
13+
14+
class UtilTest(unittest.TestCase):
15+
def test_compute_world_size(self) -> None:
16+
cfg = DictConfig(
17+
content={
18+
"main": {
19+
"rank": 0,
20+
"world_size": 1,
21+
"master_addr": "localhost",
22+
# ephemeral port range in linux
23+
"master_port": random.randint(32768, 60999),
24+
"backend": "gloo",
25+
}
26+
}
27+
)
28+
29+
self.assertEqual(1, compute_world_size(cfg))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/usr/bin/env python3
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
import os
9+
10+
import torch
11+
import torch.distributed as dist
12+
import torch.nn.functional as F
13+
from omegaconf import DictConfig
14+
15+
16+
def compute_world_size(cfg: DictConfig) -> int:
17+
18+
rank = int(os.getenv("RANK", cfg.main.rank))
19+
world_size = int(os.getenv("WORLD_SIZE", cfg.main.world_size))
20+
master_addr = os.getenv("MASTER_ADDR", cfg.main.master_addr)
21+
master_port = int(os.getenv("MASTER_PORT", cfg.main.master_port))
22+
backend = cfg.main.backend
23+
24+
print(f"initializing `{backend}` process group")
25+
dist.init_process_group(
26+
backend=backend,
27+
init_method=f"tcp://{master_addr}:{master_port}",
28+
rank=rank,
29+
world_size=world_size,
30+
)
31+
print("successfully initialized process group")
32+
33+
rank = dist.get_rank()
34+
world_size = dist.get_world_size()
35+
36+
t = F.one_hot(torch.tensor(rank), num_classes=world_size)
37+
dist.all_reduce(t)
38+
computed_world_size = int(torch.sum(t).item())
39+
print(
40+
f"rank: {rank}, actual world_size: {world_size}, computed world_size: {computed_world_size}"
41+
)
42+
return computed_world_size

Diff for: tests/kind/profiles/non-gpu1/keep-it-simple

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"madwizard/apriori/mac-installer": "Homebrew",
1111
"madwizard/apriori/in-terminal": "HTML",
1212
"ml/codeflare": "Submit a new Run",
13-
"ml/codeflare/run": "Bring Your Own Code",
13+
"ml/codeflare/run": "Bring Your Own Ray Code",
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",

Diff for: tests/kind/profiles/non-gpu1/mcad-coscheduler

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"madwizard/apriori/mac-installer": "Homebrew",
1111
"madwizard/apriori/in-terminal": "HTML",
1212
"ml/codeflare": "Submit a new Run",
13-
"ml/codeflare/run": "Bring Your Own Code",
13+
"ml/codeflare/run": "Bring Your Own Ray Code",
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",

Diff for: tests/kind/profiles/non-gpu1/mcad-default

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"madwizard/apriori/mac-installer": "Homebrew",
1111
"madwizard/apriori/in-terminal": "HTML",
1212
"ml/codeflare": "Submit a new Run",
13-
"ml/codeflare/run": "Bring Your Own Code",
13+
"ml/codeflare/run": "Bring Your Own Ray Code",
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",

Diff for: tests/kind/profiles/non-gpu1/mcad-preinstalled

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"madwizard/apriori/mac-installer": "Homebrew",
1111
"madwizard/apriori/in-terminal": "HTML",
1212
"ml/codeflare": "Submit a new Run",
13-
"ml/codeflare/run": "Bring Your Own Code",
13+
"ml/codeflare/run": "Bring Your Own Ray Code",
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",

Diff for: tests/kind/profiles/non-gpu1/ray-autoscaler

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"madwizard/apriori/mac-installer": "Homebrew",
1111
"madwizard/apriori/in-terminal": "HTML",
1212
"ml/codeflare": "Submit a new Run",
13-
"ml/codeflare/run": "Bring Your Own Code",
13+
"ml/codeflare/run": "Bring Your Own Ray Code",
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",

Diff for: tests/kind/profiles/non-gpu2/keep-it-simple

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"madwizard/apriori/mac-installer": "Homebrew",
1111
"madwizard/apriori/in-terminal": "HTML",
1212
"ml/codeflare": "Submit a new Run",
13-
"ml/codeflare/run": "Bring Your Own Code",
13+
"ml/codeflare/run": "Bring Your Own Ray Code",
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",

Diff for: tests/kind/profiles/non-gpu3/keep-it-simple

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"madwizard/apriori/mac-installer": "Homebrew",
1111
"madwizard/apriori/in-terminal": "HTML",
1212
"ml/codeflare": "Submit a new Run",
13-
"ml/codeflare/run": "Bring Your Own Code",
13+
"ml/codeflare/run": "Bring Your Own Ray Code",
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",

Diff for: tests/kind/profiles/non-gpu4/keep-it-simple

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"madwizard/apriori/mac-installer": "Homebrew",
1111
"madwizard/apriori/in-terminal": "HTML",
1212
"ml/codeflare": "Submit a new Run",
13-
"ml/codeflare/run": "Bring Your Own Code",
13+
"ml/codeflare/run": "Bring Your Own Ray Code",
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",

Diff for: tests/kind/profiles/non-gpu5/keep-it-simple

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"madwizard/apriori/mac-installer": "Homebrew",
1111
"madwizard/apriori/in-terminal": "HTML",
1212
"ml/codeflare": "Submit a new Run",
13-
"ml/codeflare/run": "Bring Your Own Code",
13+
"ml/codeflare/run": "Bring Your Own Ray Code",
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit-with-dashdash\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 intentionally-not-main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
@@ -22,4 +22,4 @@
2222
"ml/ray/cluster/choose/kubernetes": "codeflare-test-ray-cluster",
2323
"ml/ray/cluster/kubernetes/choose-pod-scheduler": "Keep It Simple"
2424
}
25-
}
25+
}

Diff for: tests/kind/profiles/non-gpu6/dashdash.txt

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
aaaaaaaaa bbbbbbbbbbbbbb 'ccccccccc ccccccccccccccc' dddddddddd eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee 'ffffffffffff ffffff' gggggggggggggggggggg

Diff for: tests/kind/profiles/non-gpu6/keep-it-simple

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
{
2+
"name": "keep-it-simple",
3+
"creationTime": 1664149787016,
4+
"lastModifiedTime": 1676828268772,
5+
"lastUsedTime": 1678637996635,
6+
"choices": {
7+
"madwizard/apriori/use-gpu": "don't use gpus",
8+
"madwizard/apriori/arch": "x64",
9+
"madwizard/apriori/platform": "darwin",
10+
"madwizard/apriori/mac-installer": "Homebrew",
11+
"madwizard/apriori/in-terminal": "HTML",
12+
"ml/codeflare": "Submit a new Run",
13+
"ml/codeflare/run": "Bring Your Own Torch Native Code",
14+
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/torchx\",\"Base image\":\"ghcr.io/pytorch/torchx:0.5.0dev0\",\"Command line prefix\":\"python3 compute_world_size/main.py\"}",
15+
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
16+
"s3/choose/bucket/maybe": "My data is not stored in S3",
17+
"ml/torchx/run/resources": "{\"Number of Workers\":\"1\",\"CPUs per worker\":\"500m\",\"GPUs per worker\":\"0\",\"Memory per worker\":\"500Mi\",\"Ephemeral Storage per worker\":\"5Gi\"}",
18+
"kubernetes/context": "kind-codeflare-test",
19+
"kubernetes/choose/ns": "default",
20+
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",
21+
"ml/ray/cluster/choose": "codeflare-test-ray-cluster",
22+
"ml/ray/cluster/choose/kubernetes": "codeflare-test-ray-cluster",
23+
"ml/ray/cluster/kubernetes/choose-pod-scheduler": "Keep It Simple"
24+
}
25+
}

Diff for: tests/kind/profiles/non-gpu6/mcad-coscheduler

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{
2+
"name": "mcad-coscheduler",
3+
"creationTime": 1660657756574,
4+
"lastModifiedTime": 1678638052528,
5+
"lastUsedTime": 1678638134621,
6+
"choices": {
7+
"madwizard/apriori/use-gpu": "don't use gpus",
8+
"madwizard/apriori/arch": "x64",
9+
"madwizard/apriori/platform": "darwin",
10+
"madwizard/apriori/mac-installer": "Homebrew",
11+
"madwizard/apriori/in-terminal": "HTML",
12+
"ml/codeflare": "Submit a new Run",
13+
"ml/codeflare/run": "Bring Your Own Torch Native Code",
14+
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/torchx\",\"Base image\":\"ghcr.io/pytorch/torchx:0.5.0dev0\",\"Command line prefix\":\"python3 compute_world_size/main.py\"}",
15+
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
16+
"s3/choose/bucket/maybe": "My data is not stored in S3",
17+
"ml/torchx/run/resources": "{\"Number of Workers\":\"1\",\"CPUs per worker\":\"500m\",\"GPUs per worker\":\"0\",\"Memory per worker\":\"500Mi\",\"Ephemeral Storage per worker\":\"5Gi\"}",
18+
"kubernetes/context": "kind-codeflare-test",
19+
"kubernetes/choose/ns": "default",
20+
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",
21+
"ml/ray/cluster/choose": "codeflare-test-ray-cluster",
22+
"ml/ray/cluster/choose/kubernetes": "codeflare-test-ray-cluster",
23+
"ml/ray/cluster/kubernetes/choose-pod-scheduler": "Use the Multi-user Enhanced Kubernetes Scheduler",
24+
"kubernetes/mcad/choose/job-priority": "Default Priority",
25+
"kubernetes/mcad/choose/scheduler": "MCAD with the Advanced Coscheduler"
26+
}
27+
}

0 commit comments

Comments
 (0)