Skip to content

Commit 632837f

Browse files
committed
fix: profile import should not overwrite existing profile by default
this PR also simplifies the torchx run to use a smaller base image, and one that we can pin (we had been using the torchx 0.5.0dev0 stream, which is highly unstable, and was causing test stability issues)
1 parent a4eb426 commit 632837f

File tree

5 files changed

+32
-47
lines changed

5 files changed

+32
-47
lines changed

Diff for: package-lock.json

+17-17
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: plugins/plugin-codeflare/package.json

+2-2
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,13 @@
3030
"@types/split2": "^3.2.1"
3131
},
3232
"dependencies": {
33-
"@guidebooks/store": "^6.1.4",
33+
"@guidebooks/store": "^6.1.6",
3434
"@logdna/tail-file": "^3.0.1",
3535
"@patternfly/react-charts": "^6.94.18",
3636
"@patternfly/react-core": "^4.276.6",
3737
"asciinema-player": "^3.1.0",
3838
"chokidar": "^3.5.3",
39-
"madwizard": "^6.3.1",
39+
"madwizard": "^6.3.2",
4040
"needle": "^3.2.0",
4141
"open": "^8.4.2",
4242
"pretty-bytes": "^6.1.0",

Diff for: tests/kind/inputs/torchx/compute_world_size/main.py

+5-19
Original file line numberDiff line numberDiff line change
@@ -26,30 +26,16 @@
2626
to ensure that the stack has been setup properly for more serious distributed training jobs.
2727
"""
2828

29-
import hydra
30-
from omegaconf import DictConfig, OmegaConf
3129
from torch.distributed.elastic.multiprocessing.errors import record
3230
from module.util import compute_world_size
3331

3432

3533
@record
36-
def run(cfg: DictConfig) -> None:
37-
print(OmegaConf.to_yaml(cfg))
38-
39-
if cfg.main.throws:
40-
raise RuntimeError(f"raising error because cfg.main.throws={cfg.main.throws}")
41-
compute_world_size(cfg)
34+
def run() -> None:
35+
compute_world_size()
4236

4337

4438
if __name__ == "__main__":
45-
# use compose API to make this compatible with ipython notebooks
46-
# need to initialize the config directory as a module to make it
47-
# not depends on rel path (PWD) or abs path (torchx install dir)
48-
# see: https://hydra.cc/docs/advanced/jupyter_notebooks/
49-
with hydra.initialize_config_module(
50-
config_module="compute_world_size.config"
51-
):
52-
cfg: DictConfig = hydra.compose(config_name="defaults")
53-
run(cfg)
54-
55-
print("SUCCEEDED") # @starpit 20230312 for testing
39+
run()
40+
41+
print("SUCCEEDED") # @starpit 20230312 for testing

Diff for: tests/kind/inputs/torchx/compute_world_size/module/util.py

+6-7
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,15 @@
1010
import torch
1111
import torch.distributed as dist
1212
import torch.nn.functional as F
13-
from omegaconf import DictConfig
1413

1514

16-
def compute_world_size(cfg: DictConfig) -> int:
15+
def compute_world_size() -> int:
1716

18-
rank = int(os.getenv("RANK", cfg.main.rank))
19-
world_size = int(os.getenv("WORLD_SIZE", cfg.main.world_size))
20-
master_addr = os.getenv("MASTER_ADDR", cfg.main.master_addr)
21-
master_port = int(os.getenv("MASTER_PORT", cfg.main.master_port))
22-
backend = cfg.main.backend
17+
rank = int(os.getenv("RANK", 0))
18+
world_size = int(os.getenv("WORLD_SIZE", 1))
19+
master_addr = os.getenv("MASTER_ADDR", "localhost")
20+
master_port = int(os.getenv("MASTER_PORT", 29500))
21+
backend = "gloo"
2322

2423
print(f"initializing `{backend}` process group")
2524
dist.init_process_group(

Diff for: tests/kind/profiles/non-gpu6/mcad-default

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"name": "mcad-default",
33
"creationTime": 1660657756574,
44
"lastModifiedTime": 1678638052528,
5-
"lastUsedTime": 1678715380516,
5+
"lastUsedTime": 1678980273687,
66
"choices": {
77
"madwizard/apriori/use-gpu": "don't use gpus",
88
"madwizard/apriori/arch": "x64",
@@ -11,7 +11,7 @@
1111
"madwizard/apriori/in-terminal": "HTML",
1212
"ml/codeflare": "Submit a new Run",
1313
"ml/codeflare/run": "Bring Your Own Torch Native Code",
14-
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/torchx\",\"Base image\":\"ghcr.io/pytorch/torchx:0.5.0dev0\",\"Command line prefix\":\"python3 compute_world_size/main.py\"}",
14+
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/torchx\",\"Base image\":\"bitnami/pytorch:1.13.1\",\"Command line prefix\":\"python3 compute_world_size/main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
1717
"ml/torchx/run/resources": "{\"Number of Workers\":\"1\",\"CPUs per worker\":\"500m\",\"GPUs per worker\":\"0\",\"Memory per worker\":\"500Mi\",\"Ephemeral Storage per worker\":\"5Gi\"}",

0 commit comments

Comments
 (0)