feat: torchx run support

starpit · starpit · commit eb0d3623ea97 · 2023-03-13T10:14:31.000-04:00
BREAKING CHANGE: this pulls in `@guidebooks/store@6` which changes the menu structure of ml/codeflare/run so as to introduce TorchX
diff --git a/.github/workflows/kind.yml b/.github/workflows/kind.yml
@@ -13,15 +13,16 @@ jobs:
     strategy:
       matrix:
         profile:
-          - non-gpu1/keep-it-simple
-          - non-gpu2/keep-it-simple
-          - non-gpu3/keep-it-simple
-          - non-gpu4/keep-it-simple
-          - non-gpu5/keep-it-simple
+          - non-gpu1/keep-it-simple # ray
+          - non-gpu2/keep-it-simple # ray
+          - non-gpu3/keep-it-simple # ray
+          - non-gpu4/keep-it-simple # ray
+          - non-gpu5/keep-it-simple # ray with dashdash args
+          - non-gpu6/mcad-default # torchx
           # - non-gpu1/ray-autoscaler
-          - non-gpu1/mcad-default
-          - non-gpu1/mcad-coscheduler
-          - non-gpu1/mcad-preinstalled
+          - non-gpu1/mcad-default # ray
+          - non-gpu1/mcad-coscheduler # ray
+          - non-gpu1/mcad-preinstalled # ray
         os: [ubuntu-latest]
         node-version: [16.x]
 
diff --git a/package-lock.json b/package-lock.json
diff --git a/plugins/plugin-codeflare/package.json b/plugins/plugin-codeflare/package.json
@@ -30,7 +30,7 @@
     "@types/split2": "^3.2.1"
   },
   "dependencies": {
-    "@guidebooks/store": "^5.6.2",
+    "@guidebooks/store": "^6.0.8",
     "@logdna/tail-file": "^3.0.1",
     "@patternfly/react-charts": "^6.94.18",
     "@patternfly/react-core": "^4.276.6",
diff --git a/tests/kind/inputs/torchx/compute_world_size/README.rst b/tests/kind/inputs/torchx/compute_world_size/README.rst
@@ -0,0 +1,15 @@
+Compute World Size Example
+############################
+
+This is a minimal "hello world" style  example application that uses
+PyTorch Distributed to compute the world size. It is a minimal example
+in that it initializes the ``torch.distributed`` process group and
+performs a single collective operation (all_reduce) which is enough to
+validate the infrastructure and scheduler setup.
+
+This example is compatible with the ``dist.ddp``. To run from CLI:
+
+.. code-block:: shell-session
+
+ $ cd $torchx-git-repo-root/torchx/examples/apps
+ $ torchx run dist.ddp --script compute_world_size/main.py -j 1x2
diff --git a/tests/kind/inputs/torchx/compute_world_size/config/__init__.py b/tests/kind/inputs/torchx/compute_world_size/config/__init__.py
diff --git a/tests/kind/inputs/torchx/compute_world_size/config/defaults.yaml b/tests/kind/inputs/torchx/compute_world_size/config/defaults.yaml
@@ -0,0 +1,10 @@
+hydra:
+  run:
+    dir: /tmp
+main:
+  backend: gloo
+  rank: 0
+  world_size: 1
+  master_addr: localhost
+  master_port: 29500
+  throws: False
diff --git a/tests/kind/inputs/torchx/compute_world_size/main.py b/tests/kind/inputs/torchx/compute_world_size/main.py
@@ -0,0 +1,55 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import sys # @starpit 20230312
+print(sys.argv[1:]) # @starpit 20230312
+
+"""
+Compute World Size Example
+============================
+
+This is a minimal "hello world" style  example application that uses
+PyTorch Distributed to compute the world size. It does not do ML training
+but it does initialize process groups and performs a single collective operation (all_reduce)
+which is enough to validate the infrastructure and scheduler setup.
+
+As simple as this application is, the actual ``compute_world_size()`` function is
+split into a separate submodule (``.module.util.compute_world_size``) to double
+as a E2E test for workspace patching logic, which typically diff-patches a full project
+directory rather than a single file. This application also uses `Hydra <https://hydra.cc/docs/intro/>`_
+configs as an expository example of how to use Hydra configs in an application that launches with TorchX.
+
+Run it with the ``dist.ddp`` builtin component to use as a validation application
+to ensure that the stack has been setup properly for more serious distributed training jobs.
+"""
+
+import hydra
+from omegaconf import DictConfig, OmegaConf
+from torch.distributed.elastic.multiprocessing.errors import record
+from module.util import compute_world_size
+
+
+@record
+def run(cfg: DictConfig) -> None:
+    print(OmegaConf.to_yaml(cfg))
+
+    if cfg.main.throws:
+        raise RuntimeError(f"raising error because cfg.main.throws={cfg.main.throws}")
+    compute_world_size(cfg)
+
+
+if __name__ == "__main__":
+    # use compose API to make this compatible with ipython notebooks
+    # need to initialize the config directory as a module to make it
+    # not depends on rel path (PWD) or abs path (torchx install dir)
+    # see: https://hydra.cc/docs/advanced/jupyter_notebooks/
+    with hydra.initialize_config_module(
+        config_module="compute_world_size.config"
+    ):
+        cfg: DictConfig = hydra.compose(config_name="defaults")
+        run(cfg)
+
+        print("SUCCEEDED") # @starpit 20230312 for testing
diff --git a/tests/kind/inputs/torchx/compute_world_size/module/test/util_test.py b/tests/kind/inputs/torchx/compute_world_size/module/test/util_test.py
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import random
+import unittest
+
+from omegaconf import DictConfig
+from torchx.examples.apps.compute_world_size.module.util import compute_world_size
+
+
+class UtilTest(unittest.TestCase):
+    def test_compute_world_size(self) -> None:
+        cfg = DictConfig(
+            content={
+                "main": {
+                    "rank": 0,
+                    "world_size": 1,
+                    "master_addr": "localhost",
+                    # ephemeral port range in linux
+                    "master_port": random.randint(32768, 60999),
+                    "backend": "gloo",
+                }
+            }
+        )
+
+        self.assertEqual(1, compute_world_size(cfg))
diff --git a/tests/kind/inputs/torchx/compute_world_size/module/util.py b/tests/kind/inputs/torchx/compute_world_size/module/util.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from omegaconf import DictConfig
+
+
+def compute_world_size(cfg: DictConfig) -> int:
+
+    rank = int(os.getenv("RANK", cfg.main.rank))
+    world_size = int(os.getenv("WORLD_SIZE", cfg.main.world_size))
+    master_addr = os.getenv("MASTER_ADDR", cfg.main.master_addr)
+    master_port = int(os.getenv("MASTER_PORT", cfg.main.master_port))
+    backend = cfg.main.backend
+
+    print(f"initializing `{backend}` process group")
+    dist.init_process_group(
+        backend=backend,
+        init_method=f"tcp://{master_addr}:{master_port}",
+        rank=rank,
+        world_size=world_size,
+    )
+    print("successfully initialized process group")
+
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+
+    t = F.one_hot(torch.tensor(rank), num_classes=world_size)
+    dist.all_reduce(t)
+    computed_world_size = int(torch.sum(t).item())
+    print(
+        f"rank: {rank}, actual world_size: {world_size}, computed world_size: {computed_world_size}"
+    )
+    return computed_world_size
diff --git a/tests/kind/profiles/non-gpu1/keep-it-simple b/tests/kind/profiles/non-gpu1/keep-it-simple
@@ -10,7 +10,7 @@
     "madwizard/apriori/mac-installer": "Homebrew",
     "madwizard/apriori/in-terminal": "HTML",
     "ml/codeflare": "Submit a new Run",
-    "ml/codeflare/run": "Bring Your Own Code",
+    "ml/codeflare/run": "Bring Your Own Ray Code",
     "ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
     "kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
     "s3/choose/bucket/maybe": "My data is not stored in S3",
diff --git a/tests/kind/profiles/non-gpu1/mcad-coscheduler b/tests/kind/profiles/non-gpu1/mcad-coscheduler
@@ -10,7 +10,7 @@
     "madwizard/apriori/mac-installer": "Homebrew",
     "madwizard/apriori/in-terminal": "HTML",
     "ml/codeflare": "Submit a new Run",
-    "ml/codeflare/run": "Bring Your Own Code",
+    "ml/codeflare/run": "Bring Your Own Ray Code",
     "ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
     "kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
     "s3/choose/bucket/maybe": "My data is not stored in S3",
diff --git a/tests/kind/profiles/non-gpu1/mcad-default b/tests/kind/profiles/non-gpu1/mcad-default
@@ -10,7 +10,7 @@
     "madwizard/apriori/mac-installer": "Homebrew",
     "madwizard/apriori/in-terminal": "HTML",
     "ml/codeflare": "Submit a new Run",
-    "ml/codeflare/run": "Bring Your Own Code",
+    "ml/codeflare/run": "Bring Your Own Ray Code",
     "ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
     "kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
     "s3/choose/bucket/maybe": "My data is not stored in S3",
diff --git a/tests/kind/profiles/non-gpu1/mcad-preinstalled b/tests/kind/profiles/non-gpu1/mcad-preinstalled
@@ -10,7 +10,7 @@
     "madwizard/apriori/mac-installer": "Homebrew",
     "madwizard/apriori/in-terminal": "HTML",
     "ml/codeflare": "Submit a new Run",
-    "ml/codeflare/run": "Bring Your Own Code",
+    "ml/codeflare/run": "Bring Your Own Ray Code",
     "ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
     "kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
     "s3/choose/bucket/maybe": "My data is not stored in S3",
diff --git a/tests/kind/profiles/non-gpu1/ray-autoscaler b/tests/kind/profiles/non-gpu1/ray-autoscaler
@@ -10,7 +10,7 @@
     "madwizard/apriori/mac-installer": "Homebrew",
     "madwizard/apriori/in-terminal": "HTML",
     "ml/codeflare": "Submit a new Run",
-    "ml/codeflare/run": "Bring Your Own Code",
+    "ml/codeflare/run": "Bring Your Own Ray Code",
     "ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
     "kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
     "s3/choose/bucket/maybe": "My data is not stored in S3",
diff --git a/tests/kind/profiles/non-gpu2/keep-it-simple b/tests/kind/profiles/non-gpu2/keep-it-simple
@@ -10,7 +10,7 @@
     "madwizard/apriori/mac-installer": "Homebrew",
     "madwizard/apriori/in-terminal": "HTML",
     "ml/codeflare": "Submit a new Run",
-    "ml/codeflare/run": "Bring Your Own Code",
+    "ml/codeflare/run": "Bring Your Own Ray Code",
     "ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
     "kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
     "s3/choose/bucket/maybe": "My data is not stored in S3",
diff --git a/tests/kind/profiles/non-gpu3/keep-it-simple b/tests/kind/profiles/non-gpu3/keep-it-simple
@@ -10,7 +10,7 @@
     "madwizard/apriori/mac-installer": "Homebrew",
     "madwizard/apriori/in-terminal": "HTML",
     "ml/codeflare": "Submit a new Run",
-    "ml/codeflare/run": "Bring Your Own Code",
+    "ml/codeflare/run": "Bring Your Own Ray Code",
     "ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
     "kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
     "s3/choose/bucket/maybe": "My data is not stored in S3",
diff --git a/tests/kind/profiles/non-gpu4/keep-it-simple b/tests/kind/profiles/non-gpu4/keep-it-simple
@@ -10,7 +10,7 @@
     "madwizard/apriori/mac-installer": "Homebrew",
     "madwizard/apriori/in-terminal": "HTML",
     "ml/codeflare": "Submit a new Run",
-    "ml/codeflare/run": "Bring Your Own Code",
+    "ml/codeflare/run": "Bring Your Own Ray Code",
     "ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
     "kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
     "s3/choose/bucket/maybe": "My data is not stored in S3",
diff --git a/tests/kind/profiles/non-gpu5/keep-it-simple b/tests/kind/profiles/non-gpu5/keep-it-simple
@@ -10,7 +10,7 @@
     "madwizard/apriori/mac-installer": "Homebrew",
     "madwizard/apriori/in-terminal": "HTML",
     "ml/codeflare": "Submit a new Run",
-    "ml/codeflare/run": "Bring Your Own Code",
+    "ml/codeflare/run": "Bring Your Own Ray Code",
     "ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit-with-dashdash\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 intentionally-not-main.py\"}",
     "kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
     "s3/choose/bucket/maybe": "My data is not stored in S3",
@@ -22,4 +22,4 @@
     "ml/ray/cluster/choose/kubernetes": "codeflare-test-ray-cluster",
     "ml/ray/cluster/kubernetes/choose-pod-scheduler": "Keep It Simple"
   }
-}
+}
diff --git a/tests/kind/profiles/non-gpu6/dashdash.txt b/tests/kind/profiles/non-gpu6/dashdash.txt
@@ -0,0 +1 @@
+aaaaaaaaa bbbbbbbbbbbbbb 'ccccccccc ccccccccccccccc' dddddddddd eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee 'ffffffffffff ffffff' gggggggggggggggggggg
diff --git a/tests/kind/profiles/non-gpu6/keep-it-simple b/tests/kind/profiles/non-gpu6/keep-it-simple
@@ -0,0 +1,25 @@
+{
+  "name": "keep-it-simple",
+  "creationTime": 1664149787016,
+  "lastModifiedTime": 1676828268772,
+  "lastUsedTime": 1678637996635,
+  "choices": {
+    "madwizard/apriori/use-gpu": "don't use gpus",
+    "madwizard/apriori/arch": "x64",
+    "madwizard/apriori/platform": "darwin",
+    "madwizard/apriori/mac-installer": "Homebrew",
+    "madwizard/apriori/in-terminal": "HTML",
+    "ml/codeflare": "Submit a new Run",
+    "ml/codeflare/run": "Bring Your Own Torch Native Code",
+    "ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/torchx\",\"Base image\":\"ghcr.io/pytorch/torchx:0.5.0dev0\",\"Command line prefix\":\"python3 compute_world_size/main.py\"}",
+    "kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
+    "s3/choose/bucket/maybe": "My data is not stored in S3",
+    "ml/torchx/run/resources": "{\"Number of Workers\":\"1\",\"CPUs per worker\":\"500m\",\"GPUs per worker\":\"0\",\"Memory per worker\":\"500Mi\",\"Ephemeral Storage per worker\":\"5Gi\"}",
+    "kubernetes/context": "kind-codeflare-test",
+    "kubernetes/choose/ns": "default",
+    "ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",
+    "ml/ray/cluster/choose": "codeflare-test-ray-cluster",
+    "ml/ray/cluster/choose/kubernetes": "codeflare-test-ray-cluster",
+    "ml/ray/cluster/kubernetes/choose-pod-scheduler": "Keep It Simple"
+  }
+}
diff --git a/tests/kind/profiles/non-gpu6/mcad-coscheduler b/tests/kind/profiles/non-gpu6/mcad-coscheduler
@@ -0,0 +1,27 @@
+{
+  "name": "mcad-coscheduler",
+  "creationTime": 1660657756574,
+  "lastModifiedTime": 1678638052528,
+  "lastUsedTime": 1678638134621,
+  "choices": {
+    "madwizard/apriori/use-gpu": "don't use gpus",
+    "madwizard/apriori/arch": "x64",
+    "madwizard/apriori/platform": "darwin",
+    "madwizard/apriori/mac-installer": "Homebrew",
+    "madwizard/apriori/in-terminal": "HTML",
+    "ml/codeflare": "Submit a new Run",
+    "ml/codeflare/run": "Bring Your Own Torch Native Code",
+    "ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/torchx\",\"Base image\":\"ghcr.io/pytorch/torchx:0.5.0dev0\",\"Command line prefix\":\"python3 compute_world_size/main.py\"}",
+    "kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
+    "s3/choose/bucket/maybe": "My data is not stored in S3",
+    "ml/torchx/run/resources": "{\"Number of Workers\":\"1\",\"CPUs per worker\":\"500m\",\"GPUs per worker\":\"0\",\"Memory per worker\":\"500Mi\",\"Ephemeral Storage per worker\":\"5Gi\"}",
+    "kubernetes/context": "kind-codeflare-test",
+    "kubernetes/choose/ns": "default",
+    "ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",
+    "ml/ray/cluster/choose": "codeflare-test-ray-cluster",
+    "ml/ray/cluster/choose/kubernetes": "codeflare-test-ray-cluster",
+    "ml/ray/cluster/kubernetes/choose-pod-scheduler": "Use the Multi-user Enhanced Kubernetes Scheduler",
+    "kubernetes/mcad/choose/job-priority": "Default Priority",
+    "kubernetes/mcad/choose/scheduler": "MCAD with the Advanced Coscheduler"
+  }
+}
diff --git a/tests/kind/profiles/non-gpu6/mcad-default b/tests/kind/profiles/non-gpu6/mcad-default

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+aaaaaaaaa bbbbbbbbbbbbbb 'ccccccccc ccccccccccccccc' dddddddddd eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee 'ffffffffffff ffffff' gggggggggggggggggggg`