Skip to content

CLI Layout and Create RayCluster function #227

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jul 26, 2023
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ We use pre-commit to make sure the code is consistently formatted. To make sure
- To run the unit tests, run `pytest -v tests/unit_test.py`
- Any new test functions/scripts can be added into the `tests` folder
- NOTE: Functional tests coming soon, will live in `tests/func_test.py`
- To test CLI, run `codeflare` followed by any command. To see list of commands, simply run `codeflare`

#### Code Coverage

Expand Down
8 changes: 8 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ kubernetes = ">= 25.3.0, < 27"
codeflare-torchx = "0.6.0.dev0"
cryptography = "40.0.2"
executing = "1.2.0"
click = "8.0.4"

[tool.poetry.group.docs]
optional = true
Expand All @@ -39,3 +40,10 @@ pdoc3 = "0.10.0"
pytest = "7.4.0"
coverage = "7.2.7"
pytest-mock = "3.11.1"

[tool.poetry.scripts]
codeflare = "codeflare_sdk.cli.codeflare_cli:cli"

[build-system]
requires = ["poetry_core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ codeflare-torchx==0.6.0.dev0
pydantic<2 # 2.0+ broke ray[default] see detail: https://github.com/ray-project/ray/pull/37000
cryptography==40.0.2
executing==1.2.0
click==8.0.4
4 changes: 4 additions & 0 deletions src/codeflare_sdk.egg-info/SOURCES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,7 @@ src/codeflare_sdk/job/jobs.py
src/codeflare_sdk/utils/__init__.py
src/codeflare_sdk/utils/generate_yaml.py
src/codeflare_sdk/utils/pretty_print.py
src/codeflare_sdk/cli/__init__.py
src/codeflare_sdk/cli/codeflare_cli.py
src/codeflare_sdk/cli/commands/create.py
src/codeflare_sdk/cli/cli_utils.py
Empty file.
12 changes: 12 additions & 0 deletions src/codeflare_sdk/cli/cli_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import ast
import click


class PythonLiteralOption(click.Option):
def type_cast_value(self, ctx, value):
try:
if not value:
return None
return ast.literal_eval(value)
except:
raise click.BadParameter(value)
36 changes: 36 additions & 0 deletions src/codeflare_sdk/cli/codeflare_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import click
import sys
import os

cmd_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "commands"))


class CodeflareCLI(click.MultiCommand):
def list_commands(self, ctx):
rv = []
for filename in os.listdir(cmd_folder):
if filename.endswith(".py") and filename != "__init__.py":
rv.append(filename[:-3])
rv.sort()
return rv

def get_command(self, ctx, name):
ns = {}
fn = os.path.join(cmd_folder, name + ".py")
try:
with open(fn) as f:
code = compile(f.read(), fn, "exec")
eval(code, ns, ns)
return ns["cli"]
except FileNotFoundError:
return


@click.command(cls=CodeflareCLI)
@click.pass_context
def cli(ctx):
pass


if __name__ == "__main__":
cli()
36 changes: 36 additions & 0 deletions src/codeflare_sdk/cli/commands/define.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import click

from codeflare_sdk.cluster.cluster import Cluster
from codeflare_sdk.cluster.config import ClusterConfiguration
from codeflare_sdk.cli.cli_utils import PythonLiteralOption


@click.group()
def cli():
"""Define a resource with parameter specifications"""
pass


@cli.command()
@click.option("--name", type=str, required=True)
@click.option("--namespace", "-n", type=str)
@click.option("--head_info", cls=PythonLiteralOption, type=list)
@click.option("--machine_types", cls=PythonLiteralOption, type=list)
@click.option("--min_cpus", type=int)
@click.option("--max_cpus", type=int)
@click.option("--min_worker", type=int)
@click.option("--max_worker", type=int)
@click.option("--min_memory", type=int)
@click.option("--max_memory", type=int)
@click.option("--gpu", type=int)
@click.option("--template", type=str)
@click.option("--instascale", type=bool)
@click.option("--envs", cls=PythonLiteralOption, type=dict)
@click.option("--image", type=str)
@click.option("--local_interactive", type=bool)
@click.option("--image_pull_secrets", cls=PythonLiteralOption, type=list)
def raycluster(**kwargs):
"""Define a RayCluster with parameter specifications"""
filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None}
clusterConfig = ClusterConfiguration(**filtered_kwargs)
Cluster(clusterConfig) # Creates yaml file
195 changes: 195 additions & 0 deletions tests/cli-test-case.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
apiVersion: mcad.ibm.com/v1beta1
kind: AppWrapper
metadata:
labels:
orderedinstance: cpu.small_gpu.large
name: cli-test-cluster
namespace: ns
spec:
priority: 9
resources:
GenericItems:
- custompodresources:
- limits:
cpu: 2
memory: 8G
nvidia.com/gpu: 0
replicas: 1
requests:
cpu: 2
memory: 8G
nvidia.com/gpu: 0
- limits:
cpu: 4
memory: 6G
nvidia.com/gpu: 7
replicas: 2
requests:
cpu: 3
memory: 5G
nvidia.com/gpu: 7
generictemplate:
apiVersion: ray.io/v1alpha1
kind: RayCluster
metadata:
labels:
appwrapper.mcad.ibm.com: cli-test-cluster
controller-tools.k8s.io: '1.0'
name: cli-test-cluster
namespace: ns
spec:
autoscalerOptions:
idleTimeoutSeconds: 60
imagePullPolicy: Always
resources:
limits:
cpu: 500m
memory: 512Mi
requests:
cpu: 500m
memory: 512Mi
upscalingMode: Default
enableInTreeAutoscaling: false
headGroupSpec:
rayStartParams:
block: 'true'
dashboard-host: 0.0.0.0
num-gpus: '0'
serviceType: ClusterIP
template:
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cli-test-cluster
operator: In
values:
- cli-test-cluster
containers:
- env:
- name: MY_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: RAY_USE_TLS
value: '0'
- name: RAY_TLS_SERVER_CERT
value: /home/ray/workspace/tls/server.crt
- name: RAY_TLS_SERVER_KEY
value: /home/ray/workspace/tls/server.key
- name: RAY_TLS_CA_CERT
value: /home/ray/workspace/tls/ca.crt
image: quay.io/project-codeflare/ray:2.5.0-py38-cu116
imagePullPolicy: Always
lifecycle:
preStop:
exec:
command:
- /bin/sh
- -c
- ray stop
name: ray-head
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
resources:
limits:
cpu: 2
memory: 8G
nvidia.com/gpu: 0
requests:
cpu: 2
memory: 8G
nvidia.com/gpu: 0
imagePullSecrets:
- name: cli-test-pull-secret
rayVersion: 2.1.0
workerGroupSpecs:
- groupName: small-group-cli-test-cluster
maxReplicas: 2
minReplicas: 2
rayStartParams:
block: 'true'
num-gpus: '7'
replicas: 2
template:
metadata:
annotations:
key: value
labels:
key: value
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cli-test-cluster
operator: In
values:
- cli-test-cluster
containers:
- env:
- name: MY_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: RAY_USE_TLS
value: '0'
- name: RAY_TLS_SERVER_CERT
value: /home/ray/workspace/tls/server.crt
- name: RAY_TLS_SERVER_KEY
value: /home/ray/workspace/tls/server.key
- name: RAY_TLS_CA_CERT
value: /home/ray/workspace/tls/ca.crt
image: quay.io/project-codeflare/ray:2.5.0-py38-cu116
lifecycle:
preStop:
exec:
command:
- /bin/sh
- -c
- ray stop
name: machine-learning
resources:
limits:
cpu: 4
memory: 6G
nvidia.com/gpu: 7
requests:
cpu: 3
memory: 5G
nvidia.com/gpu: 7
imagePullSecrets:
- name: cli-test-pull-secret
initContainers:
- command:
- sh
- -c
- until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local;
do echo waiting for myservice; sleep 2; done
image: busybox:1.28
name: init-myservice
replicas: 1
- generictemplate:
apiVersion: route.openshift.io/v1
kind: Route
metadata:
labels:
odh-ray-cluster-service: cli-test-cluster-head-svc
name: ray-dashboard-cli-test-cluster
namespace: ns
spec:
port:
targetPort: dashboard
to:
kind: Service
name: cli-test-cluster-head-svc
replica: 1
Items: []
34 changes: 34 additions & 0 deletions tests/unit_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import filecmp
import os
import re
from click.testing import CliRunner

parent = Path(__file__).resolve().parents[1]
sys.path.append(str(parent) + "/src")
Expand Down Expand Up @@ -63,6 +64,7 @@
generate_tls_cert,
export_env,
)
from codeflare_sdk.cli.codeflare_cli import cli

import openshift
from openshift.selector import Selector
Expand All @@ -75,6 +77,37 @@
import yaml


# CLI testing
def test_cli_working():
runner = CliRunner()
result = runner.invoke(cli)
assert result.exit_code == 0


def test_cluster_definition_cli():
runner = CliRunner()
define_cluster_command = """
define raycluster
--name=cli-test-cluster
--namespace=ns
--min_worker=1
--max_worker=2
--min_cpus=3
--max_cpus=4
--min_memory=5
--max_memory=6
--gpu=7
--instascale=True
--machine_types='["cpu.small", "gpu.large"]'
--image_pull_secrets='["cli-test-pull-secret"]'
"""
result = runner.invoke(cli, define_cluster_command)
assert result.output == "Written to: cli-test-cluster.yaml\n"
assert filecmp.cmp(
"cli-test-cluster.yaml", f"{parent}/tests/cli-test-case.yaml", shallow=True
)


# For mocking openshift client results
fake_res = openshift.Result("fake")

Expand Down Expand Up @@ -2221,3 +2254,4 @@ def test_cleanup():
os.remove("unit-test-default-cluster.yaml")
os.remove("test.yaml")
os.remove("raytest2.yaml")
os.remove("cli-test-cluster.yaml")