Skip to content

Added mnist_fashion as default job demo #529

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions demo-notebooks/guided-demos/1_cluster_job_client.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@
" max_cpus=1,\n",
" min_memory=4,\n",
" max_memory=4,\n",
" num_gpus=0,\n",
" num_gpus=1,\n",
" head_gpus=1,\n",
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
" write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
" # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
Expand Down Expand Up @@ -114,7 +115,7 @@
"source": [
"# Submit an example mnist job using the Job Submission Client\n",
"submission_id = client.submit_job(\n",
" entrypoint=\"python mnist.py\",\n",
" entrypoint=\"python mnist_fashion.py\",\n",
" runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n",
")\n",
"print(submission_id)"
Expand Down
84 changes: 84 additions & 0 deletions demo-notebooks/guided-demos/mnist_fashion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import torch
import torch.nn as nn
import ray
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from ray.train.torch import TorchTrainer
from ray.train import ScalingConfig


def get_dataset():
return datasets.FashionMNIST(
root="/tmp/data",
train=True,
download=True,
transform=ToTensor(),
)


class NeuralNetwork(nn.Module):
def __init__(self):
super().__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28 * 28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10),
)

def forward(self, inputs):
inputs = self.flatten(inputs)
logits = self.linear_relu_stack(inputs)
return logits


def get_dataset():
return datasets.FashionMNIST(
root="/tmp/data",
train=True,
download=True,
transform=ToTensor(),
)


def train_func_distributed():
num_epochs = 3
batch_size = 64

dataset = get_dataset()
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
dataloader = ray.train.torch.prepare_data_loader(dataloader)

model = NeuralNetwork()
model = ray.train.torch.prepare_model(model)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

for epoch in range(num_epochs):
if ray.train.get_context().get_world_size() > 1:
dataloader.sampler.set_epoch(epoch)

for inputs, labels in dataloader:
optimizer.zero_grad()
pred = model(inputs)
loss = criterion(pred, labels)
loss.backward()
optimizer.step()
print(f"epoch: {epoch}, loss: {loss.item()}")


# For GPU Training, set `use_gpu` to True.
use_gpu = True

trainer = TorchTrainer(
train_func_distributed,
scaling_config=ScalingConfig(
num_workers=3, use_gpu=use_gpu
), # num_workers = number of worker nodes with the ray head node included
)

results = trainer.fit()
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@
" max_cpus=1,\n",
" min_memory=4,\n",
" max_memory=4,\n",
" num_gpus=0,\n",
" num_gpus=1,\n",
" head_gpus=1,\n",
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
" write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
" # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
Expand Down Expand Up @@ -114,7 +115,7 @@
"source": [
"# Submit an example mnist job using the Job Submission Client\n",
"submission_id = client.submit_job(\n",
" entrypoint=\"python mnist.py\",\n",
" entrypoint=\"python mnist_fashion.py\",\n",
" runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n",
")\n",
"print(submission_id)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@
" max_cpus=1,\n",
" min_memory=4,\n",
" max_memory=4,\n",
" num_gpus=0,\n",
" num_gpus=1,\n",
" head_gpus=1,\n",
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
" write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources\n",
" # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
Expand Down Expand Up @@ -114,7 +115,7 @@
"source": [
"# Submit an example mnist job using the Job Submission Client\n",
"submission_id = client.submit_job(\n",
" entrypoint=\"python mnist.py\",\n",
" entrypoint=\"python mnist_fashion.py\",\n",
" runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n",
")\n",
"print(submission_id)"
Expand Down
Loading