diff --git a/demo-notebooks/guided-demos/1_cluster_job_client.ipynb b/demo-notebooks/guided-demos/1_cluster_job_client.ipynb index e46dc62a9..787bd315d 100644 --- a/demo-notebooks/guided-demos/1_cluster_job_client.ipynb +++ b/demo-notebooks/guided-demos/1_cluster_job_client.ipynb @@ -50,7 +50,8 @@ " max_cpus=1,\n", " min_memory=4,\n", " max_memory=4,\n", - " num_gpus=0,\n", + " num_gpus=1,\n", + " head_gpus=1,\n", " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n", " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", @@ -114,7 +115,7 @@ "source": [ "# Submit an example mnist job using the Job Submission Client\n", "submission_id = client.submit_job(\n", - " entrypoint=\"python mnist.py\",\n", + " entrypoint=\"python mnist_fashion.py\",\n", " runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n", ")\n", "print(submission_id)" diff --git a/demo-notebooks/guided-demos/mnist_fashion.py b/demo-notebooks/guided-demos/mnist_fashion.py new file mode 100644 index 000000000..2b75a8653 --- /dev/null +++ b/demo-notebooks/guided-demos/mnist_fashion.py @@ -0,0 +1,84 @@ +import torch +import torch.nn as nn +import ray +from torch.utils.data import DataLoader +from torchvision import datasets +from torchvision.transforms import ToTensor +from ray.train.torch import TorchTrainer +from ray.train import ScalingConfig + + +def get_dataset(): + return datasets.FashionMNIST( + root="/tmp/data", + train=True, + download=True, + transform=ToTensor(), + ) + + +class NeuralNetwork(nn.Module): + def __init__(self): + super().__init__() + self.flatten = nn.Flatten() + self.linear_relu_stack = nn.Sequential( + nn.Linear(28 * 28, 512), + nn.ReLU(), + nn.Linear(512, 512), + nn.ReLU(), + nn.Linear(512, 10), + ) + + def forward(self, inputs): + inputs = self.flatten(inputs) + logits = self.linear_relu_stack(inputs) + return logits + + +def get_dataset(): + return datasets.FashionMNIST( + root="/tmp/data", + train=True, + download=True, + transform=ToTensor(), + ) + + +def train_func_distributed(): + num_epochs = 3 + batch_size = 64 + + dataset = get_dataset() + dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) + dataloader = ray.train.torch.prepare_data_loader(dataloader) + + model = NeuralNetwork() + model = ray.train.torch.prepare_model(model) + + criterion = nn.CrossEntropyLoss() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + + for epoch in range(num_epochs): + if ray.train.get_context().get_world_size() > 1: + dataloader.sampler.set_epoch(epoch) + + for inputs, labels in dataloader: + optimizer.zero_grad() + pred = model(inputs) + loss = criterion(pred, labels) + loss.backward() + optimizer.step() + print(f"epoch: {epoch}, loss: {loss.item()}") + + +# For GPU Training, set `use_gpu` to True. +use_gpu = True + +trainer = TorchTrainer( + train_func_distributed, + scaling_config=ScalingConfig( + num_workers=3, use_gpu=use_gpu + ), # num_workers = number of worker nodes with the ray head node included +) + +results = trainer.fit() diff --git a/demo-notebooks/guided-demos/notebook-ex-outputs/1_cluster_job_client.ipynb b/demo-notebooks/guided-demos/notebook-ex-outputs/1_cluster_job_client.ipynb index e46dc62a9..787bd315d 100644 --- a/demo-notebooks/guided-demos/notebook-ex-outputs/1_cluster_job_client.ipynb +++ b/demo-notebooks/guided-demos/notebook-ex-outputs/1_cluster_job_client.ipynb @@ -50,7 +50,8 @@ " max_cpus=1,\n", " min_memory=4,\n", " max_memory=4,\n", - " num_gpus=0,\n", + " num_gpus=1,\n", + " head_gpus=1,\n", " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n", " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", @@ -114,7 +115,7 @@ "source": [ "# Submit an example mnist job using the Job Submission Client\n", "submission_id = client.submit_job(\n", - " entrypoint=\"python mnist.py\",\n", + " entrypoint=\"python mnist_fashion.py\",\n", " runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n", ")\n", "print(submission_id)" diff --git a/demo-notebooks/guided-demos/preview_nbs/1_cluster_job_client.ipynb b/demo-notebooks/guided-demos/preview_nbs/1_cluster_job_client.ipynb index b20f920bd..db11cada0 100644 --- a/demo-notebooks/guided-demos/preview_nbs/1_cluster_job_client.ipynb +++ b/demo-notebooks/guided-demos/preview_nbs/1_cluster_job_client.ipynb @@ -50,7 +50,8 @@ " max_cpus=1,\n", " min_memory=4,\n", " max_memory=4,\n", - " num_gpus=0,\n", + " num_gpus=1,\n", + " head_gpus=1,\n", " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n", " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources\n", " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", @@ -114,7 +115,7 @@ "source": [ "# Submit an example mnist job using the Job Submission Client\n", "submission_id = client.submit_job(\n", - " entrypoint=\"python mnist.py\",\n", + " entrypoint=\"python mnist_fashion.py\",\n", " runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n", ")\n", "print(submission_id)"