add torch distribution

Ivorfeng · Ivorfeng · commit 3bfa77a4eda5 · 2023-06-09T16:58:27.000+08:00
diff --git a/examples/basic_tutorials/cifar10_cnn_torch_dist.py b/examples/basic_tutorials/cifar10_cnn_torch_dist.py
@@ -0,0 +1,178 @@
+#! /usr/bin/python
+# -*- coding: utf-8 -*-
+
+import os
+# os.environ['TL_BACKEND'] = 'paddle'
+# os.environ['TL_BACKEND'] = 'tensorflow'
+# os.environ['TL_BACKEND'] = 'mindspore'
+os.environ['TL_BACKEND'] = 'torch'
+
+import time
+from tensorlayerx.dataflow import Dataset, DataLoader
+from tensorlayerx.vision.transforms import (
+    Compose, Resize, RandomFlipHorizontal, RandomContrast, RandomBrightness, StandardizePerImage, RandomCrop
+)
+from tensorlayerx.model import TrainOneStep
+from tensorlayerx.nn import Module
+import tensorlayerx as tlx
+from tensorlayerx.nn import (Conv2d, Linear, Flatten, MaxPool2d, BatchNorm2d)
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--local_rank", type=int, default=-1,
+                        help="For distributed training: local_rank")
+args = parser.parse_args()
+# enable debug logging
+tlx.logging.set_verbosity(tlx.logging.DEBUG)
+
+tlx.ops.set_device(device = 'MLU', id = args.local_rank)
+tlx.ops.distributed_init(backend="cncl")
+# ################## Download and prepare the CIFAR10 dataset ##################
+# This is just some way of getting the CIFAR10 dataset from an online location
+# and loading it into numpy arrays with shape [32,32,3]
+X_train, y_train, X_test, y_test = tlx.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
+
+# ################## CIFAR10 dataset ##################
+# We define a Dataset class for Loading CIFAR10 images and labels.
+class make_dataset(Dataset):
+
+    def __init__(self, data, label, transforms):
+        self.data = data
+        self.label = label
+        self.transforms = transforms
+
+    def __getitem__(self, idx):
+        x = self.data[idx].astype('uint8')
+        y = self.label[idx].astype('int64')
+        x = self.transforms(x)
+
+        return x, y
+
+    def __len__(self):
+
+        return len(self.label)
+
+# We define the CIFAR10 iamges preprocessing pipeline.
+train_transforms = Compose( # Combining multiple operations sequentially
+    [
+        RandomCrop(size=[24, 24]), #random crop from images to shape [24, 24]
+        RandomFlipHorizontal(), # random invert each image horizontally by probability
+        RandomBrightness(brightness_factor=(0.5, 1.5)), # Within the range of values (0.5, 1.5), adjust brightness randomly
+        RandomContrast(contrast_factor=(0.5, 1.5)), # Within the range of values (0.5, 1.5), adjust contrast randomly
+        StandardizePerImage() #Normalize the values of each image to [-1, 1]
+    ]
+)
+
+test_transforms = Compose([Resize(size=(24, 24)), StandardizePerImage()])
+
+# We use DataLoader to batch and shuffle data, and make data into iterators.
+train_dataset = make_dataset(data=X_train, label=y_train, transforms=train_transforms)
+test_dataset = make_dataset(data=X_test, label=y_test, transforms=test_transforms)
+
+train_dataset = DataLoader(train_dataset, batch_size=128, shuffle=True)
+test_dataset = DataLoader(test_dataset, batch_size=128)
+
+# ################## CNN network ##################
+class CNN(Module):
+
+    def __init__(self):
+        super(CNN, self).__init__()
+        # Parameter initialization method
+        W_init = tlx.nn.initializers.truncated_normal(stddev=5e-2)
+        W_init2 = tlx.nn.initializers.truncated_normal(stddev=0.04)
+        b_init2 = tlx.nn.initializers.constant(value=0.1)
+
+        # 2D Convolutional Neural Network, Set padding method "SAME", convolutional kernel size [5,5], stride [1,1], in channels, out channels
+        self.conv1 = Conv2d(64, (5, 5), (1, 1), padding='SAME', W_init=W_init, b_init=None, name='conv1', in_channels=3)
+        # Add 2D BatchNormalize, using ReLU for output.
+        self.bn = BatchNorm2d(num_features=64, act=tlx.nn.ReLU)
+        # Add 2D Max pooling layer.
+        self.maxpool1 = MaxPool2d((3, 3), (2, 2), padding='SAME', name='pool1')
+
+        self.conv2 = Conv2d(
+            64, (5, 5), (1, 1), padding='SAME', act=tlx.nn.ReLU, W_init=W_init, name='conv2', in_channels=64
+        )
+        self.maxpool2 = MaxPool2d((3, 3), (2, 2), padding='SAME', name='pool2')
+        # Flatten 2D data to 1D data
+        self.flatten = Flatten(name='flatten')
+        # Linear layer with 384 units, using ReLU for output.
+        self.linear1 = Linear(384, act=tlx.nn.ReLU, W_init=W_init2, b_init=b_init2, name='linear1relu', in_features=2304)
+        self.linear2 = Linear(192, act=tlx.nn.ReLU, W_init=W_init2, b_init=b_init2, name='linear2relu', in_features=384)
+        self.linear3 = Linear(10, act=None, W_init=W_init2, name='output', in_features=192)
+
+    # We define the forward computation process.
+    def forward(self, x):
+        z = self.conv1(x)
+        z = self.bn(z)
+        z = self.maxpool1(z)
+        z = self.conv2(z)
+        z = self.maxpool2(z)
+        z = self.flatten(z)
+        z = self.linear1(z)
+        z = self.linear2(z)
+        z = self.linear3(z)
+        return z
+
+
+# get the network
+net = CNN()
+
+# training settings
+n_epoch = 500
+learning_rate = 0.0001
+print_freq = 5
+n_step_epoch = int(len(y_train) / 128)
+n_step = n_epoch * n_step_epoch
+shuffle_buffer_size = 128
+# Get training parameters
+train_weights = net.trainable_weights
+# Define the optimizer, use the Adam optimizer.
+optimizer = tlx.optimizers.Adam(learning_rate)
+# Define evaluation metrics.
+metrics = tlx.metrics.Accuracy()
+
+# Define the loss calculation process
+class WithLoss(Module):
+
+    def __init__(self, net, loss_fn):
+        super(WithLoss, self).__init__()
+        self._net = net
+        self._loss_fn = loss_fn
+
+    def forward(self, data, label):
+        out = self._net(data)
+        loss = self._loss_fn(out, label)
+        return loss
+
+
+net_with_loss = WithLoss(net.mlu(), loss_fn=tlx.losses.softmax_cross_entropy_with_logits).mlu()
+model = tlx.ops.distributed_model(net_with_loss, device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+# Initialize one-step training
+#net_with_train = TrainOneStep(net_with_loss, optimizer, train_weights)
+net_with_train = TrainOneStep(model, optimizer, train_weights)
+
+# Custom training loops
+for epoch in range(n_epoch):
+    start_time = time.time()
+    # Set the network to training state
+    net.set_train()
+    train_loss, train_acc, n_iter = 0, 0, 0
+    # Get training data and labels
+    for X_batch, y_batch in train_dataset:
+        # Calculate the loss value, and automatically complete the gradient update
+        _loss_ce = net_with_train(X_batch.mlu(), y_batch.mlu())
+        train_loss += _loss_ce
+
+        n_iter += 1
+        _logits = net(X_batch.mlu())
+        # Calculate accuracy
+        metrics.update(_logits, y_batch.mlu())
+        train_acc += metrics.result()
+        metrics.reset()
+        if (n_iter  % 100 == 0):
+          print("Epoch {} of {} took {}".format(epoch + 1, n_epoch, time.time() - start_time))
+          print("rank {} train loss: {}".format(args.local_rank,train_loss / n_iter))
+          print("rank {} train acc:  {}".format(args.local_rank,train_acc / n_iter))
+
diff --git a/examples/basic_tutorials/test_dist.sh b/examples/basic_tutorials/test_dist.sh
@@ -0,0 +1,2 @@
+export MLU_VISIBLE_DEVICES=0,1
+python -m tensorlayerx.distributed.launch --nproc_per_node=2 cifar10_cnn_torch_dist.py 
diff --git a/tensorlayerx/backend/ops/__init__.py b/tensorlayerx/backend/ops/__init__.py
@@ -207,6 +207,8 @@
 from .load_backend import eye
 from .load_backend import einsum
 from .load_backend import set_device
+from .load_backend import distributed_init
+from .load_backend import distributed_model
 from .load_backend import get_device
 from .load_backend import scatter_update
 from .load_backend import to_device
diff --git a/tensorlayerx/backend/ops/load_backend.py b/tensorlayerx/backend/ops/load_backend.py
@@ -75,6 +75,10 @@
     from .torch_nn import *
     from .torch_backend import *
     import torch
+    try:
+        import torch_mlu
+    except:
+        pass
     BACKEND_VERSION = torch.__version__
     sys.stderr.write('Using PyTorch backend.\n')
 elif BACKEND == 'oneflow':
diff --git a/tensorlayerx/backend/ops/torch_backend.py b/tensorlayerx/backend/ops/torch_backend.py
@@ -72,6 +72,8 @@ def zeros(shape, dtype=None, device = None):
         device = torch.device('cpu')
     elif device == 'gpu':
         device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+    elif device == 'mlu':
+        device = torch.device('mlu:0' if torch.mlu.is_available() else 'cpu')
     return torch.zeros(size=shape, dtype=dtype, device = device)
 
 
@@ -95,6 +97,8 @@ def ones(shape, dtype=None, device = None):
         device = torch.device('cpu')
     elif device == 'gpu':
         device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+    elif device == 'mlu':
+        device = torch.device('mlu:0' if torch.mlu.is_available() else 'cpu')
     return torch.ones(size=shape, dtype=dtype, device = device)
 
 
@@ -120,6 +124,8 @@ def constant(value, dtype=None, shape=None, device =None):
         device = torch.device('cpu')
     elif device == 'gpu':
         device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+    elif device == 'mlu':
+        device = torch.device('mlu:0' if torch.mlu.is_available() else 'cpu')
     w = torch.empty(size=shape, dtype=dtype, device = device)
     return torch.nn.init.constant_(w, value)
 
@@ -1687,6 +1693,10 @@ def set_seed(seed):
 
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
+    try:
+        torch.mlu.manual_seed_all(seed)
+    except:
+        pass
     np.random.seed(seed)
     random.seed(seed)
     torch.backends.cudnn.deterministic = True
@@ -1745,6 +1755,23 @@ def set_device(device = 'GPU', id = 0):
     if device == 'GPU':
         torch.set_default_tensor_type('torch.cuda.FloatTensor')
         torch.cuda.set_device(id)
+    if device == 'MLU':
+        torch.set_default_tensor_type('torch.mlu.FloatTensor')
+        torch.mlu.set_device(id)
+
+def distributed_init(backend="cncl"):
+    torch.distributed.init_process_group(backend=backend)
+
+def distributed_model(module, device_ids=None, output_device=None, 
+                    dim=0, broadcast_buffers=True, process_group=None, bucket_cap_mb=25, 
+                    find_unused_parameters=False, check_reduction=False, gradient_as_bucket_view=False):
+    return torch.nn.parallel.DistributedDataParallel(module, device_ids=device_ids,
+                                                     output_device=output_device,
+                                                     dim=dim, broadcast_buffers=broadcast_buffers,
+                                                     process_group=process_group, bucket_cap_mb=bucket_cap_mb,
+                                                     find_unused_parameters=find_unused_parameters,
+                                                     check_reduction=check_reduction, 
+                                                     gradient_as_bucket_view=gradient_as_bucket_view)
 
 def scatter_update(tensor, indices, updates):
     tensor = torch.tensor(tensor)
@@ -1756,16 +1783,23 @@ def scatter_update(tensor, indices, updates):
 def get_device():
     try:
         id = torch.cuda.current_device()
-        device = 'GPU:' + str(id)
-        return device
+        device = 'GPU:' + str(id)    
     except:
         device = 'CPU'
-        return device
+        
+    try:
+        id = torch.mlu.current_device()
+        device = 'MLU:' + str(id)    
+    except:
+        device = 'CPU'    
+    return device
 
-def to_device(tensor, device='GPU', id=0):
+def to_device(tensor, device='MLU', id=0):
     device = device.lower()
     if device == 'gpu':
         device = 'cuda' + ':' + str(id)
+    if device == 'mlu':
+        device = 'mlu' + ':' + str(id)
     tensor = tensor.detach().to(device)
     return tensor
 
diff --git a/tensorlayerx/distributed/__init__.py b/tensorlayerx/distributed/__init__.py
@@ -0,0 +1 @@
+from .launch import *
diff --git a/tensorlayerx/distributed/launch.py b/tensorlayerx/distributed/launch.py
@@ -0,0 +1,34 @@
+import os
+BACKEND = 'torch'
+
+
+# Set backend based on TL_BACKEND.
+if 'TL_BACKEND' in os.environ:
+    backend = os.environ['TL_BACKEND']
+    if backend:
+        BACKEND = backend
+
+
+def main(args=None):
+    if BACKEND == 'torch':
+        from torch.distributed.run import get_args_parser, run
+        def parse_args(args):
+            parser = get_args_parser()
+            parser.add_argument(
+                "--use_env",
+                default=False,
+                action="store_true",
+                help="Use environment variable to pass "
+                "'local rank'. For legacy reasons, the default value is False. "
+                "If set to True, the script will not pass "
+                "--local_rank as argument, and will instead set LOCAL_RANK.",
+            )
+            return parser.parse_args(args)
+        args = parse_args(args)
+        run(args)
+    else:
+        raise NotImplementedError("This backend:{} is not supported".format(BACKEND))
+    
+
+if __name__ == "__main__":
+    main()

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+export MLU_VISIBLE_DEVICES=0,1`
	`2`	`+python -m tensorlayerx.distributed.launch --nproc_per_node=2 cifar10_cnn_torch_dist.py`