Skip to content

Commit 0409902

Browse files
ahmadkinv-kkudrynski
authored andcommitted
[SSD/PyT] New release with 22.10 base image
1 parent 35feabc commit 0409902

24 files changed

+167
-127
lines changed

PyTorch/Detection/SSD/Dockerfile

+7-13
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,14 @@
1-
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.07-py3
1+
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:22.10-py3
22
FROM ${FROM_IMAGE_NAME}
33

44
# Set working directory
55
WORKDIR /workspace/ssd
66

7-
# Install nv-cocoapi
8-
ENV COCOAPI_VERSION=2.0+nv0.6.0
9-
RUN export COCOAPI_TAG=$(echo ${COCOAPI_VERSION} | sed 's/^.*+n//') \
10-
&& pip install --no-cache-dir pybind11 \
11-
&& pip install --no-cache-dir git+https://github.com/NVIDIA/cocoapi.git@${COCOAPI_TAG}#subdirectory=PythonAPI
12-
# Install dllogger
13-
RUN pip install --no-cache-dir git+https://github.com/NVIDIA/dllogger.git#egg=dllogger
7+
# Copy the model files
8+
COPY . .
149

15-
# Install requirements
16-
COPY requirements.txt .
17-
RUN pip install -r requirements.txt
18-
RUN python3 -m pip install pycocotools==2.0.0
10+
# Install python requirements
11+
RUN pip install --no-cache-dir -r requirements.txt
1912

20-
COPY . .
13+
ENV CUDNN_V8_API_ENABLED=1
14+
ENV TORCH_CUDNN_V8_API_ENABLED=1

PyTorch/Detection/SSD/README.md

+91-69
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# This script launches SSD300 training in FP16 on 1 GPUs using 256 batch size
22
# Usage bash SSD300_FP16_1GPU.sh <path to this repository> <path to dataset> <additional flags>
33

4-
python $1/main.py --backbone resnet50 --warmup 300 --bs 256 --amp --data $2 ${@:3}
4+
python $1/main.py --backbone resnet50 --warmup 300 --bs 256 --data $2 ${@:3}
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# This script launches SSD300 training in FP16 on 4 GPUs using 1024 batch size (256 per GPU)
22
# Usage ./SSD300_FP16_4GPU.sh <path to this repository> <path to dataset> <additional flags>
33

4-
python -m torch.distributed.launch --nproc_per_node=4 $1/main.py --backbone resnet50 --learning-rate 2.7e-3 --warmup 1200 --bs 256 --amp --data $2 ${@:3}
4+
torchrun --nproc_per_node=4 $1/main.py --backbone resnet50 --learning-rate 2.7e-3 --warmup 1200 --bs 256 --data $2 ${@:3}
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# This script launches SSD300 training in FP16 on 8 GPUs using 1024 batch size (128 per GPU)
22
# Usage ./SSD300_FP16_8GPU.sh <path to this repository> <path to dataset> <additional flags>
33

4-
python -m torch.distributed.launch --nproc_per_node=8 $1/main.py --backbone resnet50 --learning-rate 2.7e-3 --warmup 1200 --bs 128 --amp --data $2 ${@:3}
4+
torchrun --nproc_per_node=8 $1/main.py --backbone resnet50 --learning-rate 2.7e-3 --warmup 1200 --bs 128 --data $2 ${@:3}
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# This script launches SSD300 training in FP32 on 8 GPUs using 1024 batch size (128 per GPU)
22
# Usage ./SSD300_FP32_8GPU.sh <path to this repository> <path to dataset> <additional flags>
33

4-
python -m torch.distributed.launch --nproc_per_node=8 $1/main.py --backbone resnet50 --learning-rate 2.7e-3 --warmup 1200 --bs 128 --data $2 ${@:3}
4+
torchrun --nproc_per_node=8 $1/main.py --backbone resnet50 --learning-rate 2.7e-3 --warmup 1200 --bs 128 --no-amp --data $2 ${@:3}
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# This script launches SSD300 training in FP16 on 1 GPUs using 64 batch size
22
# Usage bash SSD300_FP16_1GPU.sh <path to this repository> <path to dataset> <additional flags>
33

4-
python $1/main.py --backbone resnet50 --warmup 300 --bs 64 --amp --data $2 ${@:3}
4+
python $1/main.py --backbone resnet50 --warmup 300 --bs 64 --data $2 ${@:3}
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# This script launches SSD300 training in FP16 on 4 GPUs using 256 batch size (64 per GPU)
22
# Usage ./SSD300_FP16_4GPU.sh <path to this repository> <path to dataset> <additional flags>
33

4-
python -m torch.distributed.launch --nproc_per_node=4 $1/main.py --backbone resnet50 --warmup 300 --bs 64 --amp --data $2 ${@:3}
4+
torchrun --nproc_per_node=4 $1/main.py --backbone resnet50 --warmup 300 --bs 64 --data $2 ${@:3}
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# This script launches SSD300 training in FP16 on 8 GPUs using 512 batch size (64 per GPU)
22
# Usage ./SSD300_FP16_8GPU.sh <path to this repository> <path to dataset> <additional flags>
33

4-
python -m torch.distributed.launch --nproc_per_node=8 $1/main.py --backbone resnet50 --warmup 300 --bs 64 --amp --data $2 ${@:3}
4+
torchrun --nproc_per_node=8 $1/main.py --backbone resnet50 --warmup 300 --bs 64 --data $2 ${@:3}
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# This script evaluates SSD300 model in FP16 using 32 batch size on 1 GPU
22
# Usage: ./SSD300_FP16_EVAL.sh <path to this repository> <path to dataset> <path to checkpoint> <additional flags>
33

4-
python $1/main.py --backbone resnet50 --amp --ebs 32 --data $2 --mode evaluation --checkpoint $3 ${@:4}
4+
python $1/main.py --backbone resnet50 --ebs 32 --data $2 --mode evaluation --checkpoint $3 ${@:4}
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# This script launches SSD300 inference benchmark in FP16 on 1 GPU with 64 batch size
22
# Usage bash SSD300_FP16_INFERENCE_BENCHMARK.sh <path to this repository> <path to dataset> <additional flags>
33

4-
python $1/main.py --backbone resnet50 --mode benchmark-inference --bs 64 --amp --data $2 ${@:3}
4+
python $1/main.py --backbone resnet50 --mode benchmark-inference --bs 64 --data $2 ${@:3}
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# This script launches SSD300 training in FP32 on 1 GPUs using 32 batch size
22
# Usage ./SSD300_FP32_1GPU.sh <path to this repository> <path to dataset> <additional flags>
33

4-
python $1/main.py --backbone resnet50 --bs 32 --warmup 300 --data $2 ${@:3}
4+
python $1/main.py --backbone resnet50 --bs 32 --warmup 300 --no-amp --data-layout channels_first --data $2 ${@:3}
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# This script launches SSD300 training in FP32 on 4 GPUs using 128 batch size (32 per GPU)
22
# Usage ./SSD300_FP32_4GPU.sh <path to this repository> <path to dataset> <additional flags>
33

4-
python -m torch.distributed.launch --nproc_per_node=4 $1/main.py --backbone resnet50 --warmup 300 --bs 32 --data $2 ${@:3}
4+
torchrun --nproc_per_node=4 $1/main.py --backbone resnet50 --warmup 300 --bs 32 --no-amp --data-layout channels_first --data $2 ${@:3}
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# This script launches SSD300 training in FP32 on 8 GPUs using 256 batch size (32 per GPU)
22
# Usage ./SSD300_FP32_8GPU.sh <path to this repository> <path to dataset> <additional flags>
33

4-
python -m torch.distributed.launch --nproc_per_node=8 $1/main.py --backbone resnet50 --warmup 300 --bs 32 --data $2 ${@:3}
4+
torchrun --nproc_per_node=8 $1/main.py --backbone resnet50 --warmup 300 --bs 32 --no-amp --data-layout channels_first --data $2 ${@:3}
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# This script evaluates SSD300 model in FP32 using 32 batch size on 1 GPU
22
# Usage: ./SSD300_FP32_EVAL.sh <path to this repository> <path to dataset> <path to checkpoint> <additional flags>
33

4-
python $1/main.py --backbone resnet50 --ebs 32 --data $2 --mode evaluation --checkpoint $3 ${@:4}
4+
python $1/main.py --backbone resnet50 --ebs 32 --data $2 --mode evaluation --no-amp --data-layout channels_first --checkpoint $3 ${@:4}
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# This script launches SSD300 inference benchmark in FP32 on 1 GPU with 64 batch size
22
# Usage bash SSD300_FP32_INFERENCE_BENCHMARK.sh <path to this repository> <path to dataset> <additional flags>
33

4-
python $1/main.py --backbone resnet50 --warmup 300 --mode benchmark-inference --bs 32 --data $2 ${@:3}
4+
python $1/main.py --backbone resnet50 --warmup 300 --mode benchmark-inference --bs 32 --no-amp --data-layout channels_first --data $2 ${@:3}

PyTorch/Detection/SSD/examples/SSD300_inference.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def load_checkpoint(model, model_file):
2828

2929

3030
def build_predictor(model_file, backbone='resnet50'):
31-
ssd300 = SSD300(backbone=ResNet(backbone))
31+
ssd300 = SSD300(backbone=ResNet(backbone=backbone))
3232
load_checkpoint(ssd300, model_file)
3333

3434
return ssd300

PyTorch/Detection/SSD/main.py

+22-4
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@ def make_parser():
6767
help='manually set random seed for torch')
6868
parser.add_argument('--checkpoint', type=str, default=None,
6969
help='path to model checkpoint file')
70+
parser.add_argument('--torchvision-weights-version', type=str, default="IMAGENET1K_V2",
71+
choices=['IMAGENET1K_V1', 'IMAGENET1K_V2', 'DEFAULT'],
72+
help='The torchvision weights version to use when --checkpoint is not specified')
7073
parser.add_argument('--save', type=str, default=None,
7174
help='save model checkpoints in the specified directory')
7275
parser.add_argument('--mode', type=str, default='training',
@@ -97,9 +100,19 @@ def make_parser():
97100
' backbone model declared with the --backbone argument.'
98101
' When it is not provided, pretrained model from torchvision'
99102
' will be downloaded.')
100-
parser.add_argument('--num-workers', type=int, default=4)
101-
parser.add_argument('--amp', action='store_true',
102-
help='Whether to enable AMP ops. When false, uses TF32 on A100 and FP32 on V100 GPUS.')
103+
parser.add_argument('--num-workers', type=int, default=8)
104+
parser.add_argument("--amp", dest='amp', action="store_true",
105+
help="Enable Automatic Mixed Precision (AMP).")
106+
parser.add_argument("--no-amp", dest='amp', action="store_false",
107+
help="Disable Automatic Mixed Precision (AMP).")
108+
parser.set_defaults(amp=True)
109+
parser.add_argument("--allow-tf32", dest='allow_tf32', action="store_true",
110+
help="Allow TF32 computations on supported GPUs.")
111+
parser.add_argument("--no-allow-tf32", dest='allow_tf32', action="store_false",
112+
help="Disable TF32 computations.")
113+
parser.set_defaults(allow_tf32=True)
114+
parser.add_argument('--data-layout', default="channels_last", choices=['channels_first', 'channels_last'],
115+
help="Model data layout. It's recommended to use channels_first with --no-amp")
103116
parser.add_argument('--log-interval', type=int, default=20,
104117
help='Logging interval.')
105118
parser.add_argument('--json-summary', type=str, default=None,
@@ -150,7 +163,9 @@ def train(train_loop_func, logger, args):
150163
val_dataset = get_val_dataset(args)
151164
val_dataloader = get_val_dataloader(val_dataset, args)
152165

153-
ssd300 = SSD300(backbone=ResNet(args.backbone, args.backbone_path))
166+
ssd300 = SSD300(backbone=ResNet(backbone=args.backbone,
167+
backbone_path=args.backbone_path,
168+
weights=args.torchvision_weights_version))
154169
args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32)
155170
start_epoch = 0
156171
iteration = 0
@@ -223,6 +238,7 @@ def train(train_loop_func, logger, args):
223238
obj['model'] = ssd300.module.state_dict()
224239
else:
225240
obj['model'] = ssd300.state_dict()
241+
os.makedirs(args.save, exist_ok=True)
226242
save_path = os.path.join(args.save, f'epoch_{epoch}.pt')
227243
torch.save(obj, save_path)
228244
logger.log('model path', save_path)
@@ -261,6 +277,8 @@ def log_params(logger, args):
261277
if args.local_rank == 0:
262278
os.makedirs('./models', exist_ok=True)
263279

280+
torch.backends.cuda.matmul.allow_tf32 = args.allow_tf32
281+
torch.backends.cudnn.allow_tf32 = args.allow_tf32
264282
torch.backends.cudnn.benchmark = True
265283

266284
# write json only on the main thread
+6-3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1-
Cython>=0.28.4
2-
scikit-image>=0.15.0
3-
ujson>=4.0.2
1+
Cython>=0.29.32
2+
scikit-image>=0.19.3
3+
ujson>=5.5.0
4+
pybind11>=2.10.0
5+
git+https://github.com/NVIDIA/[email protected]#subdirectory=PythonAPI
6+
git+https://github.com/NVIDIA/dllogger.git#egg=dllogger

PyTorch/Detection/SSD/ssd/coco_pipeline.py

+12-11
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
# DALI imports
2222
import nvidia.dali as dali
2323
from nvidia.dali.pipeline import Pipeline
24+
from nvidia.dali.types import to_numpy_type
2425

2526

2627
class COCOPipeline(Pipeline):
@@ -124,14 +125,14 @@ def define_graph(self):
124125
return (images, bboxes.gpu(), labels.gpu())
125126

126127
to_torch_type = {
127-
np.dtype(np.float32) : torch.float32,
128-
np.dtype(np.float64) : torch.float64,
129-
np.dtype(np.float16) : torch.float16,
130-
np.dtype(np.uint8) : torch.uint8,
131-
np.dtype(np.int8) : torch.int8,
132-
np.dtype(np.int16) : torch.int16,
133-
np.dtype(np.int32) : torch.int32,
134-
np.dtype(np.int64) : torch.int64
128+
np.float32 : torch.float32,
129+
np.float64 : torch.float64,
130+
np.float16 : torch.float16,
131+
np.uint8 : torch.uint8,
132+
np.int8 : torch.int8,
133+
np.int16 : torch.int16,
134+
np.int32 : torch.int32,
135+
np.int64 : torch.int64
135136
}
136137

137138
def feed_ndarray(dali_tensor, arr):
@@ -242,9 +243,9 @@ def __next__(self):
242243
labels_shape[j].append(lshape)
243244

244245
# We always need to alocate new memory as bboxes and labels varies in shape
245-
images_torch_type = to_torch_type[np.dtype(images[0].dtype())]
246-
bboxes_torch_type = to_torch_type[np.dtype(bboxes[0][0].dtype())]
247-
labels_torch_type = to_torch_type[np.dtype(labels[0][0].dtype())]
246+
images_torch_type = to_torch_type[to_numpy_type(images[0].dtype)]
247+
bboxes_torch_type = to_torch_type[to_numpy_type(bboxes[0][0].dtype)]
248+
labels_torch_type = to_torch_type[to_numpy_type(labels[0][0].dtype)]
248249

249250
torch_gpu_device = torch.device('cuda', dev_id)
250251
torch_cpu_device = torch.device('cpu')

PyTorch/Detection/SSD/ssd/evaluate.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,8 @@ def evaluate(model, coco, cocoGt, encoder, inv_map, args):
5252

5353
try:
5454
result = encoder.decode_batch(ploc_i, plabel_i, 0.50, 200)[0]
55-
except:
56-
# raise
57-
print("")
58-
print("No object detected in idx: {}".format(idx))
55+
except Exception as e:
56+
print("Skipping idx {}, failed to decode with message {}, Skipping.".format(idx, e))
5957
continue
6058

6159
htot, wtot = img_size[0][idx].item(), img_size[1][idx].item()

PyTorch/Detection/SSD/ssd/model.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -18,22 +18,22 @@
1818

1919

2020
class ResNet(nn.Module):
21-
def __init__(self, backbone='resnet50', backbone_path=None):
21+
def __init__(self, backbone='resnet50', backbone_path=None, weights="IMAGENET1K_V1"):
2222
super().__init__()
2323
if backbone == 'resnet18':
24-
backbone = resnet18(pretrained=not backbone_path)
24+
backbone = resnet18(weights=None if backbone_path else weights)
2525
self.out_channels = [256, 512, 512, 256, 256, 128]
2626
elif backbone == 'resnet34':
27-
backbone = resnet34(pretrained=not backbone_path)
27+
backbone = resnet34(weights=None if backbone_path else weights)
2828
self.out_channels = [256, 512, 512, 256, 256, 256]
2929
elif backbone == 'resnet50':
30-
backbone = resnet50(pretrained=not backbone_path)
30+
backbone = resnet50(weights=None if backbone_path else weights)
3131
self.out_channels = [1024, 512, 512, 256, 256, 256]
3232
elif backbone == 'resnet101':
33-
backbone = resnet101(pretrained=not backbone_path)
33+
backbone = resnet101(weights=None if backbone_path else weights)
3434
self.out_channels = [1024, 512, 512, 256, 256, 256]
3535
else: # backbone == 'resnet152':
36-
backbone = resnet152(pretrained=not backbone_path)
36+
backbone = resnet152(weights=None if backbone_path else weights)
3737
self.out_channels = [1024, 512, 512, 256, 256, 256]
3838
if backbone_path:
3939
backbone.load_state_dict(torch.load(backbone_path))
@@ -108,7 +108,7 @@ def _init_weights(self):
108108
def bbox_view(self, src, loc, conf):
109109
ret = []
110110
for s, l, c in zip(src, loc, conf):
111-
ret.append((l(s).view(s.size(0), 4, -1), c(s).view(s.size(0), self.label_num, -1)))
111+
ret.append((l(s).reshape(s.size(0), 4, -1), c(s).reshape(s.size(0), self.label_num, -1)))
112112

113113
locs, confs = list(zip(*ret))
114114
locs, confs = torch.cat(locs, 2).contiguous(), torch.cat(confs, 2).contiguous()

PyTorch/Detection/SSD/ssd/train.py

+4
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ def train_loop(model, loss_func, scaler, epoch, optim, train_dataloader, val_dat
4444
label = label.view(N, M)
4545

4646
with torch.cuda.amp.autocast(enabled=args.amp):
47+
if args.data_layout == 'channels_last':
48+
img = img.to(memory_format=torch.channels_last)
4749
ploc, plabel = model(img)
4850

4951
ploc, plabel = ploc.float(), plabel.float()
@@ -101,6 +103,8 @@ def benchmark_train_loop(model, loss_func, scaler, epoch, optim, train_dataloade
101103
label = label.view(N, M)
102104

103105
with torch.cuda.amp.autocast(enabled=args.amp):
106+
if args.data_layout == 'channels_last':
107+
img = img.to(memory_format=torch.channels_last)
104108
ploc, plabel = model(img)
105109

106110
ploc, plabel = ploc.float(), plabel.float()

PyTorch/Detection/SSD/ssd/utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ def decode_single(self, bboxes_in, scores_in, criteria, max_output, max_num=200)
217217

218218

219219
_, max_ids = scores_out.sort(dim=0)
220-
max_ids = max_ids[-max_output:]
220+
max_ids = max_ids[-max_output:].to("cpu")
221221
return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]
222222

223223

0 commit comments

Comments
 (0)