Add DeepLabV3 implementation (#149)

qubvel · web-flow · commit b77cf5aec7a6 · 2020-02-21T17:32:43.000+03:00
* Add DeepLabV3 implemetation
* Add DeepLabV3 to README
* Add DeepLabV3 docstring
diff --git a/README.md b/README.md
@@ -68,6 +68,7 @@ preprocess_input = get_preprocessing_fn('resnet18', pretrained='imagenet')
  - [FPN](http://presentations.cocodataset.org/COCO17-Stuff-FAIR.pdf)
  - [PSPNet](https://arxiv.org/abs/1612.01105)
  - [PAN](https://arxiv.org/abs/1805.10180)
+ - [DeepLabV3](https://arxiv.org/abs/1706.05587)
 
 #### Encoders <a name="encoders"></a>
 
diff --git a/segmentation_models_pytorch/__init__.py b/segmentation_models_pytorch/__init__.py
@@ -2,6 +2,7 @@
 from .linknet import Linknet
 from .fpn import FPN
 from .pspnet import PSPNet
+from .deeplabv3 import DeepLabV3
 from .pan import PAN
 
 from . import encoders
diff --git a/segmentation_models_pytorch/deeplabv3/__init__.py b/segmentation_models_pytorch/deeplabv3/__init__.py
@@ -0,0 +1 @@
+from .model import DeepLabV3
diff --git a/segmentation_models_pytorch/deeplabv3/decoder.py b/segmentation_models_pytorch/deeplabv3/decoder.py
@@ -0,0 +1,107 @@
+"""
+BSD 3-Clause License
+
+Copyright (c) Soumith Chintala 2016,
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+__all__ = ["DeepLabV3Decoder"]
+
+
+class DeepLabV3Decoder(nn.Sequential):
+    def __init__(self, in_channels, out_channels=256, atrous_rates=(12, 24, 36)):
+        super().__init__(
+            ASPP(in_channels, out_channels, atrous_rates),
+            nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+        )
+        self.out_channels = out_channels
+
+    def forward(self, *features):
+        return super().forward(features[-1])
+
+
+class ASPPConv(nn.Sequential):
+    def __init__(self, in_channels, out_channels, dilation):
+        modules = [
+            nn.Conv2d(in_channels, out_channels, 3, padding=dilation, dilation=dilation, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU()
+        ]
+        super(ASPPConv, self).__init__(*modules)
+
+
+class ASPPPooling(nn.Sequential):
+    def __init__(self, in_channels, out_channels):
+        super(ASPPPooling, self).__init__(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_channels, out_channels, 1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU())
+
+    def forward(self, x):
+        size = x.shape[-2:]
+        for mod in self:
+            x = mod(x)
+        return F.interpolate(x, size=size, mode='bilinear', align_corners=False)
+
+
+class ASPP(nn.Module):
+    def __init__(self, in_channels, out_channels, atrous_rates):
+        super(ASPP, self).__init__()
+        modules = []
+        modules.append(nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, 1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU()))
+
+        rate1, rate2, rate3 = tuple(atrous_rates)
+        modules.append(ASPPConv(in_channels, out_channels, rate1))
+        modules.append(ASPPConv(in_channels, out_channels, rate2))
+        modules.append(ASPPConv(in_channels, out_channels, rate3))
+        modules.append(ASPPPooling(in_channels, out_channels))
+
+        self.convs = nn.ModuleList(modules)
+
+        self.project = nn.Sequential(
+            nn.Conv2d(5 * out_channels, out_channels, 1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+            nn.Dropout(0.5))
+
+    def forward(self, x):
+        res = []
+        for conv in self.convs:
+            res.append(conv(x))
+        res = torch.cat(res, dim=1)
+        return self.project(res)
diff --git a/segmentation_models_pytorch/deeplabv3/model.py b/segmentation_models_pytorch/deeplabv3/model.py
@@ -0,0 +1,81 @@
+import torch.nn as nn
+
+from typing import Optional
+from .decoder import DeepLabV3Decoder
+from ..base import SegmentationModel, SegmentationHead, ClassificationHead
+from ..encoders import get_encoder
+
+
+class DeepLabV3(SegmentationModel):
+    """DeepLabV3_ implemetation from "Rethinking Atrous Convolution for Semantic Image Segmentation"
+    Args:
+        encoder_name: name of classification model (without last dense layers) used as feature
+                extractor to build segmentation model.
+        encoder_depth: number of stages used in decoder, larger depth - more features are generated.
+            e.g. for depth=3 encoder will generate list of features with following spatial shapes
+            [(H,W), (H/2, W/2), (H/4, W/4), (H/8, W/8)], so in general the deepest feature will have
+            spatial resolution (H/(2^depth), W/(2^depth)]
+        encoder_weights: one of ``None`` (random initialization), ``imagenet`` (pre-training on ImageNet).
+        decoder_channels: a number of convolution filters in ASPP module (default 256).
+        in_channels: number of input channels for model, default is 3.
+        classes: a number of classes for output (output shape - ``(batch, classes, h, w)``).
+        activation (str, callable): activation function used in ``.predict(x)`` method for inference.
+            One of [``sigmoid``, ``softmax2d``, callable, None]
+        upsampling: optional, final upsampling factor
+            (default is 8 to preserve input -> output spatial shape identity)
+        aux_params: if specified model will have additional classification auxiliary output
+            build on top of encoder, supported params:
+                - classes (int): number of classes
+                - pooling (str): one of 'max', 'avg'. Default is 'avg'.
+                - dropout (float): dropout factor in [0, 1)
+                - activation (str): activation function to apply "sigmoid"/"softmax" (could be None to return logits)
+    Returns:
+        ``torch.nn.Module``: **DeepLabV3**
+    .. _DeeplabV3:
+        https://arxiv.org/abs/1706.05587
+    """
+        
+    def __init__(
+            self,
+            encoder_name: str = "resnet34",
+            encoder_depth: int = 5,
+            encoder_weights: Optional[str] = "imagenet",
+            decoder_channels: int = 256,
+            in_channels: int = 3,
+            classes: int = 1,
+            activation: Optional[str] = None,
+            upsampling: int = 8,
+            aux_params: Optional[dict] = None,
+    ):
+        super().__init__()
+
+        self.encoder = get_encoder(
+            encoder_name,
+            in_channels=in_channels,
+            depth=encoder_depth,
+            weights=encoder_weights,
+        )
+        self.encoder.make_dilated(
+            stage_list=[4, 5],
+            dilation_list=[2, 4]
+        )
+
+        self.decoder = DeepLabV3Decoder(
+            in_channels=self.encoder.out_channels[-1],
+            out_channels=decoder_channels,
+        )
+
+        self.segmentation_head = SegmentationHead(
+            in_channels=self.decoder.out_channels,
+            out_channels=classes,
+            activation=activation,
+            kernel_size=1,
+            upsampling=upsampling,
+        )
+
+        if aux_params is not None:
+            self.classification_head = ClassificationHead(
+                in_channels=self.encoder.out_channels[-1], **aux_params
+            )
+        else:
+            self.classification_head = None
diff --git a/segmentation_models_pytorch/encoders/_base.py b/segmentation_models_pytorch/encoders/_base.py
@@ -1,6 +1,7 @@
 import torch
 import torch.nn as nn
 from typing import List
+from collections import OrderedDict
 
 from . import _utils as utils
 
@@ -12,7 +13,7 @@ class EncoderMixin:
     """
 
     @property
-    def out_channels(self) -> List:
+    def out_channels(self):
         """Return channels dimensions for each tensor of forward output of encoder"""
         return self._out_channels[: self._depth + 1]
 
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -27,18 +27,32 @@ def get_encoders():
 
 ENCODERS = get_encoders()
 DEFAULT_ENCODER = "resnet18"
-DEFAULT_SAMPLE = torch.ones([1, 3, 64, 64])
-DEFAULT_PAN_SAMPLE = torch.ones([2, 3, 256, 256])
 
 
-def _test_forward(model):
+def get_sample(model_class):
+    if model_class in [smp.Unet, smp.Linknet, smp.FPN, smp.PSPNet]:
+        sample = torch.ones([1, 3, 64, 64])
+    elif model_class == smp.PAN:
+        sample = torch.ones([2, 3, 256, 256])
+    elif model_class == smp.DeepLabV3:
+        sample = torch.ones([2, 3, 128, 128])
+    else:
+        raise ValueError("Not supported model class {}".format(model_class))
+    return sample
+
+
+def _test_forward(model, sample, test_shape=False):
     with torch.no_grad():
-        model(DEFAULT_SAMPLE)
+        out = model(sample)
+    if test_shape:
+        assert out.shape[2:] == sample.shape[2:]
 
 
-def _test_forward_backward(model, sample):
+def _test_forward_backward(model, sample, test_shape=False):
     out = model(sample)
     out.mean().backward()
+    if test_shape:
+        assert out.shape[2:] == sample.shape[2:]
 
 
 @pytest.mark.parametrize("encoder_name", ENCODERS)
@@ -50,12 +64,22 @@ def test_forward(model_class, encoder_name, encoder_depth, **kwargs):
     model = model_class(
         encoder_name, encoder_depth=encoder_depth, encoder_weights=None, **kwargs
     )
-    _test_forward(model)
+    sample = get_sample(model_class)
 
+    if encoder_depth == 5 and model_class != smp.PSPNet:
+        test_shape = True
+    else:
+        test_shape = False
 
-@pytest.mark.parametrize("model_class", [smp.PAN, smp.FPN, smp.PSPNet, smp.Linknet, smp.Unet])
+    _test_forward(model, sample, test_shape)
+
+
+@pytest.mark.parametrize(
+    "model_class",
+    [smp.PAN, smp.FPN, smp.PSPNet, smp.Linknet, smp.Unet, smp.DeepLabV3]
+)
 def test_forward_backward(model_class):
-    sample = DEFAULT_PAN_SAMPLE if model_class is smp.PAN else DEFAULT_SAMPLE
+    sample = get_sample(model_class)
     model = model_class(DEFAULT_ENCODER, encoder_weights=None)
     _test_forward_backward(model, sample)
 
@@ -65,8 +89,8 @@ def test_aux_output(model_class):
     model = model_class(
         DEFAULT_ENCODER, encoder_weights=None, aux_params=dict(classes=2)
     )
-    sample = DEFAULT_PAN_SAMPLE if model_class is smp.PAN else DEFAULT_SAMPLE
-    label_size = (2, 2) if model_class is smp.PAN else (1, 2)
+    sample = get_sample(model_class)
+    label_size = (sample.shape[0], 2)
     mask, label = model(sample)
     assert label.size() == label_size
 
@@ -76,7 +100,8 @@ def test_aux_output(model_class):
 def test_upsample(model_class, upsampling):
     default_upsampling = 4 if model_class is smp.FPN else 8
     model = model_class(DEFAULT_ENCODER, encoder_weights=None, upsampling=upsampling)
-    mask = model(DEFAULT_SAMPLE)
+    sample = get_sample(model_class)
+    mask = model(sample)
     assert mask.size()[-1] / 64 == upsampling / default_upsampling
 
 
@@ -106,7 +131,8 @@ def test_dilation(encoder_name):
 
     encoder.eval()
     with torch.no_grad():
-        output = encoder(DEFAULT_SAMPLE)
+        sample = torch.ones([1, 3, 64, 64])
+        output = encoder(sample)
 
     shapes = [out.shape[-1] for out in output]
     assert shapes == [64, 32, 16, 8, 4, 4]  # last downsampling replaced with dilation