Added bifpn encoder (rishabh qubvel-org#1)

rishabh-16 · rishabh-16 · commit 13c231f41b0d · 2019-12-31T17:18:50.000+05:30
diff --git a/segmentation_models_pytorch/__init__.py b/segmentation_models_pytorch/__init__.py
@@ -1,6 +1,8 @@
 from .unet import Unet
 from .linknet import Linknet
 from .fpn import FPN
+from .bifpn import BiFPN
+
 from .pspnet import PSPNet
 from .pan import PAN
 
diff --git a/segmentation_models_pytorch/bifpn/__init__.py b/segmentation_models_pytorch/bifpn/__init__.py
@@ -0,0 +1 @@
+from .model import BiFPN
diff --git a/segmentation_models_pytorch/bifpn/decoder.py b/segmentation_models_pytorch/bifpn/decoder.py
@@ -0,0 +1,206 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+class Conv3x3GNReLU(nn.Module):
+    def __init__(self, in_channels, out_channels, upsample=False):
+        super().__init__()
+        self.upsample = upsample
+        self.block = nn.Sequential(
+            nn.Conv2d(
+                in_channels, out_channels, (3, 3), stride=1, padding=1, bias=False
+            ),
+            nn.GroupNorm(32, out_channels),
+            nn.ReLU(inplace=True),
+        )
+
+    def forward(self, x):
+        x = self.block(x)
+        if self.upsample:
+            x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+        return x
+
+class DepthwiseConvBlock(nn.Module):
+    """
+    Depthwise seperable convolution. 
+    
+    
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, freeze_bn=False):
+        super(DepthwiseConvBlock,self).__init__()
+        self.depthwise = nn.Conv2d(in_channels, in_channels, kernel_size, stride, 
+                               padding, dilation, groups=in_channels, bias=False)
+        self.pointwise = nn.Conv2d(in_channels, out_channels, kernel_size=1, 
+                                   stride=1, padding=0, dilation=1, groups=1, bias=False)
+        
+        
+        self.bn = nn.BatchNorm2d(out_channels, momentum=0.9997, eps=4e-5)
+        self.act = nn.ReLU()
+        
+    def forward(self, inputs):
+        x = self.depthwise(inputs)
+        x = self.pointwise(x)
+        x = self.bn(x)
+        return self.act(x)
+
+
+class SegmentationBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, n_upsamples=0):
+        super().__init__()
+
+        blocks = [Conv3x3GNReLU(in_channels, out_channels, upsample=bool(n_upsamples))]
+
+        if n_upsamples > 1:
+            for _ in range(1, n_upsamples):
+                blocks.append(Conv3x3GNReLU(out_channels, out_channels, upsample=True))
+
+        self.block = nn.Sequential(*blocks)
+
+    def forward(self, x):
+        return self.block(x)
+
+
+class MergeBlock(nn.Module):
+    def __init__(self, policy):
+        super().__init__()
+        if policy not in ["add", "cat"]:
+            raise ValueError(
+                "`merge_policy` must be one of: ['add', 'cat'], got {}".format(
+                    policy
+                )
+            )
+        self.policy = policy
+
+    def forward(self, x):
+        if self.policy == 'add':
+            return sum(x)
+        elif self.policy == 'cat':
+            return torch.cat(x, dim=1)
+        else:
+            raise ValueError(
+                "`merge_policy` must be one of: ['add', 'cat'], got {}".format(self.policy)
+            )
+
+class ConvBlock(nn.Module):
+    """
+    Convolution block with Batch Normalization and ReLU activation.
+    
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, freeze_bn=False):
+        super(ConvBlock,self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding)
+        self.bn = nn.BatchNorm2d(out_channels, momentum=0.9997, eps=4e-5)
+        self.act = nn.ReLU()
+
+    def forward(self, inputs):
+        x = self.conv(inputs)
+        x = self.bn(x)
+        return self.act(x)
+
+class BiFPNBlock(nn.Module):
+    """
+    Bi-directional Feature Pyramid Network
+    """
+    def __init__(self, feature_size=64, epsilon=0.0001):
+        super(BiFPNBlock, self).__init__()
+        self.epsilon = epsilon
+        
+        self.p3_td = DepthwiseConvBlock(feature_size, feature_size)
+        self.p4_td = DepthwiseConvBlock(feature_size, feature_size)
+        self.p5_td = DepthwiseConvBlock(feature_size, feature_size)
+        self.p6_td = DepthwiseConvBlock(feature_size, feature_size)
+        
+        self.p4_out = DepthwiseConvBlock(feature_size, feature_size)
+        self.p5_out = DepthwiseConvBlock(feature_size, feature_size)
+        self.p6_out = DepthwiseConvBlock(feature_size, feature_size)
+        self.p7_out = DepthwiseConvBlock(feature_size, feature_size)
+        
+        # TODO: Init weights
+        self.w1 = nn.Parameter(torch.Tensor(2, 4))
+        self.w1_relu = nn.ReLU()
+        self.w2 = nn.Parameter(torch.Tensor(3, 4))
+        self.w2_relu = nn.ReLU()
+    
+    def forward(self, inputs):
+        p3_x, p4_x, p5_x, p6_x, p7_x = inputs
+        
+        # Calculate Top-Down Pathway
+        w1 = self.w1_relu(self.w1)
+        w1 /= torch.sum(w1, dim=0) + self.epsilon
+        w2 = self.w2_relu(self.w2)
+        w2 /= torch.sum(w2, dim=0) + self.epsilon
+        
+        p7_td = p7_x
+        p6_td = self.p6_td(w1[0, 0] * p6_x + w1[1, 0] * F.interpolate(p7_x, scale_factor=2))        
+        p5_td = self.p5_td(w1[0, 1] * p5_x + w1[1, 1] * F.interpolate(p6_x, scale_factor=2))
+        p4_td = self.p4_td(w1[0, 2] * p4_x + w1[1, 2] * F.interpolate(p5_x, scale_factor=2))
+        p3_td = self.p3_td(w1[0, 3] * p3_x + w1[1, 3] * F.interpolate(p4_x, scale_factor=2))
+        
+        # Calculate Bottom-Up Pathway
+        p3_out = p3_td
+        p4_out = self.p4_out(w2[0, 0] * p4_x + w2[1, 0] * p4_td + w2[2, 0] * nn.Upsample(scale_factor=0.5)(p3_out))
+        p5_out = self.p5_out(w2[0, 1] * p5_x + w2[1, 1] * p5_td + w2[2, 1] * nn.Upsample(scale_factor=0.5)(p4_out))
+        p6_out = self.p6_out(w2[0, 2] * p6_x + w2[1, 2] * p6_td + w2[2, 2] * nn.Upsample(scale_factor=0.5)(p5_out))
+        p7_out = self.p7_out(w2[0, 3] * p7_x + w2[1, 3] * p7_td + w2[2, 3] * nn.Upsample(scale_factor=0.5)(p6_out))
+
+        return [p3_out, p4_out, p5_out, p6_out, p7_out]
+    
+
+class BiFPNDecoder(nn.Module):
+    def __init__(
+            self,
+            encoder_channels,
+            encoder_depth=4,
+            feature_size = 64,
+            num_layers =2,
+            segmentation_channels=128,
+            dropout=0.2,
+            merge_policy="add",
+            epsilon = 0.0001
+    ):
+        super().__init__()
+
+        self.out_channels = segmentation_channels if merge_policy == "add" else segmentation_channels * 4
+        if encoder_depth < 3:
+            raise ValueError("Encoder depth for FPN decoder cannot be less than 3, got {}.".format(encoder_depth))
+
+        encoder_channels = encoder_channels[::-1]
+        encoder_channels = encoder_channels[:encoder_depth + 1]
+        size = encoder_channels
+        self.p3 = nn.Conv2d(size[2], feature_size, kernel_size=1, stride=1, padding=0)
+        self.p4 = nn.Conv2d(size[1], feature_size, kernel_size=1, stride=1, padding=0)
+        self.p5 = nn.Conv2d(size[0], feature_size, kernel_size=1, stride=1, padding=0)
+        
+        # p6 is obtained via a 3x3 stride-2 conv on C5
+        self.p6 = nn.Conv2d(size[0], feature_size, kernel_size=3, stride=2, padding=1)
+        
+        # p7 is computed by applying ReLU followed by a 3x3 stride-2 conv on p6
+        self.p7 = ConvBlock(feature_size, feature_size, kernel_size=3, stride=2, padding=1)
+
+        bifpns = []
+        for _ in range(num_layers):
+            bifpns.append(BiFPNBlock(feature_size))
+        self.bifpn = nn.Sequential(*bifpns)
+        self.seg_blocks = nn.ModuleList([
+            SegmentationBlock(feature_size, segmentation_channels, n_upsamples=n_upsamples)
+            for n_upsamples in [0,1,2,3,4]
+        ])
+
+        self.merge = MergeBlock(merge_policy)
+        self.dropout = nn.Dropout2d(p=dropout, inplace=True)
+                
+
+    def forward(self, *features):
+        c3, c4, c5 = features[-3:]
+        p3_x = self.p3(c3)        
+        p4_x = self.p4(c4)
+        p5_x = self.p5(c5)
+        p6_x = self.p6(c5)
+        p7_x = self.p7(p6_x)
+        features = [p3_x, p4_x, p5_x, p6_x, p7_x]
+        [p3_out, p4_out, p5_out, p6_out, p7_out] = self.bifpn(features)
+        feature_pyramid = [seg_block(p) for seg_block, p in zip(self.seg_blocks, [p3_out, p4_out, p5_out, p6_out, p7_out])]
+        x = self.merge(feature_pyramid)
+        x = self.dropout(x)
+        return x
diff --git a/segmentation_models_pytorch/bifpn/model.py b/segmentation_models_pytorch/bifpn/model.py
@@ -0,0 +1,92 @@
+from typing import Optional, Union
+from .decoder import BiFPNDecoder
+from ..base import SegmentationModel, SegmentationHead, ClassificationHead
+from ..encoders import get_encoder
+
+
+class BiFPN(SegmentationModel):
+    """FPN_ is a fully convolution neural network for image semantic segmentation
+    Args:
+        encoder_name: name of classification model (without last dense layers) used as feature
+                extractor to build segmentation model.
+        encoder_depth: number of stages used in decoder, larger depth - more features are generated.
+            e.g. for depth=3 encoder will generate list of features with following spatial shapes
+            [(H,W), (H/2, W/2), (H/4, W/4), (H/8, W/8)], so in general the deepest feature will have
+            spatial resolution (H/(2^depth), W/(2^depth)]
+        encoder_weights: one of ``None`` (random initialization), ``imagenet`` (pre-training on ImageNet).
+        decoder_pyramid_channels: a number of convolution filters in Feature Pyramid of FPN_.
+        decoder_segmentation_channels: a number of convolution filters in segmentation head of FPN_.
+        decoder_merge_policy: determines how to merge outputs inside FPN.
+            One of [``add``, ``cat``]
+        decoder_dropout: spatial dropout rate in range (0, 1).
+        in_channels: number of input channels for model, default is 3.
+        classes: a number of classes for output (output shape - ``(batch, classes, h, w)``).
+        activation (str, callable): activation function used in ``.predict(x)`` method for inference.
+            One of [``sigmoid``, ``softmax2d``, callable, None]
+        upsampling: optional, final upsampling factor
+            (default is 4 to preserve input -> output spatial shape identity)
+        aux_params: if specified model will have additional classification auxiliary output
+            build on top of encoder, supported params:
+                - classes (int): number of classes
+                - pooling (str): one of 'max', 'avg'. Default is 'avg'.
+                - dropout (float): dropout factor in [0, 1)
+                - activation (str): activation function to apply "sigmoid"/"softmax" (could be None to return logits)
+
+    Returns:
+        ``torch.nn.Module``: **FPN**
+
+    .. _FPN:
+        http://presentations.cocodataset.org/COCO17-Stuff-FAIR.pdf
+
+    """
+
+    def __init__(
+        self,
+        encoder_name: str = "resnet34",
+        encoder_depth: int = 4,
+        encoder_weights: Optional[str] = "imagenet",
+        feature_size: int = 256,
+        decoder_segmentation_channels: int = 128,
+        decoder_merge_policy: str = "add",
+        decoder_dropout: float = 0.2,
+        in_channels: int = 3,
+        classes: int = 1,
+        activation: Optional[str] = None,
+        upsampling: int = 4,
+        aux_params: Optional[dict] = None,
+    ):
+        super().__init__()
+
+        self.encoder = get_encoder(
+            encoder_name,
+            in_channels=in_channels,
+            depth=encoder_depth,
+            weights=encoder_weights,
+        )
+
+        self.decoder = BiFPNDecoder(
+            encoder_channels=self.encoder.out_channels,
+            encoder_depth=encoder_depth,
+            feature_size=feature_size,
+            segmentation_channels=decoder_segmentation_channels,
+            dropout=decoder_dropout,
+            merge_policy=decoder_merge_policy,
+        )
+
+        self.segmentation_head = SegmentationHead(
+            in_channels=self.decoder.out_channels,
+            out_channels=classes,
+            activation=activation,
+            kernel_size=1,
+            upsampling=upsampling,
+        )
+
+        if aux_params is not None:
+            self.classification_head = ClassificationHead(
+                in_channels=self.encoder.out_channels[-1], **aux_params
+            )
+        else:
+            self.classification_head = None
+
+        self.name = "fpn-{}".format(encoder_name)
+        self.initialize()