Merge pull request qubvel-org#1 from ludics/add_mbv3

ludics · web-flow · commit d8bd3238d8a9 · 2021-03-23T03:52:39.000+08:00
Add mobilenet_v3 in torchvision.models
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ The main features of this library are:
 
  - High level API (just two lines to create a neural network)
  - 9 models architectures for binary and multi class segmentation (including legendary Unet)
- - 104 available encoders
+ - 106 available encoders
  - All encoders have pre-trained weights for faster and better convergence
  
 ### [📚 Project Documentation 📚](http://smp.readthedocs.io/)
@@ -284,6 +284,8 @@ The following is a list of supported encoders in the SMP. Select the appropriate
 |Encoder                         |Weights                         |Params, M                       |
 |--------------------------------|:------------------------------:|:------------------------------:|
 |mobilenet_v2                    |imagenet                        |2M                              |
+|mobilenet_v3_large              |imagenet                        |3M                              |
+|mobilenet_v3_small              |imagenet                        |1M                              |
 
 </div>
 </details>
diff --git a/docs/encoders.rst b/docs/encoders.rst
@@ -252,11 +252,15 @@ EfficientNet
 MobileNet
 ~~~~~~~~~
 
-+-----------------+------------+-------------+
-| Encoder         | Weights    | Params, M   |
-+=================+============+=============+
-| mobilenet\_v2   | imagenet   | 2M          |
-+-----------------+------------+-------------+
++---------------------+------------+-------------+
+| Encoder             | Weights    | Params, M   |
++=====================+============+=============+
+| mobilenet\_v2       | imagenet   | 2M          |
++---------------------+------------+-------------+
+| mobilenet\_v3_large | imagenet   | 3M          |
++---------------------+------------+-------------+
+| mobilenet\_v2_small | imagenet   | 1M          |
++---------------------+------------+-------------+
 
 DPN
 ~~~
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-torchvision>=0.3.0
+torchvision==0.9.0
 pretrainedmodels==0.7.4
 efficientnet-pytorch==0.6.3
 timm==0.3.2
diff --git a/segmentation_models_pytorch/encoders/__init__.py b/segmentation_models_pytorch/encoders/__init__.py
@@ -10,6 +10,7 @@
 from .inceptionv4 import inceptionv4_encoders
 from .efficientnet import efficient_net_encoders
 from .mobilenet import mobilenet_encoders
+from .mobilenet_v3 import mobilenet_v3_encoders
 from .xception import xception_encoders
 from .timm_efficientnet import timm_efficientnet_encoders
 from .timm_resnest import timm_resnest_encoders
@@ -28,6 +29,7 @@
 encoders.update(inceptionv4_encoders)
 encoders.update(efficient_net_encoders)
 encoders.update(mobilenet_encoders)
+encoders.update(mobilenet_v3_encoders)
 encoders.update(xception_encoders)
 encoders.update(timm_efficientnet_encoders)
 encoders.update(timm_resnest_encoders)
diff --git a/segmentation_models_pytorch/encoders/mobilenet_v3.py b/segmentation_models_pytorch/encoders/mobilenet_v3.py
@@ -0,0 +1,109 @@
+""" Each encoder should have following attributes and methods and be inherited from `_base.EncoderMixin`
+
+Attributes:
+
+    _out_channels (list of int): specify number of channels for each encoder feature tensor
+    _depth (int): specify number of stages in decoder (in other words number of downsampling operations)
+    _in_channels (int): default number of input channels in first Conv2d layer for encoder (usually 3)
+
+Methods:
+
+    forward(self, x: torch.Tensor)
+        produce list of features of different spatial resolutions, each feature is a 4D torch.tensor of
+        shape NCHW (features should be sorted in descending order according to spatial resolution, starting
+        with resolution same as input `x` tensor).
+
+        Input: `x` with shape (1, 3, 64, 64)
+        Output: [f0, f1, f2, f3, f4, f5] - features with corresponding shapes
+                [(1, 3, 64, 64), (1, 64, 32, 32), (1, 128, 16, 16), (1, 256, 8, 8),
+                (1, 512, 4, 4), (1, 1024, 2, 2)] (C - dim may differ)
+
+        also should support number of features according to specified depth, e.g. if depth = 5,
+        number of feature tensors = 6 (one with same resolution as input and 5 downsampled),
+        depth = 3 -> number of feature tensors = 4 (one with same resolution as input and 3 downsampled).
+"""
+
+import torchvision
+import torch.nn as nn
+from torchvision.models.mobilenetv3 import _mobilenet_v3_conf
+
+from ._base import EncoderMixin
+
+
+class MobileNetV3Encoder(torchvision.models.MobileNetV3, EncoderMixin):
+
+    def __init__(self, out_channels, stage_idxs, model_name, depth=5, **kwargs):
+        inverted_residual_setting, last_channel = _mobilenet_v3_conf(model_name, kwargs)
+        super().__init__(inverted_residual_setting, last_channel, **kwargs)
+        
+        self._depth = depth
+        self._stage_idxs = stage_idxs
+        self._out_channels = out_channels
+        self._in_channels = 3
+        
+        del self.classifier
+
+    def get_stages(self):
+        return [
+            nn.Identity(),
+            self.features[:self._stage_idxs[0]],
+            self.features[self._stage_idxs[0]:self._stage_idxs[1]],
+            self.features[self._stage_idxs[1]:self._stage_idxs[2]],
+            self.features[self._stage_idxs[2]:self._stage_idxs[3]],
+            self.features[self._stage_idxs[3]:],
+        ]
+
+    def forward(self, x):
+        stages = self.get_stages()
+
+        features = []
+        for i in range(self._depth + 1):
+            x = stages[i](x)
+            features.append(x)
+
+        return features
+
+    def load_state_dict(self, state_dict, **kwargs):
+        state_dict.pop("classifier.0.bias")
+        state_dict.pop("classifier.0.weight")
+        state_dict.pop("classifier.3.bias")
+        state_dict.pop("classifier.3.weight")
+        super().load_state_dict(state_dict, **kwargs)
+
+
+mobilenet_v3_encoders = {
+    "mobilenet_v3_large": {
+        "encoder": MobileNetV3Encoder,
+        "pretrained_settings": {
+            "imagenet": {
+                "mean": [0.485, 0.456, 0.406],
+                "std": [0.229, 0.224, 0.225],
+                "url": "https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth",
+                "input_space": "RGB",
+                "input_range": [0, 1],
+            },
+        },
+        "params": {
+            "out_channels": (3, 16, 24, 40, 112, 960),
+            "stage_idxs": (2, 4, 7, 13),
+            "model_name": "mobilenet_v3_large",
+        },
+    },
+    "mobilenet_v3_small": {
+        "encoder": MobileNetV3Encoder,
+        "pretrained_settings": {
+            "imagenet": {
+                "mean": [0.485, 0.456, 0.406],
+                "std": [0.229, 0.224, 0.225],
+                "url": "https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth",
+                "input_space": "RGB",
+                "input_range": [0, 1],
+            },
+        },
+        "params": {
+            "out_channels": (3, 16, 16, 24, 40, 576),
+            "stage_idxs": (1, 2, 4, 7),
+            "model_name": "mobilenet_v3_small",
+        },
+    },
+}