from functools import partial from typing import Any, Callable, List, Optional, Sequence import torch from torch import nn, Tensor from ..ops.misc import Conv2dNormActivation, SqueezeExcitation as SElayer from ..transforms._presets import ImageClassification from ..utils import _log_api_usage_once from ._api import register_model, Weights, WeightsEnum from ._meta import _IMAGENET_CATEGORIES from ._utils import _make_divisible, _ovewrite_named_param, handle_legacy_interface __all__ = [ "MobileNetV3", "MobileNet_V3_Large_Weights", "MobileNet_V3_Small_Weights", "mobilenet_v3_large", "mobilenet_v3_small", ] class InvertedResidualConfig: # Stores information listed at Tables 1 and 2 of the MobileNetV3 paper def __init__( self, input_channels: int, kernel: int, expanded_channels: int, out_channels: int, use_se: bool, activation: str, stride: int, dilation: int, width_mult: float, ): self.input_channels = self.adjust_channels(input_channels, width_mult) self.kernel = kernel self.expanded_channels = self.adjust_channels(expanded_channels, width_mult) self.out_channels = self.adjust_channels(out_channels, width_mult) self.use_se = use_se self.use_hs = activation == "HS" self.stride = stride self.dilation = dilation @staticmethod def adjust_channels(channels: int, width_mult: float): return _make_divisible(channels * width_mult, 8) class InvertedResidual(nn.Module): # Implemented as described at section 5 of MobileNetV3 paper def __init__( self, cnf: InvertedResidualConfig, norm_layer: Callable[..., nn.Module], se_layer: Callable[..., nn.Module] = partial(SElayer, scale_activation=nn.Hardsigmoid), ): super().__init__() if not (1 <= cnf.stride <= 2): raise ValueError("illegal stride value") self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels layers: List[nn.Module] = [] activation_layer = nn.Hardswish if cnf.use_hs else nn.ReLU # expand if cnf.expanded_channels != cnf.input_channels: layers.append( Conv2dNormActivation( cnf.input_channels, cnf.expanded_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=activation_layer, ) ) # depthwise stride = 1 if cnf.dilation > 1 else cnf.stride layers.append( Conv2dNormActivation( cnf.expanded_channels, cnf.expanded_channels, kernel_size=cnf.kernel, stride=stride, dilation=cnf.dilation, groups=cnf.expanded_channels, norm_layer=norm_layer, activation_layer=activation_layer, ) ) if cnf.use_se: squeeze_channels = _make_divisible(cnf.expanded_channels // 4, 8) layers.append(se_layer(cnf.expanded_channels, squeeze_channels)) # project layers.append( Conv2dNormActivation( cnf.expanded_channels, cnf.out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=None ) ) self.block = nn.Sequential(*layers) self.out_channels = cnf.out_channels self._is_cn = cnf.stride > 1 def forward(self, input: Tensor) -> Tensor: result = self.block(input) if self.use_res_connect: result += input return result class MobileNetV3(nn.Module): def __init__( self, inverted_residual_setting: List[InvertedResidualConfig], last_channel: int, num_classes: int = 1000, block: Optional[Callable[..., nn.Module]] = None, norm_layer: Optional[Callable[..., nn.Module]] = None, dropout: float = 0.2, **kwargs: Any, ) -> None: """ MobileNet V3 main class Args: inverted_residual_setting (List[InvertedResidualConfig]): Network structure last_channel (int): The number of channels on the penultimate layer num_classes (int): Number of classes block (Optional[Callable[..., nn.Module]]): Module specifying inverted residual building block for mobilenet norm_layer (Optional[Callable[..., nn.Module]]): Module specifying the normalization layer to use dropout (float): The droupout probability """ super().__init__() _log_api_usage_once(self) if not inverted_residual_setting: raise ValueError("The inverted_residual_setting should not be empty") elif not ( isinstance(inverted_residual_setting, Sequence) and all([isinstance(s, InvertedResidualConfig) for s in inverted_residual_setting]) ): raise TypeError("The inverted_residual_setting should be List[InvertedResidualConfig]") if block is None: block = InvertedResidual if norm_layer is None: norm_layer = partial(nn.BatchNorm2d, eps=0.001, momentum=0.01) layers: List[nn.Module] = [] # building first layer firstconv_output_channels = inverted_residual_setting[0].input_channels layers.append( Conv2dNormActivation( 3, firstconv_output_channels, kernel_size=3, stride=2, norm_layer=norm_layer, activation_layer=nn.Hardswish, ) ) # building inverted residual blocks for cnf in inverted_residual_setting: layers.append(block(cnf, norm_layer)) # building last several layers lastconv_input_channels = inverted_residual_setting[-1].out_channels lastconv_output_channels = 6 * lastconv_input_channels layers.append( Conv2dNormActivation( lastconv_input_channels, lastconv_output_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=nn.Hardswish, ) ) self.features = nn.Sequential(*layers) self.avgpool = nn.AdaptiveAvgPool2d(1) self.classifier = nn.Sequential( nn.Linear(lastconv_output_channels, last_channel), nn.Hardswish(inplace=True), nn.Dropout(p=dropout, inplace=True), nn.Linear(last_channel, num_classes), ) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode="fan_out") if m.bias is not None: nn.init.zeros_(m.bias) elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): nn.init.ones_(m.weight) nn.init.zeros_(m.bias) elif isinstance(m, nn.Linear): nn.init.normal_(m.weight, 0, 0.01) nn.init.zeros_(m.bias) def _forward_impl(self, x: Tensor) -> Tensor: x = self.features(x) x = self.avgpool(x) x = torch.flatten(x, 1) x = self.classifier(x) return x def forward(self, x: Tensor) -> Tensor: return self._forward_impl(x) def _mobilenet_v3_conf( arch: str, width_mult: float = 1.0, reduced_tail: bool = False, dilated: bool = False, **kwargs: Any ): reduce_divider = 2 if reduced_tail else 1 dilation = 2 if dilated else 1 bneck_conf = partial(InvertedResidualConfig, width_mult=width_mult) adjust_channels = partial(InvertedResidualConfig.adjust_channels, width_mult=width_mult) if arch == "mobilenet_v3_large": inverted_residual_setting = [ bneck_conf(16, 3, 16, 16, False, "RE", 1, 1), bneck_conf(16, 3, 64, 24, False, "RE", 2, 1), # C1 bneck_conf(24, 3, 72, 24, False, "RE", 1, 1), bneck_conf(24, 5, 72, 40, True, "RE", 2, 1), # C2 bneck_conf(40, 5, 120, 40, True, "RE", 1, 1), bneck_conf(40, 5, 120, 40, True, "RE", 1, 1), bneck_conf(40, 3, 240, 80, False, "HS", 2, 1), # C3 bneck_conf(80, 3, 200, 80, False, "HS", 1, 1), bneck_conf(80, 3, 184, 80, False, "HS", 1, 1), bneck_conf(80, 3, 184, 80, False, "HS", 1, 1), bneck_conf(80, 3, 480, 112, True, "HS", 1, 1), bneck_conf(112, 3, 672, 112, True, "HS", 1, 1), bneck_conf(112, 5, 672, 160 // reduce_divider, True, "HS", 2, dilation), # C4 bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160 // reduce_divider, True, "HS", 1, dilation), bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160 // reduce_divider, True, "HS", 1, dilation), ] last_channel = adjust_channels(1280 // reduce_divider) # C5 elif arch == "mobilenet_v3_small": inverted_residual_setting = [ bneck_conf(16, 3, 16, 16, True, "RE", 2, 1), # C1 bneck_conf(16, 3, 72, 24, False, "RE", 2, 1), # C2 bneck_conf(24, 3, 88, 24, False, "RE", 1, 1), bneck_conf(24, 5, 96, 40, True, "HS", 2, 1), # C3 bneck_conf(40, 5, 240, 40, True, "HS", 1, 1), bneck_conf(40, 5, 240, 40, True, "HS", 1, 1), bneck_conf(40, 5, 120, 48, True, "HS", 1, 1), bneck_conf(48, 5, 144, 48, True, "HS", 1, 1), bneck_conf(48, 5, 288, 96 // reduce_divider, True, "HS", 2, dilation), # C4 bneck_conf(96 // reduce_divider, 5, 576 // reduce_divider, 96 // reduce_divider, True, "HS", 1, dilation), bneck_conf(96 // reduce_divider, 5, 576 // reduce_divider, 96 // reduce_divider, True, "HS", 1, dilation), ] last_channel = adjust_channels(1024 // reduce_divider) # C5 else: raise ValueError(f"Unsupported model type {arch}") return inverted_residual_setting, last_channel def _mobilenet_v3( inverted_residual_setting: List[InvertedResidualConfig], last_channel: int, weights: Optional[WeightsEnum], progress: bool, **kwargs: Any, ) -> MobileNetV3: if weights is not None: _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"])) model = MobileNetV3(inverted_residual_setting, last_channel, **kwargs) if weights is not None: model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True)) return model _COMMON_META = { "min_size": (1, 1), "categories": _IMAGENET_CATEGORIES, } class MobileNet_V3_Large_Weights(WeightsEnum): IMAGENET1K_V1 = Weights( url="https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth", transforms=partial(ImageClassification, crop_size=224), meta={ **_COMMON_META, "num_params": 5483032, "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#mobilenetv3-large--small", "_metrics": { "ImageNet-1K": { "acc@1": 74.042, "acc@5": 91.340, } }, "_ops": 0.217, "_file_size": 21.114, "_docs": """These weights were trained from scratch by using a simple training recipe.""", }, ) IMAGENET1K_V2 = Weights( url="https://download.pytorch.org/models/mobilenet_v3_large-5c1a4163.pth", transforms=partial(ImageClassification, crop_size=224, resize_size=232), meta={ **_COMMON_META, "num_params": 5483032, "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe-with-reg-tuning", "_metrics": { "ImageNet-1K": { "acc@1": 75.274, "acc@5": 92.566, } }, "_ops": 0.217, "_file_size": 21.107, "_docs": """ These weights improve marginally upon the results of the original paper by using a modified version of TorchVision's `new training recipe `_. """, }, ) DEFAULT = IMAGENET1K_V2 class MobileNet_V3_Small_Weights(WeightsEnum): IMAGENET1K_V1 = Weights( url="https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth", transforms=partial(ImageClassification, crop_size=224), meta={ **_COMMON_META, "num_params": 2542856, "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#mobilenetv3-large--small", "_metrics": { "ImageNet-1K": { "acc@1": 67.668, "acc@5": 87.402, } }, "_ops": 0.057, "_file_size": 9.829, "_docs": """ These weights improve upon the results of the original paper by using a simple training recipe. """, }, ) DEFAULT = IMAGENET1K_V1 @register_model() @handle_legacy_interface(weights=("pretrained", MobileNet_V3_Large_Weights.IMAGENET1K_V1)) def mobilenet_v3_large( *, weights: Optional[MobileNet_V3_Large_Weights] = None, progress: bool = True, **kwargs: Any ) -> MobileNetV3: """ Constructs a large MobileNetV3 architecture from `Searching for MobileNetV3 `__. Args: weights (:class:`~torchvision.models.MobileNet_V3_Large_Weights`, optional): The pretrained weights to use. See :class:`~torchvision.models.MobileNet_V3_Large_Weights` below for more details, and possible values. By default, no pre-trained weights are used. progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True. **kwargs: parameters passed to the ``torchvision.models.mobilenet.MobileNetV3`` base class. Please refer to the `source code `_ for more details about this class. .. autoclass:: torchvision.models.MobileNet_V3_Large_Weights :members: """ weights = MobileNet_V3_Large_Weights.verify(weights) inverted_residual_setting, last_channel = _mobilenet_v3_conf("mobilenet_v3_large", **kwargs) return _mobilenet_v3(inverted_residual_setting, last_channel, weights, progress, **kwargs) @register_model() @handle_legacy_interface(weights=("pretrained", MobileNet_V3_Small_Weights.IMAGENET1K_V1)) def mobilenet_v3_small( *, weights: Optional[MobileNet_V3_Small_Weights] = None, progress: bool = True, **kwargs: Any ) -> MobileNetV3: """ Constructs a small MobileNetV3 architecture from `Searching for MobileNetV3 `__. Args: weights (:class:`~torchvision.models.MobileNet_V3_Small_Weights`, optional): The pretrained weights to use. See :class:`~torchvision.models.MobileNet_V3_Small_Weights` below for more details, and possible values. By default, no pre-trained weights are used. progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True. **kwargs: parameters passed to the ``torchvision.models.mobilenet.MobileNetV3`` base class. Please refer to the `source code `_ for more details about this class. .. autoclass:: torchvision.models.MobileNet_V3_Small_Weights :members: """ weights = MobileNet_V3_Small_Weights.verify(weights) inverted_residual_setting, last_channel = _mobilenet_v3_conf("mobilenet_v3_small", **kwargs) return _mobilenet_v3(inverted_residual_setting, last_channel, weights, progress, **kwargs)