Source code for towhee.models.coformer.backbone

# Copyright 2022 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Original code from https://github.com/jhcho99/CoFormer.
#
# Modified by Zilliz.

import torch
import torch.nn.functional as F
import torchvision
from torch import nn
from torchvision.models._utils import IntermediateLayerGetter
from typing import Dict, List
from towhee.models.coformer.utils import NestedTensor, is_main_process
from towhee.models.layers.position_encoding import build_position_encoding


[docs]class BackboneBase(nn.Module):
    """
    Args:
        backbone(`nn.Module`):
            Backbone model.
        train_backbone(`bool`):
            If backbone trained.
        name_backbone(`str`):
            Name of the backbone.
        num_channels(`int`):
            The number of the channels.
        return_interm_layers(`bool`):
            If the model returns interm layers.
    """
[docs]    def __init__(self,
                 backbone: nn.Module,
                 train_backbone: bool,
                 name_backbone: str,
                 num_channels: int,
                 return_interm_layers: bool
                ) -> None:
        super().__init__()
        if "resnet" in name_backbone:
            if not train_backbone:
                for _, parameter in backbone.named_parameters():
                    parameter.requires_grad_(False)
            if return_interm_layers:
                return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"}
            else:
                return_layers = {"layer4": "0"}
        else:
            # only resnet50 is supported
            assert False, f"backbone {name_backbone} is not supported now"
        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
        self.num_channels = num_channels

[docs]    def forward(self, tensor_list: NestedTensor):
        xs = self.body(tensor_list.tensors)
        out: Dict[str, NestedTensor] = {}
        for name, x in xs.items():
            m = tensor_list.mask
            assert m is not None
            mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
            out[name] = NestedTensor(x, mask)
        return out


[docs]class Backbone(BackboneBase):
    """
    ResNet backbone with frozen BatchNorm.
    """
[docs]    def __init__(self, name: str,
                 train_backbone: bool,
                 return_interm_layers: bool,
                 dilation: bool):
        if "resnet" in name:
            backbone = getattr(torchvision.models, name)(replace_stride_with_dilation=[False, False, dilation],
                                                         pretrained=is_main_process())
            num_channels = 512 if name in ("resnet18", "resnet34") else 2048
        else:
            # TODO only resnet is supported
            assert False, f"backbone {name} is not supported now"
        super().__init__(backbone, train_backbone, name, num_channels, return_interm_layers)


[docs]class Joiner(nn.Sequential):
    """
    Joiner class.
    """
    # pylint: disable=W0235
[docs]    def __init__(self, backbone, position_embedding):
        super().__init__(backbone, position_embedding)

    # pylint: disable=W0237
[docs]    def forward(self, tensor_list: NestedTensor):
        xs = self[0](tensor_list)
        out: List[NestedTensor] = []
        pos = []

        for _, x in xs.items():
            out.append(x)
            # position encoding
            pos.append(self[1](x).to(x.tensors.dtype))

        return out, pos


[docs]def build_backbone(
                    hidden_dim=512,
                    position_embedding="learned",
                    backbone="resnet50",
                  ):
    position_embedding = build_position_encoding(
                                                 hidden_dim=hidden_dim,
                                                 position_embedding=position_embedding,
                                                )
    train_backbone = False
    return_interm_layers = False
    dilation = False
    backbone = Backbone(backbone, train_backbone, return_interm_layers, dilation)
    model = Joiner(backbone, position_embedding)
    model.num_channels = backbone.num_channels
    return model