Source code for towhee.models.layers.multi_scale_transformer_block

# Copyright 2021  Facebook. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This code is modified by Zilliz.


import torch
from torch import nn
from typing import List, Tuple

from towhee.models.layers.multi_scale_attention import MultiScaleAttention
from towhee.models.layers.droppath import DropPath
from towhee.models.layers.mlp import Mlp
from towhee.models.layers.pool_attention import AttentionPool


[docs]class MultiScaleBlock(nn.Module):
    """
    A multiscale vision transformer block.
    Each block contains a multiscale attention layer and a Mlp layer.
    ::
                                      Input
                                        |-------------------+
                                        ↓                   |
                                       Norm                 |
                                        ↓                   |
                                MultiScaleAttention        Pool
                                        ↓                   |
                                     DropPath               |
                                        ↓                   |
                                    Summation ←-------------+
                                        |
                                        |-------------------+
                                        ↓                   |
                                       Norm                 |
                                        ↓                   |
                                       Mlp                 Proj
                                        ↓                   |
                                     DropPath               |
                                        ↓                   |
                                    Summation  ←------------+
    Args:
        dim (`int`):
            Input feature dimension.
        dim_out (`int`):
            Output feature dimension.
        num_heads (`int`):
            Number of heads in the attention layer.
        mlp_ratio (`float`):
            MLP ratio which controls the feature dimension in the hidden layer of the MLP block.
        qkv_bias (`bool`):
            If set to False, the qkv layer will not learn an additive bias.
        dropout_rate (`float`):
            DropOut rate. If set to 0, DropOut is disabled.
        droppath_rate (`float`):
            DropPath rate. If set to 0, DropPath is disabled.
        activation (`nn.Module`):
            Activation layer used in the MLP layer.
        norm_layer (`nn.Module`):
            Normalization layer.
        kernel_q (`_size_3_t`):
            Pooling kernel size for q. If pooling kernel size is 1 for all the dimensions.
        kernel_kv (`_size_3_t`):
            Pooling kernel size for kv. If pooling kernel size is 1 for all the dimensions, pooling is not used.
        stride_q (`_size_3_t`):
            Pooling kernel stride for q.
        stride_kv (`_size_3_t`):
            Pooling kernel stride for kv.
        pool_mode (`nn.Module`):
            Pooling mode.
        has_cls_embed (`bool`):
            If set to True, the first token of the input tensor should be a cls token.
            Otherwise, the input tensor does not contain a cls token. Pooling is not applied to the cls token.
        pool_first (`bool`):
            If set to True, pool is applied before qkv projection. Otherwise, pool is applied after qkv projection.
    """

[docs]    def __init__(
        self,
        dim,
        dim_out,
        num_heads,
        mlp_ratio=4.0,
        qkv_bias=False,
        dropout_rate=0.0,
        droppath_rate=0.0,
        activation=nn.GELU,
        norm_layer=nn.LayerNorm,
        kernel_q=(1, 1, 1),
        kernel_kv=(1, 1, 1),
        stride_q=(1, 1, 1),
        stride_kv=(1, 1, 1),
        pool_mode=nn.Conv3d,
        has_cls_embed=True,
        pool_first=False,
    ) -> None:
        super().__init__()
        self.dim = dim
        self.dim_out = dim_out
        self.norm1 = norm_layer(dim)
        kernel_skip = [s + 1 if s > 1 else s for s in stride_q]
        stride_skip = stride_q
        padding_skip = [int(skip // 2) for skip in kernel_skip]
        self.attn = MultiScaleAttention(
            dim=dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            dropout_rate=dropout_rate,
            kernel_q=kernel_q,
            kernel_kv=kernel_kv,
            stride_q=stride_q,
            stride_kv=stride_kv,
            norm_layer=nn.LayerNorm,
            has_cls_embed=has_cls_embed,
            pool_mode=pool_mode,
            pool_first=pool_first,
        )
        self.drop_path = (
            DropPath(drop_prob=droppath_rate) if droppath_rate > 0.0 else nn.Identity()
        )
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.has_cls_embed = has_cls_embed
        self.mlp = Mlp(
            in_features=dim,
            hidden_features=mlp_hidden_dim,
            out_features=dim_out,
            act_layer=activation,
            drop=dropout_rate,
        )
        if dim != dim_out:
            self.proj = nn.Linear(dim, dim_out)

        self.pool_skip = (
            nn.MaxPool3d(kernel_skip, stride_skip, padding_skip, ceil_mode=False)
            if len(kernel_skip) > 0
            else None
        )

[docs]    def forward(
        self, x: torch.Tensor, thw_shape: List[int]
    ) -> Tuple[torch.Tensor, List[int]]:
        """
        Args:
            x (`torch.Tensor`):
                Input tensor.
            thw_shape (`List`):
                The shape of the input tensor (before flattening).
        """

        x_block, thw_shape_new = self.attn(self.norm1(x), thw_shape)
        atn = AttentionPool(
            pool=self.pool_skip,
            thw_shape=thw_shape,
            has_cls_embed=self.has_cls_embed
        )
        x_res, _ = atn(x)
        x = x_res + self.drop_path(x_block)
        x_norm = self.norm2(x)
        x_mlp = self.mlp(x_norm)
        if self.dim != self.dim_out:
            x = self.proj(x_norm)
        x = x + self.drop_path(x_mlp)
        return x, thw_shape_new