Source code for towhee.models.mcprop.depthaggregator

# Built on top of the original implementation at https://github.com/mesnico/Wiki-Image-Caption-Matching/blob/master/mcprop/model.py
#
# Modifications by Copyright 2022 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
from torch import nn


[docs]class DepthAggregator(nn.Module):
    """
    Depth aggregator
    Args:
        aggr (str): aggregator
        input_dim (int): input dimension
        output_dim (int): output dimension
    """
[docs]    def __init__(self, aggr, input_dim=1024, output_dim=1024):
        super().__init__()
        self.aggr = aggr
        if self.aggr == 'gated':
            self.self_attn = nn.MultiheadAttention(input_dim, num_heads=4, dropout=0.1)
            self.gate_ffn = nn.Linear(input_dim, 1)

        if input_dim != output_dim:
            self.proj = nn.Linear(input_dim, output_dim)
        else:
            self.proj = None

[docs]    def forward(self, x, mask):
        """
        Forward function
        Args:
            x (torch.tensor): tensor with shape of (depth, B, N, dim)
            mask (torch.tensor): mask of x
        :return
            tensor with shape of (B, dim)
        """
        if self.aggr is None:
            out = x[-1, :, 0, :]  # simply takes the cls token from the last layer
        elif self.aggr == 'mean':
            out = x[:, :, 0, :].mean(dim=0)  # average the cls token from the last layer
        elif self.aggr == 'gated':
            mask_bool = mask.clone()
            mask_bool = mask_bool.bool()
            mask_bool = ~mask_bool
            mask_bool = mask_bool.unsqueeze(1).expand(-1, x.shape[0], -1)
            mask_bool = mask_bool.reshape(-1, mask_bool.shape[2])

            orig = x
            bs = x.shape[1]
            # merge batch size and depth
            x = x.view(-1, x.shape[2], x.shape[3]).permute(1, 0, 2)
            sa, _ = self.self_attn(x, x, x, key_padding_mask=mask_bool)
            scores = torch.sigmoid(self.gate_ffn(sa))  # N x bs*depth x 1
            scores = scores.permute(1, 0, 2).view(-1, bs, x.shape[0], 1)   # depth x B x N x 1

            # takes only the CLS
            scores = scores[:, :, 0, :]  # depth x B x 1
            orig = orig[:, :, 0, :]  # depth x B x dim
            scores = scores.permute(1, 2, 0)  # B x 1 x depth
            orig = orig.permute(1, 0, 2)  # B x depth x dim
            out = torch.matmul(scores, orig)  # B x 1 x dim
            out = out.squeeze(1)    # B x dim

        if self.proj is not None:
            out = self.proj(out)
        return out