Source code for towhee.models.layers.attention

# Original pytorch implementation by:
# 'An Image Is Worth 16 x 16 Words: Transformers for Image Recognition at Scale'
#       - https://arxiv.org/abs/2010.11929
# 'How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers'
#       - https://arxiv.org/abs/2106.10270
#
# Inspired by https://github.com/SvipRepetitionCounting/TransRAC/blob/main/models/TransRAC.py
#
# Built on top of codes from / Copyright 2020, Ross Wightman & Facebook, Inc. and its affiliates.
# Modifications & additions by / Copyright 2021 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import math
import numpy
import torch
from torch import nn

try:
    from einops import rearrange
except ImportError:
    os.system('pip install einops')
    from einops import rearrange

from towhee.models.layers.layers_with_relprop import Einsum, Linear, Dropout, Softmax


[docs]class MultiHeadAttention(nn.Module):
    """
    Multi-head attention layer.

    Args:
        dim (`int`):
            number of features
        num_heads (`int=8`):
            number of heads
        qkv_bias (`bool=False`):
            if add bias to qkv layer
        qk_scale (`float=None`):
            number to scale qk
        attn_drop_ratio (`float=0.`):
            drop rate of attention layer
        proj_drop_ratio (`float=0.`):
            drop rate of projection layer
        with_qkv (`bool=True`):
            if use qkv layer

    Example:
        >>> import torch
        >>> from towhee.models.layers.attention import MultiHeadAttention
        >>>
        >>> test_shape = (1, 196+1, 768)  # shape of output from patch_embed
        >>> input_x = torch.rand(test_shape)
        >>> model = MultiHeadAttention(dim=test_shape[2])
        >>> out = model.forward(input_x)
        >>> print(out.shape)
        torch.Size([1, 197, 768])
    """
[docs]    def __init__(self,
                 dim,
                 num_heads=8,
                 qkv_bias=False,
                 qk_scale=None,
                 attn_drop_ratio=0.,
                 proj_drop_ratio=0.,
                 with_qkv=True,
                 ):
        super().__init__()
        _ = with_qkv
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        self.scale = qk_scale or self.head_dim ** -0.5

        # A = Q*K^T
        self.matmul1 = Einsum('bhid,bhjd->bhij')
        # attn = A*V
        self.matmul2 = Einsum('bhij,bhjd->bhid')

        self.qkv = Linear(dim, dim * 3, qkv_bias)  # pylint: disable=too-many-function-args
        self.attn_drop = Dropout(attn_drop_ratio)  # pylint: disable=too-many-function-args
        self.proj = Linear(dim, dim)  # pylint: disable=too-many-function-args
        self.proj_drop = Dropout(proj_drop_ratio)  # pylint: disable=too-many-function-args
        self.softmax = Softmax(dim=-1)  # pylint: disable=unexpected-keyword-arg

        self.attn_cam = None
        self.attn = None
        self.v = None
        self.v_cam = None
        self.attn_gradients = None

    def get_attn(self):
        return self.attn

    def save_attn(self, attn):
        self.attn = attn

    def save_attn_cam(self, cam):
        self.attn_cam = cam

    def get_attn_cam(self):
        return self.attn_cam

    def get_v(self):
        return self.v

    def save_v(self, v):
        self.v = v

    def save_v_cam(self, cam):
        self.v_cam = cam

    def get_v_cam(self):
        return self.v_cam

    def save_attn_gradients(self, attn_gradients):
        self.attn_gradients = attn_gradients

    def get_attn_gradients(self):
        return self.attn_gradients

[docs]    def forward(self, x):
        batch_size, new_num_patch, dim = x.shape

        qkv = self.qkv(x).reshape(
            batch_size,
            new_num_patch,
            3,
            self.num_heads,
            self.head_dim,
        ).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        self.save_v(v)

        self.matmul1([q, k])
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)
        self.save_attn(attn)
        attn.register_hook(self.save_attn_gradients)

        self.matmul2([attn, v])
        x = (attn @ v).transpose(1, 2).reshape(batch_size, new_num_patch, dim)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x

    def relprop(self, cam, **kwargs):
        cam = self.proj_drop.relprop(cam, **kwargs)
        cam = self.proj.relprop(cam, **kwargs)
        cam = rearrange(cam, 'b n (h d) -> b h n d', h=self.num_heads)

        # attn = A*V
        (cam1, cam_v) = self.matmul2.relprop(cam, **kwargs)
        cam1 /= 2
        cam_v /= 2

        self.save_v_cam(cam_v)
        self.save_attn_cam(cam1)

        cam1 = self.attn_drop.relprop(cam1, **kwargs)
        cam1 = self.softmax.relprop(cam1, **kwargs)

        # A = Q*K^T
        (cam_q, cam_k) = self.matmul1.relprop(cam1, **kwargs)
        cam_q /= 2
        cam_k /= 2

        cam_qkv = rearrange([cam_q, cam_k, cam_v], 'qkv b h n d -> b n (qkv h d)', qkv=3, h=self.num_heads)

        return self.qkv.relprop(cam_qkv, **kwargs)


[docs]class Attention(nn.Module):
    """
    Scaled dot-product attention mechanism.
    """

[docs]    def __init__(self, scale=64, att_dropout=0.):
        super().__init__()
        self.softmax = nn.Softmax(dim=-1)
        self.dropout = nn.Dropout(att_dropout)
        self.scale = scale

[docs]    def forward(self, q, k, v, attn_mask=None):
        # q: [B, head, F, model_dim]
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.scale)  # [B,Head, F, F]
        if attn_mask:
            scores = scores.masked_fill_(attn_mask, -numpy.inf)
        scores = self.softmax(scores)
        scores = self.dropout(scores)  # [B,head, F, F]
        context = torch.matmul(scores, v)  # output
        return scores, context  # [B,head,F, F]

# if __name__ == '__main__':
#     import torch
#
#     test_shape = (1, 196+1, 768)
#     input_x = torch.rand(test_shape)  # shape of output from patch_embed
#     model = MultiHeadAttention(dim=test_shape[2])
#     out = model.forward(input_x)
#
#     assert(out.shape == (1, 197, 768))