Source code for towhee.models.coformer.transformer

# Copyright 2022 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Original code from https://github.com/jhcho99/CoFormer.
#
# Modified by Zilliz.

"""
Transformer Architectures in CoFormer
"""
import copy
import torch
import torch.nn.functional as F
from typing import Optional
from torch import nn, Tensor

[docs]class Transformer(nn.Module): """ Transformer class. """
[docs] def __init__(self, d_model=512, nhead=8, num_glance_enc_layers=3, num_gaze_s1_dec_layers=3, num_gaze_s1_enc_layers=3, num_gaze_s2_dec_layers=3, dim_feedforward=2048, dropout=0.15, activation="relu" ): super().__init__() self.d_model = d_model self.nhead = nhead self.num_verb_classes = 504 # Glacne Transformer glance_enc_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation) self.glance_enc = TransformerEncoder(glance_enc_layer, num_glance_enc_layers) # Gaze-Step1 Transformer gaze_s1_dec_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, activation) self.gaze_s1_dec = TransformerDecoder(gaze_s1_dec_layer, num_gaze_s1_dec_layers) gaze_s1_enc_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation) self.gaze_s1_enc = TransformerEncoder(gaze_s1_enc_layer, num_gaze_s1_enc_layers) # Gaze-Step2 Transformer gaze_s2_dec_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, activation) self.gaze_s2_dec = TransformerDecoder(gaze_s2_dec_layer, num_gaze_s2_dec_layers) # classifer (for verb prediction) self.verb_classifier = nn.Sequential(nn.Linear(d_model*2, d_model*2), nn.ReLU(), nn.Dropout(0.3), nn.Linear(d_model*2, self.num_verb_classes)) # layer norms self.ln1 = nn.LayerNorm(d_model) self.ln2 = nn.LayerNorm(d_model*2) self.ln3 = nn.LayerNorm(d_model) self.ln4 = nn.LayerNorm(d_model) self._reset_parameters()
def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p)
[docs] def forward(self, src, mask, il_token_embed, rl_token_embed, verb_token_embed, role_token_embed, pos_embed, vidx_ridx, targets=None, inference=False ): device = il_token_embed.device # flatten NxCxHxW to HWxNxC bs, _, h, w = src.shape flattend_src = src.flatten(2).permute(2, 0, 1) pos_embed = pos_embed.flatten(2).permute(2, 0, 1) mask = mask.flatten(1) # Glance Transformer ## Encoder il_token = il_token_embed.unsqueeze(1).repeat(1, bs, 1) glance_enc_zero_mask = torch.zeros((bs, 1), dtype=torch.bool, device=device) mem_mask = torch.cat([glance_enc_zero_mask, mask], dim=1) il_token_flattend_src = torch.cat([il_token, flattend_src], dim=0) glance_enc_memory = self.glance_enc(il_token_flattend_src, src_key_padding_mask=mem_mask, pos=pos_embed, num_zeros=1) il_token_feature, aggregated_src = glance_enc_memory.split([1, h*w], dim=0) # Gaze-Step1 Transformer ## Decoder all_role_tokens = role_token_embed.unsqueeze(1).repeat(1, bs, 1) role_tgt = torch.zeros_like(all_role_tokens) extracted_rhs = self.gaze_s1_dec(all_role_tokens, self.ln1(flattend_src), memory_key_padding_mask=mask, pos=pos_embed, query_pos=role_tgt) extracted_rhs = extracted_rhs.transpose(1, 2) ## Encoder num_all_roles = 190 rl_token = rl_token_embed.unsqueeze(1).repeat(1, bs, 1) gaze_s1_enc_zero_mask = torch.zeros((bs, (1 + num_all_roles)), dtype=torch.bool, device=device) rl_token_extracted_rhs = torch.cat([rl_token, extracted_rhs.view(num_all_roles, 1, -1)], dim=0) gaze_s1_enc_memory = self.gaze_s1_enc(rl_token_extracted_rhs, src_key_padding_mask=gaze_s1_enc_zero_mask, pos=None, num_zeros=(1+num_all_roles) ) rl_token_feature, aggregated_rhs = gaze_s1_enc_memory.split([1, num_all_roles], dim=0) # Verb Prediction il_token_feature = il_token_feature.view(bs, -1) rl_token_feature = rl_token_feature.view(bs, -1) vhs = torch.cat([il_token_feature, rl_token_feature], dim=-1) vhs = self.ln2(vhs) verb_pred = self.verb_classifier(vhs).view(bs, self.num_verb_classes) # Gaze-Step2 Transformer ## Deocder ### At training time, we assume that the ground-truth verb is given. ##### Please see the evaluation details in [Grounded Situation Recognition] task. ##### There are three evaluation settings: Top-1 Predicted Verb, Top-5 Predicted Verbs and Ground-Truth Verb. ##### If top-1 predicted verb is incorrect, then grounded noun predictions in Top-1 Predicted Verb setting are considered incorrect. ##### If the ground-truth verb is not included in top-5 predicted verbs, # then grounded noun predictions in Top-5 Predicted Verbs setting are considered incorrect. ##### In Ground-Truth Verb setting, we only consider grounded noun predictions. ### At inference time, we use the predicted verb. #### For frame-role queries, we select the verb token embedding corresponding to the predicted verb. #### For frame-role queries, we select the role token embeddings corresponding to the roles associated with the predicted verb. if not inference: selected_verb_token = verb_token_embed[targets["verbs"]].view(1, -1) selected_roles = targets["roles"] else: top1_verb = torch.topk(verb_pred, k=1, dim=1)[1].item() selected_verb_token = verb_token_embed[top1_verb].view(1, -1) selected_roles = vidx_ridx[top1_verb] selected_role_tokens = role_token_embed[selected_roles] frame_role_queries = selected_role_tokens + selected_verb_token frame_role_queries = frame_role_queries.unsqueeze(1).repeat(1, bs, 1) role_tgt = torch.zeros_like(frame_role_queries) final_rhs = self.gaze_s2_dec(frame_role_queries, self.ln3(aggregated_src), memory_key_padding_mask=mask, pos=pos_embed, query_pos=role_tgt) final_rhs = self.ln4(final_rhs) final_rhs = final_rhs.transpose(1,2) return verb_pred, extracted_rhs, aggregated_rhs, final_rhs, selected_roles
[docs]class TransformerEncoder(nn.Module): """ TransformerEncoder class. """
[docs] def __init__(self, encoder_layer, num_layers): super().__init__() self.layers = _get_clones(encoder_layer, num_layers) self.num_layers = num_layers
[docs] def forward(self, src, mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, num_zeros=None): output = src for layer in self.layers: output = layer(output, src_mask=mask, src_key_padding_mask=src_key_padding_mask, pos=pos, num_zeros=num_zeros) return output
[docs]class TransformerDecoder(nn.Module): """ TransformerDecoder class. """
[docs] def __init__(self, decoder_layer, num_layers): super().__init__() self.layers = _get_clones(decoder_layer, num_layers) self.num_layers = num_layers
[docs] def forward(self, tgt, memory, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): output = tgt for layer in self.layers: output = layer(output, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask, pos=pos, query_pos=query_pos) return output.unsqueeze(0)
[docs]class TransformerEncoderLayer(nn.Module): """ TransformerEncoderLayer class. """
[docs] def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.15, activation="relu"): super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation)
def with_pos_embed(self, tensor, pos: Optional[Tensor], num_zeros=None): if num_zeros is not None: return tensor if pos is None else torch.cat([tensor[:num_zeros], (tensor[num_zeros:] + pos)], dim=0) else: return tensor if pos is None else tensor + pos
[docs] def forward(self, src, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, num_zeros=None): src2 = self.norm1(src) q = k = self.with_pos_embed(src2, pos, num_zeros=num_zeros) src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0] src = src + self.dropout1(src2) src2 = self.norm2(src) src2 = self.linear2(self.dropout(self.activation(self.linear1(src2)))) src = src + self.dropout2(src2) return src
[docs]class TransformerDecoderLayer(nn.Module): """ TransformerDecoderLayer class. """
[docs] def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.15, activation="relu"): super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.norm1 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout2 = nn.Dropout(dropout) self.dropout3 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation)
def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos
[docs] def forward(self, tgt, memory, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): tgt2 = self.norm1(tgt) q = k = self.with_pos_embed(tgt2, query_pos) tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0] tgt = tgt + self.dropout1(tgt2) tgt2 = self.norm2(tgt) tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos), key=self.with_pos_embed(memory, pos), value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask)[0] tgt = tgt + self.dropout2(tgt2) tgt2 = self.norm3(tgt) tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) tgt = tgt + self.dropout3(tgt2) return tgt
def _get_clones(module, n): return nn.ModuleList([copy.deepcopy(module) for _ in range(n)])
[docs]def build_transformer( d_model=512, dropout=0.15, nhead=8, num_glance_enc_layers=3, num_gaze_s1_dec_layers=3, num_gaze_s1_enc_layers=3, num_gaze_s2_dec_layers=3, dim_feedforward=2048 ): return Transformer( d_model=d_model, dropout=dropout, nhead=nhead, num_glance_enc_layers=num_glance_enc_layers, num_gaze_s1_dec_layers=num_gaze_s1_dec_layers, num_gaze_s1_enc_layers=num_gaze_s1_enc_layers, num_gaze_s2_dec_layers=num_gaze_s2_dec_layers, dim_feedforward=dim_feedforward )
def _get_activation_fn(activation): """Return an activation function given a string""" if activation == "relu": return F.relu if activation == "gelu": return F.gelu if activation == "glu": return F.glu raise RuntimeError(F"activation should be relu/gelu, not {activation}.")