Source code for towhee.models.lightning_dot.bi_encoder

# Built on top of the original implementation at https://github.com/intersun/LightningDOT
#
# Modifications by Copyright 2022 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any

import torch

from collections import defaultdict
from torch import nn


[docs]class BiEncoder(nn.Module): """ Bi-Encoder model component. Encapsulates query/question and context/passage encoders. """
[docs] def __init__(self, UniterEncoder: nn.Module, BertEncoder: nn.Module, args: Any, # pylint: disable=invalid-name fix_img_encoder: bool = False, fix_txt_encoder: bool = False, project_dim: int = 0): super().__init__() if args.img_model_type == 'uniter-base': self.img_model = UniterEncoder.init_encoder(args.img_model_config, checkpoint_path=args.img_checkpoint, project_dim=project_dim) else: raise ValueError(f'image encoder does not support other types ({args.img_model_type}) for now') if args.txt_model_type == 'bert-base': self.txt_model = BertEncoder.init_encoder(args.txt_model_config, checkpoint_path=args.txt_checkpoint, project_dim=project_dim) elif args.txt_model_type == 'uniter-base': self.txt_model = UniterEncoder.init_encoder(args.txt_model_config, checkpoint_path=args.txt_checkpoint, project_dim=project_dim) else: raise ValueError(f'txt encoder does not support other types ({args.txt_model_type}) for now') self.fix_img_encoder = fix_img_encoder self.fix_txt_encoder = fix_txt_encoder self.project_dim = project_dim if fix_txt_encoder: for param in self.txt_model.parameters(): param.requires_grad = False if fix_img_encoder: for param in self.img_model.parameters(): param.requires_grad = False
@staticmethod def get_representation(sub_model, input_ids, attention_mask, position_ids, img_feat, img_pos_feat, img_masks, gather_index=None, fix_encoder=False): if fix_encoder: with torch.no_grad(): sequence_output, pooled_output, hidden_states = sub_model(input_ids, attention_mask, position_ids, img_feat, img_pos_feat, img_masks, gather_index) else: sequence_output, pooled_output, hidden_states = sub_model(input_ids, attention_mask, position_ids, img_feat, img_pos_feat, img_masks, gather_index) if sub_model.training: sequence_output.requires_grad_(requires_grad=True) pooled_output.requires_grad_(requires_grad=True) return sequence_output, pooled_output, hidden_states
[docs] def forward(self, batch, output_all_encoded_layers=False): # batch keys # imgs # txts # caps batch = defaultdict(lambda: None, batch) if 'txts' in batch: sb = batch['txts'] txt_seq, txt_pooled, _ = self.get_representation(self.txt_model, sb['input_ids'], sb['attention_mask'], sb['position_ids'], sb['img_feat'], sb['img_pos_feat'], sb['img_masks'], sb['gather_index'], self.fix_txt_encoder) else: txt_seq, txt_pooled = None, None if 'imgs' in batch: sb = batch['imgs'] img_seq, img_pooled, _ = self.get_representation(self.img_model, sb['input_ids'], sb['attention_mask'], sb['position_ids'], sb['img_feat'], sb['img_pos_feat'], sb['img_masks'], sb['gather_index'], self.fix_txt_encoder) else: img_seq, img_pooled = None, None if 'caps' in batch and batch['caps']['input_ids'] is not None: sb = batch['caps'] cap_seq, cap_pooled, _ = self.get_representation(self.txt_model, sb['input_ids'], sb['attention_mask'], sb['position_ids'], sb['img_feat'], sb['img_pos_feat'], sb['img_masks'], sb['gather_index'], self.fix_txt_encoder) else: cap_seq, cap_pooled = None, None if output_all_encoded_layers: return txt_seq, img_seq, cap_seq else: return txt_pooled, img_pooled, cap_pooled