# original code from
# modified by Zilliz

from collections import OrderedDict
from typing import Optional
from types import SimpleNamespace
import torch
import logging
from torch import nn
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
from towhee.models import clip

from towhee.models.drl.until_module import convert_weights
from towhee.models.drl.module_cross import CrossModel, Transformer as TransformerClip
from towhee.models.drl.until_module import LayerNorm, AllGather, AllGather2, CrossEn

allgather = AllGather.apply
allgather2 = AllGather2.apply

logger = logging.getLogger(__name__)

[docs]class DRL(nn.Module): """ This is a PyTorch implementation of the paper Disentangled Representation Learning for Text-Video Retrieval. Args: base_encoder (`str`): CLIP encoder backbone. default: `clip_vit_b32` agg_module (`str`): Feature aggregation module for video. default: `seqTransf`, choices=[`ndone`, `seqLSTM`, `seqTransf`] interaction (`str`): Interaction type for retrieval. default: `wti`. wti_arch (`int)`: Select a architecture for weight branch. default: 2. cdcr (`int`): Channel decorrelation regularization. default: 3. cdcr_alpha1 (`float`): Coefficient 1 for channel decorrelation regularization. default: 1.0. cdcr_alpha2 (`float`): Coefficient 2 for channel decorrelation regularization. default: 0.06. cdcr_lambda (`float`): Coefficient for channel decorrelation regularization. default: 0.001. cross_num_hidden_layers (`int`): Number of hidden layers for cross transformer interaction. """
[docs] def __init__(self, base_encoder: str = "clip_vit_b32", agg_module: str = "seqTransf", interaction: str = "wti", wti_arch: int = 2, cdcr: int = 3, cdcr_alpha1: float = 1.0, cdcr_alpha2: float = 0.06, cdcr_lambda: float = 0.001, cross_num_hidden_layers: Optional[int] = None, backbone_pretrained: bool = False ): super().__init__() self.base_encoder = base_encoder self.agg_module = agg_module self.interaction = interaction self.wti_arch = wti_arch self.cdcr = cdcr self.cdcr_alpha1 = cdcr_alpha1 self.cdcr_alpha2 = cdcr_alpha2 self.cdcr_lambda = cdcr_lambda self.agg_module = agg_module backbone = base_encoder self.clip = clip.create_model(model_name=backbone, pretrained=backbone_pretrained, jit=False, clip4clip=True) state_dict = self.clip.state_dict() context_length = state_dict["positional_embedding"].shape[0] transformer_width = state_dict["ln_final.weight"].shape[0] transformer_heads = transformer_width // 64 if torch.cuda.is_available(): convert_weights(self.clip) # fp16 cross_config = SimpleNamespace(**{ "hidden_dropout_prob": 0.1, "hidden_size": 512, "max_position_embeddings": 128, "num_attention_heads": 8, "num_hidden_layers": 4, "vocab_size": 512, }) cross_config.max_position_embeddings = context_length cross_config.hidden_size = transformer_width self.cross_config = cross_config if self.interaction == "xti": if cross_num_hidden_layers is not None: setattr(cross_config, "num_hidden_layers", cross_num_hidden_layers) self.cross = CrossModel(cross_config) self.similarity_dense = nn.Linear(cross_config.hidden_size, 1) elif self.interaction == "mlp": self.similarity_dense = nn.Sequential(nn.Linear(transformer_width * 2, transformer_width), nn.ReLU(inplace=True), nn.Linear(transformer_width, 1)) elif self.interaction == "wti": if self.wti_arch == 1: self.text_weight_fc = nn.Linear(transformer_width, 1) self.video_weight_fc = nn.Linear(transformer_width, 1) elif self.wti_arch == 2: self.text_weight_fc = nn.Sequential( nn.Linear(transformer_width, transformer_width), nn.ReLU(inplace=True), nn.Linear(transformer_width, 1)) self.video_weight_fc = nn.Sequential( nn.Linear(transformer_width, transformer_width), nn.ReLU(inplace=True), nn.Linear(transformer_width, 1)) elif self.wti_arch == 3: self.text_weight_fc = nn.Sequential( nn.Linear(transformer_width, transformer_width), nn.ReLU(inplace=True), nn.Linear(transformer_width, transformer_width), nn.ReLU(inplace=True), nn.Linear(transformer_width, 1)) self.video_weight_fc = nn.Sequential( nn.Linear(transformer_width, transformer_width), nn.ReLU(inplace=True), nn.Linear(transformer_width, transformer_width), nn.ReLU(inplace=True), nn.Linear(transformer_width, 1)) if self.agg_module in ["seqLSTM", "seqTransf"]: self.frame_position_embeddings = nn.Embedding(cross_config.max_position_embeddings, cross_config.hidden_size) if self.agg_module == "seqTransf": self.transformerClip = TransformerClip(width=transformer_width, # pylint: disable=invalid-name layers=cross_config.num_hidden_layers, heads=transformer_heads) if self.agg_module == "seqLSTM": self.lstm_visual = nn.LSTM(input_size=cross_config.hidden_size, hidden_size=cross_config.hidden_size, batch_first=True, bidirectional=False, num_layers=1) self.loss_fct = CrossEn() self.apply(self.init_weights) # random init must before loading pretrain # ===> Initialization trick [HARdd COddE] new_state_dict = OrderedDict() if self.interaction == "xti": contain_cross = False for key in state_dict.keys(): if key.find("cross.transformer") > -1: contain_cross = True break if contain_cross is False: for key, val in state_dict.items(): if key == "positional_embedding": new_state_dict["cross.embeddings.position_embeddings.weight"] = val.clone() continue if key.find("transformer.resblocks") == 0: num_layer = int(key.split(".")[2]) # cut from beginning if num_layer < cross_config.num_hidden_layers: new_state_dict["cross." + key] = val.clone() continue if self.agg_module in ["seqLSTM", "seqTransf"]: contain_frame_position = False for key in state_dict.keys(): if key.find("frame_position_embeddings") > -1: contain_frame_position = True break if contain_frame_position is False: for key, val in state_dict.items(): if key == "positional_embedding": new_state_dict["frame_position_embeddings.weight"] = val.clone() continue if self.agg_module in ["seqTransf"] and key.find("transformer.resblocks") == 0: num_layer = int(key.split(".")[2]) # cut from beginning if num_layer < cross_config.num_hidden_layers: new_state_dict[key.replace("transformer.", "transformerClip.")] = val.clone() continue self.load_state_dict(new_state_dict, strict=False) # only update new state (seqTransf/seqLSTM/tightTransf)
# <=== End of initialization trick
[docs] def forward(self, text_ids, text_mask, video, video_mask=None): text_ids = text_ids.view(-1, text_ids.shape[-1]) text_mask = text_mask.view(-1, text_mask.shape[-1]) video_mask = video_mask.view(-1, video_mask.shape[-1]) # bd x nd_v x 3 x H x W - > (bd x nd_v) x 3 x H x W video = torch.as_tensor(video).float() b, n_v, d, h, w = video.shape video = video.view(b * n_v, d, h, w) text_feat, video_feat = self.get_text_video_feat(text_ids, video, video_mask, shaped=True) if sim_matrix1, sim_matrix2, cdcr_loss = self.get_similarity_logits(text_feat, video_feat, text_mask, video_mask, shaped=True) sim_loss = (self.loss_fct(sim_matrix1) + self.loss_fct(sim_matrix2)) / 2.0 loss = sim_loss + cdcr_loss * self.config.cdcr_lambda return loss else: return None
def get_text_feat(self, text_ids, shaped=False): if shaped is False: text_ids = text_ids.view(-1, text_ids.shape[-1]) bs_pair = text_ids.size(0) text_feat = self.clip.encode_text(text_ids, clip4clip=True, return_hidden=True)[1].float() text_feat = text_feat.view(bs_pair, -1, text_feat.size(-1)) return text_feat def get_video_feat(self, video, video_mask, shaped=False): if shaped is False: video_mask = video_mask.view(-1, video_mask.shape[-1]) video = torch.as_tensor(video).float() b, n_v, d, h, w = video.shape video = video.view(b * n_v, d, h, w) bs_pair = video_mask.size(0) video_feat = self.clip.encode_image(video).float() video_feat = video_feat.float().view(bs_pair, -1, video_feat.size(-1)) video_feat = self.aggvideo_feat(video_feat, video_mask, self.agg_module) return video_feat def get_text_video_feat(self, text_ids, video, video_mask, shaped=False): if shaped is False: text_ids = text_ids.view(-1, text_ids.shape[-1]) # text_mask = text_mask.view(-1, text_mask.shape[-1]) video_mask = video_mask.view(-1, video_mask.shape[-1]) video = torch.as_tensor(video).float() b, n_v, d, h, w = video.shape video = video.view(b * n_v, d, h, w) text_feat = self.get_text_feat(text_ids, shaped=True) video_feat = self.get_video_feat(video, video_mask, shaped=True) return text_feat, video_feat def get_video_avg_feat(self, video_feat, video_mask): video_mask_un = video_feat = video_feat * video_mask_un video_mask_un_sum = torch.sum(video_mask_un, dim=1, dtype=torch.float) video_mask_un_sum[video_mask_un_sum == 0.] = 1. video_feat = torch.sum(video_feat, dim=1) / video_mask_un_sum return video_feat def get_text_sep_feat(self, text_feat, text_mask): text_feat = text_feat.contiguous() text_feat = text_feat[torch.arange(text_feat.shape[0]), torch.sum(text_mask, dim=-1) - 1, :] text_feat = text_feat.unsqueeze(1).contiguous() return text_feat def aggvideo_feat(self, video_feat, video_mask, agg_module): video_feat = video_feat.contiguous() if agg_module == "ndone": pass elif agg_module == "seqLSTM": # Sequential type: LSTM video_feat_original = video_feat video_feat = pack_padded_sequence(video_feat, torch.sum(video_mask, dim=-1).cpu(), batch_first=True, enforce_sorted=False) video_feat, _ = self.lstm_visual(video_feat) if self.lstm_visual.flatten_parameters() video_feat, _ = pad_packed_sequence(video_feat, batch_first=True) video_feat = (video_feat, video_feat_original[:, video_feat.size(1):, ...].contiguous()), dim=1) video_feat = video_feat + video_feat_original elif agg_module == "seqTransf": # Sequential type: Transformer Encoder video_feat_original = video_feat seq_length = video_feat.size(1) position_ids = torch.arange(seq_length, dtype=torch.long, device=video_feat.device) position_ids = position_ids.unsqueeze(0).expand(video_feat.size(0), -1) frame_position_embeddings = self.frame_position_embeddings(position_ids) video_feat = video_feat + frame_position_embeddings extended_video_mask = (1.0 - video_mask.unsqueeze(1)) * -1000000.0 extended_video_mask = extended_video_mask.expand(-1, video_mask.size(1), -1) video_feat = video_feat.permute(1, 0, 2) # ndLdd -> Lnddd video_feat = self.transformerClip(video_feat, extended_video_mask) video_feat = video_feat.permute(1, 0, 2) # Lnddd -> ndLdd video_feat = video_feat + video_feat_original return video_feat def dp_interaction(self, text_feat, video_feat, text_mask, video_mask): text_feat = self.get_text_sep_feat(text_feat, text_mask) # bd x 1 x dd if and torch.cuda.is_available(): # batch merge here text_feat = allgather(text_feat, self.config) video_feat = allgather(video_feat, self.config) video_mask = allgather(video_mask, self.config) torch.distributed.barrier() # force sync text_feat = text_feat.squeeze(1) # bd x 1 x dd -> bd x dd text_feat = text_feat / text_feat.norm(dim=-1, keepdim=True) # bd x dd video_feat = video_feat / video_feat.norm(dim=-1, keepdim=True) video_feat = self.get_video_avg_feat(video_feat, video_mask) # bd x nd_v x dd -> bd x dd video_feat = video_feat / video_feat.norm(dim=-1, keepdim=True) retrieve_logits = torch.matmul(text_feat, video_feat.t()) if logit_scale = self.clip.logit_scale.exp() # retrieve_logits = logit_scale * retrieve_logits if self.config.cdcr != 0: z_a_norm = (text_feat - text_feat.mean(0)) / text_feat.std(0) # bdxdd z_b_norm = (video_feat - video_feat.mean(0)) / video_feat.std(0) # bdxdd # cross-correlation matrix bd, dd = z_a_norm.shape c = torch.einsum("bm,bn->mn", z_a_norm, z_b_norm) / bd # ddxdd # loss on_diag = torch.diagonal(c).add_(-1).pow_(2).sum() off_diag = c.flatten()[1:].view(dd - 1, dd + 1)[:, :-1].pow_(2).sum() cdcr_loss = (on_diag * self.config.cdcr_alpha1 + off_diag * self.config.cdcr_alpha2) return retrieve_logits, retrieve_logits.T, cdcr_loss else: return retrieve_logits, retrieve_logits.T, 0.0 else: return retrieve_logits, retrieve_logits.T, 0.0 def _get_cross_feat(self, text_feat, video_feat, text_mask, video_mask): concat_feats =, video_feat), dim=1) # concatnate tokens and frames concat_mask =, video_mask), dim=1) text_type_ = torch.zeros_like(text_mask) video_type_ = torch.ones_like(video_mask) concat_type =, video_type_), dim=1) cross_layers, pooled_feat = self.cross(concat_feats, concat_type, concat_mask, output_all_encoded_layers=True) cross_feat = cross_layers[-1] return cross_feat, pooled_feat, concat_mask def xti_interaction(self, text_feat, video_feat, text_mask, video_mask): text_feat = self.get_text_sep_feat(text_feat, text_mask) # bd x 1 x dd b_text, s_text, d_text = text_feat.size() b_video, s_video, d_video = video_feat.size() if and torch.cuda.is_available(): # batch merge here text_feat_full = allgather2(text_feat, self.config) video_feat_full = allgather2(video_feat, self.config) video_mask_full = allgather2(video_mask, self.config) text_feat = text_feat_full[b_text * self.config.local_rank: b_text * (1 + self.config.local_rank)] video_feat = video_feat_full[b_video * self.config.local_rank: b_video * (1 + self.config.local_rank)] torch.distributed.barrier() # force sync else: text_feat_full = text_feat video_feat_full = video_feat video_mask_full = video_mask b_text_full = text_feat_full.shape[0] b_video_full = video_feat_full.shape[0] text_mask = torch.ones(text_feat.size(0), 1).to(device=text_mask.device, dtype=text_mask.dtype) text_mask_full = torch.ones(text_feat_full.size(0), 1).to(device=text_mask.device, dtype=text_mask.dtype) # tV text_feat_1 = text_feat.unsqueeze(1).repeat(1, b_video_full, 1, 1) # b_t x bd_v x n_t x d_t text_feat_1 = text_feat_1.view(-1, s_text, d_text) # (b_t x bd_v) x n_t x d_t text_mask_1 = text_mask.unsqueeze(1).repeat(1, b_video_full, 1) # b_t x bd_v x 1 text_mask_1 = text_mask_1.view(-1, s_text) # (b_t x bd_v) x 1 video_feat_1 = video_feat_full.unsqueeze(0).repeat(b_text, 1, 1, 1) # b_t x bd_v x n_v x d_t video_feat_1 = video_feat_1.view(-1, s_video, d_video) # (b_t x bd_v) x n_v x d_v video_mask_1 = video_mask_full.unsqueeze(0).repeat(b_text, 1, 1) # b_t x bd_v x n_v video_mask_1 = video_mask_1.view(-1, s_video) # (b_t x bd_v) x n_v # vT text_feat_2 = text_feat_full.unsqueeze(1).repeat(1, b_video, 1, 1) # bd_t x b_v x n_t x d_t text_feat_2 = text_feat_2.view(-1, s_text, d_text) # (bd_t x b_v) x n_t x d_t text_mask_2 = text_mask_full.unsqueeze(1).repeat(1, b_video, 1) # bd_t x b_v x 1 text_mask_2 = text_mask_2.view(-1, s_text) # (bd_t x b_v) x 1 video_feat_2 = video_feat.unsqueeze(0).repeat(b_text_full, 1, 1, 1) # bd_t x b_v x n_v x d_v video_feat_2 = video_feat_2.view(-1, s_video, d_video) # (bd_t x b_v) x n_v x d_t video_mask_2 = video_mask.unsqueeze(0).repeat(b_text_full, 1, 1) # bd_t x b_v x n_v video_mask_2 = video_mask_2.view(-1, s_video) # (bd_t x b_v) x n_v _, pooled_feat, _ = \ self._get_cross_feat(text_feat_1, video_feat_1, text_mask_1, video_mask_1) retrieve_logits_tv = self.similarity_dense(pooled_feat).squeeze(-1).view(b_text, b_video_full) _, pooled_feat, _ = \ self._get_cross_feat(text_feat_2, video_feat_2, text_mask_2, video_mask_2) retrieve_logits_vt = self.similarity_dense(pooled_feat).squeeze(-1).view(b_text_full, b_video).T if logit_scale = self.clip.logit_scale.exp() # retrieve_logits_tv = torch.roll(retrieve_logits_tv, -b_text * self.config.local_rank, -1) retrieve_logits_vt = torch.roll(retrieve_logits_vt, -b_video * self.config.local_rank, -1) retrieve_logits_tv = logit_scale * retrieve_logits_tv retrieve_logits_vt = logit_scale * retrieve_logits_vt return retrieve_logits_tv, retrieve_logits_vt, 0.0 else: return retrieve_logits_tv, retrieve_logits_vt, 0.0 def wti_interaction(self, text_feat, video_feat, text_mask, video_mask): if and torch.cuda.is_available(): # batch merge here text_feat = allgather(text_feat, self.config) video_feat = allgather(video_feat, self.config) text_mask = allgather(text_mask, self.config) video_mask = allgather(video_mask, self.config) torch.distributed.barrier() # force sync if self.config.interaction == "wti": text_weight = self.text_weight_fc(text_feat).squeeze(2) # bd x nd_t x dd -> bd x nd_t text_weight.masked_fill_(torch.tensor((1 - text_mask), dtype=torch.bool), float("-inf")) # pylint: disable=not-callable text_weight = torch.softmax(text_weight, dim=-1) # bd x nd_t video_weight = self.video_weight_fc(video_feat).squeeze(2) # bd x nd_v x dd -> bd x nd_v video_weight.masked_fill_(torch.tensor((1 - video_mask), dtype=torch.bool), float("-inf")) # pylint: disable=not-callable video_weight = torch.softmax(video_weight, dim=-1) # bd x nd_v text_feat = text_feat / text_feat.norm(dim=-1, keepdim=True) video_feat = video_feat / video_feat.norm(dim=-1, keepdim=True) retrieve_logits = torch.einsum("atd,bvd->abtv", [text_feat, video_feat]) retrieve_logits = torch.einsum("abtv,at->abtv", [retrieve_logits, text_mask]) retrieve_logits = torch.einsum("abtv,bv->abtv", [retrieve_logits, video_mask]) text_sum = text_mask.sum(-1) video_sum = video_mask.sum(-1) # max for video token if self.config.interaction == "ti": # token-wise interaction t2v_logits, max_idx1 = retrieve_logits.max(dim=-1) # abtv -> abt v2t_logits, max_idx2 = retrieve_logits.max(dim=-2) # abtv -> abv t2v_logits = torch.sum(t2v_logits, dim=2) / (text_sum.unsqueeze(1)) v2t_logits = torch.sum(v2t_logits, dim=2) / (video_sum.unsqueeze(0)) retrieve_logits = (t2v_logits + v2t_logits) / 2.0 elif self.config.interaction == "wti": # weighted token-wise interaction t2v_logits, max_idx1 = retrieve_logits.max(dim=-1) # abtv -> abt t2v_logits = torch.einsum("abt,at->ab", [t2v_logits, text_weight]) v2t_logits, max_idx2 = retrieve_logits.max(dim=-2) # abtv -> abv v2t_logits = torch.einsum("abv,bv->ab", [v2t_logits, video_weight]) retrieve_logits = (t2v_logits + v2t_logits) / 2.0 if logit_scale = self.clip.logit_scale.exp() retrieve_logits = logit_scale * retrieve_logits if self.config.cdcr == 1: # simple random text_feat = text_feat[torch.arange(text_feat.shape[0]), torch.randint_like(text_sum, 0, 10000) % text_sum, :] video_feat = video_feat[torch.arange(video_feat.shape[0]), torch.randint_like(video_sum, 0, 10000) % video_sum, :] z_a_norm = (text_feat - text_feat.mean(0)) / text_feat.std(0) # ndxnd_sxdd z_b_norm = (video_feat - video_feat.mean(0)) / video_feat.std(0) # ndxnd_txdd # cross-correlation matrix bd, dd = z_a_norm.shape c = torch.einsum("ac,ad->cd", z_a_norm, z_b_norm) / bd # ddxdd # loss on_diag = torch.diagonal(c).add_(-1).pow_(2).sum() off_diag = c.flatten()[1:].view(dd - 1, dd + 1)[:, :-1].pow_(2).sum() cdcr_loss = (on_diag * self.config.cdcr_alpha1 + off_diag * self.config.cdcr_alpha2) return retrieve_logits, retrieve_logits.T, cdcr_loss elif self.config.cdcr == 2: # selecet max max_idx1 = max_idx1[torch.arange(max_idx1.shape[0]), torch.arange(max_idx1.shape[1])] max_idx2 = max_idx2[torch.arange(max_idx2.shape[0]), torch.arange(max_idx2.shape[1])] max_t_feat = text_feat[torch.arange(max_idx2.shape[0]).repeat_interleave(max_idx2.shape[1]), max_idx2.flatten()] max_v_feat = video_feat[torch.arange(max_idx1.shape[0]).repeat_interleave(max_idx1.shape[1]), max_idx1.flatten()] t_feat = text_feat.reshape(-1, text_feat.shape[-1]) t_mask = text_mask.flatten().type(torch.bool) v_feat = video_feat.reshape(-1, text_feat.shape[-1]) v_mask = video_mask.flatten().type(torch.bool) t_feat = t_feat[t_mask] v_feat = v_feat[v_mask] max_t_feat = max_t_feat[v_mask] max_v_feat = max_v_feat[t_mask] z_a_norm = (t_feat - t_feat.mean(0)) / t_feat.std(0) # (bdxnd_t)xdd z_b_norm = (max_v_feat - max_v_feat.mean(0)) / max_v_feat.std(0) # (bdxnd_t)xdd x_a_norm = (v_feat - v_feat.mean(0)) / v_feat.std(0) # (bdxnd_v)xdd x_b_norm = (max_t_feat - max_t_feat.mean(0)) / max_t_feat.std(0) # (bdxnd_v)xdd # cross-correlation matrix nd, dd = z_a_norm.shape c1 = torch.einsum("ac,ad->cd", z_a_norm, z_b_norm) / nd # ddxdd nd, dd = x_a_norm.shape c2 = torch.einsum("ac,ad->cd", x_a_norm, x_b_norm) / nd # ddxdd c = (c1 + c2) / 2.0 # loss on_diag = torch.diagonal(c).add_(-1).pow_(2).sum() off_diag = c.flatten()[1:].view(dd - 1, dd + 1)[:, :-1].pow_(2).sum() cdcr_loss = (on_diag * self.config.cdcr_alpha1 + off_diag * self.config.cdcr_alpha2) return retrieve_logits, retrieve_logits.T, cdcr_loss elif self.config.cdcr == 3: # selecet max max_idx1 = max_idx1[torch.arange(max_idx1.shape[0]), torch.arange(max_idx1.shape[1])] max_idx2 = max_idx2[torch.arange(max_idx2.shape[0]), torch.arange(max_idx2.shape[1])] max_t_feat = text_feat[torch.arange(max_idx2.shape[0]).repeat_interleave(max_idx2.shape[1]), max_idx2.flatten()].squeeze(1) max_v_feat = video_feat[torch.arange(max_idx1.shape[0]).repeat_interleave(max_idx1.shape[1]), max_idx1.flatten()].squeeze(1) t_feat = text_feat.reshape(-1, text_feat.shape[-1]) t_mask = text_mask.flatten().type(torch.bool) v_feat = video_feat.reshape(-1, video_feat.shape[-1]) v_mask = video_mask.flatten().type(torch.bool) t_feat = t_feat[t_mask] v_feat = v_feat[v_mask] max_t_feat = max_t_feat[v_mask] max_v_feat = max_v_feat[t_mask] text_weight = text_weight.flatten()[t_mask] video_weight = video_weight.flatten()[v_mask] z_a_norm = (t_feat - t_feat.mean(0)) / t_feat.std(0) # (bdxnd_t)xdd z_b_norm = (max_v_feat - max_v_feat.mean(0)) / max_v_feat.std(0) # (bdxnd_t)xdd x_a_norm = (v_feat - v_feat.mean(0)) / v_feat.std(0) # (bdxnd_v)xdd x_b_norm = (max_t_feat - max_t_feat.mean(0)) / max_t_feat.std(0) # (bdxnd_v)xdd # cross-correlation matrix nd, dd = z_a_norm.shape bd = text_feat.shape[0] c1 = torch.einsum("acd,a->cd", torch.einsum("ac,ad->acd", z_a_norm, z_b_norm), text_weight) / bd # ddxdd c2 = torch.einsum("acd,a->cd", torch.einsum("ac,ad->acd", x_a_norm, x_b_norm), video_weight) / bd # ddxdd c = (c1 + c2) / 2.0 # loss on_diag = torch.diagonal(c).add_(-1).pow_(2).sum() off_diag = c.flatten()[1:].view(dd - 1, dd + 1)[:, :-1].pow_(2).sum() cdcr_loss = (on_diag * self.config.cdcr_alpha1 + off_diag * self.config.cdcr_alpha2) return retrieve_logits, retrieve_logits.T, cdcr_loss else: return retrieve_logits, retrieve_logits.T, 0.0 else: return retrieve_logits, retrieve_logits.T, 0.0 def get_similarity_logits(self, text_feat, video_feat, text_mask, video_mask, shaped=False): if shaped is False: text_mask = text_mask.view(-1, text_mask.shape[-1]) video_mask = video_mask.view(-1, video_mask.shape[-1]) if self.interaction == "dp": t2v_logits, v2t_logits, cdcr_loss = self.dp_interaction(text_feat, video_feat, text_mask, video_mask) elif self.interaction == "xti": t2v_logits, v2t_logits, cdcr_loss = self.xti_interaction(text_feat, video_feat, text_mask, video_mask) elif self.interaction in ["ti", "wti"]: t2v_logits, v2t_logits, cdcr_loss = self.wti_interaction(text_feat, video_feat, text_mask, video_mask) else: raise NotImplementedError return t2v_logits, v2t_logits, cdcr_loss @property def dtype(self): """ :obj:`torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype). """ try: return next(self.parameters()).dtype except StopIteration: # For nn.ddataParallel compatibility in PyTorch 1.5 def find_tensor_attributes(module: nn.Module): tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)] return tuples gen = self._named_members(get_members_fn=find_tensor_attributes) first_tuple = next(gen) return first_tuple[1].dtype
[docs] def init_weights(self, module): """ Initialize the weights. """ if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf, std=0.02) elif isinstance(module, LayerNorm): if "beta" in dir(module) and "gamma" in dir(module): else: if isinstance(module, nn.Linear) and module.bias is not None:
[docs]def create_model( base_encoder: str = "clip_vit_b32", agg_module: str = "seqTransf", interaction: str = "wti", wti_arch: int = 2, cdcr: int = 3, cdcr_alpha1: float = 1.0, cdcr_alpha2: float = 0.06, cdcr_lambda: float = 0.001, cross_num_hidden_layers: int = None, pretrained: bool = False, weights_path: str = None, device: str = None ) -> DRL: """ Build a DRL model. Args: base_encoder (`str`): Base_encoder in DRL model, `clip_vit_b32` or `clip_vit_b16`. agg_module (`str`): Feature aggregation module for video. default: `seqTransf`, choices=[`ndone`, `seqLSTM`, `seqTransf`] interaction (`str`): Interaction type for retrieval. default: `wti`. wti_arch (`int`): Select an architecture for weight branch. default: 2. cdcr (`int`): Channel decorrelation regularization. default: 3. cdcr_alpha1 (`float`): Coefficient 1 for channel decorrelation regularization. default: 1.0. cdcr_alpha2 (`float`): Coefficient 2 for channel decorrelation regularization. default: 0.06. cdcr_lambda (`float`): Coefficient for channel decorrelation regularization. default: 0.001. cross_num_hidden_layers (`int`): Number of hidden layers for cross transformer interaction. pretrained (`bool`): Whether model is pretrained, default if False. weights_path (`str`): Pretrained model local path, default if None. device (`str`): Model device. `cpu` or `cuda`. Returns: >>> from towhee.models import drl >>> model = drl.create_model("clip_vit_b32") >>> model.__class__.__name__ 'DRL' """ model = DRL(base_encoder=base_encoder, agg_module=agg_module, interaction=interaction, wti_arch=wti_arch, cdcr=cdcr, cdcr_alpha1=cdcr_alpha1, cdcr_alpha2=cdcr_alpha2, cdcr_lambda=cdcr_lambda, cross_num_hidden_layers=cross_num_hidden_layers) if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" if pretrained and weights_path is not None: state_dict = torch.load(weights_path, map_location=device) missing_keys = [] unexpected_keys = [] error_msgs = [] metadata = getattr(state_dict, "_metadata", None) state_dict = state_dict.copy() if metadata is not None: state_dict._metadata = metadata # pylint: disable=protected-access def load(module, prefix=""): local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) module._load_from_state_dict( # pylint: disable=protected-access state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) for name, child in module._modules.items(): # pylint: disable=protected-access if child is not None: load(child, prefix + name + ".") load(model, prefix="") if len(missing_keys) > 0:"Weights of %s not initialized from pretrained model: %s", model.__class__.__name__, "\n " + "\n ".join(missing_keys)) if len(unexpected_keys) > 0:"Weights from pretrained model not used in %s: %s", model.__class__.__name__, "\n " + "\n ".join(unexpected_keys)) if len(error_msgs) > 0: logger.error("Weights from pretrained model cause errors in %s: %s", model.__class__.__name__, "\n " + "\n ".join(error_msgs)) if pretrained and weights_path is None: raise ValueError("weights_path is None") return model