Source code for towhee.models.clip4clip.clip4clip

# Built on top of the original implementation at
# Modifications by Copyright 2022 Zilliz. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import logging

from typing import Union
from torch import nn
from towhee.models.clip4clip.until_module import PreTrainedModel, CrossEn
from towhee.models.clip import CLIP, get_configs

logger = logging.getLogger(__name__)

[docs]class CLIP4ClipPreTrainedModel(PreTrainedModel, nn.Module): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """
[docs] def __init__(self): super().__init__() self.clip = None
[docs]class CLIP4Clip(CLIP4ClipPreTrainedModel): """ CLIP4Clip model from the paper: Only meanP and 2d type reserved because of its performance. Args: embed_dim (`int`): Embedding dimension. image_resolution (`int`): Image resolution. vision_layers (`int`): Number of layers for vision. vision_width (`int`): Width for vision. vision_patch_size (`int`): Patch size for vision. context_length (`int`): Length of context. vocab_size (`int`): Vocabulary size. transformer_width (`int`): Width for transformer. transformer_heads (`int`): Number of heads for transformer. transformer_layers (`int`): Number of layers for transformer size. """
[docs] def __init__(self, embed_dim: int, image_resolution: int, vision_layers: int, vision_width: int, vision_patch_size: int, context_length: int, vocab_size: int, transformer_width: int, transformer_heads: int, transformer_layers: int): super().__init__() self.ignore_video_index = -1 self.loose_type = True self.linear_patch = "2d" self.clip = CLIP( embed_dim=embed_dim, image_resolution=image_resolution, vision_layers=vision_layers, vision_width=vision_width, vision_patch_size=vision_patch_size, context_length=context_length, vocab_size=vocab_size, transformer_width=transformer_width, transformer_heads=transformer_heads, transformer_layers=transformer_layers, clip4clip=True ).float() self.sim_header = "meanP" self.loss_fct = CrossEn() self.apply(self.init_weights)
[docs] def forward(self, input_ids, video, video_mask): input_ids = input_ids.view(-1, input_ids.shape[-1]) video_mask = video_mask.view(-1, video_mask.shape[-1]) # T x 3 x H x W video = torch.as_tensor(video).float() b, pair, bs, ts, channel, h, w = video.shape video = video.view(b * pair * bs * ts, channel, h, w) # video_frame = bs * ts sequence_output, visual_output = self.get_sequence_visual_output(input_ids, video, video_mask, shaped=True) if loss = 0. sim_matrix, _ = self.get_similarity_logits(sequence_output, visual_output, video_mask, shaped=True) sim_loss1 = self.loss_fct(sim_matrix) sim_loss2 = self.loss_fct(sim_matrix.T) sim_loss = (sim_loss1 + sim_loss2) / 2 loss += sim_loss return loss else: return None
def get_sequence_output(self, input_ids, shaped=False): if shaped is False: input_ids = input_ids.view(-1, input_ids.shape[-1]) bs_pair = input_ids.size(0) sequence_hidden = self.clip.encode_text(input_ids, clip4clip=True).float() sequence_hidden = sequence_hidden.view(bs_pair, -1, sequence_hidden.size(-1)) return sequence_hidden def get_visual_output(self, video, video_mask, shaped=False): if shaped is False: video_mask = video_mask.view(-1, video_mask.shape[-1]) video = torch.as_tensor(video).float() b, pair, bs, ts, channel, h, w = video.shape video = video.view(b * pair * bs * ts, channel, h, w) bs_pair = video_mask.size(0) visual_hidden = self.clip.encode_image(video).float() visual_hidden = visual_hidden.view(bs_pair, -1, visual_hidden.size(-1)) return visual_hidden def get_sequence_visual_output(self, input_ids, video, video_mask, shaped=False): if shaped is False: input_ids = input_ids.view(-1, input_ids.shape[-1]) video_mask = video_mask.view(-1, video_mask.shape[-1]) video = torch.as_tensor(video).float() b, pair, bs, ts, channel, h, w = video.shape video = video.view(b * pair * bs * ts, channel, h, w) sequence_output = self.get_sequence_output(input_ids, shaped=True) visual_output = self.get_visual_output(video, video_mask, shaped=True) return sequence_output, visual_output def get_similarity_logits(self, sequence_output, visual_output, video_mask, shaped=False, ): if shaped is False: video_mask = video_mask.view(-1, video_mask.shape[-1]) contrastive_direction = () assert self.sim_header in ["meanP"] retrieve_logits = self._loose_similarity(sequence_output, visual_output, video_mask) return retrieve_logits, contrastive_direction def _loose_similarity(self, sequence_output, visual_output, video_mask): sequence_output, visual_output = sequence_output.contiguous(), visual_output.contiguous() visual_output = visual_output / visual_output.norm(dim=-1, keepdim=True) visual_output = self._mean_pooling_for_similarity_visual(visual_output, video_mask) visual_output = visual_output / visual_output.norm(dim=-1, keepdim=True) sequence_output = sequence_output.squeeze(1) sequence_output = sequence_output / sequence_output.norm(dim=-1, keepdim=True) logit_scale = self.clip.logit_scale.exp() retrieve_logits = logit_scale * torch.matmul(sequence_output, visual_output.t()) return retrieve_logits def _mean_pooling_for_similarity_visual(self, visual_output, video_mask, ): video_mask_un = visual_output = visual_output * video_mask_un video_mask_un_sum = torch.sum(video_mask_un, dim=1, dtype=torch.float) video_mask_un_sum[video_mask_un_sum == 0.] = 1. video_out = torch.sum(visual_output, dim=1) / video_mask_un_sum return video_out
[docs]def create_model( model_name: str = None, context_length: int = 77, pretrained: bool = False, weights_path: str = None, device: Union[str, torch.device] = None, ) -> CLIP4Clip: """ Create a CLIP4Clip model. Args: model_name (`str`): Model name, `clip_vit_b16` or `clip_vit_b32`. context_length (`int`): Length of context sequence, default is 77, the same as these in CLIP. pretrained (`bool`): Whether pretained model. weights_path (`str`): Your local trained weights path. default is None. device (`str`): Model device, `cpu` or `cuda`. Returns: (`CLIP4Clip`): CLIP4Clip model. >>> from towhee.models import clip4clip >>> model = clip4clip.create_model(model_name="clip_vit_b32", context_length=32, pretrained=False, device='cpu') >>> model.__class__.__name__ 'CLIP4Clip' """ configs = get_configs(model_name) if "multilingual_model" in configs: configs.pop("multilingual_model") if "url" in configs: configs.pop("url") configs["context_length"] = context_length model = CLIP4Clip(**configs) if pretrained and weights_path is not None: state_dict = torch.load(weights_path, map_location=device) missing_keys = [] unexpected_keys = [] error_msgs = [] metadata = getattr(state_dict, "_metadata", None) state_dict = state_dict.copy() if metadata is not None: state_dict._metadata = metadata # pylint: disable=protected-access def load(module, prefix=""): local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) module._load_from_state_dict( # pylint: disable=protected-access state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) for name, child in module._modules.items(): # pylint: disable=protected-access if child is not None: load(child, prefix + name + ".") load(model, prefix="") if len(missing_keys) > 0:"Weights of %s not initialized from pretrained model: %s", model.__class__.__name__, "\n " + "\n ".join(missing_keys)) if len(unexpected_keys) > 0:"Weights from pretrained model not used in %s: %s", model.__class__.__name__, "\n " + "\n ".join(unexpected_keys)) if len(error_msgs) > 0: logger.error("Weights from pretrained model cause errors in %s: %s", model.__class__.__name__, "\n " + "\n ".join(error_msgs)) if pretrained and weights_path is None: raise ValueError("weights_path is None") return model