Source code for towhee.models.omnivore.omnivore

# Adapted from: https://github.com/facebookresearch/omnivore
#
# All modifications are made by / Copyright 2022 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from towhee.models.video_swin_transformer.video_swin_transformer import VideoSwinTransformer
from typing import Any, Optional, Union

import torch
from torch import nn
from torch.hub import load_state_dict_from_url



[docs]def get_all_heads(dim_in: int = 1024) -> nn.Module: heads = nn.ModuleDict( { "image": get_imagenet_head(dim_in), "rgbd": get_sunrgbd_head(dim_in), "video": get_kinetics_head(dim_in), } ) return heads
[docs]def get_imagenet_head(dim_in: int = 1024) -> nn.Module: head = nn.Linear(in_features=dim_in, out_features=1000, bias=True) return head
[docs]def get_sunrgbd_head(dim_in: int = 1024) -> nn.Module: head = nn.Linear(in_features=dim_in, out_features=19, bias=True) return head
[docs]def get_kinetics_head(dim_in: int = 1024, num_classes: int = 400) -> nn.Module: head = nn.Linear(in_features=dim_in, out_features=num_classes, bias=True) return nn.Sequential(nn.Dropout(p=0.5), head)
[docs]class OmnivoreModel(nn.Module): """ Args: """
[docs] def __init__(self, trunk: nn.Module, heads: Union[nn.ModuleDict, nn.Module]): super().__init__() self.trunk = trunk self.heads = heads self.types = ["image", "video", "rgbd"] self.multimodal_model = False if isinstance(heads, nn.ModuleDict): self.multimodal_model = True assert [n in heads for n in self.types], "All heads must be provided"
def head(self, features: torch.Tensor, input_type: Optional[str] = None): head_in = self.heads if self.multimodal_model: assert input_type in self.types, "unsupported input type" head_in = head_in[input_type] return head_in(features) def forward_features(self, x: torch.Tensor): assert x.ndim == 5 x = self.trunk(x) features = [torch.mean(x, [-3, -2, -1])][0] return features
[docs] def forward(self, x: torch.Tensor, input_type: Optional[str] = None): """ Args: x: input to the model of shape 1 x C x T x H x W input_type: Optional[str] one of ["image", "video", "rgbd"] if self.multimodal_model iss True Returns: preds: tensor of shape (1, num_classes) """ features = self.forward_features(x) return self.head(features, input_type = input_type)
CHECKPOINT_PATHS = { "omnivore_swinT": "https://dl.fbaipublicfiles.com/omnivore/models/swinT_checkpoint.torch", "omnivore_swinS": "https://dl.fbaipublicfiles.com/omnivore/models/swinS_checkpoint.torch", "omnivore_swinB": "https://dl.fbaipublicfiles.com/omnivore/models/swinB_checkpoint.torch", "omnivore_swinB_in21k": "https://dl.fbaipublicfiles.com/omnivore/models/swinB_In21k_checkpoint.torch", "omnivore_swinL_in21k": "https://dl.fbaipublicfiles.com/omnivore/models/swinL_In21k_checkpoint.torch", "omnivore_swinB_epic": "https://dl.fbaipublicfiles.com/omnivore/models/swinB_epic_checkpoint.torch", } def _omnivore_base( trunk: nn.Module, heads: Optional[Union[nn.Module, nn.ModuleDict]] = None, head_dim_in: int = 1024, pretrained: bool = True, progress: bool = True, load_heads: bool = True, checkpoint_name: str = "omnivore_swinB", ) -> nn.Module: """ Load and initialize the specified Omnivore model trunk (and optionally heads). Args: trunk: nn.Module of the SwinTransformer3D trunk heads: Provide the heads module if using a custom model. If not provided image/video/rgbd heads are added corresponding to the omnivore base model. head_dim_in: Only needs to be set if heads = None. The dim is used for the default base model heads. load_heads: if True, loads the 3 heads, one each for image/video/rgbd prediction. If False loads only the trunk. Returns: model: nn.Module of the full Omnivore model """ if load_heads and heads is None: # Get heads heads = get_all_heads(dim_in=head_dim_in) if pretrained: path = CHECKPOINT_PATHS[checkpoint_name] # All models are loaded onto CPU by default checkpoint = load_state_dict_from_url( path, progress=progress, map_location="cpu" ) trunk.fc_cls=nn.Sequential() trunk.load_state_dict(checkpoint["trunk"]) if load_heads: heads.load_state_dict(checkpoint["heads"]) if load_heads: model = OmnivoreModel(trunk=trunk, heads=heads) else: model = trunk return model
[docs]def omnivore_swinb_epic( progress: bool = True, checkpoint_name: str = "omnivore_swinB_epic", **kwargs: Any, ) -> nn.Module: r""" Omnivore swin B model trained on EPIC-KITCHENS-100 dataset Args: progress: print progress of loading checkpoint Returns: model: nn.Module of the omnivore model """ # Only specify the non default values trunk = VideoSwinTransformer( pretrained2d=False, patch_size=(2, 4, 4), embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), window_size=(16, 7, 7), drop_path_rate=0.4, patch_norm=True, depth_mode="summed_rgb_d_tokens", **kwargs, ) heads = nn.Sequential( nn.Dropout(p=0.5), nn.Linear(in_features=1024, out_features=3806, bias=True) ) return _omnivore_base( trunk=trunk, head_dim_in=1024, # embed_dim * 8 = 128*8 progress=progress, pretrained=True, load_heads=True, checkpoint_name=checkpoint_name, heads=heads )
[docs]def omnivore_swinb( pretrained: bool = True, progress: bool = True, load_heads: bool = True, checkpoint_name: str = "omnivore_swinB", **kwargs: Any, ) -> nn.Module: r""" Omnivore model trunk: Swin B patch (2,4,4) window (1,6,7,7) Args: pretrained: if True loads weights from model trained on Imagenet 1k, Kinetics 400, SUN RGBD. progress: print progress of loading checkpoint load_heads: if True, loads the 3 heads, one each for image/video/rgbd prediction. If False loads only the trunk. Returns: model: nn.Module of the omnivore model """ # Only specify the non default values trunk = VideoSwinTransformer( pretrained2d=False, patch_size=(2, 4, 4), embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), window_size=(16, 7, 7), drop_path_rate=0.3, # TODO: set this based on the final models patch_norm=True, # Make this the default value? depth_mode="summed_rgb_d_tokens", **kwargs, ) return _omnivore_base( trunk=trunk, head_dim_in=1024, # embed_dim * 8 = 128*8 progress=progress, pretrained=pretrained, load_heads=load_heads, checkpoint_name=checkpoint_name, )
[docs]def omnivore_swinb_imagenet21k( pretrained: bool = True, progress: bool = True, load_heads: bool = True, **kwargs: Any ) -> nn.Module: r""" Omnivore Swin B model pretrained on Imagenet 1k, Imagenet 21k, Kinetics 400, SUN RGBD. By default the pretrained weights will be loaded. Args: progress: print progress of loading checkpoint load_heads: if True, loads the 3 heads, one each for image/video/rgbd prediction. If False loads only the trunk. Returns: model: nn.Module of the omnivore model """ return omnivore_swinb( pretrained=pretrained, load_heads=load_heads, progress=progress, checkpoint_name="omnivore_swinB_in21k", **kwargs, )
[docs]def omnivore_swins( pretrained: bool = True, progress: bool = True, load_heads: bool = True, **kwargs: Any, ) -> nn.Module: r""" Omnivore model trunk: Swin S patch (2,4,4) window (8,7,7) Args: pretrained: if True loads weights from model trained on Imagenet 1k, Kinetics 400, SUN RGBD. progress: print progress of loading checkpoint load_heads: if True, loads the 3 heads, one each for image/video/rgbd prediction. If False loads only the trunk. Returns: model: nn.Module of the omnivore model """ # Only specify the non default values trunk = VideoSwinTransformer( pretrained2d=False, patch_size=(2, 4, 4), embed_dim=96, depths=(2, 2, 18, 2), num_heads=(3, 6, 12, 24), window_size=(8, 7, 7), drop_path_rate=0.3, patch_norm=True, depth_mode="summed_rgb_d_tokens", **kwargs, ) return _omnivore_base( trunk=trunk, head_dim_in=768, # 96*8 progress=progress, pretrained=pretrained, load_heads=load_heads, checkpoint_name="omnivore_swinS", )
[docs]def omnivore_swint( pretrained: bool = True, progress: bool = True, load_heads: bool = True, **kwargs: Any, ) -> nn.Module: r""" Omnivore model trunk: Swin T patch (2,4,4) window (8,7,7) Args: pretrained: if True loads weights from model trained on Imagenet 1k, Kinetics 400, SUN RGBD. progress: print progress of loading checkpoint load_heads: if True, loads the 3 heads, one each for image/video/rgbd prediction. If False loads only the trunk. Returns: model: nn.Module of the omnivore model """ # Only specify the non default values trunk = VideoSwinTransformer( pretrained2d=False, patch_size=(2, 4, 4), embed_dim=96, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24), window_size=(8, 7, 7), drop_path_rate=0.2, patch_norm=True, depth_mode="summed_rgb_d_tokens", **kwargs, ) return _omnivore_base( trunk=trunk, head_dim_in=768, # 96*8 progress=progress, pretrained=pretrained, load_heads=load_heads, checkpoint_name="omnivore_swinT", )
def _omnivore_swinl( pretrained: bool = True, progress: bool = True, load_heads: bool = True, checkpoint_name: str = "", heads: Optional[nn.ModuleDict] = None, **kwargs: Any, ) -> nn.Module: """ Omnivore model trunk: Swin L patch (2,4,4) window (8,7,7) Args: pretrained: if True loads weights from model trained on Imagenet 1k, Kinetics 400, SUN RGBD. progress: print progress of loading checkpoint load_heads: if True, loads the 3 heads, one each for image/video/rgbd prediction. If False loads only the trunk. Returns: model: nn.Module of the omnivore model """ assert checkpoint_name != "", "checkpoint_name must be provided" # Only specify the non default values trunk = VideoSwinTransformer( pretrained2d=False, patch_size=(2, 4, 4), embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48), window_size=(8, 7, 7), drop_path_rate=0.3, patch_norm=True, depth_mode="summed_rgb_d_tokens", **kwargs, ) return _omnivore_base( trunk=trunk, heads=heads, head_dim_in=1536, # 192*8 progress=progress, pretrained=pretrained, load_heads=load_heads, checkpoint_name=checkpoint_name, )
[docs]def omnivore_swinl_imagenet21k( pretrained: bool = True, progress: bool = True, load_heads: bool = True, **kwargs: Any, ) -> nn.Module: r""" Swin L patch 244 window 877 pretrained on Imagenet 1k, Imagenet 21k, Kinetics 400, SUN RGBD. By default the pretrained weights will be loaded. Args: pretrained: if True loads weights from model trained on Imagenet1k, Imagenet 21k, Kinetics 400, SUN RGBD. progress: print progress of loading checkpoint load_heads: if True, loads the 3 heads, one each for image/video/rgbd prediction. If False loads only the trunk. Returns: model: nn.Module of the omnivore model """ return _omnivore_swinl( pretrained=pretrained, progress=progress, load_heads=load_heads, checkpoint_name="omnivore_swinL_in21k", **kwargs, )
[docs]def omnivore_swinl_kinetics600( pretrained: bool = True, progress: bool = True, load_heads: bool = True, **kwargs: Any, ) -> nn.Module: r""" Swin L patch 244 window 877 trained with Kinetics 600 Args: pretrained: if True loads weights from model trained on Imagenet 1k, Kinetics 600, SUN RGBD. progress: print progress of loading checkpoint load_heads: if True, loads the 3 heads, one each for image/video/rgbd prediction. If False loads only the trunk. Returns: model: nn.Module of the omnivore model """ heads = nn.ModuleDict( { "image": get_imagenet_head(dim_in=1536), "rgbd": get_sunrgbd_head(dim_in=1536), "video": get_kinetics_head(dim_in=1536, num_classes=600), } ) return _omnivore_swinl( pretrained=pretrained, progress=progress, load_heads=load_heads, checkpoint_name="omnivore_swinL_kinetics600", heads=heads, **kwargs, )
[docs]def create_model( model_name: str = None, pretrained: bool = True, device: str = None, ): if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" if model_name == "omnivore_swinT": model = omnivore_swint(pretrained = pretrained) elif model_name == "omnivore_swinS": model = omnivore_swins(pretrained = pretrained) elif model_name == "omnivore_swinB": model = omnivore_swinb(pretrained = pretrained) elif model_name == "omnivore_swinB_in21k": model = omnivore_swinb_imagenet21k(pretrained = pretrained) elif model_name == "omnivore_swinL_in21k": model = omnivore_swinl_imagenet21k(pretrained = pretrained) elif model_name == "omnivore_swinB_epic": model = omnivore_swinb_epic(pretrained = pretrained) else: raise AttributeError(f"Invalid model_name {model_name}.") model.to(device) return model