# Copyright 2023 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from towhee.runtime.factory import HubOp
[docs]
class AudioEmbedding:
"""
`Audio embedding <https://towhee.io/tasks/detail/operator?field_name=Audio&task_name=Audio-Embedding>`_
is a task that encodes audio's semantics into a set of real vectors. It is a fundamental task type that can be used in a
variety of applications, including but not limited to reverse audio search and audio deduplication.
"""
nnfp: HubOp = HubOp('audio_embedding.nnfp')
"""
The audio embedding operator converts an input audio into a dense vector which can be used
to represent the audio clip's semantics. Each vector represents for an audio clip with a fixed length
of around 1s. This operator generates audio embeddings with fingerprinting method introduced by
`Neural Audio Fingerprint <https://arxiv.org/abs/2010.11910>`_. The model is implemented in Pytorch.
We've also trained the nnfp model with `FMA <https://github.com/mdeff/fma>`_ dataset (& some noise audio)
and shared weights in this operator. The nnfp operator is suitable for audio fingerprinting.
__init__(self, model_name: str = 'nnfp_default', model_path: str = None, framework: str = 'pytorch', device: str = None)
model_name(`str`):
Model name to create nnfp model with different parameters, available model names: "nnfp_default", "nnfp_hop25", "nnfp_distill".
model_path(`str`):
The path to model. If None, it will load default model weights.
framework(`str`):
Default value is pytorch. (Legacy parameter)
device(`str`):
Device id: cpu/cuda:{GPUID}, if not set, will try to find an available GPU device.
__call__(self, data: List[towhee.types.AudioFrame]) -> numpy.ndarray
data(`List[towhee.types.AudioFrame]`):
Input audio data is a list of towhee audio frames. Create by
`ops.audio_decode.ffmpeg() <https://towhee.io/audio-decode/ffmpeg>`_.
The audio input should be at least 1s.
Example:
.. code-block:: python
from towhee import pipe, ops, DataCollection
p = (
pipe.input('path')
.map('path', 'frame', ops.audio_decode.ffmpeg())
.map('frame', 'vecs', ops.audio_embedding.nnfp(device='cpu'))
.output('path', 'vecs')
)
DataCollection(p('test.wav')).show()
"""
vggish: HubOp = HubOp('audio-embedding.vggish')
"""
`vggish <https://towhee.io/audio-embedding/vggish>`_ converts an input audio into a dense vector which can be used to represent
the audio clip's semantics. Each vector represents for an audio clip with a fixed length of around 0.9s.
This operator is built on top of `VGGish <https://github.com/tensorflow/models/tree/master/research/audioset/vggish>`_
with Pytorch. The model is a `VGG <https://arxiv.org/abs/1409.1556>`_ variant pre-trained with a large scale of
audio dataset AudioSet. As suggested, it is suitable to extract features at high level or warm up a larger model.
__init__(self, weights_path: str = None, framework: str = 'pytorch')
weights_path(`str`):
The path to model weights. If None, it will load default model weights.
framework(`str`):
Default value is pytorch. (Legacy parameter)
__call__(self, data: List[towhee.types.AudioFrame]) -> numpy.ndarray
data(`List[towhee.types.AudioFrame]`):
Input audio data is a list of towhee audio frames. Create by
`ops.audio_decode.ffmpeg() <https://towhee.io/audio-decode/ffmpeg>`_.
The input data should represent for an audio longer than 0.9s.
Example:
.. code-block:: python
from towhee import pipe, ops
p = (
pipe.input('path')
.map('path', 'frame', ops.audio_decode.ffmpeg())
.map('frame', 'vecs', ops.audio_embedding.vggish())
.output('vecs')
)
p('test.wav').get()[0]
"""
clmr: HubOp = HubOp('audio_embedding.clmr')
"""
`clmr <https://towhee.io/audio-embedding/clmr>`_ converts an input audio into a dense vector which can be used
to represent the audio clip's semantics. Each vector represents for an audio clip with a
fixed length of around 2s. This operator is built on top of the original implementation of
`CLMR <https://github.com/Spijkervet/CLMR>`_. The default model weight provided is pretrained on
`Magnatagatune Dataset <https://paperswithcode.com/dataset/magnatagatune>`_ with SampleCNN.
__init__(self, framework="pytorch"):
framework(`str`):
Default value is pytorch. (Legacy parameter)
__call__(self, data: List[towhee.types.AudioFrame]) -> numpy.ndarray
data(`List[towhee.types.AudioFrame]`):
Input audio data is a list of towhee audio frames. Create by
`ops.audio_decode.ffmpeg() <https://towhee.io/audio-decode/ffmpeg>`_.
The input data should represent for an audio longer than 3s.
Returns:
Audio embeddings in shape (num_clips, 512). Each embedding stands for features of an audio clip with length of 2.7s.
Example:
.. code-block:: python
from towhee import pipe, ops, DataCollection
p = (
pipe.input('path')
.map('path', 'frame', ops.audio_decode.ffmpeg())
.map('frame', 'vecs', ops.audio_embedding.clmr())
.output('path', 'vecs')
)
DataCollection(p('./test.wav')).show()
"""
data2vec: HubOp = HubOp('audio_embedding.data2vec')
"""
`data2vec <https://towhee.io/audio-embedding/data2vec>`_ extracts features for audio with data2vec.
The core idea is to predict latent representations of the full input data based on a masked view of the
input in a self-distillation setup using a standard Transformer architecture.
__init__(self, model_name = "facebook/data2vec-audio-base-960h")
model_name(`str`):
Default value is "facebook/data2vec-audio-base-960h". Available models: facebook/data2vec-audio-base-960h,
facebook/data2vec-audio-large-960h, facebook/data2vec-audio-base, facebook/data2vec-audio-base-100h,
facebook/data2vec-audio-base-10m, facebook/data2vec-audio-large, facebook/data2vec-audio-large-100h,
facebook/data2vec-audio-large-10m
__call__(self, data: List[towhee.types.AudioFrame]) -> numpy.ndarray
data(`List[towhee.types.AudioFrame]`):
Input audio data is a list of towhee audio frames. Create by
`ops.audio_decode.ffmpeg() <https://towhee.io/audio-decode/ffmpeg>`_.
Example:
.. code-block:: python
from towhee import pipe, ops, DataCollection
p = (
pipe.input('path')
.map('path', 'frame', ops.audio_decode.ffmpeg(sample_rate=16000))
.map('frame', 'vecs', ops.audio_embedding.data2vec(model_name='facebook/data2vec-audio-base-960h'))
.output('path', 'vecs')
)
DataCollection(p('test.wav')).show()
"""
def __call__(self, *args, **kwargs):
return HubOp('towhee.audio_embedding')(*args, **kwargs)