# Copyright 2021 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
from typing import Any
from towhee.datacollection.entity import Entity
from towhee.datacollection.mixins.display import DisplayMixin
# pylint: disable=protected-access
[docs]class DataCollection(DisplayMixin):
"""
A pythonic computation and processing framework.
DataCollection is a pythonic computation and processing framework for unstructured
data in machine learning and data science. It allows a data scientist or researcher
to assemble data processing pipelines and do their model work (embedding,
transforming, or classification) with a method-chaining style API.
Args:
data ('towhee.runtime.DataQueue'):
The data to be stored in DataColletion in the form of DataQueue.
Examples:
>>> from towhee.runtime.data_queue import DataQueue, ColumnType
>>> from towhee.datacollection.data_collection import DataCollection
>>> dq = DataQueue([('a', ColumnType.SCALAR), ('b', ColumnType.QUEUE)])
>>> dq.put(('a', 'b1'))
True
>>> DataCollection(dq)
<DataCollection Schema[a: ColumnType.SCALAR, b: ColumnType.QUEUE] SIZE 1>
"""
[docs] def __init__(self, data):
self._schema = data.schema
self._type_schema = data.type_schema
self._iterable = [Entity.from_dict(dict(zip(self._schema, data.get()))) for _ in range(data.size)]
def __iter__(self):
"""
Iterate the DataCollection in the form of Entity.
Examples:
>>> from towhee.runtime.data_queue import DataQueue, ColumnType
>>> from towhee.datacollection.data_collection import DataCollection
>>> dq = DataQueue([('a', ColumnType.SCALAR), ('b', ColumnType.QUEUE)])
>>> dq.put(('a', 'b1'))
True
>>> dq.put(('a', 'b2'))
True
>>> dc = DataCollection(dq)
>>> [i for i in dc]
[<Entity dict_keys(['a', 'b'])>, <Entity dict_keys(['a', 'b'])>]
"""
return iter(self._iterable)
[docs] def __getitem__(self, index: int):
"""
Get the item with given index.
Examples:
>>> from towhee.runtime.data_queue import DataQueue, ColumnType
>>> from towhee.datacollection.data_collection import DataCollection
>>> dq = DataQueue([('a', ColumnType.SCALAR), ('b', ColumnType.QUEUE)])
>>> dq.put(('a', 'b1'))
True
>>> dc = DataCollection(dq)
>>> dc[0]
<Entity dict_keys(['a', 'b'])>
"""
return self._iterable[index]
[docs] def __setitem__(self, index: int, value: Any):
"""
Set the item to given value.
Examples:
>>> from towhee.runtime.data_queue import DataQueue, ColumnType
>>> from towhee.datacollection.data_collection import DataCollection
>>> dq = DataQueue([('a', ColumnType.SCALAR), ('b', ColumnType.QUEUE)])
>>> dq.put(('a', 'b1'))
True
>>> dc = DataCollection(dq)
>>> dc[0] = 'a'
>>> dc[0]
'a'
"""
self._iterable[index] = value
[docs] def __repr__(self) -> str:
"""
String representation of the DataCollection.
Examples:
>>> from towhee.runtime.data_queue import DataQueue, ColumnType
>>> from towhee.datacollection.data_collection import DataCollection
>>> dq = DataQueue([('a', ColumnType.SCALAR), ('b', ColumnType.QUEUE)])
>>> dc = DataCollection(dq)
>>> repr(dc)
'<DataCollection Schema[a: ColumnType.SCALAR, b: ColumnType.QUEUE] SIZE 0>'
"""
names = self._schema
types = self._type_schema
content = ', '.join([i + ': ' + str(j) for i, j in zip(names, types)])
return f'<{self.__class__.__name__} Schema[{content}] SIZE {len(self)}>'
def __len__(self):
"""
Return the number of entities in the DataCollection.
Examples:
>>> from towhee.runtime.data_queue import DataQueue, ColumnType
>>> from towhee.datacollection.data_collection import DataCollection
>>> dq = DataQueue([('a', ColumnType.SCALAR), ('b', ColumnType.QUEUE)])
>>> dc = DataCollection(dq)
>>> len(dc)
0
"""
return len(self._iterable)
[docs] def __add__(self, another: 'DataCollection') -> 'DataCollection':
"""
Concat two DataCollections with same Schema.
Note that this function will consume tha data in the second DataCollection.
Args:
another ('DataCollection'):
Another DataCollection to concat.
Examples:
>>> from towhee.runtime.data_queue import DataQueue, ColumnType
>>> from towhee.datacollection.data_collection import DataCollection
>>> dq = DataQueue([('a', ColumnType.SCALAR), ('b', ColumnType.QUEUE)])
>>> dq1 = DataQueue([('a', ColumnType.SCALAR), ('b', ColumnType.QUEUE)])
>>> dq2 = DataQueue([('a', ColumnType.SCALAR), ('b', ColumnType.QUEUE)])
>>> dq1.put(('a', 'b1'))
True
>>> dq2.put(('a', 'b2'))
True
>>> dc1 = DataCollection(dq1)
>>> dc2 = DataCollection(dq2)
>>> len(dc1)
1
>>> len(dc2)
1
>>> len(dc1 + dc2)
2
"""
new = copy.deepcopy(self)
new._iterable = self._iterable + another._iterable
return new
[docs] def to_list(self) -> list:
"""
Convert DataCollection to list.
Examples:
>>> from towhee.runtime.data_queue import DataQueue, ColumnType
>>> from towhee.datacollection.data_collection import DataCollection
>>> dq = DataQueue([('a', ColumnType.SCALAR), ('b', ColumnType.QUEUE)])
>>> dq.put(('a', 'b1'))
True
>>> dc = DataCollection(dq)
>>> dc.to_list()
[<Entity dict_keys(['a', 'b'])>]
"""
return list(self)
[docs] def copy(self, deep: bool = False):
"""
Copy a DataCollection.
Examples:
>>> from towhee.runtime.data_queue import DataQueue, ColumnType
>>> from towhee.datacollection.data_collection import DataCollection
>>> dq = DataQueue([('a', ColumnType.SCALAR), ('b', ColumnType.QUEUE)])
>>> dq.put(('a', 'b1'))
True
>>> dc = DataCollection(dq)
>>> dc_copy = dc.copy()
>>> dc_dcopy = dc.copy(True)
>>> id(dc) == id(dc_copy)
False
>>> id(dc[0]) == id(dc_copy[0])
True
>>> id(dc) == id(dc_dcopy)
False
>>> id(dc[0]) == id(dc_dcopy[0])
False
"""
if deep:
return copy.deepcopy(self)
else:
return copy.copy(self)