Source code for towhee.functional.data_collection

# Copyright 2021 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Iterable, Iterator, Callable
import reprlib

from towhee.functional.mixins.dag import register_dag
from towhee.hparam import param_scope, dynamic_dispatch
from towhee.functional.option import Option, Some
from towhee.functional.entity import EntityView
from towhee.functional.mixins import DCMixins
from towhee.functional.mixins.dataframe import DataFrameMixin
from towhee.functional.mixins.column import ColumnMixin


[docs]class DataCollection(Iterable, DCMixins):
    """A pythonic computation and processing framework.

    DataCollection is a pythonic computation and processing framework for unstructured
    data in machine learning and data science. It allows a data scientist or researcher
    to assemble data processing pipelines and do their model work (embedding,
    transforming, or classification) with a method-chaining style API. It is also
    designed to behave as a python list or iterator. When created from a list,
    operations arent performed once all data has been stored from previous step. When
    created from an iterator, operations are performed streamwise, reading and operating
    on data one by one, and only progressing if its previous output has been consumed.

    Examples:
        1. Create a DataCollection from list or iterator::

        >>> dc = DataCollection([0, 1, 2, 3, 4])
        >>> dc = DataCollection(iter([0, 1, 2, 3, 4]))

        2. Chaining function invocations makes your code clean and fluent::

        >>> (
        ...    dc.map(lambda x: x+1)
        ...      .map(lambda x: x*2)
        ... ).to_list()
        [2, 4, 6, 8, 10]

        3. Multi-line closures are also supported via decorator syntax::

        >>> dc = DataCollection([1,2,3,4])
        >>> @dc.map
        ... def add1(x):
        ...     return x+1
        >>> @add1.map
        ... def mul2(x):
        ...     return x *2
        >>> @mul2.filter
        ... def ge3(x):
        ...     return x>=7
        >>> ge3.to_list()
        [8, 10]
    """

[docs]    def __init__(self, iterable: Iterable) -> None:
        """Initializes a new DataCollection instance.

        Args:
            iterable (Iterable): The iterable data that is stored in the DataCollection.
        """
        super().__init__()
        self._iterable = iterable

[docs]    def __iter__(self) -> iter:
        """Generate an iterator of the DataCollection.

        Returns:
            iter : iterator for the data.
        """
        if hasattr(self._iterable, 'iterrows'):
            return (x[1] for x in self._iterable.iterrows())
        return iter(self._iterable)

[docs]    def __getattr__(self, name) -> 'DataCollection':
        """Unknown method dispatcher.

        When a unknown method is invoked on a `DataCollection` object, the function call
        will be dispatched to a method resolver. By registering function to the
        resolver, you are able to extend `DataCollection`'s API at runtime without
        modifying its code.

        Args:
            name (str): The unkown attribute.

        Returns:
            DataCollection: Returns a new DataCollection for the output of attribute
                call.

        Examples:
            >>> from towhee import register
            >>> dc = DataCollection([1,2,3,4])
            >>> @register(name='test/add1')
            ... def add1(x):
            ...     return x+1
            >>> dc.test.add1().to_list()
            [2, 3, 4, 5]
        """
        if name.startswith('_'):
            return super().__getattribute__(name)

        @dynamic_dispatch
        def wrapper(*arg, **kws):
            with param_scope() as hp:
                # pylint: disable=protected-access
                path = hp._name
                index = hp._index
            if self.get_backend() == 'ray':
                return self.ray_resolve({}, path, index, *arg, **kws)
            if self._jit is not None:
                op = self.jit_resolve(path, index, *arg, **kws)
            else:
                op = self.resolve(path, index, *arg, **kws)
            return self.map(op)

        return getattr(wrapper, name)

[docs]    def __getitem__(self, index) -> any:
        """Index based access of element in DataCollection.

        Access the element at the given index, similar to accessing `list[at_index]`.
        Does not work with streamed DataCollections.

        Args:
            index (int): The index location of the element being accessed.

        Raises:
            TypeError: If function called on streamed DataCollection

        Returns:
            any: The object at index.

        Examples:
            1. Usage with non-streamed::

                >>> dc = DataCollection([0, 1, 2, 3, 4])
                >>> dc[2]
                2

            2. Usage with streamed::

                >>> dc.stream()[1] # doctest: +NORMALIZE_WHITESPACE
                Traceback (most recent call last):
                TypeError: indexing is only supported for DataCollection created from list
                    or pandas DataFrame.
        """
        if not hasattr(self._iterable, '__getitem__'):
            raise TypeError(
                'indexing is only supported for '
                'DataCollection created from list or pandas DataFrame.')
        if isinstance(index, int):
            return self._iterable[index]
        return DataCollection(self._iterable[index])

[docs]    def __setitem__(self, index, value):
        """Index based setting of element in DataCollection.

        Assign the value of the element at the given index, similar to
        `list[at_index]=val`. Does not work with streamed DataCollections.

        Args:
            index (int): The index location of the element being set.
            val (any): The value to be set.

        Raises:
            TypeError: If function called on streamed DataCollection

        Examples:
            1. Usage with non-streamed::

                >>> dc = DataCollection([0, 1, 2, 3, 4])
                >>> dc[2] = 3
                >>> dc.to_list()
                [0, 1, 3, 3, 4]

            2. Usage with streamed::

                >>> dc.stream()[1] # doctest: +NORMALIZE_WHITESPACE
                Traceback (most recent call last):
                TypeError: indexing is only supported for DataCollection created from list
                    or pandas DataFrame.
        """
        if not hasattr(self._iterable, '__setitem__'):
            raise TypeError(
                'indexing is only supported for '
                'DataCollection created from list or pandas DataFrame.')
        self._iterable[index] = value

[docs]    @register_dag
    def __add__(self, other) -> 'DataCollection':
        """Concat two DataCollections.

        Args:
            other (DataCollection): The DataCollection being appended to the calling
                DataFrame.

        Returns:
            DataCollection: A new DataCollection of the concated DataCollections.

        Examples:
            >>> dc0 = DataCollection.range(5)
            >>> dc1 = DataCollection.range(5)
            >>> dc2 = DataCollection.range(5)
            >>> (dc0 + dc1 + dc2)
            [0, 1, 2, 3, 4, 0, ...]
        """
        self.parent_ids.append(other.id)
        other.notify_consumed(self.id)

        def inner():
            for x in self:
                yield x
            for x in other:
                yield x

        return self._factory(inner())

[docs]    def __repr__(self) -> str:
        """String representation of the DataCollection

        Returns:
            str: String repersentation of the DataCollection.

        Examples:
            1. Usage with non-streamed::

                >>> DataCollection([1, 2, 3]).unstream()
                [1, 2, 3]

            2. Usage with streamed::

                >>> DataCollection([1, 2, 3]).stream() #doctest: +ELLIPSIS
                <list_iterator object at...>
        """
        if isinstance(self._iterable, list):
            return reprlib.repr(self._iterable)
        if hasattr(self._iterable, '__repr__'):
            return repr(self._iterable)
        return super().__repr__()

    # Generation Related Function
    def _factory(self, iterable, parent_stream=True) -> 'DataCollection':
        """Factory method for Creating new DataCollections.

        This factory method has been wrapped into a `param_scope()` which contains the
        parent DataCollection's information.

        Args:
            iterable (Iterable): The data being encapsulated by the DataCollection
            parent_stream (bool, optional): Whether to use the same format of parent
                DataCollection (streamed or unstreamed). Defaults to True.

        Returns:
            DataCollection: The newly created DataCollection.
        """
        if parent_stream is True:
            if self.is_stream:
                if not isinstance(iterable, Iterator):
                    iterable = iter(iterable)
            else:
                if isinstance(iterable, Iterator):
                    iterable = list(iterable)

        with param_scope() as hp:
            hp().data_collection.parent = self
            return DataCollection(iterable)

[docs]    @staticmethod
    @register_dag
    def range(*arg, **kws) -> 'DataCollection':
        """Generate DataCollection with range of values.

        Generate DataCollection with a range of numbers as the data. Functions in same
        way as Python `range()` function.

        Returns:
            DataCollection: Returns a new DataCollection.

        Examples:
            >>> DataCollection.range(5).to_list()
            [0, 1, 2, 3, 4]

        """
        return DataCollection(range(*arg, **kws))

[docs]    def to_list(self) -> list:
        """Convert DataCollection to list.

        Returns:
            list: List of values stored in DataCollection.

        Examples:
            >>> DataCollection.range(5).to_list()
            [0, 1, 2, 3, 4]
        """
        return self._iterable if isinstance(self._iterable, list) else list(self._iterable)

[docs]    @register_dag
    def map(self, *arg) -> 'DataCollection':
        """Apply a function across all values in a DataCollection.

        Can apply multiple functions to the DataCollection. If multiple functions
        supplied, the same amount of new DataCollections will be returend.

        Args:
            *arg (Callable): One or multiple functions to apply to the DataCollection.

        Returns:
            DataCollection: New DataCollection containing computation results.

        Examples:
            1. Single Function::

                >>> dc = DataCollection([1,2,3,4])
                >>> dc.map(lambda x: x+1).map(lambda x: x*2).to_list()
                [4, 6, 8, 10]

            2. Multiple Functions::

                >>> dc = DataCollection([1,2,3,4])
                >>> a, b = dc.map(lambda x: x+1, lambda x: x*2)
                >>> (a.to_list(), b.to_list())
                ([2, 3, 4, 5], [2, 4, 6, 8])
        """
        # mmap
        if len(arg) > 1:
            return self.mmap(list(arg))
        unary_op = arg[0]

        # smap map for stateful operator
        if hasattr(unary_op, 'is_stateful') and unary_op.is_stateful:
            return self.smap(unary_op)

        # pmap
        if self.get_executor() is not None:
            return self.pmap(unary_op)

        if hasattr(self._iterable, 'map'):
            return self._factory(self._iterable.map(unary_op))

        if hasattr(self._iterable, 'apply') and hasattr(unary_op, '__dataframe_apply__'):
            return self._factory(unary_op.__dataframe_apply__(self._iterable))

        # map
        def inner(x):
            if isinstance(x, Option):
                return x.map(unary_op)
            else:
                return unary_op(x)

        result = map(inner, self._iterable)
        return self._factory(result)

[docs]    @register_dag
    def filter(self, unary_op: Callable, drop_empty=False) -> 'DataCollection':
        """Filter the DataCollection data based on function.

        Filters the DataCollection based on the function provided. If data is stored
        as an Option (see towhee.functional.option.py), drop empty will decide whether
        to remove the element or set it to empty.

        Args:
            unary_op (Callable): Function that dictates filtering.
            drop_empty (bool, optional): Whether to drop empty fields. Defaults to False.

        Returns:
            DataCollection: Resulting DataCollection after filter.
        """
        def inner(x):
            if isinstance(x, Option):
                if isinstance(x, Some):
                    return unary_op(x.get())
                return not drop_empty
            return unary_op(x)

        if hasattr(self._iterable, 'filter'):
            return self._factory(self._iterable.filter(unary_op))

        if hasattr(self._iterable, 'apply') and hasattr(unary_op, '__dataframe_filter__'):
            return DataCollection(unary_op.__dataframe_apply__(self._iterable))

        return self._factory(filter(inner, self._iterable))

[docs]    def run(self):
        """Iterate through the DataCollections data.

        Stream-based DataCollections will not run if the data is not a datasink. This
        function is a datasink that consumes the data without any operations.
        """
        for _ in self._iterable:
            pass

[docs]    def to_df(self) -> 'DataFrame':
        """Turn a DataCollection into a DataFrame.

        Returns:
            DataFrame: Resulting converted DataFrame.

        Examples:
            >>> from towhee import DataCollection, Entity
            >>> e = [Entity(a=a, b=b) for a,b in zip(['abc', 'def', 'ghi'], [1,2,3])]
            >>> dc = DataCollection(e)
            >>> type(dc)
            <class 'towhee.functional.data_collection.DataCollection'>

            >>> type(dc.to_df())
            <class 'towhee.functional.data_collection.DataFrame'>
        """
        return DataFrame(self._iterable)


[docs]class DataFrame(DataCollection, DataFrameMixin, ColumnMixin):
    """Entity based DataCollection.

    Examples:
        >>> from towhee import Entity
        >>> DataFrame([Entity(id=a) for a in [1,2,3]])
        [<Entity dict_keys(['id'])>, <Entity dict_keys(['id'])>, <Entity dict_keys(['id'])>]
    """

[docs]    def __init__(self, iterable: Iterable = None, **kws) -> None:
        """Initializes a new DataFrame instance.

        Args:
            iterable (Iterable, optional): The data to be encapsualted by the DataFrame.
                Defaults to None.
        """
        if iterable is not None:
            super().__init__(iterable)
            self._mode = self.ModeFlag.ROWBASEDFLAG
        else:
            super().__init__(DataFrame.from_arrow_talbe(**kws))
            self._mode = self.ModeFlag.COLBASEDFLAG


    def _factory(self, iterable, parent_stream=True, mode=None) -> 'DataFrame':
        """Factory method for Creating new DataFrames.

        This factory method has been wrapped into a `param_scope()` which contains the
        parent DataFrames's information.

        Args:
            iterable (Iterable): The data being encapsulated by the DataFrame
            parent_stream (bool, optional): Whether to use the same format of parent
                DataFrame (streamed or unstreamed). Defaults to True.
            mode (ModeFlag): The storage mode of the Dataframe.

        Returns:
            DataFrame: The newly created DataFrame.
        """

        # pylint: disable=protected-access
        if parent_stream is True:
            if self.is_stream:
                if not isinstance(iterable, Iterator):
                    iterable = iter(iterable)
            else:
                if isinstance(iterable, Iterator):
                    iterable = list(iterable)

        with param_scope() as hp:
            hp().data_collection.parent = self
            df = DataFrame(iterable)
            df._mode = self._mode if mode is None else mode
            return df

[docs]    def to_dc(self) -> 'DataCollection':
        """Turn a DataFrame into a DataCollection.

        Returns:
            DataCollection: Resulting DataCollection from DataFrame

        Examples:
            >>> from towhee import DataFrame, Entity
            >>> e = [Entity(a=a, b=b) for a,b in zip(['abc', 'def', 'ghi'], [1,2,3])]
            >>> df = DataFrame(e)
            >>> type(df)
            <class 'towhee.functional.data_collection.DataFrame'>

            >>> type(df.to_dc())
            <class 'towhee.functional.data_collection.DataCollection'>
        """
        return DataCollection(self._iterable)

    @property
    def mode(self):
        """Storage mode of the DataFrame.

        Return the storage mode of the DataFrame.

        Returns:
            ModeFlag: The storage format of the Dataframe.

        Examples:
            >>> from towhee import Entity, DataFrame
            >>> e = [Entity(a=a, b=b) for a,b in zip(range(5), range(5))]
            >>> df = DataFrame(e)
            >>> df.mode
            <ModeFlag.ROWBASEDFLAG: 1>

            >>> df = df.to_column()
            >>> df.mode
            <ModeFlag.COLBASEDFLAG: 2>
        """
        return self._mode

    def __iter__(self) -> iter:
        """Generate an iterator of the DataFrame.

        Returns:
            iterator: The iterator for the DataFrame.

        Examples:
            1. Row Based::

                >>> from towhee import Entity, DataFrame
                >>> e = [Entity(a=a, b=b) for a,b in zip(range(3), range(3))]
                >>> df = DataFrame(e)
                >>> df.to_list()[0]
                <Entity dict_keys(['a', 'b'])>

            2. Column Based::

                >>> df = df.to_column()
                >>> df.to_list()[0]
                <EntityView dict_keys(['a', 'b'])>

            2. Chunk Bassed::

                >>> df = DataFrame(e)
                >>> df = df.set_chunksize(2)
                >>> df.to_list()[0]
                <EntityView dict_keys(['a', 'b'])>
        """
        if hasattr(self._iterable, 'iterrows'):
            return (x[1] for x in self._iterable.iterrows())
        if self._mode == self.ModeFlag.ROWBASEDFLAG:
            return iter(self._iterable)
        if self._mode == self.ModeFlag.COLBASEDFLAG:
            return (EntityView(i, self._iterable) for i in range(len((self._iterable))))
        if self._mode == self.ModeFlag.CHUNKBASEDFLAG:
            return (ev for wtable in self._iterable.chunks() for ev in wtable)

[docs]    def map(self, *arg) -> 'DataFrame':
        """Apply a function across all values in a DataFrame.

        Args:
            *arg (Callable): One function to apply to the DataFrame.

        Returns:
            DataFrame: New DataFrame containing computation results.
        """
        if hasattr(arg[0], '__check_init__'):
            arg[0].__check_init__()
        if self._mode == self.ModeFlag.COLBASEDFLAG or self._mode == self.ModeFlag.CHUNKBASEDFLAG:
            return self.cmap(arg[0])
        else:
            return super().map(*arg)