Source code for towhee.functional.mixins.dataframe

# Copyright 2021 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json

from typing import Dict, Any, Optional, Set, Union, List

from towhee.functional.entity import Entity
from towhee.hparam import dynamic_dispatch, param_scope
# pylint: disable=protected-access


[docs]class DataFrameMixin: """ Mixin to help deal with Entity. Examples: 1. define an operator with `register` decorator >>> from towhee import register >>> from towhee import DataFrame >>> @register ... def add_1(x): ... return x+1 2. apply the operator to named field of entity and save result to another named field >>> ( ... DataFrame([dict(a=1, b=2), dict(a=2, b=3)]) ... .as_entity() ... .add_1['a', 'c']() # <-- use field `a` as input and filed `c` as output ... .as_str() ... .to_list() ... ) ["{'a': 1, 'b': 2, 'c': 2}", "{'a': 2, 'b': 3, 'c': 3}"] Select the entity on the specified fields. Examples: 1. Select the entity on one specified field: >>> from towhee import Entity >>> from towhee import DataFrame >>> df = DataFrame([Entity(a=i, b=i, c=i) for i in range(2)]) >>> df.select['a']().to_list() [<Entity dict_keys(['a'])>, <Entity dict_keys(['a'])>] 2. Select multiple fields and unpack the entity: >>> ( ... DataFrame([Entity(a=i, b=i, c=i) for i in range(5)]) ... .select['a', 'b']() ... .as_raw() ... .to_list() ... ) [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)] 3. Another field selection syntax (not suggested): >>> ( ... DataFrame([Entity(a=i, b=i, c=i) for i in range(5)]) ... .select('a', 'b') ... .as_raw() ... .to_list() ... ) [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)] """
[docs] def __init__(self): # pylint: disable=useless-super-delegation super().__init__()
@property def select(self): """ Select columns from a DC. Examples: >>> from towhee import Entity, DataFrame >>> entities = [Entity(a=i, b=i, c=i) for i in range(3)] >>> dc = DataFrame(entities) >>> dc.select('a') [<Entity dict_keys(['a'])>, <Entity dict_keys(['a'])>, <Entity dict_keys(['a'])>] """ @dynamic_dispatch def selector(*arg): index = param_scope()._index if isinstance(index, str): index = (index, ) if index is None and arg is not None and len(arg) > 0: index = arg def inner(entity: Entity): if index is not None: return Entity( **{col: getattr(entity, col) for col in index}) return entity return self.map(inner) return selector # pylint: disable=invalid-name
[docs] def fill_entity(self, _DefaultKVs: Optional[Dict[str, Any]] = None, _ReplaceNoneValue: bool = False, **kws): """ When DataFrame's iterable exists of Entities and some indexes missing, fill default value for those indexes. Args: _ReplaceNoneValue (`bool`): Whether to replace None in Entity's value. _DefaultKVs (`Dict[str, Any]`): The key-value pairs stored in a dict. Examples: >>> from towhee import Entity, DataFrame >>> entities = [Entity(num=i) for i in range(3)] >>> df = DataFrame(entities) >>> df [<Entity dict_keys(['num'])>, <Entity dict_keys(['num'])>, <Entity dict_keys(['num'])>] >>> kvs = {'foo': 'bar'} >>> df.fill_entity(kvs).fill_entity(usage='test').to_list() [<Entity dict_keys(['num', 'foo', 'usage'])>, <Entity dict_keys(['num', 'foo', 'usage'])>, <Entity dict_keys(['num', 'foo', 'usage'])>] >>> kvs = {'FOO': None} >>> df.fill_entity(_ReplaceNoneValue=True, _DefaultKVs=kvs).to_list()[0].FOO 0 """ if _DefaultKVs: kws.update(_DefaultKVs) def fill(entity: Entity): for k, v in kws.items(): if not hasattr(entity, k): setattr(entity, k, v) if _ReplaceNoneValue and v is None: setattr(entity, k, 0) return entity return self._factory(map(fill, self._iterable))
[docs] def as_entity(self, schema: Optional[List[str]] = None): """ Convert elements into Entities. Args: schema (Optional[List[str]]): schema contains field names. Examples: 1. convert dicts into entities: >>> from towhee import DataFrame >>> ( ... DataFrame([dict(a=1, b=2), dict(a=2, b=3)]) ... .as_entity() ... .as_str() ... .to_list() ... ) ["{'a': 1, 'b': 2}", "{'a': 2, 'b': 3}"] 2. convert tuples into entities: >>> from towhee import DataFrame >>> ( ... DataFrame([(1, 2), (2, 3)]) ... .as_entity(schema=['a', 'b']) ... .as_str() ... .to_list() ... ) ["{'a': 1, 'b': 2}", "{'a': 2, 'b': 3}"] 3. convert single value into entities: >>> from towhee import DataFrame >>> ( ... DataFrame([1, 2]) ... .as_entity(schema=['a']) ... .as_str() ... .to_list() ... ) ["{'a': 1}", "{'a': 2}"] """ if schema is None: def inner(x): return Entity(**x) else: def inner(x): if len(schema) == 1: x = (x, ) data = dict(zip(schema, x)) return Entity(**data) return self._factory(map(inner, self._iterable))
[docs] def parse_json(self): """ Parse string to entities. Examples: >>> from towhee import DataFrame >>> df = ( ... DataFrame(['{"x": 1}']) ... .parse_json() ... ) >>> df[0].x 1 """ def inner(x): data = json.loads(x) return Entity(**data) return self.map(inner)
[docs] def as_json(self): """ Convert entities to json Examples: >>> from towhee import DataFrame, Entity >>> ( ... DataFrame([Entity(x=1)]) ... .as_json() ... ) ['{"x": 1}'] """ def inner(x): return json.dumps(x.__dict__) return self.map(inner)
[docs] def as_raw(self): """ Convert entitis into raw python values Examples: 1. unpack multiple values from entities: >>> from towhee import DataFrame >>> ( ... DataFrame([(1, 2), (2, 3)]) ... .as_entity(schema=['a', 'b']) ... .as_raw() ... .to_list() ... ) [(1, 2), (2, 3)] 2. unpack single value from entities: >>> ( ... DataFrame([1, 2]) ... .as_entity(schema=['a']) ... .as_raw() ... .to_list() ... ) [1, 2] """ def inner(x): if len(x.__dict__) == 1: return list(x.__dict__.values())[0] return tuple(getattr(x, name) for name in x.__dict__) return self.map(inner)
[docs] def replace(self, **kws): """ Replace specific attributes with given vlues. Examples: >>> from towhee import Entity, DataFrame >>> entities = [Entity(num=i) for i in range(5)] >>> df = DataFrame(entities) >>> [i.num for i in df] [0, 1, 2, 3, 4] >>> df = df.replace(num={0: 1, 1: 2, 2: 3, 3: 4, 4: 5}) >>> [i.num for i in df] [1, 2, 3, 4, 5] """ def inner(entity: Entity): for index, convert_dict in kws.items(): origin_value = getattr(entity, index) if origin_value in convert_dict: setattr(entity, index, convert_dict[origin_value]) return entity return self._factory(map(inner, self._iterable))
[docs] def dropna(self, na: Set[str] = {'', None}) -> Union[bool, 'DataFrame']: # pylint: disable=dangerous-default-value """ Drop entities that contain some specific values. Args: na (`Set[str]`): Those entities contain values in na will be dropped. Examples: >>> from towhee import Entity, DataFrame >>> entities = [Entity(a=i, b=i + 1) for i in range(3)] >>> entities.append(Entity(a=3, b='')) >>> df = DataFrame(entities) >>> df [<Entity dict_keys(['a', 'b'])>, <Entity dict_keys(['a', 'b'])>, <Entity dict_keys(['a', 'b'])>, <Entity dict_keys(['a', 'b'])>] >>> df.dropna() [<Entity dict_keys(['a', 'b'])>, <Entity dict_keys(['a', 'b'])>, <Entity dict_keys(['a', 'b'])>] """ def inner(entity: Entity): for val in entity.__dict__.values(): if val in na: return False return True return self._factory(filter(inner, self._iterable))
[docs] def rename(self, column: Dict[str, str]): """ Rename an column in DataFrame. Args: column (`Dict[str, str]`): The columns to rename and their corresponding new name. Examples: >>> from towhee import Entity, DataFrame >>> entities = [Entity(a=i, b=i + 1) for i in range(3)] >>> df = DataFrame(entities) >>> df [<Entity dict_keys(['a', 'b'])>, <Entity dict_keys(['a', 'b'])>, <Entity dict_keys(['a', 'b'])>] >>> df.rename(column={'a': 'A', 'b': 'B'}) [<Entity dict_keys(['A', 'B'])>, <Entity dict_keys(['A', 'B'])>, <Entity dict_keys(['A', 'B'])>] """ def inner(x): for key in column: x.__dict__[column[key]] = x.__dict__.pop(key) return x return self._factory(map(inner, self._iterable))
@property def df(self): # pylint: disable=import-outside-toplevel import pandas as pd if isinstance(self._iterable, pd.DataFrame): return self._iterable else: raise TypeError( 'data collection is not created from pandas DataFrame.')