Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementation of group_by+mutate #242

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion tidypolars/funs.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
"is_nan", "is_not", "is_not_in", "is_not_null", "is_null",

# Type conversion
"as_boolean", "as_float", "as_integer", "as_string",
"as_boolean", "as_float", "as_integer", "as_string", "as_factor",
"cast"
]

Expand Down Expand Up @@ -131,6 +131,22 @@ def as_string(x):
x = _col_expr(x)
return x.cast(pl.Utf8)

def as_factor(x, levels):
dafxy marked this conversation as resolved.
Show resolved Hide resolved
"""
Convert to factor. Defaults to Utf8.
dafxy marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
x : Str
Column to operate on

Examples
--------
>>> df.mutate(string_x = tp.as_factor('x'))
dafxy marked this conversation as resolved.
Show resolved Hide resolved
"""
x = _col_expr(x)
return x.cast(pl.Enum(levels))

def abs(x):
"""
Absolute value
Expand Down
167 changes: 141 additions & 26 deletions tidypolars/tibble.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
from .reexports import *
from .tidyselect import everything
from operator import not_
# to supress polars' 'warning of dtype in the nested data operations
import warnings
warnings.filterwarnings("ignore", category=pl.exceptions.MapWithoutReturnDtypeWarning)

__all__ = [
"Tibble",
Expand Down Expand Up @@ -61,24 +64,24 @@ def __str__(self):
df = self.to_polars()
return df.__str__()

def __getattribute__(self, attr):
if attr in _polars_methods:
raise AttributeError
return pl.DataFrame.__getattribute__(self, attr)

def __dir__(self):
_tidypolars_methods = [
'arrange', 'bind_cols', 'bind_rows', 'colnames', 'clone', 'count',
'distinct', 'drop', 'drop_null', 'head', 'fill', 'filter',
'inner_join', 'left_join', 'mutate', 'names', 'nrow', 'ncol',
'full_join', 'pivot_longer', 'pivot_wider',
'pull', 'relocate', 'rename', 'replace_null', 'select',
'separate', 'set_names',
'slice', 'slice_head', 'slice_tail', 'summarize', 'tail',
'to_pandas', 'to_polars', 'write_csv', 'write_parquet'
]
return _tidypolars_methods

# def __getattribute__(self, attr):
dafxy marked this conversation as resolved.
Show resolved Hide resolved
# if attr in _polars_methods:
# raise AttributeError
# return pl.DataFrame.__getattribute__(self, attr)

# def __dir__(self):
# _tidypolars_methods = [
# 'arrange', 'bind_cols', 'bind_rows', 'colnames', 'clone', 'count',
# 'distinct', 'drop', 'drop_null', 'head', 'fill', 'filter',
# 'inner_join', 'left_join', 'mutate', 'names', 'nrow', 'ncol',
# 'full_join', 'pivot_longer', 'pivot_wider',
# 'pull', 'relocate', 'rename', 'replace_null', 'select',
# 'separate', 'set_names',
# 'slice', 'slice_head', 'slice_tail', 'summarize', 'tail',
# 'to_pandas', 'to_polars', 'write_csv', 'write_parquet'
# ]
# return _tidypolars_methods
def arrange(self, *args):
"""
Arrange/sort rows
Expand Down Expand Up @@ -174,7 +177,7 @@ def count(self, *args, sort = False, name = 'n'):

return out

def distinct(self, *args):
def distinct(self, *args, **kwargs):
dafxy marked this conversation as resolved.
Show resolved Hide resolved
"""
Select distinct/unique rows

Expand All @@ -183,17 +186,26 @@ def distinct(self, *args):
*args : str, Expr
Columns to find distinct/unique rows

**kwargs : dict
keep_all : boll
If True (default), keep all columns. Otherwise, return
only the ones used to select the distinct rows.

Examples
--------
>>> df = tp.Tibble({'a': range(3), 'b': ['a', 'a', 'b']})
>>> df.distinct()
>>> df.distinct('b')
"""
args = _as_list(args)
keep_all = kwargs.get("keep_all", True)
#
if len(args) == 0:
df = super().unique()
else:
df = super().select(args).unique()
df = super().unique(args)
if not keep_all:
df = df.select(args)
return df.pipe(from_polars)

def drop(self, *args):
Expand Down Expand Up @@ -374,9 +386,7 @@ def left_join(self, df, left_on = None, right_on = None, on = None, suffix = '_r
on = list(set(self.names) & set(df.names))
return super().join(df, on, 'left', left_on = left_on, right_on= right_on, suffix= suffix).pipe(from_polars)

def mutate(self, *args,
by = None,
**kwargs):
def mutate(self, *args, by = None, **kwargs):
"""
Add or modify columns

Expand Down Expand Up @@ -539,7 +549,8 @@ def pivot_wider(self,

out = (
super()
.pivot(values_from, id_cols, names_from, values_fn)
# .pivot(values_from, id_cols, names_from, values_fn)
.pivot(index=id_cols, on=names_from, values=values_from)
.pipe(from_polars)
)

Expand Down Expand Up @@ -949,6 +960,110 @@ def write_parquet(self,
"""Write a data frame to a parquet"""
return super().write_parquet(file, compression = compression, use_pyarrow = use_pyarrow, **kwargs)

def group_by(self, group, *args, **kwargs):
dafxy marked this conversation as resolved.
Show resolved Hide resolved
res = TibbleGroupBy(self, group, maintain_order=True)
return res

def nest(self, by, select='all', nested_name="data", *args, **kwargs):
dafxy marked this conversation as resolved.
Show resolved Hide resolved
"""
Nest rows into a list-column of dataframes

Parameters
----------
by : list, str
Columns to nest on
select : str, list
Columns to select for the nested dataframe. If 'all' (default)
all columns except those specified in 'by' will be selected.
nested_name : str
Name of the column to receive the nested dataframe

Examples
--------
"""
if select=='all':
select = [col for col in self.names if col not in by]
# make sure all columns in 'by' are removed from the nested data
select = [col for col in select if col not in by]
out = (self
.group_by(by)
.agg(**{
nested_name : pl.struct(select).map_elements(
lambda cols: from_polars( pl.DataFrame(cols.to_list()) ) )
})
)
return out.pipe(from_polars)

def unnest(self, col, *args, **kwargs):
dafxy marked this conversation as resolved.
Show resolved Hide resolved
"""
Unnest a nested data frame
Parameters
----------
col : str
Columns to unnest

"""
assert isinstance(col, str), "'col', must be a string"
out = (self
.mutate(**{
col : pl.col(col).map_elements(lambda d: d.to_struct())
})
.to_polars()
.explode(col)
.unnest(col)
)
return out.pipe(from_polars)

def crossing(self, *args, **kwargs):
"""
Expand the data set using a list of values. Each value in the
list
"""
out = self.mutate(*args, **kwargs).to_polars()
for var,_ in kwargs.items():
out = out.explode(var)
return out.pipe(from_polars)

# Not tidy functions, but useful from pandas/polars
# -------------------------------------------------
def replace(self, *args, **kwargs):
dafxy marked this conversation as resolved.
Show resolved Hide resolved
"""
Replace method from pandas
"""
out = (self
.to_polars()
.to_pandas()
.replace(*args, **kwargs))
return out.pipe(from_pandas)

def print(self, nrows=1000, str_lenght=1000):
dafxy marked this conversation as resolved.
Show resolved Hide resolved
"""
Print the DataFrame
"""
with pl.Config(set_tbl_rows=nrows,
fmt_str_lengths=str_lenght):
print(self)

class TibbleGroupBy(pl.dataframe.group_by.GroupBy):

def __init__(self, df, by, *args, **kwargs):
assert isinstance(by, str) or isinstance(by, list), "Use list or string to group by."
super().__init__(df, by, *args, **kwargs)
self.df = df
self.by = by if isinstance(by, list) else [by]

@property
def _constructor(self):
return TibbleGroupBy

def mutate(self, *args, **kwargs):
out = self.map_groups(lambda x: from_polars(x).mutate(*args, **kwargs))
return out

def filter(self, *args, **kwargs):
out = self.map_groups(lambda x: from_polars(x).filter(*args, **kwargs))
return out

def desc(x):
"""Mark a column to order in descending"""
x = copy.copy(x)
Expand Down Expand Up @@ -1027,7 +1142,7 @@ def from_pandas(df):
'null_count',
'quantile',
'rechunk',
'replace',
# 'replace',
'replace_at_idx',
'row',
'rows'
Expand All @@ -1050,7 +1165,7 @@ def from_pandas(df):
'to_pandas'
'to_parquet',
'transpose',
'unnest',
# 'unnest',
'var',
'width',
'with_column',
Expand Down