From 47f8b807ab2c524a340345b49a7b84486b4f75f2 Mon Sep 17 00:00:00 2001 From: diogoferrari Date: Wed, 28 Aug 2024 12:32:49 -0700 Subject: [PATCH 1/7] Implementation of group_by+mutate --- tidypolars/tibble.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tidypolars/tibble.py b/tidypolars/tibble.py index c681fc1..1eefc34 100644 --- a/tidypolars/tibble.py +++ b/tidypolars/tibble.py @@ -949,6 +949,26 @@ def write_parquet(self, """Write a data frame to a parquet""" return super().write_parquet(file, compression = compression, use_pyarrow = use_pyarrow, **kwargs) + def group_by(self, group, *args, **kwargs): + res = TibbleGroupBy(self, group, maintain_order=True) + return res + +class TibbleGroupBy(pl.dataframe.group_by.GroupBy): + + def __init__(self, df, by, *args, **kwargs): + assert isinstance(by, str) or isinstance(by, list), "Use list or string to group by." + super().__init__(df, by, *args, **kwargs) + self.df = df + self.by = by if isinstance(by, list) else list(by) + + @property + def _constructor(self): + return TibbleGroupBy + + def mutate(self, *args, **kwargs): + out = self.map_groups(lambda x: from_polars(x).mutate(*args, **kwargs)) + return out + def desc(x): """Mark a column to order in descending""" x = copy.copy(x) From 2405f7665a1c129fc73b912f30172d3a21a2f372 Mon Sep 17 00:00:00 2001 From: diogoferrari Date: Wed, 28 Aug 2024 16:03:06 -0700 Subject: [PATCH 2/7] Nest, unnest, and crossing implemented --- tidypolars/tibble.py | 73 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 67 insertions(+), 6 deletions(-) diff --git a/tidypolars/tibble.py b/tidypolars/tibble.py index 1eefc34..f004c21 100644 --- a/tidypolars/tibble.py +++ b/tidypolars/tibble.py @@ -13,6 +13,9 @@ from .reexports import * from .tidyselect import everything from operator import not_ +# to supress polars' 'warning of dtype in the nested data operations +import warnings +warnings.filterwarnings("ignore", category=pl.exceptions.MapWithoutReturnDtypeWarning) __all__ = [ "Tibble", @@ -78,7 +81,7 @@ def __dir__(self): 'to_pandas', 'to_polars', 'write_csv', 'write_parquet' ] return _tidypolars_methods - + def arrange(self, *args): """ Arrange/sort rows @@ -374,9 +377,7 @@ def left_join(self, df, left_on = None, right_on = None, on = None, suffix = '_r on = list(set(self.names) & set(df.names)) return super().join(df, on, 'left', left_on = left_on, right_on= right_on, suffix= suffix).pipe(from_polars) - def mutate(self, *args, - by = None, - **kwargs): + def mutate(self, *args, by = None, **kwargs): """ Add or modify columns @@ -953,13 +954,73 @@ def group_by(self, group, *args, **kwargs): res = TibbleGroupBy(self, group, maintain_order=True) return res + def nest(self, by, select='all', nested_name="data", *args, **kwargs): + """ + Nest rows into a list-column of dataframes + + Parameters + ---------- + by : list, str + Columns to nest on + select : str, list + Columns to select for the nested dataframe. If 'all' (default) + all columns except those specified in 'by' will be selected. + nested_name : str + Name of the column to receive the nested dataframe + + Examples + -------- + """ + if select=='all': + select = [col for col in self.names if col not in by] + # make sure all columns in 'by' are removed from the nested data + select = [col for col in select if col not in by] + out = (self + .group_by(by) + .agg(**{ + nested_name : pl.struct(select).map_elements( + lambda cols: from_polars( pl.DataFrame(cols.to_list()) ) ) + }) + ) + return out.pipe(from_polars) + + def unnest(self, col, *args, **kwargs): + """ + Unnest a nested data frame + Parameters + ---------- + col : str + Columns to unnest + + """ + assert isinstance(col, str), "'col', must be a string" + out = (self + .mutate(**{ + col : pl.col(col).map_elements(lambda d: d.to_struct()) + }) + .to_polars() + .explode(col) + .unnest(col) + ) + return out.pipe(from_polars) + + def crossing(self, *args, **kwargs): + """ + Expand the data set using a list of values. Each value in the + list + """ + out = self.mutate(*args, **kwargs).to_polars() + for var,_ in kwargs.items(): + out = out.explode(var) + return out.pipe(from_polars) + class TibbleGroupBy(pl.dataframe.group_by.GroupBy): def __init__(self, df, by, *args, **kwargs): assert isinstance(by, str) or isinstance(by, list), "Use list or string to group by." super().__init__(df, by, *args, **kwargs) self.df = df - self.by = by if isinstance(by, list) else list(by) + self.by = by if isinstance(by, list) else [by] @property def _constructor(self): @@ -1070,7 +1131,7 @@ def from_pandas(df): 'to_pandas' 'to_parquet', 'transpose', - 'unnest', + # 'unnest', 'var', 'width', 'with_column', From 0b977f6520070078a3e8d557374e676ec430fa9e Mon Sep 17 00:00:00 2001 From: diogoferrari Date: Thu, 29 Aug 2024 09:14:29 -0700 Subject: [PATCH 3/7] Add option keep_all to distinct() to match tidyverse --- tidypolars/tibble.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tidypolars/tibble.py b/tidypolars/tibble.py index f004c21..8ae84ef 100644 --- a/tidypolars/tibble.py +++ b/tidypolars/tibble.py @@ -177,7 +177,7 @@ def count(self, *args, sort = False, name = 'n'): return out - def distinct(self, *args): + def distinct(self, *args, **kwargs): """ Select distinct/unique rows @@ -186,6 +186,11 @@ def distinct(self, *args): *args : str, Expr Columns to find distinct/unique rows + **kwargs : dict + keep_all : boll + If True (default), keep all columns. Otherwise, return + only the ones used to select the distinct rows. + Examples -------- >>> df = tp.Tibble({'a': range(3), 'b': ['a', 'a', 'b']}) @@ -193,10 +198,14 @@ def distinct(self, *args): >>> df.distinct('b') """ args = _as_list(args) + keep_all = kwargs.get("keep_all", True) + # if len(args) == 0: df = super().unique() else: - df = super().select(args).unique() + df = super().unique(args) + if not keep_all: + df = df.select(args) return df.pipe(from_polars) def drop(self, *args): From 85f6353c5059337cb049e8d10fbdca24f760b81a Mon Sep 17 00:00:00 2001 From: diogoferrari Date: Fri, 30 Aug 2024 08:55:51 -0700 Subject: [PATCH 4/7] Added filter for grouped data, replace from pandas, and commented dir --- tidypolars/tibble.py | 62 ++++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/tidypolars/tibble.py b/tidypolars/tibble.py index 8ae84ef..a00d34d 100644 --- a/tidypolars/tibble.py +++ b/tidypolars/tibble.py @@ -64,23 +64,23 @@ def __str__(self): df = self.to_polars() return df.__str__() - def __getattribute__(self, attr): - if attr in _polars_methods: - raise AttributeError - return pl.DataFrame.__getattribute__(self, attr) - - def __dir__(self): - _tidypolars_methods = [ - 'arrange', 'bind_cols', 'bind_rows', 'colnames', 'clone', 'count', - 'distinct', 'drop', 'drop_null', 'head', 'fill', 'filter', - 'inner_join', 'left_join', 'mutate', 'names', 'nrow', 'ncol', - 'full_join', 'pivot_longer', 'pivot_wider', - 'pull', 'relocate', 'rename', 'replace_null', 'select', - 'separate', 'set_names', - 'slice', 'slice_head', 'slice_tail', 'summarize', 'tail', - 'to_pandas', 'to_polars', 'write_csv', 'write_parquet' - ] - return _tidypolars_methods + # def __getattribute__(self, attr): + # if attr in _polars_methods: + # raise AttributeError + # return pl.DataFrame.__getattribute__(self, attr) + + # def __dir__(self): + # _tidypolars_methods = [ + # 'arrange', 'bind_cols', 'bind_rows', 'colnames', 'clone', 'count', + # 'distinct', 'drop', 'drop_null', 'head', 'fill', 'filter', + # 'inner_join', 'left_join', 'mutate', 'names', 'nrow', 'ncol', + # 'full_join', 'pivot_longer', 'pivot_wider', + # 'pull', 'relocate', 'rename', 'replace_null', 'select', + # 'separate', 'set_names', + # 'slice', 'slice_head', 'slice_tail', 'summarize', 'tail', + # 'to_pandas', 'to_polars', 'write_csv', 'write_parquet' + # ] + # return _tidypolars_methods def arrange(self, *args): """ @@ -1022,7 +1022,27 @@ def crossing(self, *args, **kwargs): for var,_ in kwargs.items(): out = out.explode(var) return out.pipe(from_polars) - + + # Not tidy functions, but useful from pandas/polars + # ------------------------------------------------- + def replace(self, *args, **kwargs): + """ + Replace method from pandas + """ + out = (self + .to_polars() + .to_pandas() + .replace(*args, **kwargs)) + return out.pipe(from_pandas) + + def print(self, nrows=1000, str_lenght=1000): + """ + Print the DataFrame + """ + with pl.Config(set_tbl_rows=nrows, + fmt_str_lengths=str_lenght): + print(self) + class TibbleGroupBy(pl.dataframe.group_by.GroupBy): def __init__(self, df, by, *args, **kwargs): @@ -1039,6 +1059,10 @@ def mutate(self, *args, **kwargs): out = self.map_groups(lambda x: from_polars(x).mutate(*args, **kwargs)) return out + def filter(self, *args, **kwargs): + out = self.map_groups(lambda x: from_polars(x).filter(*args, **kwargs)) + return out + def desc(x): """Mark a column to order in descending""" x = copy.copy(x) @@ -1117,7 +1141,7 @@ def from_pandas(df): 'null_count', 'quantile', 'rechunk', - 'replace', + # 'replace', 'replace_at_idx', 'row', 'rows' From 36ef96003633e5d6986cac8bac81491ef558ebba Mon Sep 17 00:00:00 2001 From: diogoferrari Date: Sun, 8 Sep 2024 18:29:23 -0700 Subject: [PATCH 5/7] Add as_factor function --- tidypolars/funs.py | 18 +++++++++++++++++- tidypolars/tibble.py | 3 ++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/tidypolars/funs.py b/tidypolars/funs.py index 0f8147b..347ad61 100644 --- a/tidypolars/funs.py +++ b/tidypolars/funs.py @@ -37,7 +37,7 @@ "is_nan", "is_not", "is_not_in", "is_not_null", "is_null", # Type conversion - "as_boolean", "as_float", "as_integer", "as_string", + "as_boolean", "as_float", "as_integer", "as_string", "as_factor", "cast" ] @@ -131,6 +131,22 @@ def as_string(x): x = _col_expr(x) return x.cast(pl.Utf8) +def as_factor(x, levels): + """ + Convert to factor. Defaults to Utf8. + + Parameters + ---------- + x : Str + Column to operate on + + Examples + -------- + >>> df.mutate(string_x = tp.as_factor('x')) + """ + x = _col_expr(x) + return x.cast(pl.Enum(levels)) + def abs(x): """ Absolute value diff --git a/tidypolars/tibble.py b/tidypolars/tibble.py index a00d34d..dea092f 100644 --- a/tidypolars/tibble.py +++ b/tidypolars/tibble.py @@ -549,7 +549,8 @@ def pivot_wider(self, out = ( super() - .pivot(values_from, id_cols, names_from, values_fn) + # .pivot(values_from, id_cols, names_from, values_fn) + .pivot(index=id_cols, on=names_from, values=values_from) .pipe(from_polars) ) From 8d396f07dd14ec4f14530783f3b4478abb80f3f2 Mon Sep 17 00:00:00 2001 From: dafxy Date: Wed, 11 Sep 2024 20:10:42 -0700 Subject: [PATCH 6/7] First pass review with bulk changes --- tidypolars/funs.py | 55 ++++++++++++++++++----- tidypolars/tibble.py | 102 ++++++++++++++++++++++++++----------------- 2 files changed, 107 insertions(+), 50 deletions(-) diff --git a/tidypolars/funs.py b/tidypolars/funs.py index 347ad61..4629d6c 100644 --- a/tidypolars/funs.py +++ b/tidypolars/funs.py @@ -37,7 +37,9 @@ "is_nan", "is_not", "is_not_in", "is_not_null", "is_null", # Type conversion - "as_boolean", "as_float", "as_integer", "as_string", "as_factor", + "as_boolean", "as_logical", "as_float", "as_integer", + "as_string", "as_character", + "as_factor", "as_categorical", "cast" ] @@ -69,16 +71,24 @@ def across(cols, fn = lambda x: x, names_prefix = None): def as_boolean(x): """ - Convert to a boolean + Convert column to string. Alias to as_logical (R naming). + """ + return as_logical(x) + +def as_logical(x): + """ + Convert to a boolean (polars) or 'logical' (R naming) Parameters ---------- - x : Expr + x : Str Column to operate on Examples -------- >>> df.mutate(bool_x = tp.as_boolean(col('x'))) + # or equivalently + >>> df.mutate(logical_x = tp.as_logical(col('x'))) """ x = _col_expr(x) return x.cast(pl.Boolean) @@ -116,36 +126,61 @@ def as_integer(x): return x.cast(pl.Int64) def as_string(x): + ''' + Convert column to string. Alias to as_character (R naming). + Equivalent to Utf8 type (polars) + ''' + return as_character(x) + +def as_character(x): """ Convert to string. Defaults to Utf8. Parameters ---------- - x : Expr + x : Str Column to operate on Examples -------- - >>> df.mutate(string_x = tp.as_string(col('x'))) + >>> df.mutate(string_x = tp.as_string('x')) + # or equivalently + >>> df.mutate(character_x = tp.as_character('x')) """ x = _col_expr(x) return x.cast(pl.Utf8) - -def as_factor(x, levels): + +def as_factor(x, levels = None): """ - Convert to factor. Defaults to Utf8. + Convert to factor (R naming), equlivalent to Enum or + Categorical (polars), depending on whether 'levels' is provided. Parameters ---------- x : Str Column to operate on + level : list of str + Categories to use in the factor. The catogories will be ordered + as they appear in the list. If None (default), it will + create an unordered factor (polars Categorical). + Examples -------- - >>> df.mutate(string_x = tp.as_factor('x')) + >>> df.mutate(factor_x = tp.as_factor('x')) + # or equivalently + >>> df.mutate(categorical_x = tp.as_categorical('x')) """ x = _col_expr(x) - return x.cast(pl.Enum(levels)) + if levels is None: + x = x.cast(pl.Categorical) + else: + x = x.cast(pl.Enum(levels)) + return x + +def as_categorical(*args, **kwargs): + "Convert to factor. Alias for as_factor" + return as_factor(*args, **kwargs) def abs(x): """ diff --git a/tidypolars/tibble.py b/tidypolars/tibble.py index dea092f..0fd27b8 100644 --- a/tidypolars/tibble.py +++ b/tidypolars/tibble.py @@ -33,7 +33,7 @@ def __init__(self, _data = None, **kwargs): elif not_(isinstance(_data, dict)): raise ValueError("_data must be a dictionary or kwargs must be used") super().__init__(_data) - + def __repr__(self): """Printing method""" df = self.to_polars() @@ -64,23 +64,28 @@ def __str__(self): df = self.to_polars() return df.__str__() - # def __getattribute__(self, attr): - # if attr in _polars_methods: - # raise AttributeError - # return pl.DataFrame.__getattribute__(self, attr) - - # def __dir__(self): - # _tidypolars_methods = [ - # 'arrange', 'bind_cols', 'bind_rows', 'colnames', 'clone', 'count', - # 'distinct', 'drop', 'drop_null', 'head', 'fill', 'filter', - # 'inner_join', 'left_join', 'mutate', 'names', 'nrow', 'ncol', - # 'full_join', 'pivot_longer', 'pivot_wider', - # 'pull', 'relocate', 'rename', 'replace_null', 'select', - # 'separate', 'set_names', - # 'slice', 'slice_head', 'slice_tail', 'summarize', 'tail', - # 'to_pandas', 'to_polars', 'write_csv', 'write_parquet' - # ] - # return _tidypolars_methods + def __getattribute__(self, attr): + if attr in _polars_methods: + raise AttributeError + return pl.DataFrame.__getattribute__(self, attr) + + def __dir__(self): + _tidypolars_methods = [ + 'arrange', 'bind_cols', 'bind_rows', 'colnames', 'clone', 'count', + 'crossing', + 'distinct', 'drop', 'drop_null', 'head', 'fill', 'filter', + 'group_by', + 'inner_join', 'left_join', 'mutate', 'names', 'nest', + 'nrow', 'ncol', + 'full_join', 'pivot_longer', 'pivot_wider', 'print', + 'pull', 'relocate', 'rename', + 'replace', + 'replace_null', 'select', + 'separate', 'set_names', + 'slice', 'slice_head', 'slice_tail', 'summarize', 'tail', + 'to_pandas', 'to_polars', 'unnest', 'write_csv', 'write_parquet' + ] + return _tidypolars_methods def arrange(self, *args): """ @@ -177,7 +182,7 @@ def count(self, *args, sort = False, name = 'n'): return out - def distinct(self, *args, **kwargs): + def distinct(self, *args, keep_all = False): """ Select distinct/unique rows @@ -198,7 +203,6 @@ def distinct(self, *args, **kwargs): >>> df.distinct('b') """ args = _as_list(args) - keep_all = kwargs.get("keep_all", True) # if len(args) == 0: df = super().unique() @@ -963,8 +967,8 @@ def write_parquet(self, def group_by(self, group, *args, **kwargs): res = TibbleGroupBy(self, group, maintain_order=True) return res - - def nest(self, by, select='all', nested_name="data", *args, **kwargs): + + def nest(self, by, *args, **kwargs): """ Nest rows into a list-column of dataframes @@ -972,29 +976,47 @@ def nest(self, by, select='all', nested_name="data", *args, **kwargs): ---------- by : list, str Columns to nest on - select : str, list - Columns to select for the nested dataframe. If 'all' (default) - all columns except those specified in 'by' will be selected. - nested_name : str - Name of the column to receive the nested dataframe + + kwargs : + data : list of column names + columns to select to include in the nested data + If not provided, include all columns except the ones + used in 'by' + + key : str + name of the resulting nested column. + + names_sep : str + If not provided (default), the names in the nested + data will come from the former names. If a string, + the new inner names in the nested dataframe will use + the outer names with names_sep automatically stripped. + This makes names_sep roughly + symmetric between nesting and unnesting. Examples -------- """ - if select=='all': - select = [col for col in self.names if col not in by] - # make sure all columns in 'by' are removed from the nested data - select = [col for col in select if col not in by] + key = kwargs.get("key", 'data') + data = kwargs.get("data", [c for c in self.names if c not in by]) + names_sep = kwargs.get("names_sep", None) + out = (self .group_by(by) .agg(**{ - nested_name : pl.struct(select).map_elements( + key : pl.struct(data).map_elements( lambda cols: from_polars( pl.DataFrame(cols.to_list()) ) ) - }) + }) + .pipe(from_polars) ) - return out.pipe(from_polars) - - def unnest(self, col, *args, **kwargs): + + if names_sep is not None: + new_names = {col:f"{col}_{names_sep}" for col in data} + print(new_names) + out = out.mutate(**{key:col(key).map_elements(lambda row: row.rename(new_names))}) + return out + + def unnest(self, col): """ Unnest a nested data frame Parameters @@ -1036,14 +1058,14 @@ def replace(self, *args, **kwargs): .replace(*args, **kwargs)) return out.pipe(from_pandas) - def print(self, nrows=1000, str_lenght=1000): + def print(self, n=1000, str_length=1000): """ Print the DataFrame """ - with pl.Config(set_tbl_rows=nrows, - fmt_str_lengths=str_lenght): + with pl.Config(set_tbl_rows=n, + fmt_str_lengths=str_length): print(self) - + class TibbleGroupBy(pl.dataframe.group_by.GroupBy): def __init__(self, df, by, *args, **kwargs): From 6bf5a9ce90954d2d53d644ce9f1ae876ff4119a6 Mon Sep 17 00:00:00 2001 From: dafxy Date: Thu, 12 Sep 2024 15:42:59 -0700 Subject: [PATCH 7/7] Add summarize to group_by --- tidypolars/tibble.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tidypolars/tibble.py b/tidypolars/tibble.py index 0fd27b8..a4bee15 100644 --- a/tidypolars/tibble.py +++ b/tidypolars/tibble.py @@ -1086,6 +1086,11 @@ def filter(self, *args, **kwargs): out = self.map_groups(lambda x: from_polars(x).filter(*args, **kwargs)) return out + def summarize(self, *args, **kwargs): + out = self.map_groups(lambda x: from_polars(x).summarise(by=self.by, *args, **kwargs)) + return out + + def desc(x): """Mark a column to order in descending""" x = copy.copy(x)