From 47f8b807ab2c524a340345b49a7b84486b4f75f2 Mon Sep 17 00:00:00 2001
From: diogoferrari <diogoferrari@gmail.com>
Date: Wed, 28 Aug 2024 12:32:49 -0700
Subject: [PATCH 1/7] Implementation of group_by+mutate

---
 tidypolars/tibble.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tidypolars/tibble.py b/tidypolars/tibble.py
index c681fc1..1eefc34 100644
--- a/tidypolars/tibble.py
+++ b/tidypolars/tibble.py
@@ -949,6 +949,26 @@ def write_parquet(self,
         """Write a data frame to a parquet"""
         return super().write_parquet(file, compression = compression, use_pyarrow = use_pyarrow, **kwargs)
 
+    def group_by(self, group, *args, **kwargs):
+        res = TibbleGroupBy(self, group, maintain_order=True)
+        return res
+
+class TibbleGroupBy(pl.dataframe.group_by.GroupBy):
+
+    def __init__(self, df, by, *args, **kwargs):
+        assert isinstance(by, str) or isinstance(by, list), "Use list or string to group by."
+        super().__init__(df, by, *args, **kwargs)
+        self.df = df
+        self.by = by if isinstance(by, list) else list(by)
+
+    @property
+    def _constructor(self):
+        return TibbleGroupBy
+
+    def mutate(self, *args, **kwargs):
+        out = self.map_groups(lambda x: from_polars(x).mutate(*args, **kwargs))
+        return out
+
 def desc(x):
     """Mark a column to order in descending"""
     x = copy.copy(x)

From 2405f7665a1c129fc73b912f30172d3a21a2f372 Mon Sep 17 00:00:00 2001
From: diogoferrari <diogoferrari@gmail.com>
Date: Wed, 28 Aug 2024 16:03:06 -0700
Subject: [PATCH 2/7] Nest, unnest, and crossing implemented

---
 tidypolars/tibble.py | 73 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 67 insertions(+), 6 deletions(-)

diff --git a/tidypolars/tibble.py b/tidypolars/tibble.py
index 1eefc34..f004c21 100644
--- a/tidypolars/tibble.py
+++ b/tidypolars/tibble.py
@@ -13,6 +13,9 @@
 from .reexports import *
 from .tidyselect import everything
 from operator import not_
+# to supress polars' 'warning of dtype in the nested data operations
+import warnings
+warnings.filterwarnings("ignore", category=pl.exceptions.MapWithoutReturnDtypeWarning)
 
 __all__ = [
     "Tibble",
@@ -78,7 +81,7 @@ def __dir__(self):
             'to_pandas', 'to_polars', 'write_csv', 'write_parquet'
         ]
         return _tidypolars_methods
-
+    
     def arrange(self, *args):
         """
         Arrange/sort rows
@@ -374,9 +377,7 @@ def left_join(self, df, left_on = None, right_on = None, on = None, suffix = '_r
             on = list(set(self.names) & set(df.names))
         return super().join(df, on, 'left',  left_on = left_on, right_on= right_on, suffix= suffix).pipe(from_polars)
 
-    def mutate(self, *args,
-               by = None,
-               **kwargs):
+    def mutate(self, *args, by = None, **kwargs):
         """
         Add or modify columns
 
@@ -953,13 +954,73 @@ def group_by(self, group, *args, **kwargs):
         res = TibbleGroupBy(self, group, maintain_order=True)
         return res
 
+    def nest(self, by, select='all', nested_name="data", *args, **kwargs):
+        """
+        Nest rows into a list-column of dataframes
+
+        Parameters
+        ----------
+        by : list, str
+            Columns to nest on
+        select : str, list
+            Columns to select for the nested dataframe. If 'all' (default)
+            all columns except those specified in 'by' will be selected.
+        nested_name : str
+            Name of the column to receive the nested dataframe
+
+        Examples
+        --------
+        """
+        if select=='all':
+            select = [col for col in self.names if col not in by]
+        # make sure all columns in 'by' are removed from the nested data
+        select = [col for col in select if col not in by]
+        out = (self
+               .group_by(by)
+               .agg(**{
+                   nested_name : pl.struct(select).map_elements(
+                       lambda cols: from_polars( pl.DataFrame(cols.to_list()) ) )
+                   })
+               )
+        return out.pipe(from_polars)
+    
+    def unnest(self, col, *args, **kwargs):
+        """
+        Unnest a nested data frame
+        Parameters
+        ----------
+        col : str
+            Columns to unnest
+
+        """
+        assert isinstance(col, str), "'col', must be a string"
+        out = (self
+               .mutate(**{
+                   col : pl.col(col).map_elements(lambda d: d.to_struct())
+               })
+               .to_polars()
+               .explode(col)
+               .unnest(col)
+               )
+        return out.pipe(from_polars)
+
+    def crossing(self, *args, **kwargs):
+        """
+        Expand the data set using a list of values. Each value in the
+        list
+        """
+        out = self.mutate(*args, **kwargs).to_polars()
+        for var,_ in kwargs.items():
+            out = out.explode(var)
+        return out.pipe(from_polars)
+            
 class TibbleGroupBy(pl.dataframe.group_by.GroupBy):
 
     def __init__(self, df, by, *args, **kwargs):
         assert isinstance(by, str) or isinstance(by, list), "Use list or string to group by."
         super().__init__(df, by, *args, **kwargs)
         self.df = df
-        self.by = by if isinstance(by, list) else list(by)
+        self.by = by if isinstance(by, list) else [by]
 
     @property
     def _constructor(self):
@@ -1070,7 +1131,7 @@ def from_pandas(df):
     'to_pandas'
     'to_parquet',
     'transpose',
-    'unnest',
+    # 'unnest',
     'var',
     'width',
     'with_column',

From 0b977f6520070078a3e8d557374e676ec430fa9e Mon Sep 17 00:00:00 2001
From: diogoferrari <diogoferrari@gmail.com>
Date: Thu, 29 Aug 2024 09:14:29 -0700
Subject: [PATCH 3/7] Add option keep_all to distinct() to match tidyverse

---
 tidypolars/tibble.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tidypolars/tibble.py b/tidypolars/tibble.py
index f004c21..8ae84ef 100644
--- a/tidypolars/tibble.py
+++ b/tidypolars/tibble.py
@@ -177,7 +177,7 @@ def count(self, *args, sort = False, name = 'n'):
 
         return out
 
-    def distinct(self, *args):
+    def distinct(self, *args, **kwargs):
         """
         Select distinct/unique rows
 
@@ -186,6 +186,11 @@ def distinct(self, *args):
         *args : str, Expr
             Columns to find distinct/unique rows
 
+        **kwargs : dict
+            keep_all : boll
+              If True (default), keep all columns. Otherwise, return
+              only the ones used to select the distinct rows.
+
         Examples
         --------
         >>> df = tp.Tibble({'a': range(3), 'b': ['a', 'a', 'b']})
@@ -193,10 +198,14 @@ def distinct(self, *args):
         >>> df.distinct('b')
         """
         args = _as_list(args)
+        keep_all = kwargs.get("keep_all", True)
+        # 
         if len(args) == 0:
             df = super().unique()
         else:
-            df = super().select(args).unique()
+            df = super().unique(args)
+        if not keep_all:
+            df = df.select(args)
         return df.pipe(from_polars)
 
     def drop(self, *args):

From 85f6353c5059337cb049e8d10fbdca24f760b81a Mon Sep 17 00:00:00 2001
From: diogoferrari <diogoferrari@gmail.com>
Date: Fri, 30 Aug 2024 08:55:51 -0700
Subject: [PATCH 4/7] Added filter for grouped data, replace from pandas, and
 commented dir

---
 tidypolars/tibble.py | 62 ++++++++++++++++++++++++++++++--------------
 1 file changed, 43 insertions(+), 19 deletions(-)

diff --git a/tidypolars/tibble.py b/tidypolars/tibble.py
index 8ae84ef..a00d34d 100644
--- a/tidypolars/tibble.py
+++ b/tidypolars/tibble.py
@@ -64,23 +64,23 @@ def __str__(self):
         df = self.to_polars()
         return df.__str__()
 
-    def __getattribute__(self, attr):
-        if attr in _polars_methods:
-            raise AttributeError
-        return pl.DataFrame.__getattribute__(self, attr)
-
-    def __dir__(self):
-        _tidypolars_methods = [
-            'arrange', 'bind_cols', 'bind_rows', 'colnames', 'clone', 'count',
-            'distinct', 'drop', 'drop_null', 'head', 'fill', 'filter',
-            'inner_join', 'left_join', 'mutate', 'names', 'nrow', 'ncol',
-            'full_join', 'pivot_longer', 'pivot_wider',
-            'pull', 'relocate', 'rename', 'replace_null', 'select',
-            'separate', 'set_names',
-            'slice', 'slice_head', 'slice_tail', 'summarize', 'tail',
-            'to_pandas', 'to_polars', 'write_csv', 'write_parquet'
-        ]
-        return _tidypolars_methods
+    # def __getattribute__(self, attr):
+    #     if attr in _polars_methods:
+    #         raise AttributeError
+    #     return pl.DataFrame.__getattribute__(self, attr)
+
+    # def __dir__(self):
+    #     _tidypolars_methods = [
+    #         'arrange', 'bind_cols', 'bind_rows', 'colnames', 'clone', 'count',
+    #         'distinct', 'drop', 'drop_null', 'head', 'fill', 'filter',
+    #         'inner_join', 'left_join', 'mutate', 'names', 'nrow', 'ncol',
+    #         'full_join', 'pivot_longer', 'pivot_wider',
+    #         'pull', 'relocate', 'rename', 'replace_null', 'select',
+    #         'separate', 'set_names',
+    #         'slice', 'slice_head', 'slice_tail', 'summarize', 'tail',
+    #         'to_pandas', 'to_polars', 'write_csv', 'write_parquet'
+    #     ]
+    #     return _tidypolars_methods
     
     def arrange(self, *args):
         """
@@ -1022,7 +1022,27 @@ def crossing(self, *args, **kwargs):
         for var,_ in kwargs.items():
             out = out.explode(var)
         return out.pipe(from_polars)
-            
+
+    # Not tidy functions, but useful from pandas/polars 
+    # -------------------------------------------------
+    def replace(self, *args, **kwargs):
+        """
+        Replace method from pandas
+        """
+        out = (self
+               .to_polars()
+               .to_pandas()
+               .replace(*args, **kwargs))
+        return out.pipe(from_pandas)
+        
+    def print(self, nrows=1000, str_lenght=1000):
+        """
+        Print the DataFrame
+        """
+        with pl.Config(set_tbl_rows=nrows,
+                       fmt_str_lengths=str_lenght):
+            print(self)
+        
 class TibbleGroupBy(pl.dataframe.group_by.GroupBy):
 
     def __init__(self, df, by, *args, **kwargs):
@@ -1039,6 +1059,10 @@ def mutate(self, *args, **kwargs):
         out = self.map_groups(lambda x: from_polars(x).mutate(*args, **kwargs))
         return out
 
+    def filter(self, *args, **kwargs):
+        out = self.map_groups(lambda x: from_polars(x).filter(*args, **kwargs))
+        return out
+
 def desc(x):
     """Mark a column to order in descending"""
     x = copy.copy(x)
@@ -1117,7 +1141,7 @@ def from_pandas(df):
     'null_count',
     'quantile',
     'rechunk',
-    'replace',
+    # 'replace',
     'replace_at_idx',
     'row',
     'rows'

From 36ef96003633e5d6986cac8bac81491ef558ebba Mon Sep 17 00:00:00 2001
From: diogoferrari <diogoferrari@gmail.com>
Date: Sun, 8 Sep 2024 18:29:23 -0700
Subject: [PATCH 5/7] Add as_factor function

---
 tidypolars/funs.py   | 18 +++++++++++++++++-
 tidypolars/tibble.py |  3 ++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/tidypolars/funs.py b/tidypolars/funs.py
index 0f8147b..347ad61 100644
--- a/tidypolars/funs.py
+++ b/tidypolars/funs.py
@@ -37,7 +37,7 @@
     "is_nan", "is_not", "is_not_in", "is_not_null", "is_null",
 
     # Type conversion
-    "as_boolean", "as_float", "as_integer", "as_string",
+    "as_boolean", "as_float", "as_integer", "as_string", "as_factor",
     "cast"
 ]
 
@@ -131,6 +131,22 @@ def as_string(x):
     x = _col_expr(x)
     return x.cast(pl.Utf8)
 
+def as_factor(x, levels):
+    """
+    Convert to factor. Defaults to Utf8.
+
+    Parameters
+    ----------
+    x : Str
+        Column to operate on
+
+    Examples
+    --------
+    >>> df.mutate(string_x = tp.as_factor('x'))
+    """
+    x = _col_expr(x)
+    return x.cast(pl.Enum(levels))
+
 def abs(x):
     """
     Absolute value
diff --git a/tidypolars/tibble.py b/tidypolars/tibble.py
index a00d34d..dea092f 100644
--- a/tidypolars/tibble.py
+++ b/tidypolars/tibble.py
@@ -549,7 +549,8 @@ def pivot_wider(self,
 
         out = (
             super()
-            .pivot(values_from, id_cols, names_from, values_fn)
+            # .pivot(values_from, id_cols, names_from, values_fn)
+            .pivot(index=id_cols, on=names_from, values=values_from)
             .pipe(from_polars)
         )
 

From 8d396f07dd14ec4f14530783f3b4478abb80f3f2 Mon Sep 17 00:00:00 2001
From: dafxy <diofxy@gmail.com>
Date: Wed, 11 Sep 2024 20:10:42 -0700
Subject: [PATCH 6/7] First pass review with bulk changes

---
 tidypolars/funs.py   |  55 ++++++++++++++++++-----
 tidypolars/tibble.py | 102 ++++++++++++++++++++++++++-----------------
 2 files changed, 107 insertions(+), 50 deletions(-)

diff --git a/tidypolars/funs.py b/tidypolars/funs.py
index 347ad61..4629d6c 100644
--- a/tidypolars/funs.py
+++ b/tidypolars/funs.py
@@ -37,7 +37,9 @@
     "is_nan", "is_not", "is_not_in", "is_not_null", "is_null",
 
     # Type conversion
-    "as_boolean", "as_float", "as_integer", "as_string", "as_factor",
+    "as_boolean", "as_logical", "as_float", "as_integer",
+    "as_string", "as_character",
+    "as_factor", "as_categorical",
     "cast"
 ]
 
@@ -69,16 +71,24 @@ def across(cols, fn = lambda x: x, names_prefix = None):
 
 def as_boolean(x):
     """
-    Convert to a boolean
+    Convert column to string. Alias to as_logical (R naming).
+    """
+    return as_logical(x)
+
+def as_logical(x):
+    """
+    Convert to a boolean (polars) or 'logical' (R naming)
 
     Parameters
     ----------
-    x : Expr
+    x : Str
         Column to operate on
 
     Examples
     --------
     >>> df.mutate(bool_x = tp.as_boolean(col('x')))
+    # or equivalently
+    >>> df.mutate(logical_x = tp.as_logical(col('x')))
     """
     x = _col_expr(x)
     return x.cast(pl.Boolean)
@@ -116,36 +126,61 @@ def as_integer(x):
     return x.cast(pl.Int64)
 
 def as_string(x):
+    '''
+    Convert column to string. Alias to as_character (R naming).
+    Equivalent to Utf8 type (polars)
+    '''
+    return as_character(x)
+
+def as_character(x):
     """
     Convert to string. Defaults to Utf8.
 
     Parameters
     ----------
-    x : Expr
+    x : Str 
         Column to operate on
 
     Examples
     --------
-    >>> df.mutate(string_x = tp.as_string(col('x')))
+    >>> df.mutate(string_x = tp.as_string('x'))
+    # or equivalently
+    >>> df.mutate(character_x = tp.as_character('x'))
     """
     x = _col_expr(x)
     return x.cast(pl.Utf8)
-
-def as_factor(x, levels):
+   
+def as_factor(x, levels = None):
     """
-    Convert to factor. Defaults to Utf8.
+    Convert to factor (R naming), equlivalent to Enum or
+    Categorical (polars), depending on whether 'levels' is provided. 
 
     Parameters
     ----------
     x : Str
         Column to operate on
 
+    level : list of str
+        Categories to use in the factor. The catogories will be ordered
+        as they appear in the list. If None (default), it will
+        create an unordered factor (polars Categorical).
+
     Examples
     --------
-    >>> df.mutate(string_x = tp.as_factor('x'))
+    >>> df.mutate(factor_x = tp.as_factor('x'))
+    # or equivalently
+    >>> df.mutate(categorical_x = tp.as_categorical('x'))
     """
     x = _col_expr(x)
-    return x.cast(pl.Enum(levels))
+    if levels is None:
+        x = x.cast(pl.Categorical)
+    else:
+        x = x.cast(pl.Enum(levels))
+    return x
+
+def as_categorical(*args, **kwargs):
+    "Convert to factor. Alias for as_factor"
+    return as_factor(*args, **kwargs)
 
 def abs(x):
     """
diff --git a/tidypolars/tibble.py b/tidypolars/tibble.py
index dea092f..0fd27b8 100644
--- a/tidypolars/tibble.py
+++ b/tidypolars/tibble.py
@@ -33,7 +33,7 @@ def __init__(self, _data = None, **kwargs):
         elif not_(isinstance(_data, dict)):
             raise ValueError("_data must be a dictionary or kwargs must be used")
         super().__init__(_data)
-    
+
     def __repr__(self):
         """Printing method"""
         df = self.to_polars()
@@ -64,23 +64,28 @@ def __str__(self):
         df = self.to_polars()
         return df.__str__()
 
-    # def __getattribute__(self, attr):
-    #     if attr in _polars_methods:
-    #         raise AttributeError
-    #     return pl.DataFrame.__getattribute__(self, attr)
-
-    # def __dir__(self):
-    #     _tidypolars_methods = [
-    #         'arrange', 'bind_cols', 'bind_rows', 'colnames', 'clone', 'count',
-    #         'distinct', 'drop', 'drop_null', 'head', 'fill', 'filter',
-    #         'inner_join', 'left_join', 'mutate', 'names', 'nrow', 'ncol',
-    #         'full_join', 'pivot_longer', 'pivot_wider',
-    #         'pull', 'relocate', 'rename', 'replace_null', 'select',
-    #         'separate', 'set_names',
-    #         'slice', 'slice_head', 'slice_tail', 'summarize', 'tail',
-    #         'to_pandas', 'to_polars', 'write_csv', 'write_parquet'
-    #     ]
-    #     return _tidypolars_methods
+    def __getattribute__(self, attr):
+        if attr in _polars_methods:
+            raise AttributeError
+        return pl.DataFrame.__getattribute__(self, attr)
+
+    def __dir__(self):
+        _tidypolars_methods = [
+            'arrange', 'bind_cols', 'bind_rows', 'colnames', 'clone', 'count',
+            'crossing',
+            'distinct', 'drop', 'drop_null', 'head', 'fill', 'filter',
+            'group_by', 
+            'inner_join', 'left_join', 'mutate', 'names', 'nest',
+            'nrow', 'ncol',
+            'full_join', 'pivot_longer', 'pivot_wider', 'print',
+            'pull', 'relocate', 'rename',
+            'replace',
+            'replace_null', 'select',
+            'separate', 'set_names',
+            'slice', 'slice_head', 'slice_tail', 'summarize', 'tail',
+            'to_pandas', 'to_polars', 'unnest', 'write_csv', 'write_parquet'
+        ]
+        return _tidypolars_methods
     
     def arrange(self, *args):
         """
@@ -177,7 +182,7 @@ def count(self, *args, sort = False, name = 'n'):
 
         return out
 
-    def distinct(self, *args, **kwargs):
+    def distinct(self, *args, keep_all = False):
         """
         Select distinct/unique rows
 
@@ -198,7 +203,6 @@ def distinct(self, *args, **kwargs):
         >>> df.distinct('b')
         """
         args = _as_list(args)
-        keep_all = kwargs.get("keep_all", True)
         # 
         if len(args) == 0:
             df = super().unique()
@@ -963,8 +967,8 @@ def write_parquet(self,
     def group_by(self, group, *args, **kwargs):
         res = TibbleGroupBy(self, group, maintain_order=True)
         return res
-
-    def nest(self, by, select='all', nested_name="data", *args, **kwargs):
+    
+    def nest(self, by, *args, **kwargs):
         """
         Nest rows into a list-column of dataframes
 
@@ -972,29 +976,47 @@ def nest(self, by, select='all', nested_name="data", *args, **kwargs):
         ----------
         by : list, str
             Columns to nest on
-        select : str, list
-            Columns to select for the nested dataframe. If 'all' (default)
-            all columns except those specified in 'by' will be selected.
-        nested_name : str
-            Name of the column to receive the nested dataframe
+
+        kwargs :
+            data : list of column names
+               columns to select to include in the nested data
+               If not provided, include all columns except the ones
+               used in 'by'
+
+             key : str
+               name of the resulting nested column. 
+
+             names_sep : str
+                If not provided (default), the names in the nested
+                data will come from the former names. If a string,
+                the new inner names in the nested dataframe will use
+                the outer names with names_sep automatically stripped.
+                This makes names_sep roughly
+                symmetric between nesting and unnesting.
 
         Examples
         --------
         """
-        if select=='all':
-            select = [col for col in self.names if col not in by]
-        # make sure all columns in 'by' are removed from the nested data
-        select = [col for col in select if col not in by]
+        key  = kwargs.get("key", 'data')
+        data = kwargs.get("data", [c for c in self.names if c not in by])
+        names_sep = kwargs.get("names_sep", None)
+
         out = (self
                .group_by(by)
                .agg(**{
-                   nested_name : pl.struct(select).map_elements(
+                   key : pl.struct(data).map_elements(
                        lambda cols: from_polars( pl.DataFrame(cols.to_list()) ) )
-                   })
+               })
+               .pipe(from_polars)
                )
-        return out.pipe(from_polars)
-    
-    def unnest(self, col, *args, **kwargs):
+
+        if names_sep is not None:
+            new_names = {col:f"{col}_{names_sep}" for col in data}
+            print(new_names)
+            out = out.mutate(**{key:col(key).map_elements(lambda row: row.rename(new_names))})
+        return out
+
+    def unnest(self, col):
         """
         Unnest a nested data frame
         Parameters
@@ -1036,14 +1058,14 @@ def replace(self, *args, **kwargs):
                .replace(*args, **kwargs))
         return out.pipe(from_pandas)
         
-    def print(self, nrows=1000, str_lenght=1000):
+    def print(self, n=1000, str_length=1000):
         """
         Print the DataFrame
         """
-        with pl.Config(set_tbl_rows=nrows,
-                       fmt_str_lengths=str_lenght):
+        with pl.Config(set_tbl_rows=n,
+                       fmt_str_lengths=str_length):
             print(self)
-        
+            
 class TibbleGroupBy(pl.dataframe.group_by.GroupBy):
 
     def __init__(self, df, by, *args, **kwargs):

From 6bf5a9ce90954d2d53d644ce9f1ae876ff4119a6 Mon Sep 17 00:00:00 2001
From: dafxy <diofxy@gmail.com>
Date: Thu, 12 Sep 2024 15:42:59 -0700
Subject: [PATCH 7/7] Add summarize to group_by

---
 tidypolars/tibble.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tidypolars/tibble.py b/tidypolars/tibble.py
index 0fd27b8..a4bee15 100644
--- a/tidypolars/tibble.py
+++ b/tidypolars/tibble.py
@@ -1086,6 +1086,11 @@ def filter(self, *args, **kwargs):
         out = self.map_groups(lambda x: from_polars(x).filter(*args, **kwargs))
         return out
 
+    def summarize(self, *args, **kwargs):
+        out = self.map_groups(lambda x: from_polars(x).summarise(by=self.by, *args, **kwargs))
+        return out
+        
+
 def desc(x):
     """Mark a column to order in descending"""
     x = copy.copy(x)