markfairbanks · dafxy · Aug 28, 2024 · Aug 28, 2024 · Aug 29, 2024 · Aug 30, 2024
diff --git a/tidypolars/funs.py b/tidypolars/funs.py
@@ -37,7 +37,7 @@
     "is_nan", "is_not", "is_not_in", "is_not_null", "is_null",
 
     # Type conversion
-    "as_boolean", "as_float", "as_integer", "as_string",
+    "as_boolean", "as_float", "as_integer", "as_string", "as_factor",
     "cast"
 ]
 
@@ -131,6 +131,22 @@ def as_string(x):
     x = _col_expr(x)
     return x.cast(pl.Utf8)
 
+def as_factor(x, levels):
+    """
+    Convert to factor. Defaults to Utf8.
+
+    Parameters
+    ----------
+    x : Str
+        Column to operate on
+
+    Examples
+    --------
+    >>> df.mutate(string_x = tp.as_factor('x'))
+    """
+    x = _col_expr(x)
+    return x.cast(pl.Enum(levels))
+
 def abs(x):
     """
     Absolute value

diff --git a/tidypolars/tibble.py b/tidypolars/tibble.py
@@ -13,6 +13,9 @@
 from .reexports import *
 from .tidyselect import everything
 from operator import not_
+# to supress polars' 'warning of dtype in the nested data operations
+import warnings
+warnings.filterwarnings("ignore", category=pl.exceptions.MapWithoutReturnDtypeWarning)
 
 __all__ = [
     "Tibble",
@@ -61,24 +64,24 @@ def __str__(self):
         df = self.to_polars()
         return df.__str__()
 
-    def __getattribute__(self, attr):
-        if attr in _polars_methods:
-            raise AttributeError
-        return pl.DataFrame.__getattribute__(self, attr)
-
-    def __dir__(self):
-        _tidypolars_methods = [
-            'arrange', 'bind_cols', 'bind_rows', 'colnames', 'clone', 'count',
-            'distinct', 'drop', 'drop_null', 'head', 'fill', 'filter',
-            'inner_join', 'left_join', 'mutate', 'names', 'nrow', 'ncol',
-            'full_join', 'pivot_longer', 'pivot_wider',
-            'pull', 'relocate', 'rename', 'replace_null', 'select',
-            'separate', 'set_names',
-            'slice', 'slice_head', 'slice_tail', 'summarize', 'tail',
-            'to_pandas', 'to_polars', 'write_csv', 'write_parquet'
-        ]
-        return _tidypolars_methods
-
+    # def __getattribute__(self, attr):
+    #     if attr in _polars_methods:
+    #         raise AttributeError
+    #     return pl.DataFrame.__getattribute__(self, attr)
+
+    # def __dir__(self):
+    #     _tidypolars_methods = [
+    #         'arrange', 'bind_cols', 'bind_rows', 'colnames', 'clone', 'count',
+    #         'distinct', 'drop', 'drop_null', 'head', 'fill', 'filter',
+    #         'inner_join', 'left_join', 'mutate', 'names', 'nrow', 'ncol',
+    #         'full_join', 'pivot_longer', 'pivot_wider',
+    #         'pull', 'relocate', 'rename', 'replace_null', 'select',
+    #         'separate', 'set_names',
+    #         'slice', 'slice_head', 'slice_tail', 'summarize', 'tail',
+    #         'to_pandas', 'to_polars', 'write_csv', 'write_parquet'
+    #     ]
+    #     return _tidypolars_methods
+    
     def arrange(self, *args):
         """
         Arrange/sort rows
@@ -174,7 +177,7 @@ def count(self, *args, sort = False, name = 'n'):
 
         return out
 
-    def distinct(self, *args):
+    def distinct(self, *args, **kwargs):
         """
         Select distinct/unique rows
 
@@ -183,17 +186,26 @@ def distinct(self, *args):
         *args : str, Expr
             Columns to find distinct/unique rows
 
+        **kwargs : dict
+            keep_all : boll
+              If True (default), keep all columns. Otherwise, return
+              only the ones used to select the distinct rows.
+
         Examples
         --------
         >>> df = tp.Tibble({'a': range(3), 'b': ['a', 'a', 'b']})
         >>> df.distinct()
         >>> df.distinct('b')
         """
         args = _as_list(args)
+        keep_all = kwargs.get("keep_all", True)
+        # 
         if len(args) == 0:
             df = super().unique()
         else:
-            df = super().select(args).unique()
+            df = super().unique(args)
+        if not keep_all:
+            df = df.select(args)
         return df.pipe(from_polars)
 
     def drop(self, *args):
@@ -374,9 +386,7 @@ def left_join(self, df, left_on = None, right_on = None, on = None, suffix = '_r
             on = list(set(self.names) & set(df.names))
         return super().join(df, on, 'left',  left_on = left_on, right_on= right_on, suffix= suffix).pipe(from_polars)
 
-    def mutate(self, *args,
-               by = None,
-               **kwargs):
+    def mutate(self, *args, by = None, **kwargs):
         """
         Add or modify columns
 
@@ -539,7 +549,8 @@ def pivot_wider(self,
 
         out = (
             super()
-            .pivot(values_from, id_cols, names_from, values_fn)
+            # .pivot(values_from, id_cols, names_from, values_fn)
+            .pivot(index=id_cols, on=names_from, values=values_from)
             .pipe(from_polars)
         )
 
@@ -949,6 +960,110 @@ def write_parquet(self,
         """Write a data frame to a parquet"""
         return super().write_parquet(file, compression = compression, use_pyarrow = use_pyarrow, **kwargs)
 
+    def group_by(self, group, *args, **kwargs):
+        res = TibbleGroupBy(self, group, maintain_order=True)
+        return res
+
+    def nest(self, by, select='all', nested_name="data", *args, **kwargs):
+        """
+        Nest rows into a list-column of dataframes
+
+        Parameters
+        ----------
+        by : list, str
+            Columns to nest on
+        select : str, list
+            Columns to select for the nested dataframe. If 'all' (default)
+            all columns except those specified in 'by' will be selected.
+        nested_name : str
+            Name of the column to receive the nested dataframe
+
+        Examples
+        --------
+        """
+        if select=='all':
+            select = [col for col in self.names if col not in by]
+        # make sure all columns in 'by' are removed from the nested data
+        select = [col for col in select if col not in by]
+        out = (self
+               .group_by(by)
+               .agg(**{
+                   nested_name : pl.struct(select).map_elements(
+                       lambda cols: from_polars( pl.DataFrame(cols.to_list()) ) )
+                   })
+               )
+        return out.pipe(from_polars)
+
+    def unnest(self, col, *args, **kwargs):
+        """
+        Unnest a nested data frame
+        Parameters
+        ----------
+        col : str
+            Columns to unnest
+
+        """
+        assert isinstance(col, str), "'col', must be a string"
+        out = (self
+               .mutate(**{
+                   col : pl.col(col).map_elements(lambda d: d.to_struct())
+               })
+               .to_polars()
+               .explode(col)
+               .unnest(col)
+               )
+        return out.pipe(from_polars)
+
+    def crossing(self, *args, **kwargs):
+        """
+        Expand the data set using a list of values. Each value in the
+        list
+        """
+        out = self.mutate(*args, **kwargs).to_polars()
+        for var,_ in kwargs.items():
+            out = out.explode(var)
+        return out.pipe(from_polars)
+
+    # Not tidy functions, but useful from pandas/polars 
+    # -------------------------------------------------
+    def replace(self, *args, **kwargs):
+        """
+        Replace method from pandas
+        """
+        out = (self
+               .to_polars()
+               .to_pandas()
+               .replace(*args, **kwargs))
+        return out.pipe(from_pandas)
+
+    def print(self, nrows=1000, str_lenght=1000):
+        """
+        Print the DataFrame
+        """
+        with pl.Config(set_tbl_rows=nrows,
+                       fmt_str_lengths=str_lenght):
+            print(self)
+
+class TibbleGroupBy(pl.dataframe.group_by.GroupBy):
+
+    def __init__(self, df, by, *args, **kwargs):
+        assert isinstance(by, str) or isinstance(by, list), "Use list or string to group by."
+        super().__init__(df, by, *args, **kwargs)
+        self.df = df
+        self.by = by if isinstance(by, list) else [by]
+
+    @property
+    def _constructor(self):
+        return TibbleGroupBy
+
+    def mutate(self, *args, **kwargs):
+        out = self.map_groups(lambda x: from_polars(x).mutate(*args, **kwargs))
+        return out
+
+    def filter(self, *args, **kwargs):
+        out = self.map_groups(lambda x: from_polars(x).filter(*args, **kwargs))
+        return out
+
 def desc(x):
     """Mark a column to order in descending"""
     x = copy.copy(x)
@@ -1027,7 +1142,7 @@ def from_pandas(df):
     'null_count',
     'quantile',
     'rechunk',
-    'replace',
+    # 'replace',
     'replace_at_idx',
     'row',
     'rows'
@@ -1050,7 +1165,7 @@ def from_pandas(df):
     'to_pandas'
     'to_parquet',
     'transpose',
-    'unnest',
+    # 'unnest',
     'var',
     'width',
     'with_column',