Unifying the API Parameters name (#323)

* - unifying the API with nbins instead of bins + adding relation_type in drop * nbins
vertica · May 6, 2022 · 7e674b9 · 7e674b9
1 parent b4aae69
commit 7e674b9
Show file tree

Hide file tree

Showing 8 changed files with 62 additions and 60 deletions.
diff --git a/verticapy/datasets.py b/verticapy/datasets.py
@@ -200,13 +200,13 @@ def gen_meshgrid(features_ranges: dict):
 
     for idx, param in enumerate(features_ranges):
 
-        bins = 100
+        nbins = 100
         if "nbins" in features_ranges[param]:
-            bins = features_ranges[param]["nbins"]
+            nbins = features_ranges[param]["nbins"]
         ts_table = (
             f"(SELECT DAY(tm - '03-11-1993'::TIMESTAMP) AS tm FROM "
             "(SELECT '03-11-1993'::TIMESTAMP AS t UNION ALL SELECT"
-            f" '03-11-1993'::TIMESTAMP + INTERVAL '{bins} days' AS t)"
+            f" '03-11-1993'::TIMESTAMP + INTERVAL '{nbins} days' AS t)"
             " x TIMESERIES tm AS '1 day' OVER(ORDER BY t)) y"
         )
 
@@ -226,7 +226,7 @@ def gen_meshgrid(features_ranges: dict):
         elif features_ranges[param]["type"] == float:
             val = features_ranges[param]["range"]
             lower, upper = val[0], val[1]
-            h = (upper - lower) / bins
+            h = (upper - lower) / nbins
             sql += [
                 f'(SELECT ({lower} + {h} * tm)::FLOAT AS "{param}" '
                 f"FROM {ts_table}) x{idx}"
@@ -235,7 +235,7 @@ def gen_meshgrid(features_ranges: dict):
         elif features_ranges[param]["type"] == int:
             val = features_ranges[param]["range"]
             lower, upper = val[0], val[1]
-            h = (upper - lower) / bins
+            h = (upper - lower) / nbins
             sql += [
                 f'(SELECT ({lower} + {h} * tm)::INT AS "{param}" '
                 f"FROM {ts_table}) x{idx}"
@@ -244,7 +244,7 @@ def gen_meshgrid(features_ranges: dict):
         elif features_ranges[param]["type"] == datetime.date:
             val = features_ranges[param]["range"]
             start_date, number_of_days = val[0], val[1]
-            h = number_of_days / bins
+            h = number_of_days / nbins
             sql += [
                 f"(SELECT ('{start_date}'::DATE + {h} * tm)::DATE"
                 f' AS "{param}" FROM {ts_table}) x{idx}'
@@ -253,7 +253,7 @@ def gen_meshgrid(features_ranges: dict):
         elif features_ranges[param]["type"] == datetime.datetime:
             val = features_ranges[param]["range"]
             start_date, number_of_days = val[0], val[1]
-            h = number_of_days / bins
+            h = number_of_days / nbins
             sql += [
                 f"(SELECT ('{start_date}'::DATE + {h} * tm)::TIMESTAMP "
                 f'AS "{param}" FROM {ts_table}) x{idx}'

diff --git a/verticapy/learn/delphi.py b/verticapy/learn/delphi.py
@@ -330,7 +330,7 @@ def fit(
                     ):
                         vdf[elem].discretize(
                             method=self.parameters["num_method"],
-                            bins=self.parameters["nbins"],
+                            nbins=self.parameters["nbins"],
                         )
                     elif vdf[elem].nunique() > self.parameters["cat_topk"] and not (
                         vdf[elem].isnum()

diff --git a/verticapy/plot.py b/verticapy/plot.py
@@ -808,13 +808,13 @@ def bar(
     method: str = "density",
     of=None,
     max_cardinality: int = 6,
-    bins: int = 0,
+    nbins: int = 0,
     h: float = 0,
     ax=None,
     **style_kwds,
 ):
     x, y, z, h, is_categorical = compute_plot_variables(
-        vdf, method=method, of=of, max_cardinality=max_cardinality, bins=bins, h=h
+        vdf, method=method, of=of, max_cardinality=max_cardinality, nbins=nbins, h=h
     )
     if not (ax):
         fig, ax = plt.subplots()
@@ -1852,7 +1852,7 @@ def compute_plot_variables(
     method: str = "density",
     of: str = "",
     max_cardinality: int = 6,
-    bins: int = 0,
+    nbins: int = 0,
     h: float = 0,
     pie: bool = False,
 ):
@@ -1974,16 +1974,16 @@ def compute_plot_variables(
         is_categorical = True
     # case when date
     elif is_date:
-        if (h <= 0) and (bins <= 0):
+        if (h <= 0) and (nbins <= 0):
             h = vdf.numh()
-        elif bins > 0:
+        elif nbins > 0:
             query = "SELECT DATEDIFF('second', MIN({}), MAX({})) FROM ".format(
                 vdf.alias, vdf.alias
             )
             query_result = executeSQL(
                 query=query, title="Computing the histogram interval", method="fetchrow"
             )
-            h = float(query_result[0]) / bins
+            h = float(query_result[0]) / nbins
         min_date = vdf.min()
         converted_date = "DATEDIFF('second', '{}', {})".format(min_date, vdf.alias)
         query = "SELECT FLOOR({} / {}) * {}, {} FROM {} WHERE {} IS NOT NULL GROUP BY 1 ORDER BY 1".format(
@@ -2013,10 +2013,10 @@ def compute_plot_variables(
         is_categorical = True
     # case when numerical
     else:
-        if (h <= 0) and (bins <= 0):
+        if (h <= 0) and (nbins <= 0):
             h = vdf.numh()
-        elif bins > 0:
-            h = float(vdf.max() - vdf.min()) / bins
+        elif nbins > 0:
+            h = float(vdf.max() - vdf.min()) / nbins
         if (vdf.ctype == "int") or (h == 0):
             h = max(1.0, h)
         query = "SELECT FLOOR({} / {}) * {}, {} FROM {} WHERE {} IS NOT NULL GROUP BY 1 ORDER BY 1"
@@ -2212,13 +2212,13 @@ def hist(
     method: str = "density",
     of=None,
     max_cardinality: int = 6,
-    bins: int = 0,
+    nbins: int = 0,
     h: float = 0,
     ax=None,
     **style_kwds,
 ):
     x, y, z, h, is_categorical = compute_plot_variables(
-        vdf, method, of, max_cardinality, bins, h
+        vdf, method, of, max_cardinality, nbins, h
     )
     is_numeric = vdf.isnum()
     if not (ax):

diff --git a/verticapy/tests/vDataFrame/test_vDF_plot.py b/verticapy/tests/vDataFrame/test_vDF_plot.py
@@ -219,8 +219,8 @@ def test_vDF_bar(self, titanic_vd, amazon_vd):
         assert result.get_yticks()[1] == pytest.approx(44705828.571428575)
         plt.close("all")
 
-        # method=sum of=survived and bins=5
-        result2 = titanic_vd["fare"].bar(method="sum", of="survived", bins=5, color="b")
+        # method=sum of=survived and nbins=5
+        result2 = titanic_vd["fare"].bar(method="sum", of="survived", nbins=5, color="b")
         assert result2.get_default_bbox_extra_artists()[0].get_width() == pytest.approx(
             391
         )

diff --git a/verticapy/tests/vDataFrame/test_vDF_preprocessing.py b/verticapy/tests/vDataFrame/test_vDF_preprocessing.py
@@ -106,14 +106,14 @@ def test_vDF_discretize(self, titanic_vd):
 
         # expected exception
         with pytest.raises(AssertionError) as exception_info:
-            titanic_copy["age"].discretize(method="same_freq", bins=1)
+            titanic_copy["age"].discretize(method="same_freq", nbins=1)
         # checking the error message
         assert exception_info.match(
-            "Parameter 'bins' must be greater or equals to 2 in case "
+            "Parameter 'nbins' must be greater or equals to 2 in case "
             "of discretization using the method 'same_freq'"
         )
 
-        titanic_copy["age"].discretize(method="same_freq", bins=5)
+        titanic_copy["age"].discretize(method="same_freq", nbins=5)
         assert titanic_copy["age"].distinct() == [
             "[0.330;19.000]",
             "[19.000;25.000]",
@@ -129,7 +129,7 @@ def test_vDF_discretize(self, titanic_vd):
         titanic_copy["age"].discretize(
             method="smart",
             response="survived",
-            bins=6,
+            nbins=6,
             RFmodel_params={"n_estimators": 100, "nbins": 100},
         )
         assert len(titanic_copy["age"].distinct()) == 6

diff --git a/verticapy/utilities.py b/verticapy/utilities.py
@@ -207,7 +207,7 @@ def drop(name: str = "", method: str = "auto", raise_error: bool = False, **kwds
 name: str, optional
     Relation name. If empty, it will drop all VerticaPy temporary 
     elements.
-method: str, optional
+method / relation_type: str, optional
     Method used to drop.
         auto   : identifies the table/view/index/model to drop. 
                  It will never drop an entire schema unless the 
@@ -226,6 +226,8 @@ def drop(name: str = "", method: str = "auto", raise_error: bool = False, **kwds
 bool
     True if the relation was dropped, False otherwise.
     """
+    if "relation_type" in kwds and method == "auto":
+        method = kwds["relation_type"]
     if isinstance(method, str):
         method = method.lower()
     check_types(

diff --git a/verticapy/vcolumn.py b/verticapy/vcolumn.py
@@ -611,7 +611,7 @@ def bar(
         method: str = "density",
         of: str = "",
         max_cardinality: int = 6,
-        bins: int = 0,
+        nbins: int = 0,
         h: float = 0,
         ax=None,
         **style_kwds,
@@ -637,8 +637,8 @@ def bar(
 	max_cardinality: int, optional
  		Maximum number of the vColumn distinct elements to be used as categorical 
  		(No h will be picked or computed)
- 	bins: int, optional
- 		Number of bins. If empty, an optimized number of bins will be computed.
+ 	nbins: int, optional
+ 		Number of nbins. If empty, an optimized number of nbins will be computed.
  	h: float, optional
  		Interval width of the bar. If empty, an optimized h will be computed.
     ax: Matplotlib axes object, optional
@@ -660,7 +660,7 @@ def bar(
                 ("method", method, [str]),
                 ("of", of, [str]),
                 ("max_cardinality", max_cardinality, [int, float]),
-                ("bins", bins, [int, float]),
+                ("nbins", nbins, [int, float]),
                 ("h", h, [int, float]),
             ]
         )
@@ -669,7 +669,7 @@ def bar(
             of = self.parent.format_colnames(of)
         from verticapy.plot import bar
 
-        return bar(self, method, of, max_cardinality, bins, h, ax=ax, **style_kwds)
+        return bar(self, method, of, max_cardinality, nbins, h, ax=ax, **style_kwds)
 
     # ---#
     def boxplot(
@@ -1251,7 +1251,7 @@ def discretize(
         self,
         method: str = "auto",
         h: float = 0,
-        bins: int = -1,
+        nbins: int = -1,
         k: int = 6,
         new_category: str = "Others",
         RFmodel_params: dict = {},
@@ -1277,7 +1277,7 @@ def discretize(
  	h: float, optional
  		The interval size to convert to use to convert the vColumn. If this parameter 
  		is equal to 0, an optimised interval will be computed.
- 	bins: int, optional
+ 	nbins: int, optional
  		Number of bins used for the discretization (must be > 1)
  	k: int, optional
  		The integer k of the 'topk' method.
@@ -1314,7 +1314,7 @@ def discretize(
                 ("return_enum_trans", return_enum_trans, [bool]),
                 ("h", h, [int, float]),
                 ("response", response, [str]),
-                ("bins", bins, [int, float]),
+                ("nbins", nbins, [int, float]),
                 (
                     "method",
                     method,
@@ -1330,8 +1330,8 @@ def discretize(
                 schema = "public"
             tmp_view_name = gen_tmp_name(schema=schema, name="view")
             tmp_model_name = gen_tmp_name(schema=schema, name="model")
-            assert bins >= 2, ParameterError(
-                "Parameter 'bins' must be greater or equals to 2 in case of discretization using the method 'smart'."
+            assert nbins >= 2, ParameterError(
+                "Parameter 'nbins' must be greater or equals to 2 in case of discretization using the method 'smart'."
             )
             assert response, ParameterError(
                 "Parameter 'response' can not be empty in case of discretization using the method 'smart'."
@@ -1362,11 +1362,11 @@ def discretize(
                     for i in range(parameters["n_estimators"])
                 ]
                 query = "SELECT split_value FROM (SELECT split_value, MAX(weighted_information_gain) FROM ({}) VERTICAPY_SUBTABLE WHERE split_value IS NOT NULL GROUP BY 1 ORDER BY 2 DESC LIMIT {}) VERTICAPY_SUBTABLE ORDER BY split_value::float".format(
-                    " UNION ALL ".join(query), bins - 1
+                    " UNION ALL ".join(query), nbins - 1
                 )
                 result = executeSQL(
                     query=query,
-                    title="Computing the optimized histogram bins using Random Forest.",
+                    title="Computing the optimized histogram nbins using Random Forest.",
                     method="fetchall",
                 )
                 result = [elem[0] for elem in result]
@@ -1398,16 +1398,16 @@ def discretize(
                 "text",
             )
         elif self.isnum() and method == "same_freq":
-            assert bins >= 2, ParameterError(
-                "Parameter 'bins' must be greater or equals to 2 in case of discretization using the method 'same_freq'"
+            assert nbins >= 2, ParameterError(
+                "Parameter 'nbins' must be greater or equals to 2 in case of discretization using the method 'same_freq'"
             )
             count = self.count()
-            nb = int(float(count / int(bins)))
+            nb = int(float(count / int(nbins)))
             assert nb != 0, Exception(
                 "Not enough values to compute the Equal Frequency discretization"
             )
             total, query, nth_elems = nb, [], []
-            while total < int(float(count / int(bins))) * int(bins):
+            while total < int(float(count / int(nbins))) * int(nbins):
                 nth_elems += [str(total)]
                 total += nb
             where = "WHERE _verticapy_row_nb_ IN ({})".format(
@@ -1429,10 +1429,10 @@ def discretize(
             result = [elem[0] for elem in result]
         elif self.isnum() and method in ("same_width", "auto"):
             if not (h) or h <= 0:
-                if bins <= 0:
+                if nbins <= 0:
                     h = self.numh()
                 else:
-                    h = (self.max() - self.min()) * 1.01 / bins
+                    h = (self.max() - self.min()) * 1.01 / nbins
                 if h > 0.01:
                     h = round(h, 2)
                 elif h > 0.0001:
@@ -2176,7 +2176,7 @@ def hist(
         method: str = "density",
         of: str = "",
         max_cardinality: int = 6,
-        bins: int = 0,
+        nbins: int = 0,
         h: float = 0,
         ax=None,
         **style_kwds,
@@ -2202,7 +2202,7 @@ def hist(
 	max_cardinality: int, optional
  		Maximum number of the vColumn distinct elements to be used as categorical 
  		(No h will be picked or computed)
- 	bins: int, optional
+ 	nbins: int, optional
  		Number of bins. If empty, an optimized number of bins will be computed.
  	h: float, optional
  		Interval width of the bar. If empty, an optimized h will be computed.
@@ -2226,15 +2226,15 @@ def hist(
                 ("of", of, [str]),
                 ("max_cardinality", max_cardinality, [int, float]),
                 ("h", h, [int, float]),
-                ("bins", bins, [int, float]),
+                ("nbins", nbins, [int, float]),
             ]
         )
         if of:
             self.parent.are_namecols_in(of)
             of = self.parent.format_colnames(of)
         from verticapy.plot import hist
 
-        return hist(self, method, of, max_cardinality, bins, h, ax=ax, **style_kwds)
+        return hist(self, method, of, max_cardinality, nbins, h, ax=ax, **style_kwds)
 
     # ---#
     def iloc(self, limit: int = 5, offset: int = 0):
@@ -2365,7 +2365,7 @@ def isnum(self):
         return self.category() in ("float", "int")
 
     # ---#
-    def iv_woe(self, y: str, bins: int = 10):
+    def iv_woe(self, y: str, nbins: int = 10):
         """
     ---------------------------------------------------------------------------
     Computes the Information Value (IV) / Weight Of Evidence (WOE) Table. It tells 
@@ -2376,8 +2376,8 @@ def iv_woe(self, y: str, bins: int = 10):
     ----------
     y: str
         Response vColumn.
-    bins: int, optional
-        Maximum number of bins used for the discretization (must be > 1)
+    nbins: int, optional
+        Maximum number of nbins used for the discretization (must be > 1)
 
     Returns
     -------
@@ -2389,7 +2389,7 @@ def iv_woe(self, y: str, bins: int = 10):
     --------
     vDataFrame.iv_woe : Computes the Information Value (IV) Table.
         """
-        check_types([("y", y, [str]), ("bins", bins, [int])])
+        check_types([("y", y, [str]), ("nbins", nbins, [int])])
         self.parent.are_namecols_in(y)
         y = self.parent.format_colnames(y)
         assert self.parent[y].nunique() == 2, TypeError(
@@ -2403,8 +2403,8 @@ def iv_woe(self, y: str, bins: int = 10):
         self.parent[y].distinct()
         trans = self.discretize(
             method="same_width" if self.isnum() else "topk",
-            bins=bins,
-            k=bins,
+            nbins=nbins,
+            k=nbins,
             new_category="Others",
             return_enum_trans=True,
         )[0].replace("{}", self.alias)