Skip to content

Commit

Permalink
Unifying the API Parameters name (#323)
Browse files Browse the repository at this point in the history
* - unifying the API with nbins instead of bins + adding relation_type in drop

* nbins
  • Loading branch information
oualib authored May 6, 2022
1 parent b4aae69 commit 7e674b9
Show file tree
Hide file tree
Showing 8 changed files with 62 additions and 60 deletions.
14 changes: 7 additions & 7 deletions verticapy/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,13 +200,13 @@ def gen_meshgrid(features_ranges: dict):

for idx, param in enumerate(features_ranges):

bins = 100
nbins = 100
if "nbins" in features_ranges[param]:
bins = features_ranges[param]["nbins"]
nbins = features_ranges[param]["nbins"]
ts_table = (
f"(SELECT DAY(tm - '03-11-1993'::TIMESTAMP) AS tm FROM "
"(SELECT '03-11-1993'::TIMESTAMP AS t UNION ALL SELECT"
f" '03-11-1993'::TIMESTAMP + INTERVAL '{bins} days' AS t)"
f" '03-11-1993'::TIMESTAMP + INTERVAL '{nbins} days' AS t)"
" x TIMESERIES tm AS '1 day' OVER(ORDER BY t)) y"
)

Expand All @@ -226,7 +226,7 @@ def gen_meshgrid(features_ranges: dict):
elif features_ranges[param]["type"] == float:
val = features_ranges[param]["range"]
lower, upper = val[0], val[1]
h = (upper - lower) / bins
h = (upper - lower) / nbins
sql += [
f'(SELECT ({lower} + {h} * tm)::FLOAT AS "{param}" '
f"FROM {ts_table}) x{idx}"
Expand All @@ -235,7 +235,7 @@ def gen_meshgrid(features_ranges: dict):
elif features_ranges[param]["type"] == int:
val = features_ranges[param]["range"]
lower, upper = val[0], val[1]
h = (upper - lower) / bins
h = (upper - lower) / nbins
sql += [
f'(SELECT ({lower} + {h} * tm)::INT AS "{param}" '
f"FROM {ts_table}) x{idx}"
Expand All @@ -244,7 +244,7 @@ def gen_meshgrid(features_ranges: dict):
elif features_ranges[param]["type"] == datetime.date:
val = features_ranges[param]["range"]
start_date, number_of_days = val[0], val[1]
h = number_of_days / bins
h = number_of_days / nbins
sql += [
f"(SELECT ('{start_date}'::DATE + {h} * tm)::DATE"
f' AS "{param}" FROM {ts_table}) x{idx}'
Expand All @@ -253,7 +253,7 @@ def gen_meshgrid(features_ranges: dict):
elif features_ranges[param]["type"] == datetime.datetime:
val = features_ranges[param]["range"]
start_date, number_of_days = val[0], val[1]
h = number_of_days / bins
h = number_of_days / nbins
sql += [
f"(SELECT ('{start_date}'::DATE + {h} * tm)::TIMESTAMP "
f'AS "{param}" FROM {ts_table}) x{idx}'
Expand Down
2 changes: 1 addition & 1 deletion verticapy/learn/delphi.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ def fit(
):
vdf[elem].discretize(
method=self.parameters["num_method"],
bins=self.parameters["nbins"],
nbins=self.parameters["nbins"],
)
elif vdf[elem].nunique() > self.parameters["cat_topk"] and not (
vdf[elem].isnum()
Expand Down
22 changes: 11 additions & 11 deletions verticapy/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -808,13 +808,13 @@ def bar(
method: str = "density",
of=None,
max_cardinality: int = 6,
bins: int = 0,
nbins: int = 0,
h: float = 0,
ax=None,
**style_kwds,
):
x, y, z, h, is_categorical = compute_plot_variables(
vdf, method=method, of=of, max_cardinality=max_cardinality, bins=bins, h=h
vdf, method=method, of=of, max_cardinality=max_cardinality, nbins=nbins, h=h
)
if not (ax):
fig, ax = plt.subplots()
Expand Down Expand Up @@ -1852,7 +1852,7 @@ def compute_plot_variables(
method: str = "density",
of: str = "",
max_cardinality: int = 6,
bins: int = 0,
nbins: int = 0,
h: float = 0,
pie: bool = False,
):
Expand Down Expand Up @@ -1974,16 +1974,16 @@ def compute_plot_variables(
is_categorical = True
# case when date
elif is_date:
if (h <= 0) and (bins <= 0):
if (h <= 0) and (nbins <= 0):
h = vdf.numh()
elif bins > 0:
elif nbins > 0:
query = "SELECT DATEDIFF('second', MIN({}), MAX({})) FROM ".format(
vdf.alias, vdf.alias
)
query_result = executeSQL(
query=query, title="Computing the histogram interval", method="fetchrow"
)
h = float(query_result[0]) / bins
h = float(query_result[0]) / nbins
min_date = vdf.min()
converted_date = "DATEDIFF('second', '{}', {})".format(min_date, vdf.alias)
query = "SELECT FLOOR({} / {}) * {}, {} FROM {} WHERE {} IS NOT NULL GROUP BY 1 ORDER BY 1".format(
Expand Down Expand Up @@ -2013,10 +2013,10 @@ def compute_plot_variables(
is_categorical = True
# case when numerical
else:
if (h <= 0) and (bins <= 0):
if (h <= 0) and (nbins <= 0):
h = vdf.numh()
elif bins > 0:
h = float(vdf.max() - vdf.min()) / bins
elif nbins > 0:
h = float(vdf.max() - vdf.min()) / nbins
if (vdf.ctype == "int") or (h == 0):
h = max(1.0, h)
query = "SELECT FLOOR({} / {}) * {}, {} FROM {} WHERE {} IS NOT NULL GROUP BY 1 ORDER BY 1"
Expand Down Expand Up @@ -2212,13 +2212,13 @@ def hist(
method: str = "density",
of=None,
max_cardinality: int = 6,
bins: int = 0,
nbins: int = 0,
h: float = 0,
ax=None,
**style_kwds,
):
x, y, z, h, is_categorical = compute_plot_variables(
vdf, method, of, max_cardinality, bins, h
vdf, method, of, max_cardinality, nbins, h
)
is_numeric = vdf.isnum()
if not (ax):
Expand Down
4 changes: 2 additions & 2 deletions verticapy/tests/vDataFrame/test_vDF_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,8 +219,8 @@ def test_vDF_bar(self, titanic_vd, amazon_vd):
assert result.get_yticks()[1] == pytest.approx(44705828.571428575)
plt.close("all")

# method=sum of=survived and bins=5
result2 = titanic_vd["fare"].bar(method="sum", of="survived", bins=5, color="b")
# method=sum of=survived and nbins=5
result2 = titanic_vd["fare"].bar(method="sum", of="survived", nbins=5, color="b")
assert result2.get_default_bbox_extra_artists()[0].get_width() == pytest.approx(
391
)
Expand Down
8 changes: 4 additions & 4 deletions verticapy/tests/vDataFrame/test_vDF_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,14 +106,14 @@ def test_vDF_discretize(self, titanic_vd):

# expected exception
with pytest.raises(AssertionError) as exception_info:
titanic_copy["age"].discretize(method="same_freq", bins=1)
titanic_copy["age"].discretize(method="same_freq", nbins=1)
# checking the error message
assert exception_info.match(
"Parameter 'bins' must be greater or equals to 2 in case "
"Parameter 'nbins' must be greater or equals to 2 in case "
"of discretization using the method 'same_freq'"
)

titanic_copy["age"].discretize(method="same_freq", bins=5)
titanic_copy["age"].discretize(method="same_freq", nbins=5)
assert titanic_copy["age"].distinct() == [
"[0.330;19.000]",
"[19.000;25.000]",
Expand All @@ -129,7 +129,7 @@ def test_vDF_discretize(self, titanic_vd):
titanic_copy["age"].discretize(
method="smart",
response="survived",
bins=6,
nbins=6,
RFmodel_params={"n_estimators": 100, "nbins": 100},
)
assert len(titanic_copy["age"].distinct()) == 6
Expand Down
4 changes: 3 additions & 1 deletion verticapy/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def drop(name: str = "", method: str = "auto", raise_error: bool = False, **kwds
name: str, optional
Relation name. If empty, it will drop all VerticaPy temporary
elements.
method: str, optional
method / relation_type: str, optional
Method used to drop.
auto : identifies the table/view/index/model to drop.
It will never drop an entire schema unless the
Expand All @@ -226,6 +226,8 @@ def drop(name: str = "", method: str = "auto", raise_error: bool = False, **kwds
bool
True if the relation was dropped, False otherwise.
"""
if "relation_type" in kwds and method == "auto":
method = kwds["relation_type"]
if isinstance(method, str):
method = method.lower()
check_types(
Expand Down
56 changes: 28 additions & 28 deletions verticapy/vcolumn.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,7 +611,7 @@ def bar(
method: str = "density",
of: str = "",
max_cardinality: int = 6,
bins: int = 0,
nbins: int = 0,
h: float = 0,
ax=None,
**style_kwds,
Expand All @@ -637,8 +637,8 @@ def bar(
max_cardinality: int, optional
Maximum number of the vColumn distinct elements to be used as categorical
(No h will be picked or computed)
bins: int, optional
Number of bins. If empty, an optimized number of bins will be computed.
nbins: int, optional
Number of nbins. If empty, an optimized number of nbins will be computed.
h: float, optional
Interval width of the bar. If empty, an optimized h will be computed.
ax: Matplotlib axes object, optional
Expand All @@ -660,7 +660,7 @@ def bar(
("method", method, [str]),
("of", of, [str]),
("max_cardinality", max_cardinality, [int, float]),
("bins", bins, [int, float]),
("nbins", nbins, [int, float]),
("h", h, [int, float]),
]
)
Expand All @@ -669,7 +669,7 @@ def bar(
of = self.parent.format_colnames(of)
from verticapy.plot import bar

return bar(self, method, of, max_cardinality, bins, h, ax=ax, **style_kwds)
return bar(self, method, of, max_cardinality, nbins, h, ax=ax, **style_kwds)

# ---#
def boxplot(
Expand Down Expand Up @@ -1251,7 +1251,7 @@ def discretize(
self,
method: str = "auto",
h: float = 0,
bins: int = -1,
nbins: int = -1,
k: int = 6,
new_category: str = "Others",
RFmodel_params: dict = {},
Expand All @@ -1277,7 +1277,7 @@ def discretize(
h: float, optional
The interval size to convert to use to convert the vColumn. If this parameter
is equal to 0, an optimised interval will be computed.
bins: int, optional
nbins: int, optional
Number of bins used for the discretization (must be > 1)
k: int, optional
The integer k of the 'topk' method.
Expand Down Expand Up @@ -1314,7 +1314,7 @@ def discretize(
("return_enum_trans", return_enum_trans, [bool]),
("h", h, [int, float]),
("response", response, [str]),
("bins", bins, [int, float]),
("nbins", nbins, [int, float]),
(
"method",
method,
Expand All @@ -1330,8 +1330,8 @@ def discretize(
schema = "public"
tmp_view_name = gen_tmp_name(schema=schema, name="view")
tmp_model_name = gen_tmp_name(schema=schema, name="model")
assert bins >= 2, ParameterError(
"Parameter 'bins' must be greater or equals to 2 in case of discretization using the method 'smart'."
assert nbins >= 2, ParameterError(
"Parameter 'nbins' must be greater or equals to 2 in case of discretization using the method 'smart'."
)
assert response, ParameterError(
"Parameter 'response' can not be empty in case of discretization using the method 'smart'."
Expand Down Expand Up @@ -1362,11 +1362,11 @@ def discretize(
for i in range(parameters["n_estimators"])
]
query = "SELECT split_value FROM (SELECT split_value, MAX(weighted_information_gain) FROM ({}) VERTICAPY_SUBTABLE WHERE split_value IS NOT NULL GROUP BY 1 ORDER BY 2 DESC LIMIT {}) VERTICAPY_SUBTABLE ORDER BY split_value::float".format(
" UNION ALL ".join(query), bins - 1
" UNION ALL ".join(query), nbins - 1
)
result = executeSQL(
query=query,
title="Computing the optimized histogram bins using Random Forest.",
title="Computing the optimized histogram nbins using Random Forest.",
method="fetchall",
)
result = [elem[0] for elem in result]
Expand Down Expand Up @@ -1398,16 +1398,16 @@ def discretize(
"text",
)
elif self.isnum() and method == "same_freq":
assert bins >= 2, ParameterError(
"Parameter 'bins' must be greater or equals to 2 in case of discretization using the method 'same_freq'"
assert nbins >= 2, ParameterError(
"Parameter 'nbins' must be greater or equals to 2 in case of discretization using the method 'same_freq'"
)
count = self.count()
nb = int(float(count / int(bins)))
nb = int(float(count / int(nbins)))
assert nb != 0, Exception(
"Not enough values to compute the Equal Frequency discretization"
)
total, query, nth_elems = nb, [], []
while total < int(float(count / int(bins))) * int(bins):
while total < int(float(count / int(nbins))) * int(nbins):
nth_elems += [str(total)]
total += nb
where = "WHERE _verticapy_row_nb_ IN ({})".format(
Expand All @@ -1429,10 +1429,10 @@ def discretize(
result = [elem[0] for elem in result]
elif self.isnum() and method in ("same_width", "auto"):
if not (h) or h <= 0:
if bins <= 0:
if nbins <= 0:
h = self.numh()
else:
h = (self.max() - self.min()) * 1.01 / bins
h = (self.max() - self.min()) * 1.01 / nbins
if h > 0.01:
h = round(h, 2)
elif h > 0.0001:
Expand Down Expand Up @@ -2176,7 +2176,7 @@ def hist(
method: str = "density",
of: str = "",
max_cardinality: int = 6,
bins: int = 0,
nbins: int = 0,
h: float = 0,
ax=None,
**style_kwds,
Expand All @@ -2202,7 +2202,7 @@ def hist(
max_cardinality: int, optional
Maximum number of the vColumn distinct elements to be used as categorical
(No h will be picked or computed)
bins: int, optional
nbins: int, optional
Number of bins. If empty, an optimized number of bins will be computed.
h: float, optional
Interval width of the bar. If empty, an optimized h will be computed.
Expand All @@ -2226,15 +2226,15 @@ def hist(
("of", of, [str]),
("max_cardinality", max_cardinality, [int, float]),
("h", h, [int, float]),
("bins", bins, [int, float]),
("nbins", nbins, [int, float]),
]
)
if of:
self.parent.are_namecols_in(of)
of = self.parent.format_colnames(of)
from verticapy.plot import hist

return hist(self, method, of, max_cardinality, bins, h, ax=ax, **style_kwds)
return hist(self, method, of, max_cardinality, nbins, h, ax=ax, **style_kwds)

# ---#
def iloc(self, limit: int = 5, offset: int = 0):
Expand Down Expand Up @@ -2365,7 +2365,7 @@ def isnum(self):
return self.category() in ("float", "int")

# ---#
def iv_woe(self, y: str, bins: int = 10):
def iv_woe(self, y: str, nbins: int = 10):
"""
---------------------------------------------------------------------------
Computes the Information Value (IV) / Weight Of Evidence (WOE) Table. It tells
Expand All @@ -2376,8 +2376,8 @@ def iv_woe(self, y: str, bins: int = 10):
----------
y: str
Response vColumn.
bins: int, optional
Maximum number of bins used for the discretization (must be > 1)
nbins: int, optional
Maximum number of nbins used for the discretization (must be > 1)
Returns
-------
Expand All @@ -2389,7 +2389,7 @@ def iv_woe(self, y: str, bins: int = 10):
--------
vDataFrame.iv_woe : Computes the Information Value (IV) Table.
"""
check_types([("y", y, [str]), ("bins", bins, [int])])
check_types([("y", y, [str]), ("nbins", nbins, [int])])
self.parent.are_namecols_in(y)
y = self.parent.format_colnames(y)
assert self.parent[y].nunique() == 2, TypeError(
Expand All @@ -2403,8 +2403,8 @@ def iv_woe(self, y: str, bins: int = 10):
self.parent[y].distinct()
trans = self.discretize(
method="same_width" if self.isnum() else "topk",
bins=bins,
k=bins,
nbins=nbins,
k=nbins,
new_category="Others",
return_enum_trans=True,
)[0].replace("{}", self.alias)
Expand Down
Loading

0 comments on commit 7e674b9

Please sign in to comment.