diff --git a/verticapy/machine_learning/memmodel/naive_bayes.py b/verticapy/machine_learning/memmodel/naive_bayes.py index 56519dc56..e497e5df4 100755 --- a/verticapy/machine_learning/memmodel/naive_bayes.py +++ b/verticapy/machine_learning/memmodel/naive_bayes.py @@ -84,7 +84,7 @@ class NaiveBayes(MulticlassClassifier): 'male': 0.583333333333333}, 'S': {'female': 0.311212814645309, 'male': 0.688787185354691}} - + prior: ArrayLike The model's classes probabilities. classes: ArrayLike diff --git a/verticapy/tests_new/__init__.py b/verticapy/tests_new/__init__.py index 62eb0aeaa..8b80cd90b 100755 --- a/verticapy/tests_new/__init__.py +++ b/verticapy/tests_new/__init__.py @@ -14,3 +14,74 @@ See the License for the specific language governing permissions and limitations under the License. """ +from collections import namedtuple + +AggregateFun = namedtuple("AggregateFun", ["vpy", "py"]) +functions = { + "aad": [ + "vpy_data.aad()", + "np.absolute(py_data - py_data.mean(numeric_only=True)).mean(numeric_only=True)", + ], + "count": ["vpy_data.count()", "py_data.count()"], + "cvar": [ + "vpy_data.cvar()", + "py_data[py_data >= py_data.quantile(0.95, numeric_only=True)].mean(numeric_only=True)", + ], + "iqr": [ + "vpy_data.iqr()", + "py_data.quantile(0.75, numeric_only=True) - py_data.quantile(0.25, numeric_only=True)", + ], + "kurt": ["vpy_data.kurt()", "py_data.kurt(numeric_only=True)"], + "kurtosis": [ + "vpy_data.kurtosis()", + "py_data.kurtosis(numeric_only=True)", + ], + "jb": ["vpy_data.jb()", "jarque_bera(py_data, nan_policy='omit').statistic"], + "mad": [ + "vpy_data.mad()", + "median_abs_deviation(py_data, nan_policy='omit')", + ], + "max": ["vpy_data.max()", "py_data.max(numeric_only=True)"], + "mean": ["vpy_data.mean()", "py_data.mean(numeric_only=True)"], + "avg": ["vpy_data.avg()", "py_data.mean(numeric_only=True)"], + "median": ["vpy_data.median()", "py_data.median(numeric_only=True)"], + "min": ["vpy_data.min()", "py_data.min(numeric_only=True)"], + "mode": ["vpy_data.mode()", "py_data.mode(numeric_only=True, dropna=False).values"], + "percent": ["vpy_data.percent()", "py_data.count()/len(py_data)*100"], + "quantile": [ + "vpy_data.quantile(q=[0.2, 0.5])", + "py_data.quantile(q=[0.2, 0.5],numeric_only=True).values", + ], + "10%": ["vpy_data.q10()", "py_data.quantile(0.1, numeric_only=True)"], + "90%": ["vpy_data.q90", "py_data.quantile(0.9, numeric_only=True)"], + "prod": ["vpy_data.prod()", "py_data.prod(numeric_only=True)"], + "product": ["vpy_data.product()", "py_data.product(numeric_only=True)"], + "range": [ + "vpy_data.range()", + "py_data.max(numeric_only=True) - py_data.min(numeric_only=True)", + ], + "sem": ["vpy_data.sem()", "py_data.sem(numeric_only=True)"], + "skew": ["vpy_data.skew()", "py_data.skew(numeric_only=True)"], + "skewness": ["vpy_data.skewness()", "py_data.skew(numeric_only=True)"], + "sum": ["vpy_data.sum()", "py_data.sum(numeric_only=True)"], + "std": ["vpy_data.std()", "py_data.std(numeric_only=True)"], + "stddev": ["vpy_data.stddev()", "py_data.std(numeric_only=True)"], + "topk": ["vpy_data.topk(k=3)", "py_data.value_counts(dropna=False)"], + "top1": ["vpy_data.topk(k=1)", "py_data.value_counts(dropna=False).index[0]"], + "top1_percent": [ + "vpy_data.top1_percent()", + "py_data.value_counts(dropna=False).iloc[0]/len(py_data)*100", + ], + "nunique": ["vpy_data.nunique(approx=False)", "py_data.nunique()"], + "unique": ["vpy_data.nunique(approx=False)", "py_data.nunique()"], + "var": ["vpy_data.var()", "py_data.var(numeric_only=True)"], + "variance": ["vpy_data.variance()", "py_data.var(numeric_only=True)"], + "value_counts": [ + "vpy_data.value_counts()", + "py_data.value_counts(dropna=False)", + ], + "distinct": [ + "vpy_data.distinct()", + "py_data.unique()", + ], +} diff --git a/verticapy/tests_new/core/vdataframe/test_agg.py b/verticapy/tests_new/core/vdataframe/test_agg.py index 8eb312f63..1c4b1d081 100644 --- a/verticapy/tests_new/core/vdataframe/test_agg.py +++ b/verticapy/tests_new/core/vdataframe/test_agg.py @@ -14,84 +14,13 @@ See the License for the specific language governing permissions and limitations under the License. """ -from collections import namedtuple from contextlib import nullcontext as does_not_raise import pytest import numpy as np from scipy.stats import median_abs_deviation, jarque_bera from verticapy.errors import MissingColumn -import verticapy as vp from verticapy.tests_new.core.vdataframe import REL_TOLERANCE, ABS_TOLERANCE - -AggregateFun = namedtuple("AggregateFun", ["vpy", "py"]) -functions = { - "aad": [ - "vpy_data.aad()", - "np.absolute(py_data - py_data.mean(numeric_only=True)).mean(numeric_only=True)", - ], - "count": ["vpy_data.count()", "py_data.count()"], - "cvar": [ - "vpy_data.cvar()", - "py_data[py_data >= py_data.quantile(0.95, numeric_only=True)].mean(numeric_only=True)", - ], - "iqr": [ - "vpy_data.iqr()", - "py_data.quantile(0.75, numeric_only=True) - py_data.quantile(0.25, numeric_only=True)", - ], - "kurt": ["vpy_data.kurt()", "py_data.kurt(numeric_only=True)"], - "kurtosis": [ - "vpy_data.kurtosis()", - "py_data.kurtosis(numeric_only=True)", - ], - "jb": ["vpy_data.jb()", "jarque_bera(py_data, nan_policy='omit').statistic"], - "mad": [ - "vpy_data.mad()", - "median_abs_deviation(py_data, nan_policy='omit')", - ], - "max": ["vpy_data.max()", "py_data.max(numeric_only=True)"], - "mean": ["vpy_data.mean()", "py_data.mean(numeric_only=True)"], - "avg": ["vpy_data.avg()", "py_data.mean(numeric_only=True)"], - "median": ["vpy_data.median()", "py_data.median(numeric_only=True)"], - "min": ["vpy_data.min()", "py_data.min(numeric_only=True)"], - "mode": ["vpy_data.mode()", "py_data.mode(numeric_only=True, dropna=False).values"], - "percent": ["vpy_data.percent()", "py_data.count()/len(py_data)*100"], - "quantile": [ - "vpy_data.quantile(q=[0.2, 0.5])", - "py_data.quantile(q=[0.2, 0.5],numeric_only=True).values", - ], - "10%": ["vpy_data.q10()", "py_data.quantile(0.1, numeric_only=True)"], - "90%": ["vpy_data.q90", "py_data.quantile(0.9, numeric_only=True)"], - "prod": ["vpy_data.prod()", "py_data.prod(numeric_only=True)"], - "product": ["vpy_data.product()", "py_data.product(numeric_only=True)"], - "range": [ - "vpy_data.range()", - "py_data.max(numeric_only=True) - py_data.min(numeric_only=True)", - ], - "sem": ["vpy_data.sem()", "py_data.sem(numeric_only=True)"], - "skew": ["vpy_data.skew()", "py_data.skew(numeric_only=True)"], - "skewness": ["vpy_data.skewness()", "py_data.skew(numeric_only=True)"], - "sum": ["vpy_data.sum()", "py_data.sum(numeric_only=True)"], - "std": ["vpy_data.std()", "py_data.std(numeric_only=True)"], - "stddev": ["vpy_data.stddev()", "py_data.std(numeric_only=True)"], - "topk": ["vpy_data.topk(k=3)", "py_data.value_counts(dropna=False)"], - "top1": ["vpy_data.topk(k=1)", "py_data.value_counts(dropna=False).index[0]"], - "top1_percent": [ - "vpy_data.top1_percent()", - "py_data.value_counts(dropna=False).iloc[0]/len(py_data)*100", - ], - "nunique": ["vpy_data.nunique(approx=False)", "py_data.nunique()"], - "unique": ["vpy_data.nunique(approx=False)", "py_data.nunique()"], - "var": ["vpy_data.var()", "py_data.var(numeric_only=True)"], - "variance": ["vpy_data.variance()", "py_data.var(numeric_only=True)"], - "value_counts": [ - "vpy_data.value_counts()", - "py_data.value_counts(dropna=False)", - ], - "distinct": [ - "vpy_data.distinct()", - "py_data.unique()", - ], -} +from verticapy.tests_new import functions, AggregateFun class TestAgg: diff --git a/verticapy/tests_new/core/vdataframe/test_math.py b/verticapy/tests_new/core/vdataframe/test_math.py new file mode 100644 index 000000000..df02afcb8 --- /dev/null +++ b/verticapy/tests_new/core/vdataframe/test_math.py @@ -0,0 +1,885 @@ +""" +Copyright (c) 2018-2023 Open Text or one of its +affiliates. Licensed under the Apache License, +Version 2.0 (the "License"); You may not use this +file except in compliance with the License. + +You may obtain a copy of the License at: +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in +writing, software distributed under the License is +distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing +permissions and limitations under the License. +""" +import re +import datetime +import pandas as pd +import numpy as np # pylint: disable=unused-import +from scipy.stats import ( + median_abs_deviation, + jarque_bera, +) # pylint: disable=unused-import +import pytest +from verticapy.core.tablesample.base import TableSample +from verticapy.tests_new.core.vdataframe import REL_TOLERANCE, ABS_TOLERANCE +from verticapy.tests_new import functions, AggregateFun + + +class TestMath: + """ + test class for Text functions test + """ + + @pytest.mark.parametrize( + "input_type, columns", + [ + ("vDataFrame", "age"), + ("vDataFrame_column", "age"), + ("vDataFrame_column", ["age", "fare", "pclass", "survived"]), + ("vcolumn", "age"), + ("vcolumn", ["age", "fare", "pclass", "survived"]), + ], + ) + def test_abs(self, titanic_vd_fun, input_type, columns): + """ + test function - absolute + """ + focus_columns = [ + "pclass", + "survived", + "age", + "sibsp", + "parch", + "fare", + "body", + ] + + titanic_vdf = titanic_vd_fun[focus_columns].normalize().fillna(0) + titanic_pdf = titanic_vdf.to_pandas() + titanic_pdf[columns] = titanic_pdf[columns].astype(float) + + if input_type == "vDataFrame": + titanic_vdf.abs() + vpy_res = titanic_vdf[columns].sum() + py_res = titanic_pdf.abs()[columns].sum() + elif input_type == "vDataFrame_column": + if isinstance(columns, list) and len(columns) > 1: + _vpy_res = titanic_vdf.abs(columns=columns)[columns].sum() + vpy_res = dict(zip(_vpy_res["index"], _vpy_res["sum"]))['"age"'] + py_res = dict(titanic_pdf[columns].abs().sum())["age"] + else: + vpy_res = titanic_vdf.abs(columns=columns)[columns].sum() + py_res = titanic_pdf[columns].abs().sum() + else: + if isinstance(columns, list) and len(columns) > 1: + _vpy_res = titanic_vdf[columns].abs()[columns].sum() + vpy_res = dict(zip(_vpy_res["index"], _vpy_res["sum"]))['"age"'] + py_res = dict(titanic_pdf[columns].abs().sum())["age"] + else: + vpy_res = titanic_vdf[columns].abs()[columns].sum() + py_res = titanic_pdf[columns].abs().sum() + + print( + f"Input Type: {input_type} \ncolumns: {columns} \nVerticaPy Result: {vpy_res} \nPython Result :{py_res}\n" + ) + assert vpy_res == pytest.approx(py_res) + + @pytest.mark.parametrize( + "func, columns, scalar", + [ + ("add", "age", 2.643), + ("div", "age", 2.12), + ("mul", "age", 2), + ("sub", "age", 2), + ], + ) + def test_binary_operator(self, titanic_vd_fun, func, columns, scalar): + """ + test function - add + """ + titanic_pdf = titanic_vd_fun.to_pandas() + titanic_pdf[columns] = titanic_pdf[columns].astype(float) + + vpy_res = getattr(titanic_vd_fun[columns], func)(scalar)[columns].sum() + + py_res = getattr(titanic_pdf[columns], func)(scalar).sum() + + print( + f"Function Name: {func} \nVerticaPy Result: {vpy_res} \nPython Result :{py_res}\n" + ) + + assert vpy_res == pytest.approx(py_res) + + @pytest.mark.parametrize( + "columns, input_type, func, copy_name", + [ + ( + ["age", "boat", "name"], + "vDataFrame", + { + "age": "COALESCE(age, AVG({}) OVER (PARTITION BY pclass, sex))", + "boat": "DECODE({}, NULL, 0, 1)", + "name": "REGEXP_SUBSTR({}, '([A-Za-z])+\.')", + }, + None, + ), + (["age"], "vcolumn", "POWER({}, 2)", None), + (["age"], "vcolumn", "POWER({}, 2)", "age_pow2"), + ], + ) + def test_apply(self, titanic_vd_fun, columns, input_type, func, copy_name): + """ + test function - apply + """ + titanic_pdf = titanic_vd_fun.to_pandas() + titanic_pdf[columns[0]] = titanic_pdf[columns[0]].astype(float) + + if input_type == "vDataFrame": + titanic_vd_fun.apply(func=func) + + vpy_res = [ + titanic_vd_fun[columns[0]].sum(), + titanic_vd_fun[columns[1]].sum(), + len(titanic_vd_fun[columns[2]].distinct()), + ] + + titanic_pdf[columns[0]] = titanic_pdf.groupby(by=["pclass", "sex"])[ + columns[0] + ].transform(lambda x: x.fillna(x.mean())) + titanic_pdf[columns[1]] = titanic_pdf[columns[1]].apply( + lambda x: 1 if x else 0 + ) + titanic_pdf[columns[2]] = titanic_pdf[columns[2]].apply( + lambda x: re.search(r"([A-Za-z]+\.)", x)[0] + ) + py_res = [ + titanic_pdf[columns[0]].sum(), + titanic_pdf[columns[1]].sum(), + len(titanic_pdf[columns[2]].unique()), + ] + else: + apply_column_name = copy_name if copy_name else columns[0] + titanic_vd_fun[columns[0]].apply(func=func, copy_name=copy_name) + vpy_res = ( + titanic_vd_fun[copy_name].sum() + if copy_name + else titanic_vd_fun[columns[0]].sum() + ) + + titanic_pdf[apply_column_name] = titanic_pdf[columns[0]].apply( + lambda x: x**2 + ) + py_res = titanic_pdf[apply_column_name].sum() + + print(f"VerticaPy Result: {vpy_res} \nPython Result :{py_res}\n") + assert vpy_res == pytest.approx(py_res) + + @pytest.mark.parametrize( + "columns, data, vpy_func, py_func", + [ + ("age", None, "abs", "np.absolute(x)"), + ("survived", None, "acos", "np.arccos(x)"), + ("survived", None, "asin", "np.arcsin(x)"), + ("survived", None, "atan", "np.arctan(x)"), + ("album_cost", "sample_data", "avg", "np.mean(x)"), + ("album_cost", "sample_data", "mean", "np.mean(x)"), + ("age", None, "cbrt", "np.cbrt(x)"), + ("age", None, "ceil", "np.ceil(x)"), + ("album_cost", "sample_data", "contain", "1 if 2 in x else 0"), + ("age", None, "cos", "np.cos(x)"), + ("age", None, "cosh", "np.cosh(x)"), + ("age", None, "cot", "np.cos(x)/np.sin(x)"), + ("album_cost", "sample_data", "dim", "np.ndim(x)"), + ("age", None, "exp", "np.exp(x)"), + ("album_cost", "sample_data", "find", "1 if 2 in x else -1"), + ("age", None, "floor", "np.floor(x)"), + ("album_cost", "sample_data", "len", "np.size(x)"), + ("album_cost", "sample_data", "length", "np.size(x)"), + ("age", None, "ln", "np.log(x)"), + # ( + # "fare", + # None, + # "log", + # "np.log2(x, where=x != 0)", + # ), # Looks like bug in code. Expected log arg from vertica doc is log(base, expression). However, Verticapy is considering log(expression, base) i.e. in reverse order. + ("age", None, "log10", "np.log10(x)"), + ("album_cost", "sample_data", "max", "np.max(x)"), + ("album_cost", "sample_data", "min", "np.min(x)"), + ("age", None, "mod", "np.mod(x, 2)"), + ("age", None, "pow", "np.power(x, 2)"), + ("age", None, "round", "np.round(x, 2)"), + ("sign_num", "sample_data", "sign", "np.sign(x)"), + ("age", None, "sin", "np.sin(x)"), + ("age", None, "sinh", "np.sinh(x)"), + ("age", None, "sqrt", "np.sqrt(x)"), + ("album_cost", "sample_data", "sum", "np.sum(x)"), + ("age", None, "tan", "np.tan(x)"), + ("age", None, "tanh", "np.tanh(x)"), + ], + ) + def test_apply_fun(self, titanic_vd_fun, data, columns, vpy_func, py_func): + """ + test function - apply_fun + """ + titanic_pdf = titanic_vd_fun.to_pandas() + + sample_data = TableSample( + values={ + "index": [0, 1, 2], + "name": ["Bernard", "Fred", "Cassandra"], + "fav_album": [ + ["Inna", "Connect R"], + ["Majda Roumi"], + ["Beyonce", "Alicia Keys", "Dr Dre"], + ], + "album_cost": [ + [65, 50, 90.11, 25, 71], + [40, 50, 90.11, 35], + [56, 50, 90.11, 55, 213], + ], + "sign_num": [0, -1, 2], + } + ).to_vdf() + vpy_data = sample_data if data == "sample_data" else titanic_vd_fun + + sample_data_pdf = sample_data.to_pandas() + py_data = sample_data_pdf if data == "sample_data" else titanic_pdf + py_data[columns] = ( + py_data[columns].astype(float) + if vpy_data[columns].isnum() and not vpy_data[columns].isarray() + else py_data[columns] + ) + + vpy_data[columns].apply_fun(func=vpy_func) + vpy_res = vpy_data[columns].sum() + + py_data[columns] = py_data[columns].apply(lambda x: eval(py_func)) + py_res = float(py_data[columns].sum()) + + print( + f"Function Name: {vpy_func}, \nVerticaPy Result: {vpy_res} \nPython Result :{py_res}\n" + ) + assert vpy_res == pytest.approx(py_res) + + @pytest.mark.parametrize( + "part, columns", + [ + ("hour", "time"), + ("minute", "time"), + ("second", "time"), + ("microsecond", "time"), + ("day", "time"), + ("month", "time"), + ("year", "time"), + ("quarter", "time"), + ], + ) + def test_date_part(self, smart_meters_vd, part, columns): + """ + test function - date_part + """ + smart_meters_copy = smart_meters_vd.copy() + smart_meters_pdf = smart_meters_vd.to_pandas() + + vpy_res = smart_meters_copy[columns].date_part(part)[columns].sum() + + py_res = getattr(smart_meters_pdf[columns].dt, part).sum() + + print( + f"Date Part: {part} \nVerticaPy Result: {vpy_res} \nPython Result :{py_res}\n" + ) + + assert vpy_res == pytest.approx(py_res) + + @pytest.mark.parametrize("col_type", (["complex", "string"])) + def test_get_len(self, titanic_vd_fun, laliga_vd, col_type): + """ + test function - get_len + """ + titanic_pdf = titanic_vd_fun.to_pandas() + laliga_pdf = laliga_vd.to_pandas() + + if col_type == "complex": + vpy_res = laliga_vd["away_team"]["managers"][0]["name"].get_len().sum() + py_res = ( + laliga_pdf["away_team"] + .apply(lambda x: len(x["managers"][0]["name"]) if x["managers"] else 0) + .sum() + ) + else: + vpy_res = titanic_vd_fun["name"].get_len().sum() + py_res = titanic_pdf["name"].apply(len).sum() + + print(f"VerticaPy Result: {vpy_res} \nPython Result :{py_res}\n") + + assert vpy_res == pytest.approx(py_res) + + @pytest.mark.parametrize("column, n", ([("age", 4), ("fare", 2)])) + def test_round(self, titanic_vd_fun, column, n): + """ + test function - round + """ + titanic_pdf = titanic_vd_fun.to_pandas() + titanic_pdf[column] = titanic_pdf[column].astype(float) + + vpy_res = titanic_vd_fun[column].round(n)[column].sum() + + py_res = titanic_pdf[column].round(n).sum() + + print(f"VerticaPy Result: {vpy_res} \nPython Result :{py_res}\n") + + assert vpy_res == pytest.approx(py_res, rel=1e-04) + + @pytest.mark.parametrize( + "length, unit, start, column, expected", + ( + [ + (30, "minute", False, "time", datetime.datetime(2014, 1, 1, 1, 30)), + (1, "hour", True, "time", datetime.datetime(2014, 1, 1, 1, 00)), + ] + ), + ) + def test_slice(self, smart_meters_vd, length, unit, start, column, expected): + """ + test function - slice + """ + vpy_res = ( + smart_meters_vd[column] + .slice(length=length, unit=unit, start=start)[column] + .min() + ) + + print(f"VerticaPy Result: {vpy_res} \n") + + assert vpy_res == pytest.approx(expected) + + @pytest.mark.parametrize( + "func, columns, by, order_by, name, offset, x_smoothing, add_count, _rel_tol, _abs_tol", + [ + ( + "aad", + "age", + "pclass", + None, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "beta", + ["age", "fare"], + None, + None, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "count", + "age", + "pclass", + None, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "corr", + ["age", "fare"], + None, + None, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "cov", + ["age", "fare"], + None, + None, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "ema", + "age", + None, + {"name": "asc", "ticket": "desc"}, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), # Passed. vpy is returning nulls from the row when it gets 1st null + ( + "first_value", + "age", + None, + {"name": "asc", "ticket": "desc"}, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "iqr", + "age", + None, + None, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "dense_rank", + None, + None, + {"pclass": "desc", "sex": "desc"}, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "kurtosis", + "age", + None, + None, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ("jb", "age", None, None, "new_colm", 1, 0.5, True, 1e-02, ABS_TOLERANCE), + ( + "lead", + "age", + None, + {"name": "asc", "ticket": "desc"}, + "new_colm", + 5, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "lag", + "age", + None, + {"name": "asc", "ticket": "desc"}, + "new_colm", + 5, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "last_value", + "age", + "home.dest", + {"name": "asc", "ticket": "desc"}, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + # ( + # "mad", + # "age", + # None, + # None, + # "new_colm", + # 1, + # 0.5, + # True, + # REL_TOLERANCE, + # ABS_TOLERANCE, + # ), # Fail. looks like bug in code. vpy code has mean instead of median. + ( + "max", + "age", + None, + None, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "mean", + "age", + None, + None, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "median", + "age", + None, + None, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "min", + "age", + None, + None, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "mode", + "embarked", + None, + None, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "10%", + "age", + None, + None, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "pct_change", + "age", + None, + {"name": "asc", "ticket": "desc"}, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "percent_rank", + None, + None, + {"name": "asc", "ticket": "desc"}, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "prod", + "body", + "pclass", + None, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "range", + "age", + None, + None, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "rank", + None, + None, + {"pclass": "desc", "sex": "desc"}, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "row_number", + None, + None, + {"name": "asc", "ticket": "desc"}, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "sem", + "age", + None, + None, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "skewness", + "age", + None, + None, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "sum", + "age", + None, + None, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "std", + "age", + None, + None, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "unique", + "pclass", + None, + None, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ( + "var", + "age", + None, + None, + "new_colm", + 1, + 0.5, + True, + REL_TOLERANCE, + ABS_TOLERANCE, + ), + ], + ) + def test_analytic( + self, + titanic_vd_fun, + func, + columns, + by, + order_by, + name, + offset, + x_smoothing, + add_count, # pylint: disable=unused-argument + _rel_tol, + _abs_tol, + ): + """ + test function - analytic + """ + titanic_pdf = titanic_vd_fun.to_pandas() + titanic_pdf["age"] = titanic_pdf["age"].astype(float) + titanic_pdf["fare"] = titanic_pdf["fare"].astype(float) + + vpy_func, py_func = ( + (AggregateFun(*functions[func]).vpy, AggregateFun(*functions[func]).py) + if func in functions + else (func, func) + ) + + if order_by: + titanic_pdf.sort_values( + by=list(order_by.keys()), + ascending=[i == "asc" for i in list(order_by.values())], + inplace=True, + ) + + if func in ["aad", "count"]: + titanic_vd_fun.analytic(func=func, columns=columns, by=[by], name=name) + vpy_res = titanic_vd_fun[name][0] + + py_grp_data = titanic_pdf.groupby([by])[columns] + py_res = py_grp_data.transform(lambda py_data: eval(py_func))[0] + elif func in ["ema", "first_value", "last_value", "pct_change"]: + titanic_vd_fun.analytic( + func=func, columns=columns, by=by, order_by=order_by, name=name + ) + + if func == "first_value": + vpy_res = titanic_vd_fun[name].max() + py_res = titanic_pdf[columns].iloc[0] + elif func == "last_value": + # vpy_res = titanic_vd_fun.analytic(func=func, columns=columns, by=by, order_by=order_by, name=name)[by].isin("Belfast, NI")[name] + vpy_res = titanic_vd_fun[by].isin("Belfast, NI")[name] + py_res = titanic_pdf.groupby(by).last(columns).loc["Belfast, NI"] + elif func == "ema": + vpy_res = titanic_vd_fun[:10][name].sum() + py_res = ( + titanic_pdf["age"] + .ewm(adjust=False, alpha=x_smoothing) + .mean()[:10] + .sum() + ) + else: + vpy_res = titanic_vd_fun[name].max() + py_res = ( + titanic_pdf[columns] / titanic_pdf[columns].shift(periods=1) + ).max() + elif func in ["dense_rank", "percent_rank", "rank", "row_number"]: + titanic_vd_fun.analytic(func=func, order_by=order_by, name=name) + vpy_res = titanic_vd_fun[name].max() + + if func in ["dense_rank", "percent_rank", "rank"]: + col1, col2 = list(order_by.keys())[0], list(order_by.keys())[1] + py_res = ( + (titanic_pdf[col1].astype(str) + titanic_pdf[col2]) + .rank( + method="dense" if func == "dense_rank" else "min", + ascending=False, + pct=func == "percent_rank", + ) + .max() + ) + else: + titanic_pdf[func] = titanic_pdf.index + 1 + py_res = titanic_pdf[func].max() + + elif func in ["lead", "lag"]: + titanic_vd_fun.analytic( + func=func, columns=columns, order_by=order_by, offset=offset, name=name + ) + vpy_res = titanic_vd_fun[name].sum() + + py_res = ( + titanic_pdf[columns].shift(offset if func == "lag" else -offset).sum() + ) + else: + titanic_vd_fun.analytic(func=func, columns=columns, name=name) + vpy_res = titanic_vd_fun[name][0] + + if py_func in ["cov", "corr", "beta"]: + py_cov = (titanic_pdf[columns[0]] * titanic_pdf[columns[1]]).mean() - ( + titanic_pdf[columns[0]].mean() * titanic_pdf[columns[1]].mean() + ) + + if func in ["cov"]: + py_res = py_cov + elif func in ["corr"]: + py_res = py_cov / ( + titanic_pdf[columns[0]].std() * titanic_pdf[columns[1]].std() + ) + elif func == "beta": + py_var = titanic_pdf[columns[1]].var() + py_res = py_cov / py_var + else: + py_data = ( + titanic_pdf[columns].to_frame() + if func in ["iqr", "10%", "mode"] + else titanic_pdf[columns] + ) + py_res = eval(py_func) + + print( + f"Function name: {vpy_func} \ncolumns: {columns} \nVerticaPy Result: {vpy_res} \nPython Result :{py_res}\n" + ) + assert vpy_res == pytest.approx( + py_res[0] if func == "quantile" else py_res, rel=_rel_tol, abs=_abs_tol + ) + + @pytest.mark.parametrize("column, func", [("sex", "DECODE({}, NULL, 0, 1)")]) + def test_applymap(self, titanic_vd_fun, column, func): + """ + test function - applymap + """ + titanic_pdf = titanic_vd_fun.to_pandas() + vpy_res = titanic_vd_fun.applymap(func=func, numeric_only=False)[column].sum() + py_res = titanic_pdf[column].map(lambda x: 0 if pd.isnull(x) else 1).sum() + + print(f"VerticaPy Result: {vpy_res} \nPython Result :{py_res}\n") + assert vpy_res == pytest.approx(py_res) diff --git a/verticapy/tests_new/core/vdataframe/test_rolling.py b/verticapy/tests_new/core/vdataframe/test_rolling.py new file mode 100644 index 000000000..e50ad71c7 --- /dev/null +++ b/verticapy/tests_new/core/vdataframe/test_rolling.py @@ -0,0 +1,293 @@ +""" +Copyright (c) 2018-2023 Open Text or one of its +affiliates. Licensed under the Apache License, +Version 2.0 (the "License"); You may not use this +file except in compliance with the License. + +You may obtain a copy of the License at: +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in +writing, software distributed under the License is +distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing +permissions and limitations under the License. +""" +import numpy as np +import pytest +import verticapy.stats as st + + +class TestRolling: + """ + test class for Text functions test + """ + + @pytest.mark.parametrize( + "vpy_func, py_func, window, columns, by, order_by", + [ + ("aad", "aad", (-1, 1), ["age"], None, {"name": "asc", "ticket": "desc"}), + ( + "beta", + "beta", + (-1, 1), + ["age", "fare"], + None, + {"name": "asc", "ticket": "desc"}, + ), # few rows values are not matching with verticapy. comparing sample records + ( + "count", + "count", + (-1, 1), + ["age"], + None, + {"name": "asc", "ticket": "desc"}, + ), + ( + "corr", + "corr", + (-1, 1), + ["age", "fare"], + None, + {"name": "asc", "ticket": "desc"}, + ), # all rows value for corr are not matching with verticapy. comparing sample records + ( + "cov", + "cov", + (-1, 1), + ["age", "fare"], + None, + {"name": "asc", "ticket": "desc"}, + ), # all rows value for cov are not matching with verticapy. comparing sample records + ( + "kurtosis", + "kurt", + (-2, 1), + ["age"], + None, + {"name": "asc", "ticket": "desc"}, + ), # few rows values are not matching with verticapy. comparing sample records + ( + "jb", + "jb", + (-2, 1), + ["age"], + None, + {"name": "asc", "ticket": "desc"}, + ), # few rows values are not matching with verticapy. comparing sample records + ("max", "max", (-1, 1), ["age"], None, {"name": "asc", "ticket": "desc"}), + ("mean", "mean", (-1, 1), ["age"], None, {"name": "asc", "ticket": "desc"}), + ("min", "min", (-1, 1), ["age"], None, {"name": "asc", "ticket": "desc"}), + ("prod", "prod", (-1, 1), ["age"], None, {"name": "asc", "ticket": "desc"}), + ( + "range", + "range", + (-1, 1), + ["age"], + None, + {"name": "asc", "ticket": "desc"}, + ), + ("sem", "sem", (-1, 1), ["age"], None, {"name": "asc", "ticket": "desc"}), + ( + "skewness", + "skew", + (-1, 1), + ["age"], + None, + {"name": "asc", "ticket": "desc"}, + ), # few rows values are not matching with verticapy. comparing sample records + ("sum", "sum", (-1, 1), ["age"], None, {"name": "asc", "ticket": "desc"}), + ("std", "std", (-1, 1), ["age"], None, {"name": "asc", "ticket": "desc"}), + ("var", "var", (-1, 1), ["age"], None, {"name": "asc", "ticket": "desc"}), + ], + ) + def test_rolling( + self, + titanic_vd_fun, + vpy_func, + py_func, + window, + columns, + by, + order_by, + ): + """ + test function - str_extract for vColumns + """ + titanic_pdf = titanic_vd_fun.to_pandas() + name = f"{vpy_func}_val" + py_window = sum(-1 * w if w < 0 else w for w in window) + 1 + + titanic_vd_fun.rolling( + func=vpy_func, + window=window, + columns=columns, + name=name, + order_by=order_by, + ) + # filling nan/inf/zero + titanic_vd_fun[name] = st.case_when( + st.isnan(titanic_vd_fun[name]), + 0, + st.isinf(titanic_vd_fun[name]), + 0, + st.zeroifnull(titanic_vd_fun[name]), + ) + titanic_vd_fun = ( + titanic_vd_fun[:50] + if vpy_func in ["corr", "cov", "beta", "skewness", "kurtosis", "jb"] + else titanic_vd_fun + ) + vpy_res = titanic_vd_fun[name].sum() + + pdf_new = titanic_pdf.sort_values( + by=list(order_by.keys()), + ascending=[i == "asc" for i in list(order_by.values())], + ).reset_index(drop=True) + + # casting + for idx, column in enumerate(columns): + pdf_new[column] = pdf_new[column].astype(float) + if idx == 0: + titanic_pdf_roll_col0 = pdf_new[column].rolling( + py_window, min_periods=1, center=True + ) + else: + titanic_pdf_roll_col1 = pdf_new[column].rolling( + py_window, min_periods=1, center=True + ) + + if py_func == "aad": + pdf_new["abs_mean"] = abs( + pdf_new[columns[0]] - titanic_pdf_roll_col0.mean() + ) + pdf_new[name] = ( + pdf_new["abs_mean"] + .rolling(py_window, min_periods=1, center=True) + .mean() + ) + elif py_func in ["cov", "corr", "beta"]: + pdf_new["cov"] = titanic_pdf_roll_col0.cov(pdf_new[columns[1]], ddof=0) + if py_func == "cov": + pdf_new[name] = pdf_new["cov"] + elif py_func == "corr": + pdf_new[f"std_{columns[0]}"] = titanic_pdf_roll_col0.std() + pdf_new[f"std_{columns[1]}"] = titanic_pdf_roll_col1.std() + pdf_new[name] = pdf_new["cov"] / ( + pdf_new[f"std_{columns[0]}"].replace(0, np.nan) + * pdf_new[f"std_{columns[1]}"].replace(0, np.nan) + ) + elif py_func == "beta": + pdf_new[f"var_{columns[0]}"] = titanic_pdf_roll_col1.var().replace( + 0, np.nan + ) + pdf_new[name] = pdf_new["cov"] / pdf_new[f"var_{columns[0]}"] + pdf_new = pdf_new[:50] + elif py_func == "prod": + pdf_new[name] = titanic_pdf_roll_col0.apply(np.prod) + elif py_func == "sem": + pdf_new[name] = getattr(titanic_pdf_roll_col0, py_func)(ddof=0) + elif py_func in ["skew", "kurt", "jb"]: + # skew + pdf_new["skew1"] = pow( + (pdf_new[columns[0]] - titanic_pdf_roll_col0.mean()) + / titanic_pdf_roll_col0.std(), + 3, + ) + pdf_new["skew1_mean"] = ( + pdf_new["skew1"].rolling(py_window, min_periods=1, center=True).mean() + ) + pdf_new["skew2"] = pow(titanic_pdf_roll_col0.count(), 2) / ( + (titanic_pdf_roll_col0.count() - 1) + * (titanic_pdf_roll_col0.count() - 2) + ).replace(0, np.nan) + + # kurt + pdf_new["kurt1"] = ( + pow(titanic_pdf_roll_col0.count(), 1) + * (titanic_pdf_roll_col0.count() + 1) + ) / ( + (titanic_pdf_roll_col0.count() - 1) + * (titanic_pdf_roll_col0.count() - 2) + * (titanic_pdf_roll_col0.count() - 3) + ).replace( + 0, np.nan + ) + pdf_new["kurt2"] = pow( + (pdf_new[columns[0]] - titanic_pdf_roll_col0.mean()) + / titanic_pdf_roll_col0.std(), + 4, + ) + pdf_new["kurt2_mean"] = ( + pdf_new["kurt2"].rolling(4, min_periods=1, center=True).sum() + ) + pdf_new["kurt3"] = ( + 3 + * pow((titanic_pdf_roll_col0.count() - 1), 2) + / ( + (titanic_pdf_roll_col0.count() - 2) + * (titanic_pdf_roll_col0.count() - 3) + ).replace(0, np.nan) + ) + + if py_func == "skew": + pdf_new[name] = pdf_new["skew1_mean"] * pdf_new["skew2"] + pdf_new[name] = pdf_new[name].fillna(0) + elif py_func == "kurt": + pdf_new[name] = (pdf_new["kurt1"] * pdf_new["kurt2_mean"]) - pdf_new[ + "kurt3" + ] + pdf_new[name] = pdf_new[name].fillna(0) + else: + pdf_new["skew"] = pdf_new["skew1_mean"] * pdf_new["skew2"] + pdf_new["skew"] = pdf_new["skew"].fillna(0) + + pdf_new["kurt"] = (pdf_new["kurt1"] * pdf_new["kurt2_mean"]) - pdf_new[ + "kurt3" + ] + pdf_new["kurt"] = pdf_new["kurt"].fillna(0) + + pdf_new[name] = (titanic_pdf_roll_col0.count() / 6) * ( + pow(pdf_new["skew"], 2) + pow((pdf_new["kurt"]), 2) / 4 + ) + pdf_new = pdf_new[:50] + elif py_func == "range": + pdf_new[name] = titanic_pdf_roll_col0.max() - titanic_pdf_roll_col0.min() + else: + pdf_new[name] = getattr( + pdf_new[columns].rolling(py_window, center=True, min_periods=1), py_func + )() + + py_res = pdf_new[name].sum() + + print( + f"Rolling Function : {vpy_func} \nVerticaPy Result: {vpy_res} \nPython Result :{py_res}\n" + ) + + assert vpy_res == pytest.approx(py_res, rel=1e-02) + + @pytest.mark.parametrize( + "func, columns, by, order_by, name", + [ + ("cummax", "number", "state", "date", "cummax_num"), + ("cummin", "number", "state", "date", "cummin_num"), + ("cumprod", "number", "state", "date", "cumprod_num"), + ("cumsum", "number", "state", "date", "cumsum_num"), + ], + ) + def test_cum_func(self, amazon_vd, func, columns, by, order_by, name): + """ + test function - cumulative functions + """ + amazon_pdf = amazon_vd.to_pandas() + getattr(amazon_vd, func)( + column=columns, by=[by], order_by=[order_by], name=name + ).sort([by, order_by]) + vpy_res = amazon_vd[name].sum() + + py_res = getattr(amazon_pdf.groupby(by=[by])[columns], func)().sum() + + print(f"VerticaPy Result: {vpy_res} \nPython Result :{py_res}\n") + + assert vpy_res == pytest.approx(py_res)