diff --git a/docs/source/stats.rst b/docs/source/stats.rst index e4fe415fa..f49e1f3fc 100644 --- a/docs/source/stats.rst +++ b/docs/source/stats.rst @@ -21,18 +21,6 @@ Heteroscedascity ols.het_goldfeldquandt ols.het_white -____ - -Endogeneity -------------- - -.. currentmodule:: verticapy.machine_learning.model_selection.statistical_tests - -.. autosummary:: - :toctree: api/ - - ols.endogtest - ____ diff --git a/verticapy/core/vdataframe/_corr.py b/verticapy/core/vdataframe/_corr.py index 504011855..82323b239 100755 --- a/verticapy/core/vdataframe/_corr.py +++ b/verticapy/core/vdataframe/_corr.py @@ -1993,6 +1993,7 @@ def pacf( input_relation=tmp_view_name, X=[f"lag_{i}_{gen_name([column])}" for i in range(1, p)], y=column, + return_report=True, ) model.predict(vdf, name="prediction_0") drop(tmp_lr1_name, method="model") @@ -2001,6 +2002,7 @@ def pacf( input_relation=tmp_view_name, X=[f"lag_{i}_{gen_name([column])}" for i in range(1, p)], y=f"lag_{p}_{gen_name([column])}", + return_report=True, ) model.predict(vdf, name="prediction_p") vdf.eval(expr=f"{column} - prediction_0", name="eps_0") diff --git a/verticapy/core/vdataframe/_encoding.py b/verticapy/core/vdataframe/_encoding.py index 77615c42b..997443ec4 100755 --- a/verticapy/core/vdataframe/_encoding.py +++ b/verticapy/core/vdataframe/_encoding.py @@ -301,7 +301,12 @@ def discretize( model.set_params(RFmodel_params) parameters = model.get_params() try: - model.fit(tmp_view_name, [self._alias], response) + model.fit( + tmp_view_name, + [self._alias], + response, + return_report=True, + ) query = [ f""" (SELECT diff --git a/verticapy/machine_learning/model_selection/hp_tuning/cv.py b/verticapy/machine_learning/model_selection/hp_tuning/cv.py index f7003ebf0..1e9a338af 100755 --- a/verticapy/machine_learning/model_selection/hp_tuning/cv.py +++ b/verticapy/machine_learning/model_selection/hp_tuning/cv.py @@ -719,7 +719,12 @@ def bayesian_search_cv( hyper_param_estimator = vml.RandomForestRegressor( name=estimator.model_name, **RFmodel_params ) - hyper_param_estimator.fit(relation, all_params, "score") + hyper_param_estimator.fit( + relation, + all_params, + "score", + return_report=True, + ) if random_grid: vdf = gen_dataset(model_grid, nrows=nrows) else: diff --git a/verticapy/machine_learning/model_selection/kmeans.py b/verticapy/machine_learning/model_selection/kmeans.py index 20c65bbc4..217e56b91 100755 --- a/verticapy/machine_learning/model_selection/kmeans.py +++ b/verticapy/machine_learning/model_selection/kmeans.py @@ -139,7 +139,11 @@ def best_k( max_iter=max_iter, tol=tol, ) - model.fit(input_relation, X) + model.fit( + input_relation, + X, + return_report=True, + ) score = model.elbow_score_ if score > elbow_score_stop: return i @@ -266,7 +270,11 @@ def elbow( max_iter=max_iter, tol=tol, ) - model.fit(input_relation, X) + model.fit( + input_relation, + X, + return_report=True, + ) elbow_score += [float(model.elbow_score_)] between_cluster_ss += [float(model.between_cluster_ss_)] total_ss += [float(model.total_ss_)] diff --git a/verticapy/machine_learning/model_selection/model_validation.py b/verticapy/machine_learning/model_selection/model_validation.py index cbc43a901..23f1d1379 100755 --- a/verticapy/machine_learning/model_selection/model_validation.py +++ b/verticapy/machine_learning/model_selection/model_validation.py @@ -229,7 +229,13 @@ def cross_validate( test_size=float(1 / cv), order_by=[X[0]], random_state=random_state ) start_time = time.time() - estimator.fit(train, X, y, test) + estimator.fit( + train, + X, + y, + test, + return_report=True, + ) total_time += [time.time() - start_time] fun = estimator.report kwargs = {"metrics": final_metrics} diff --git a/verticapy/machine_learning/model_selection/statistical_tests/__init__.py b/verticapy/machine_learning/model_selection/statistical_tests/__init__.py index 18af5d64c..c3a165640 100755 --- a/verticapy/machine_learning/model_selection/statistical_tests/__init__.py +++ b/verticapy/machine_learning/model_selection/statistical_tests/__init__.py @@ -24,7 +24,6 @@ seasonal_decompose, ) from verticapy.machine_learning.model_selection.statistical_tests.ols import ( - endogtest, het_breuschpagan, het_goldfeldquandt, het_white, diff --git a/verticapy/machine_learning/model_selection/statistical_tests/ols.py b/verticapy/machine_learning/model_selection/statistical_tests/ols.py index 9c2056428..b8ea59e42 100755 --- a/verticapy/machine_learning/model_selection/statistical_tests/ols.py +++ b/verticapy/machine_learning/model_selection/statistical_tests/ols.py @@ -296,16 +296,25 @@ def het_breuschpagan( vdf = vDataFrame(input_relation) X = format_type(X, dtype=list) eps, X = vdf.format_colnames(eps, X) - name = gen_tmp_name(schema=conf.get_option("temp_schema"), name="linear_reg") - model = LinearRegression(name) + model = LinearRegression() vdf_copy = vdf.copy() vdf_copy["v_eps2"] = vdf_copy[eps] ** 2 try: - model.fit(vdf_copy, X, "v_eps2") + model.fit( + vdf_copy, + X, + "v_eps2", + return_report=True, + ) R2 = model.score(metric="r2") except QueryError: model.set_params({"solver": "bfgs"}) - model.fit(vdf_copy, X, "v_eps2") + model.fit( + vdf_copy, + X, + "v_eps2", + return_report=True, + ) R2 = model.score(metric="r2") finally: model.drop() @@ -537,7 +546,12 @@ def model_fit( mse = [] for vdf_tmp in input_relation: model.drop() - model.fit(vdf_tmp, X, y) + model.fit( + vdf_tmp, + X, + y, + return_report=True, + ) mse += [model.score(metric="mse")] model.drop() return mse @@ -551,8 +565,7 @@ def model_fit( split_value = vdf[X[idx]].quantile(split) vdf_0_half = vdf.search(vdf[X[idx]] < split_value) vdf_1_half = vdf.search(vdf[X[idx]] > split_value) - name = gen_tmp_name(schema=conf.get_option("temp_schema"), name="linear_reg") - model = LinearRegression(name) + model = LinearRegression() try: mse0, mse1 = model_fit([vdf_0_half, vdf_1_half], X, y, model) except QueryError: @@ -843,14 +856,23 @@ def het_white( POWER({eps}, 2) AS v_eps2 FROM {vdf}""" vdf_white = vDataFrame(query) - name = gen_tmp_name(schema=conf.get_option("temp_schema"), name="linear_reg") - model = LinearRegression(name) + model = LinearRegression() try: - model.fit(vdf_white, variables_names, "v_eps2") + model.fit( + vdf_white, + variables_names, + "v_eps2", + return_report=True, + ) R2 = model.score(metric="r2") except QueryError: model.set_params({"solver": "bfgs"}) - model.fit(vdf_white, variables_names, "v_eps2") + model.fit( + vdf_white, + variables_names, + "v_eps2", + return_report=True, + ) R2 = model.score(metric="r2") finally: model.drop() @@ -866,322 +888,6 @@ def het_white( return LM, lm_pvalue, F, f_pvalue -""" -OLS Tests: Endogeneity. -""" - - -@save_verticapy_logs -def endogtest( - input_relation: SQLRelation, eps: str, X: SQLColumns -) -> tuple[float, float, float, float]: - """ - Endogeneity test. - - Parameters - ---------- - input_relation: SQLRelation - Input relation. - eps: str - Input residual vDataColumn. - X: list - Input Variables to test the endogeneity on. - - Returns - ------- - tuple - Lagrange Multiplier statistic, LM pvalue, - F statistic, F pvalue - - Examples - --------- - - Initialization - ^^^^^^^^^^^^^^^ - - Let's try this test on a dummy dataset that has the - following elements: - - - x (a predictor) - - y (the response) - - Random noise - - .. note:: - - This metric requires ``eps``, which represents - the difference between the predicted value - and the true value. If you already have ``eps`` - available, you can directly use it instead of - recomputing it, as demonstrated in the example - below. - - Before we begin we can import the necessary libraries: - - .. ipython:: python - - import verticapy as vp - import numpy as np - from verticapy.machine_learning.vertica.linear_model import LinearRegression - - Example 1: Homoscedasticity - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - Next, we can create some values with random - noise: - - .. ipython:: python - :suppress: - - x_vals = list(range(N)) - y_vals = [2 * x + np.random.normal(3) for x in x_vals] - - .. code-block:: python - - x_vals = list(range(N)) - y_vals = [2 * x + np.random.normal(3) for x in x_vals] - - We can use those values to create the ``vDataFrame``: - - .. ipython:: python - - N = 50 - vdf = vp.vDataFrame( - { - "x": x_vals, - "y": y_vals, - } - ) - - We can initialize a regression model: - - .. ipython:: python - - model = LinearRegression() - - Fit that model on the dataset: - - .. ipython:: python - - model.fit(input_relation = vdf, X = "x", y = "y") - - We can create a column in the ``vDataFrame`` that - has the predictions: - - .. code-block:: python - - model.predict(vdf, X = "x", name = "y_pred") - - .. ipython:: python - :suppress: - - result = model.predict(vdf, X = "x", name = "y_pred") - html_file = open("figures/machine_learning_model_selection_statistical_tests_endogtest_1.html", "w") - html_file.write(result._repr_html_()) - html_file.close() - - .. raw:: html - :file: SPHINX_DIRECTORY/figures/machine_learning_model_selection_statistical_tests_endogtest_1.html - - Then we can calculate the residuals i.e. ``eps``: - - .. ipython:: python - - vdf["eps"] = vdf["y"] - vdf["y_pred"] - - We can plot the residuals to see the trend: - - .. code-block:: python - - vdf.scatter(["x", "eps"]) - - .. ipython:: python - :suppress: - - vp.set_option("plotting_lib", "plotly") - fig = vdf.scatter(["x", "eps"], width = 550) - fig.write_html("figures/plotting_machine_learning_model_selection_ols_endogtest.html") - - .. raw:: html - :file: SPHINX_DIRECTORY/figures/plotting_machine_learning_model_selection_ols_endogtest.html - - Notice the randomness of the residuals with respect to x. - This shows that the noise is homoscedestic. - - To test its score, we can import the test function: - - .. ipython:: python - - from verticapy.machine_learning.model_selection.statistical_tests import endogtest - - And simply apply it on the ``vDataFrame``: - - .. ipython:: python - - lm_statistic, lm_pvalue, f_statistic, f_pvalue = endogtest(vdf, eps = "eps", X = "x") - - .. ipython:: python - - print(lm_statistic, lm_pvalue, f_statistic, f_pvalue) - - As the noise was not heteroscedestic, we got higher - p_value scores and lower statistics score. - - .. note:: - - A ``p_value`` in statistics represents the - probability of obtaining results as extreme - as, or more extreme than, the observed data, - assuming the null hypothesis is true. - A *smaller* p-value typically suggests - stronger evidence against the null hypothesis - i.e. the test data does not have - a heteroscedestic noise in the current case. - - However, *small* is a relative term. And - the choice for the threshold value which - determines a "small" should be made before - analyzing the data. - - Generally a ``p-value`` less than 0.05 - is considered the threshold to reject the - null hypothesis. But it is not always - the case - - `read more `_ - - .. note:: - - F-statistics tests the overall significance - of a model, while LM statistics tests the - validity of linear restrictions on model - parameters. High values indicate heterescedestic - noise in this case. - - Example 2: Heteroscedasticity - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - We can contrast the above result with a dataset that - has **heteroscedestic noise** below: - - .. ipython:: python - :suppress: - - y_vals = [2 * x + np.random.normal(scale=10 * x * x + 1) for x in x_vals] - - .. code-block:: python - - # x values - x_vals = list(range(N)) - - # Adding some heteroscedestic noise - y_vals = [2 * x + np.random.normal(scale=10 * x * x + 1) for x in x_vals] - - .. ipython:: python - - vdf = vp.vDataFrame( - { - "x": x_vals, - "y": y_vals, - } - ) - - We can intialize a regression model: - - .. ipython:: python - - model = LinearRegression() - - Fit that model on the dataset: - - .. ipython:: python - - model.fit(input_relation = vdf, X = "x", y = "y") - - We can create a column in the ``vDataFrame`` that - has the predictions: - - .. code-block:: python - - model.predict(vdf, X = "x", name = "y_pred") - - .. ipython:: python - :suppress: - - result = model.predict(vdf, X = "x", name = "y_pred") - html_file = open("figures/machine_learning_model_selection_statistical_tests_endogtest_1.html", "w") - html_file.write(result._repr_html_()) - html_file.close() - - .. raw:: html - :file: SPHINX_DIRECTORY/figures/machine_learning_model_selection_statistical_tests_endogtest_1.html - - Then we can calculate the residual i.e. ``eps``: - - .. ipython:: python - - vdf["eps"] = vdf["y"] - vdf["y_pred"] - - We can plot the residuals to see the trend: - - .. code-block:: python - - vdf.scatter(["x", "eps"]) - - .. ipython:: python - :suppress: - - fig = vdf.scatter(["x", "eps"], width = 550) - fig.write_html("figures/plotting_machine_learning_model_selection_ols_endogtest_2.html") - - .. raw:: html - :file: SPHINX_DIRECTORY/figures/plotting_machine_learning_model_selection_ols_endogtest_2.html - - Notice the relationship of the residuals with - respect to x. This shows that the noise is - heteroscedestic. - - Now we can perform the test on this dataset: - - .. ipython:: python - - lm_statistic, lm_pvalue, f_statistic, f_pvalue = endogtest(vdf, eps = "eps", X = "x") - - .. ipython:: python - - print(lm_statistic, lm_pvalue, f_statistic, f_pvalue) - - .. note:: - - Notice the contrast of the two test results. In this - dataset, the noise was heteroscedestic so we got very low - p_value scores and higher statistics score. Thus confirming - that the noise was in fact heteroscedestic. - """ - if isinstance(input_relation, vDataFrame): - vdf = input_relation.copy() - else: - vdf = vDataFrame(input_relation) - X = format_type(X, dtype=list) - eps, X = vdf.format_colnames(eps, X) - name = gen_tmp_name(schema=conf.get_option("temp_schema"), name="linear_reg") - model = LinearRegression(name) - try: - model.fit(vdf, X, eps) - R2 = model.score(metric="r2") - except QueryError: - model.set_params({"solver": "bfgs"}) - model.fit(vdf, X, eps) - R2 = model.score(metric="r2") - finally: - model.drop() - n = vdf.shape()[0] - k = len(X) - LM = n * R2 - lm_pvalue = chi2.sf(LM, k) - F = (n - k - 1) * R2 / (1 - R2) / k - f_pvalue = f.sf(F, k, n - k - 1) - return LM, lm_pvalue, F, f_pvalue - - """ OLS Tests: Multicollinearity. """ @@ -1349,14 +1055,23 @@ def variance_inflation_factor( if i != X_idx: X_r += [X[i]] y_r = X[X_idx] - name = gen_tmp_name(schema=conf.get_option("temp_schema"), name="linear_reg") - model = LinearRegression(name) + model = LinearRegression() try: - model.fit(vdf, X_r, y_r) + model.fit( + vdf, + X_r, + y_r, + return_report=True, + ) R2 = model.score(metric="r2") except QueryError: model.set_params({"solver": "bfgs"}) - model.fit(vdf, X_r, y_r) + model.fit( + vdf, + X_r, + y_r, + return_report=True, + ) R2 = model.score(metric="r2") finally: model.drop() diff --git a/verticapy/machine_learning/model_selection/statistical_tests/tsa.py b/verticapy/machine_learning/model_selection/statistical_tests/tsa.py index 201789027..42e90a130 100755 --- a/verticapy/machine_learning/model_selection/statistical_tests/tsa.py +++ b/verticapy/machine_learning/model_selection/statistical_tests/tsa.py @@ -358,10 +358,20 @@ def adfuller( if with_trend: predictors += ["ts"] try: - model.fit(relation_name, predictors, "delta") + model.fit( + relation_name, + predictors, + "delta", + return_report=True, + ) except QueryError: model.set_params({"solver": "bfgs"}) - model.fit(relation_name, predictors, "delta") + model.fit( + relation_name, + predictors, + "delta", + return_report=True, + ) finally: drop(relation_name, method="view") coef = model.get_vertica_attributes("details") @@ -695,9 +705,8 @@ def cochrane_orcutt( else: vdf = vDataFrame(input_relation) ts = vdf.format_colnames(ts) - name = gen_tmp_name(schema=schema_relation(model.model_name)[0], name="linear") param = model.get_params() - model_tmp = type(model)(name) + model_tmp = type(model)() model_tmp.set_params(param) X, y = model.X, model.y print_info = conf.get_option("print_info") @@ -727,7 +736,12 @@ def cochrane_orcutt( new_val = f"COALESCE({new_val}, {predictor} * {(1 - pho ** 2) ** (0.5)})" vdf[predictor] = new_val model_tmp.drop() - model_tmp.fit(vdf, X, y) + model_tmp.fit( + vdf, + X, + y, + return_report=True, + ) model_tmp.pho_ = pho model_tmp.anova_table_ = model.regression_report(metrics="anova") model_tmp.r2_ = model.score(metric="r2") @@ -912,14 +926,23 @@ def het_arch( X_names += [f"lag_{i}"] query = f"SELECT {', '.join(X)} FROM {vdf}" vdf_lags = vDataFrame(query) - name = gen_tmp_name(schema=conf.get_option("temp_schema"), name="linear_reg") - model = LinearRegression(name) + model = LinearRegression() try: - model.fit(vdf_lags, X_names[1:], X_names[0]) + model.fit( + vdf_lags, + X_names[1:], + X_names[0], + return_report=True, + ) R2 = model.score(metric="r2") except QueryError: model.set_params({"solver": "bfgs"}) - model.fit(vdf_lags, X_names[1:], X_names[0]) + model.fit( + vdf_lags, + X_names[1:], + X_names[0], + return_report=True, + ) R2 = model.score(metric="r2") finally: model.drop() @@ -1040,7 +1063,12 @@ def seasonal_decompose( name = gen_tmp_name(schema=conf.get_option("temp_schema"), name="linear_reg") model = LinearRegression(name=name, solver="bfgs", max_iter=100, tol=1e-6) model.drop() - model.fit(vdf_poly, X, column) + model.fit( + vdf_poly, + X, + column, + return_report=True, + ) coefficients = [str(model.intercept_)] + [ f"{model.coef_[i-1]} * POWER(ROW_NUMBER() OVER({by_str}ORDER BY {ts}), {i})" if i != 1 @@ -1083,7 +1111,12 @@ def seasonal_decompose( name = gen_tmp_name(schema=conf.get_option("temp_schema"), name="linear_reg") model = LinearRegression(name=name, solver="bfgs", max_iter=100, tol=1e-6) model.drop() - model.fit(vdf_seasonality, X, seasonal_name) + model.fit( + vdf_seasonality, + X, + seasonal_name, + return_report=True, + ) vdf[ seasonal_name ] = f""" diff --git a/verticapy/machine_learning/model_selection/variables_selection.py b/verticapy/machine_learning/model_selection/variables_selection.py index 2e26d0a4a..747b04cb7 100755 --- a/verticapy/machine_learning/model_selection/variables_selection.py +++ b/verticapy/machine_learning/model_selection/variables_selection.py @@ -445,7 +445,12 @@ def stepwise( model_id, res, current_step = 0, [], 0 if direction == "backward": estimator.drop() - estimator.fit(input_relation, X, y) + estimator.fit( + input_relation, + X, + y, + return_report=True, + ) current_score = estimator.score(metric=criterion) res += [(copy.deepcopy(X), current_score, None, None, 0, None)] X_current = copy.deepcopy(X) @@ -461,7 +466,12 @@ def stepwise( X_test.remove(X[idx]) if len(X_test) != 0: estimator.drop() - estimator.fit(input_relation, X_test, y) + estimator.fit( + input_relation, + X_test, + y, + return_report=True, + ) test_score = estimator.score(metric=criterion) else: test_score = fun(y, str(avg), input_relation, 0) @@ -494,7 +504,12 @@ def stepwise( break X_test = copy.deepcopy(X_current) + [X[idx]] estimator.drop() - estimator.fit(input_relation, X_test, y) + estimator.fit( + input_relation, + X_test, + y, + return_report=True, + ) test_score = estimator.score(metric=criterion) score_diff = current_score - test_score if current_score - test_score > criterion_threshold: diff --git a/verticapy/machine_learning/vertica/automl/clustering.py b/verticapy/machine_learning/vertica/automl/clustering.py index 4402bb40c..0c1fa5cb3 100755 --- a/verticapy/machine_learning/vertica/automl/clustering.py +++ b/verticapy/machine_learning/vertica/automl/clustering.py @@ -155,7 +155,12 @@ def __init__( # Model Fitting Method. - def fit(self, input_relation: SQLRelation, X: Optional[SQLColumns] = None) -> None: + def fit( + self, + input_relation: SQLRelation, + X: Optional[SQLColumns] = None, + return_report: bool = False, + ) -> None: """ Trains the model. diff --git a/verticapy/machine_learning/vertica/automl/dataprep.py b/verticapy/machine_learning/vertica/automl/dataprep.py index 8c3041711..8ba0b512f 100755 --- a/verticapy/machine_learning/vertica/automl/dataprep.py +++ b/verticapy/machine_learning/vertica/automl/dataprep.py @@ -210,6 +210,7 @@ def fit( X: Optional[SQLColumns] = None, ts: Optional[str] = None, by: Optional[SQLColumns] = None, + return_report: bool = False, ) -> None: """ Trains the model. @@ -391,7 +392,11 @@ def fit( if self.parameters["apply_pca"] and not ts: model_pca = PCA(self.model_name + "_pca") model_pca.drop() - model_pca.fit(vdf, self.X_out_) + model_pca.fit( + vdf, + self.X_out_, + return_report=True, + ) vdf = model_pca.transform() self.X_out_ = vdf.get_columns( exclude_columns=by + [ts] + X_diff if ts else by + X_diff diff --git a/verticapy/machine_learning/vertica/automl/supervised.py b/verticapy/machine_learning/vertica/automl/supervised.py index b863c2bb9..cb7343cf3 100755 --- a/verticapy/machine_learning/vertica/automl/supervised.py +++ b/verticapy/machine_learning/vertica/automl/supervised.py @@ -349,6 +349,7 @@ def fit( input_relation: SQLRelation, X: Optional[SQLColumns] = None, y: Optional[str] = None, + return_report: bool = False, ) -> None: """ Trains the model. @@ -532,7 +533,11 @@ def fit( model_preprocess = AutoDataPrep( name=name, **self.parameters["preprocess_dict"] ) - model_preprocess.fit(input_relation, X=X) + model_preprocess.fit( + input_relation, + X=X, + return_report=True, + ) input_relation = model_preprocess.final_relation_ X = copy.deepcopy(model_preprocess.X_out_) self.preprocess_ = model_preprocess @@ -657,7 +662,12 @@ def fit( criterion_threshold=2, ) else: - best_model.fit(input_relation, X, y) + best_model.fit( + input_relation, + X, + y, + return_report=True, + ) self.best_model_ = best_model self.model_grid_ = result self.parameters["reverse"] = not reverse diff --git a/verticapy/machine_learning/vertica/cluster.py b/verticapy/machine_learning/vertica/cluster.py index 0f8e65151..bc3a911f5 100755 --- a/verticapy/machine_learning/vertica/cluster.py +++ b/verticapy/machine_learning/vertica/cluster.py @@ -2231,6 +2231,7 @@ def fit( X: Optional[SQLColumns] = None, key_columns: Optional[SQLColumns] = None, index: Optional[str] = None, + return_report: bool = False, ) -> None: """ Trains the model. @@ -3038,8 +3039,10 @@ def _attributes(self) -> list[str]: # System & Special Methods. @save_verticapy_logs - def __init__(self, name: str = None, p: int = 2) -> None: - super().__init__(name) + def __init__( + self, name: str = None, overwrite_model: bool = False, p: int = 2 + ) -> None: + super().__init__(name, overwrite_model) self.parameters = {"p": p} def drop(self) -> bool: diff --git a/verticapy/machine_learning/vertica/feature_extraction/text.py b/verticapy/machine_learning/vertica/feature_extraction/text.py index 7086c7ab6..b7dec23f6 100644 --- a/verticapy/machine_learning/vertica/feature_extraction/text.py +++ b/verticapy/machine_learning/vertica/feature_extraction/text.py @@ -370,7 +370,13 @@ def _get_filter_df(self): # Model Fitting Method. - def fit(self, input_relation: SQLRelation, index: str, x: str) -> None: + def fit( + self, + input_relation: SQLRelation, + index: str, + x: str, + return_report: bool = False, + ) -> None: """ Applies basic pre-processing. Creates table with fitted vocabulary and idf values. diff --git a/verticapy/machine_learning/vertica/neighbors.py b/verticapy/machine_learning/vertica/neighbors.py index aa2733544..154cfc7c8 100755 --- a/verticapy/machine_learning/vertica/neighbors.py +++ b/verticapy/machine_learning/vertica/neighbors.py @@ -144,8 +144,14 @@ def _attributes(self) -> list[str]: # System & Special Methods. @save_verticapy_logs - def __init__(self, name: str = None, n_neighbors: int = 5, p: int = 2) -> None: - super().__init__(name) + def __init__( + self, + name: str = None, + overwrite_model: bool = False, + n_neighbors: int = 5, + p: int = 2, + ) -> None: + super().__init__(name, overwrite_model) self.parameters = {"n_neighbors": n_neighbors, "p": p} def drop(self) -> bool: @@ -873,8 +879,14 @@ def _attributes(self) -> list[str]: # System & Special Methods. @save_verticapy_logs - def __init__(self, name: str = None, n_neighbors: int = 5, p: int = 2) -> None: - super().__init__(name) + def __init__( + self, + name: str = None, + overwrite_model: bool = False, + n_neighbors: int = 5, + p: int = 2, + ) -> None: + super().__init__(name, overwrite_model) self.parameters = {"n_neighbors": n_neighbors, "p": p} def drop(self) -> bool: @@ -1519,7 +1531,12 @@ def _density_compute( # Model Fitting Method. - def fit(self, input_relation: SQLRelation, X: Optional[SQLColumns] = None) -> None: + def fit( + self, + input_relation: SQLRelation, + X: Optional[SQLColumns] = None, + return_report: bool = False, + ) -> None: """ Trains the model. @@ -2042,6 +2059,7 @@ def fit( X: Optional[SQLColumns] = None, key_columns: Optional[SQLColumns] = None, index: Optional[str] = None, + return_report: bool = False, ) -> None: """ Trains the model. diff --git a/verticapy/machine_learning/vertica/pipeline.py b/verticapy/machine_learning/vertica/pipeline.py index cb3b26e39..e88be286d 100755 --- a/verticapy/machine_learning/vertica/pipeline.py +++ b/verticapy/machine_learning/vertica/pipeline.py @@ -173,6 +173,7 @@ def fit( X: list, y: Optional[str] = None, test_relation: SQLRelation = "", + return_report: bool = False, ) -> None: """ Trains the model. diff --git a/verticapy/machine_learning/vertica/preprocessing.py b/verticapy/machine_learning/vertica/preprocessing.py index 01d355207..85f28de48 100755 --- a/verticapy/machine_learning/vertica/preprocessing.py +++ b/verticapy/machine_learning/vertica/preprocessing.py @@ -688,7 +688,12 @@ def deploySQL(self, _return_main_table: bool = False) -> str: # Model Fitting Method. - def fit(self, input_relation: SQLRelation, X: Optional[SQLColumns] = None) -> None: + def fit( + self, + input_relation: SQLRelation, + X: Optional[SQLColumns] = None, + return_report: bool = False, + ) -> None: """ Trains the model. diff --git a/verticapy/stats/__init__.py b/verticapy/stats/__init__.py index cc0c90b48..375a1ff4f 100755 --- a/verticapy/stats/__init__.py +++ b/verticapy/stats/__init__.py @@ -130,7 +130,6 @@ ) from verticapy.machine_learning.model_selection.statistical_tests.ols import ( - endogtest, het_breuschpagan, het_goldfeldquandt, het_white, diff --git a/verticapy/tests/stats/test_stats.py b/verticapy/tests/stats/test_stats.py index 34a5c68b7..b9a1eb015 100755 --- a/verticapy/tests/stats/test_stats.py +++ b/verticapy/tests/stats/test_stats.py @@ -98,17 +98,6 @@ def test_durbin_watson(self, amazon_vd): result = st.durbin_watson(amazon_vd, eps="number", ts="date", by=["state"]) assert result == pytest.approx(0.583991056156811, 1e-2) - def test_endogtest(self, amazon_vd): - result = amazon_vd.groupby(["date"], ["AVG(number) AS number"]) - result["lag_number"] = "LAG(number) OVER (ORDER BY date)" - result = st.endogtest(result, eps="number", X=["lag_number"]) - assert result == ( - pytest.approx(110.77336789258061), - pytest.approx(6.633693190527767e-26), - pytest.approx(204.74130653722867), - pytest.approx(6.827786109983712e-34), - ) - def test_het_arch(self, amazon_vd): result = st.het_arch(amazon_vd, eps="number", ts="date", by=["state"], p=2) assert result == (