From 12ba3a99a6e2d43a9a5a08e05edba22a4f7223ee Mon Sep 17 00:00:00 2001 From: Badr Ouali <32390048+oualib@users.noreply.github.com> Date: Mon, 6 Nov 2023 12:45:31 -0500 Subject: [PATCH] Adding TIME SERIES models (AR, MA, ARMA, ARIMA) (#830) * Adding ARIMA model * Multiple support - AR, MA, ARMA - load_model - improving code quality * corrections - fi * Possibility to draw TS * correction: start parameter * corrections * corrections * corrections * Correcting plots + adding metrics * Supporting estimation of "ts" * supporting highchart * correction + plotly * Update tsa.py * Update tsa.py * Update tsa.py * Docstring for AR * Added Docstring for ARIMA * Docstring for ARMA * multiple corrections --------- Co-authored-by: umar <46414488+mail4umar@users.noreply.github.com> --- verticapy/_utils/_sql/_vertica_version.py | 4 + .../machine_learning/vertica/__init__.py | 1 + verticapy/machine_learning/vertica/base.py | 38 +- .../machine_learning/vertica/linear_model.py | 8 + .../vertica/model_management.py | 26 + verticapy/machine_learning/vertica/tsa.py | 2282 +++++++++++++++++ verticapy/plotting/_highcharts/__init__.py | 1 + .../_highcharts/machine_learning/tsa.py | 120 + verticapy/plotting/_highcharts/range.py | 1 - verticapy/plotting/_matplotlib/__init__.py | 1 + .../_matplotlib/machine_learning/tsa.py | 112 + verticapy/plotting/_plotly/__init__.py | 1 + verticapy/plotting/_plotly/line.py | 2 +- .../plotting/_plotly/machine_learning/tsa.py | 124 + verticapy/plotting/base.py | 52 + .../vertica/test_linear_model.py | 5 +- .../vertica/test_tree_model.py | 5 +- 17 files changed, 2761 insertions(+), 22 deletions(-) create mode 100755 verticapy/machine_learning/vertica/tsa.py create mode 100755 verticapy/plotting/_highcharts/machine_learning/tsa.py create mode 100755 verticapy/plotting/_matplotlib/machine_learning/tsa.py create mode 100644 verticapy/plotting/_plotly/machine_learning/tsa.py diff --git a/verticapy/_utils/_sql/_vertica_version.py b/verticapy/_utils/_sql/_vertica_version.py index 87c07c422..19d155e07 100755 --- a/verticapy/_utils/_sql/_vertica_version.py +++ b/verticapy/_utils/_sql/_vertica_version.py @@ -22,6 +22,9 @@ from verticapy.errors import VersionError MINIMUM_VERTICA_VERSION = { + "ARIMA": [23, 3, 0], + "AR": [11, 0, 0], + "ARMA": [12, 0, 3], "Balance": [8, 1, 1], "BernoulliNB": [8, 0, 0], "BisectingKMeans": [9, 3, 1], @@ -47,6 +50,7 @@ "LogisticRegression": [8, 0, 0], "KMeans": [8, 0, 0], "KPrototypes": [12, 0, 3], + "MA": [11, 0, 0], "MCA": [9, 1, 0], "MinMaxScaler": [8, 1, 0], "MultinomialNB": [8, 0, 0], diff --git a/verticapy/machine_learning/vertica/__init__.py b/verticapy/machine_learning/vertica/__init__.py index 47ca13459..7eed7ff33 100755 --- a/verticapy/machine_learning/vertica/__init__.py +++ b/verticapy/machine_learning/vertica/__init__.py @@ -68,3 +68,4 @@ DummyTreeClassifier, DummyTreeRegressor, ) +from verticapy.machine_learning.vertica.tsa import ARIMA, ARMA, AR, MA diff --git a/verticapy/machine_learning/vertica/base.py b/verticapy/machine_learning/vertica/base.py index 6a7525e44..e490b2261 100755 --- a/verticapy/machine_learning/vertica/base.py +++ b/verticapy/machine_learning/vertica/base.py @@ -783,6 +783,8 @@ def fit( str model's summary. """ + + # Initialization if self.overwrite_model: self.drop() else: @@ -819,7 +821,7 @@ def fit( ROW_NUMBER() OVER (ORDER BY {', '.join(X)}) AS {id_column_name}""" - tmp_view = False + tmp_view = False if isinstance(input_relation, vDataFrame) or (id_column): tmp_view = True if isinstance(input_relation, vDataFrame): @@ -830,10 +832,9 @@ def fit( relation = gen_tmp_name( schema=schema_relation(self.model_name)[0], name="view" ) - drop(relation, method="view") _executeSQL( query=f""" - CREATE VIEW {relation} AS + CREATE OR REPLACE VIEW {relation} AS SELECT /*+LABEL('learn.VerticaModel.fit')*/ *{id_column} @@ -849,6 +850,7 @@ def fit( self.test_relation = test_relation else: self.test_relation = self.input_relation + # Fitting if self._is_native: parameters = self._get_vertica_param_dict() if ( @@ -917,15 +919,6 @@ def _attributes(self) -> list: def __init__(self) -> None: """Must be overridden in the child class""" self.features_importance_trees_ = {} - return None - # self.input_relation = None - # self.test_relation = None - # self.X = None - # self.y = None - # self.parameters = {} - # self.classes_ = None - # for att in self._attributes: - # setattr(self, att, None) def _compute_trees_arrays( self, tree: TableSample, X: list, return_probability: bool = False @@ -1265,6 +1258,10 @@ def plot_tree( class BinaryClassifier(Supervised): + """ + Base Class for Vertica Binary Classifier. + """ + # Properties. @property @@ -1805,13 +1802,16 @@ def roc_curve( class MulticlassClassifier(Supervised): + """ + Base Class for Vertica Multiclass Classifiers. + """ + # System & Special Methods. @abstractmethod def __init__(self, name: str, overwrite_model: bool = False) -> None: """Must be overridden in the child class""" super().__init__(name, overwrite_model) - # self.classes_ = None def _check_pos_label(self, pos_label: PythonScalar) -> PythonScalar: """ @@ -2645,6 +2645,10 @@ def roc_curve( class Regressor(Supervised): + """ + Base Class for Vertica Regressors. + """ + # System & Special Methods. @abstractmethod @@ -2845,7 +2849,7 @@ def predict( Returns ------- vDataFrame - the input object. + the input object. """ if hasattr(self, "_predict"): return self._predict(vdf=vdf, X=X, name=name, inplace=inplace) @@ -2904,6 +2908,8 @@ def fit( str model's summary. """ + + # Initialization if self.overwrite_model: self.drop() else: @@ -2938,10 +2944,9 @@ def fit( relation = gen_tmp_name( schema=schema_relation(self.model_name)[0], name="view" ) - drop(relation, method="view") _executeSQL( query=f""" - CREATE VIEW {relation} AS + CREATE OR REPLACE VIEW {relation} AS SELECT /*+LABEL('learn.VerticaModel.fit')*/ * {id_column} @@ -2962,6 +2967,7 @@ def fit( parameters = self._get_vertica_param_dict() if "num_components" in parameters and not parameters["num_components"]: del parameters["num_components"] + # Fitting fun = self._vertica_fit_sql if self._model_type != "MCA" else "PCA" query = f""" SELECT diff --git a/verticapy/machine_learning/vertica/linear_model.py b/verticapy/machine_learning/vertica/linear_model.py index 3700df290..1814001da 100755 --- a/verticapy/machine_learning/vertica/linear_model.py +++ b/verticapy/machine_learning/vertica/linear_model.py @@ -43,6 +43,10 @@ class LinearModel: + """ + Base Class for Vertica Linear Models. + """ + # Properties. @property @@ -188,6 +192,10 @@ def plot( class LinearModelClassifier(LinearModel): + """ + Base Class for Vertica Linear Models Classifiers. + """ + # Properties. @property diff --git a/verticapy/machine_learning/vertica/model_management.py b/verticapy/machine_learning/vertica/model_management.py index 3ef81a99a..8cfd14b37 100755 --- a/verticapy/machine_learning/vertica/model_management.py +++ b/verticapy/machine_learning/vertica/model_management.py @@ -38,11 +38,13 @@ Lasso, LinearRegression, LogisticRegression, + PoissonRegressor, Ridge, ) from verticapy.machine_learning.vertica.naive_bayes import NaiveBayes from verticapy.machine_learning.vertica.preprocessing import Scaler, OneHotEncoder from verticapy.machine_learning.vertica.svm import LinearSVC, LinearSVR +from verticapy.machine_learning.vertica.tsa import ARIMA, AR, MA @save_verticapy_logs @@ -139,6 +141,9 @@ def load_model( info = info[0] info = eval("[" + info + "]") lookup_table = { + "arima": ARIMA, + "autoregressor": AR, + "moving_average": MA, "rf_regressor": RandomForestRegressor, "rf_classifier": RandomForestClassifier, "iforest": IsolationForest, @@ -149,6 +154,7 @@ def load_model( "svm_regressor": LinearSVR, "svm_classifier": LinearSVC, "linear_reg": LinearRegression, + "poisson_reg": PoissonRegressor, "kmeans": KMeans, "kprototypes": KPrototypes, "bisecting_kmeans": BisectingKMeans, @@ -158,9 +164,20 @@ def load_model( } model = lookup_table[model_type](name) if model_type != "svd": + # Variables used in the CALL STRING true, false = True, False squarederror = "squarederror" crossentropy = "crossentropy" + ols = "ols" + hr = "hr" + linear_interpolation = "linear_interpolation" + zero = "zero" + error = "error" + drop = "drop" + if "method=yule-walker," in parameters: + parameters = parameters.replace( + "method=yule-walker,", "method='yule-walker'," + ) if " lambda=" in parameters: parameters = parameters.replace(" lambda=", " C=") try: @@ -186,6 +203,15 @@ def load_model( model.y = info[2] model.X = eval("[" + info[3] + "]") model.test_relation = test_relation if (test_relation) else model.input_relation + elif model._model_category == "TIMESERIES": + model.y = info[2] + model.ts = info[3] + model.test_relation = test_relation if (test_relation) else model.input_relation + if model._model_type == "ARIMA": + p = int(model.get_vertica_attributes("p")["p"][0]) + d = int(model.get_vertica_attributes("d")["d"][0]) + q = int(model.get_vertica_attributes("q")["q"][0]) + model.set_params({"order": (p, d, q)}) else: model.X = eval("[" + info[2] + "]") model._compute_attributes() diff --git a/verticapy/machine_learning/vertica/tsa.py b/verticapy/machine_learning/vertica/tsa.py new file mode 100755 index 000000000..38857af78 --- /dev/null +++ b/verticapy/machine_learning/vertica/tsa.py @@ -0,0 +1,2282 @@ +""" +Copyright (c) 2018-2023 Open Text or one of its +affiliates. Licensed under the Apache License, +Version 2.0 (the "License"); You may not use this +file except in compliance with the License. + +You may obtain a copy of the License at: +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in +writing, software distributed under the License is +distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing +permissions and limitations under the License. +""" +from abc import abstractmethod +import copy +from typing import Literal, Optional, Union + +import numpy as np + +from verticapy._typing import ( + PlottingObject, + PythonNumber, + NoneType, + SQLRelation, +) +from verticapy._utils._gen import gen_name, gen_tmp_name +from verticapy._utils._sql._collect import save_verticapy_logs +from verticapy._utils._sql._format import ( + clean_query, + quote_ident, + schema_relation, +) +from verticapy._utils._sql._sys import _executeSQL +from verticapy._utils._sql._vertica_version import check_minimum_version + +from verticapy.core.vdataframe.base import TableSample, vDataFrame + +import verticapy.machine_learning.metrics as mt +from verticapy.machine_learning.vertica.base import VerticaModel + +from verticapy.sql.drop import drop + +""" +General Classes. +""" + + +class TimeSeriesModelBase(VerticaModel): + """ + Base Class for Vertica Time Series Models. + """ + + # Properties. + + @property + def _model_category(self) -> Literal["TIMESERIES"]: + return "TIMESERIES" + + @property + def _attributes(self) -> list[str]: + common_params = [ + "mse_", + "n_", + ] + if self._model_type == "ARIMA": + return [ + "phi_", + "theta_", + "mean_", + ] + common_params + elif self._model_type == "AR": + return [ + "phi_", + "intercept_", + "features_importance_", + ] + common_params + else: + return [ + "theta_", + "mu_", + "mean_", + ] + common_params + + # Attributes Methods. + + def _compute_attributes(self) -> None: + """ + Computes the model's attributes. + """ + coefficients = self.get_vertica_attributes("coefficients") + i = 1 + if "p" in self.parameters: + p = self.parameters["p"] + self.intercept_ = coefficients["value"][0] + else: + self.mean_ = self.get_vertica_attributes("mean")["mean"][0] + if "order" in self.parameters: + p = self.parameters["order"][0] + i = 0 + else: + p = 0 + self.mu_ = coefficients["value"][0] + self.phi_ = np.array(coefficients["value"][i : p + i]) + self.theta_ = np.array(coefficients["value"][p + i :]) + try: + self.mse_ = self.get_vertica_attributes("mean_squared_error")[ + "mean_squared_error" + ][0] + except: + self.mse_ = None + self.n_ = self.get_vertica_attributes("accepted_row_count")[ + "accepted_row_count" + ][0] + + # System & Special Methods. + + @abstractmethod + def __init__(self, name: str, overwrite_model: bool = False) -> None: + """Must be overridden in the child class""" + super().__init__(name, overwrite_model) + + # Model Fitting Method. + + def fit( + self, + input_relation: SQLRelation, + ts: str, + y: str, + test_relation: SQLRelation = "", + return_report: bool = False, + ) -> Optional[str]: + """ + Trains the model. + + Parameters + ---------- + input_relation: SQLRelation + Training relation. + ts: str + TS (Time Series) vDataColumn used to order + the data. The vDataColumn type must be date + (date, datetime, timestamp...) or numerical. + y: str + Response column. + test_relation: SQLRelation, optional + Relation used to test the model. + return_report: bool, optional + [For native models] + When set to True, the model summary + will be returned. Otherwise, it will + be printed. + + Returns + ------- + str + model's summary. + """ + + # Initialization + if self.overwrite_model: + self.drop() + else: + self._is_already_stored(raise_error=True) + self.ts = quote_ident(ts) + self.y = quote_ident(y) + tmp_view = False + if isinstance(input_relation, vDataFrame) and self._is_native: + tmp_view = True + if isinstance(input_relation, vDataFrame): + self.input_relation = input_relation.current_relation() + else: + self.input_relation = input_relation + relation = gen_tmp_name( + schema=schema_relation(self.model_name)[0], name="view" + ) + _executeSQL( + query=f""" + CREATE OR REPLACE VIEW {relation} AS + SELECT + /*+LABEL('learn.VerticaModel.fit')*/ + {self.ts}, {self.y} + FROM {self.input_relation}""", + title="Creating a temporary view to fit the model.", + ) + else: + self.input_relation = input_relation + relation = input_relation + if isinstance(test_relation, vDataFrame): + self.test_relation = test_relation.current_relation() + elif test_relation: + self.test_relation = test_relation + else: + self.test_relation = self.input_relation + # Fitting + if self._is_native: + parameters = self._get_vertica_param_dict() + if "order" in parameters: + parameters["p"] = parameters["order"][0] + parameters["q"] = parameters["order"][-1] + if len(parameters["order"]) == 3: + parameters["d"] = parameters["order"][1] + del parameters["order"] + query = f""" + SELECT + /*+LABEL('learn.VerticaModel.fit')*/ + {self._vertica_fit_sql} + ('{self.model_name}', + '{relation}', + '{self.y}', + '{self.ts}' + USING PARAMETERS + {', '.join([f"{p} = {parameters[p]}" for p in parameters])})""" + try: + _executeSQL(query, title="Fitting the model.") + finally: + if tmp_view: + drop(relation, method="view") + self._compute_attributes() + if self._is_native: + report = self.summarize() + if return_report: + return report + print(report) + return None + + # I/O Methods. + + def deploySQL( + self, + ts: Optional[str] = None, + y: Optional[str] = None, + start: Optional[int] = None, + npredictions: int = 10, + output_standard_errors: bool = False, + output_index: bool = False, + ) -> str: + """ + Returns the SQL code needed to deploy the model. + + Parameters + ---------- + ts: str, optional + TS (Time Series) vDataColumn used to order + the data. The vDataColumn type must be date + (date, datetime, timestamp...) or numerical. + y: str, optional + Response column. + start: int, optional + The behavior of the start parameter and its + range of accepted values depends on whether + you provide a timeseries-column (ts): + + - No provided timeseries-column: + start must be an integer greater or equal + to 0, where zero indicates to start prediction + at the end of the in-sample data. If start is a + positive value, the function predicts the values + between the end of the in-sample data and the + start index, and then uses the predicted values + as time series inputs for the subsequent + npredictions. + - timeseries-column provided: + start must be an integer greater or equal to 1 + and identifies the index (row) of the timeseries + -column at which to begin prediction. If the start + index is greater than the number of rows, N, in the + input data, the function predicts the values between + N and start and uses the predicted values as time + series inputs for the subsequent npredictions. + + Default: + + - No provided timeseries-column: + prediction begins from the end of the in-sample + data. + - timeseries-column provided: + prediction begins from the end of the provided + input data. + npredictions: int, optional + Integer greater or equal to 1, the number of predicted + timesteps. + output_standard_errors: bool, optional + Boolean, whether to return estimates of the standard + error of each prediction. + output_index: bool, optional + Boolean, whether to return the index of each position. + + Returns + ------- + str + the SQL code needed to deploy the model. + """ + if self._vertica_predict_sql: + # Initialization + if isinstance(ts, NoneType): + ts = "" + else: + ts = "ORDER BY " + quote_ident(ts) + if isinstance(y, NoneType): + y = "" + else: + y = quote_ident(y) + if isinstance(start, NoneType): + start = "" + else: + start = f"start = {start}," + if output_standard_errors or output_index: + output_standard_errors = f", output_standard_errors = true" + else: + output_standard_errors = "" + # Deployment + sql = f""" + {self._vertica_predict_sql}({y} + USING PARAMETERS + model_name = '{self.model_name}', + add_mean = True, + {start} + npredictions = {npredictions} + {output_standard_errors}) + OVER ({ts})""" + return clean_query(sql) + else: + raise AttributeError( + f"Method 'deploySQL' does not exist for {self._model_type} models." + ) + + # Features Importance Methods. + + def _compute_features_importance(self) -> None: + """ + Computes the features importance. + """ + if self._model_type == "MA" or ( + self._model_type in ("ARMA", "ARIMA") and self.get_params()["order"][0] == 0 + ): + raise AttributeError( + "Features Importance can not be computed for Moving Averages." + ) + else: + self.features_importance_ = self.phi_ / sum(abs(self.phi_)) + + def _get_features_importance(self) -> np.ndarray: + """ + Returns the features' importance. + """ + if not hasattr(self, "features_importance_"): + self._compute_features_importance() + return copy.deepcopy(self.features_importance_) + + def features_importance( + self, show: bool = True, chart: Optional[PlottingObject] = None, **style_kwargs + ) -> PlottingObject: + """ + Computes the model's features importance. + + Parameters + ---------- + show: bool + If set to True, draw the feature's importance. + chart: PlottingObject, optional + The chart object to plot on. + **style_kwargs + Any optional parameter to pass to the Plotting + functions. + + Returns + ------- + obj + features importance. + """ + fi = self._get_features_importance() + columns = [copy.deepcopy(self.y) + f"[t-{i + 1}]" for i in range(len(fi))] + if show: + data = { + "importance": fi, + } + layout = {"columns": columns} + vpy_plt, kwargs = self.get_plotting_lib( + class_name="ImportanceBarChart", + chart=chart, + style_kwargs=style_kwargs, + ) + return vpy_plt.ImportanceBarChart(data=data, layout=layout).draw(**kwargs) + importances = { + "index": [quote_ident(x)[1:-1].lower() for x in columns], + "importance": list(abs(fi)), + "sign": list(np.sign(fi)), + } + return TableSample(values=importances).sort(column="importance", desc=True) + + # Prediction / Transformation Methods. + + def predict( + self, + vdf: Optional[SQLRelation] = None, + ts: Optional[str] = None, + y: Optional[str] = None, + start: Optional[int] = None, + npredictions: int = 10, + output_standard_errors: bool = False, + output_index: bool = False, + output_estimated_ts: bool = False, + ) -> vDataFrame: + """ + Predicts using the input relation. + + Parameters + ---------- + vdf: SQLRelation + Object used to run the prediction. You can + also specify a customized relation, but you + must enclose it with an alias. For example, + "(SELECT 1) x" is valid, whereas "(SELECT 1)" + and "SELECT 1" are invalid. + ts: str, optional + TS (Time Series) vDataColumn used to order + the data. The vDataColumn type must be date + (date, datetime, timestamp...) or numerical. + y: str, optional + Response column. + start: int, optional + The behavior of the start parameter and its + range of accepted values depends on whether + you provide a timeseries-column (ts): + + - No provided timeseries-column: + start must be an integer greater or equal + to 0, where zero indicates to start prediction + at the end of the in-sample data. If start is a + positive value, the function predicts the values + between the end of the in-sample data and the + start index, and then uses the predicted values + as time series inputs for the subsequent + npredictions. + - timeseries-column provided: + start must be an integer greater or equal to 1 + and identifies the index (row) of the timeseries + -column at which to begin prediction. If the start + index is greater than the number of rows, N, in the + input data, the function predicts the values between + N and start and uses the predicted values as time + series inputs for the subsequent npredictions. + + Default: + + - No provided timeseries-column: + prediction begins from the end of the in-sample + data. + - timeseries-column provided: + prediction begins from the end of the provided + input data. + npredictions: int, optional + Integer greater or equal to 1, the number of predicted + timesteps. + output_standard_errors: bool, optional + Boolean, whether to return estimates of the standard + error of each prediction. + output_index: bool, optional + Boolean, whether to return the index of each prediction. + output_estimated_ts: bool, optional + Boolean, whether to return the estimated abscissa of + each prediction. The real one is hard to obtain due to + interval computations. + + Returns + ------- + vDataFrame + a new object. + """ + ar_ma = False + if self._model_type in ( + "AR", + "MA", + ): + if isinstance(vdf, NoneType): + vdf = self.input_relation + if isinstance(ts, NoneType): + ts = self.ts + if isinstance(y, NoneType): + y = self.y + ar_ma = True + sql = "SELECT " + self.deploySQL( + ts=ts, + y=y, + start=start, + npredictions=npredictions, + output_standard_errors=( + output_standard_errors or output_index or output_estimated_ts + ), + output_index=output_index, + ) + no_relation = True + if not (isinstance(vdf, NoneType)): + sql += f" FROM {vdf}" + no_relation = False + if output_index or output_estimated_ts: + j = self.n_ + if no_relation: + if not (isinstance(start, NoneType)): + j = j + start + elif not (isinstance(start, NoneType)): + j = start + if (output_standard_errors or output_estimated_ts) and not (ar_ma): + if not (output_standard_errors): + stde_out = "" + else: + stde_out = ", std_err" + output_standard_errors = ", std_err" + else: + output_standard_errors = "" + stde_out = "" + if ar_ma: + order_by = "" + else: + order_by = 'ORDER BY "std_err"' + sql = f""" + SELECT + ROW_NUMBER() OVER ({order_by}) + {j} - 1 AS idx, + prediction{output_standard_errors} + FROM ({sql}) VERTICAPY_SUBTABLE""" + if output_estimated_ts: + min_value = f"(SELECT MIN({self.ts}) FROM {self.input_relation})" + delta = f""" + (SELECT + AVG(delta) + FROM (SELECT + {self.ts} - LAG({self.ts}) OVER (ORDER BY {self.ts}) AS delta + FROM {self.input_relation}) VERTICAPY_SUBTABLE)""" + sql = f""" + SELECT + idx * {delta} + {min_value} AS {self.ts}, + prediction{stde_out} + FROM ({sql}) VERTICAPY_SUBTABLE""" + return vDataFrame(clean_query(sql)) + + # Model Evaluation Methods. + + def _evaluation_relation( + self, + start: Optional[int] = None, + npredictions: Optional[int] = None, + ): + """ + Returns the relation needed to evaluate the + model. + """ + if hasattr(self, "test_relation"): + test_relation = self.test_relation + elif hasattr(self, "input_relation"): + test_relation = self.input_relation + else: + raise AttributeError( + "No attributes found. The model is probably not yet fitted." + ) + parameters = self.get_params() + if isinstance(start, NoneType): + start = self.n_ / 4 + if isinstance(npredictions, NoneType): + npredictions = self.n_ - start + prediction = self.predict( + vdf=test_relation, + ts=self.ts, + y=self.y, + start=start, + npredictions=npredictions, + output_index=True, + ) + sql = f""" + (SELECT + true_values.y_true, + prediction_relation.prediction AS y_pred + FROM + ( + SELECT + ROW_NUMBER() OVER (ORDER BY {self.ts}) AS idx, + {self.y} AS y_true + FROM {test_relation} + ) AS true_values + NATURAL JOIN + (SELECT * FROM {prediction}) AS prediction_relation) VERTICAPY_SUBTABLE""" + return clean_query(sql) + + def regression_report( + self, + metrics: Union[ + str, + Literal[None, "anova", "details"], + list[Literal[tuple(mt.FUNCTIONS_REGRESSION_DICTIONNARY)]], + ] = None, + start: Optional[int] = None, + npredictions: Optional[int] = None, + ) -> Union[float, TableSample]: + """ + Computes a regression report using multiple metrics to + evaluate the model (r2, mse, max error...). + + Parameters + ---------- + metrics: str, optional + The metrics used to compute the regression report. + None : Computes the model different metrics. + anova : Computes the model ANOVA table. + details : Computes the model details. + You can also provide a list of different metrics, + including the following: + aic : Akaike’s Information Criterion + bic : Bayesian Information Criterion + max : Max Error + mae : Mean Absolute Error + median : Median Absolute Error + mse : Mean Squared Error + msle : Mean Squared Log Error + qe : quantile error, the quantile must be + included in the name. Example: + qe50.1% will return the quantile error + using q=0.501. + r2 : R squared coefficient + r2a : R2 adjusted + rmse : Root Mean Squared Error + var : Explained Variance + start: int, optional + The behavior of the start parameter and its + range of accepted values depends on whether + you provide a timeseries-column (ts): + + - No provided timeseries-column: + start must be an integer greater or equal + to 0, where zero indicates to start prediction + at the end of the in-sample data. If start is a + positive value, the function predicts the values + between the end of the in-sample data and the + start index, and then uses the predicted values + as time series inputs for the subsequent + npredictions. + - timeseries-column provided: + start must be an integer greater or equal to 1 + and identifies the index (row) of the timeseries + -column at which to begin prediction. If the start + index is greater than the number of rows, N, in the + input data, the function predicts the values between + N and start and uses the predicted values as time + series inputs for the subsequent npredictions. + + Default: + + - No provided timeseries-column: + prediction begins from the end of the in-sample + data. + - timeseries-column provided: + prediction begins from the end of the provided + input data. + npredictions: int, optional + Integer greater or equal to 1, the number of predicted + timesteps. + + Returns + ------- + TableSample + report. + """ + return mt.regression_report( + "y_true", + "y_pred", + self._evaluation_relation(start=start, npredictions=npredictions), + metrics=metrics, + k=1, + ) + + report = regression_report + + def score( + self, + metric: Literal[ + tuple(mt.FUNCTIONS_REGRESSION_DICTIONNARY) + + ("r2a", "r2_adj", "rsquared_adj", "r2adj", "r2adjusted", "rmse") + ] = "r2", + start: Optional[int] = None, + npredictions: Optional[int] = None, + ) -> float: + """ + Computes the model score. + + Parameters + ---------- + metric: str, optional + The metric used to compute the score. + aic : Akaike’s Information Criterion + bic : Bayesian Information Criterion + max : Max Error + mae : Mean Absolute Error + median : Median Absolute Error + mse : Mean Squared Error + msle : Mean Squared Log Error + r2 : R squared coefficient + r2a : R2 adjusted + rmse : Root Mean Squared Error + var : Explained Variance + start: int, optional + The behavior of the start parameter and its + range of accepted values depends on whether + you provide a timeseries-column (ts): + + - No provided timeseries-column: + start must be an integer greater or equal + to 0, where zero indicates to start prediction + at the end of the in-sample data. If start is a + positive value, the function predicts the values + between the end of the in-sample data and the + start index, and then uses the predicted values + as time series inputs for the subsequent + npredictions. + - timeseries-column provided: + start must be an integer greater or equal to 1 + and identifies the index (row) of the timeseries + -column at which to begin prediction. If the start + index is greater than the number of rows, N, in the + input data, the function predicts the values between + N and start and uses the predicted values as time + series inputs for the subsequent npredictions. + + Default: + + - No provided timeseries-column: + prediction begins from the end of the in-sample + data. + - timeseries-column provided: + prediction begins from the end of the provided + input data. + npredictions: int, optional + Integer greater or equal to 1, the number of predicted + timesteps. + + Returns + ------- + float + score. + """ + # Initialization + metric = str(metric).lower() + if metric in ["r2adj", "r2adjusted"]: + metric = "r2a" + adj, root = False, False + if metric in ("r2a", "r2adj", "r2adjusted", "r2_adj", "rsquared_adj"): + metric, adj = "r2", True + elif metric == "rmse": + metric, root = "mse", True + fun = mt.FUNCTIONS_REGRESSION_DICTIONNARY[metric] + + # Scoring + arg = [ + "y_true", + "y_pred", + self._evaluation_relation(start=start, npredictions=npredictions), + ] + if metric in ("aic", "bic") or adj: + arg += [1] + if root or adj: + arg += [True] + return fun(*arg) + + # Plotting Methods. + + def plot( + self, + vdf: Optional[SQLRelation] = None, + ts: Optional[str] = None, + y: Optional[str] = None, + start: Optional[int] = None, + npredictions: int = 10, + chart: Optional[PlottingObject] = None, + **style_kwargs, + ) -> PlottingObject: + """ + Draws the model. + + Parameters + ---------- + vdf: SQLRelation + Object used to run the prediction. You can + also specify a customized relation, but you + must enclose it with an alias. For example, + "(SELECT 1) x" is valid, whereas "(SELECT 1)" + and "SELECT 1" are invalid. + ts: str, optional + TS (Time Series) vDataColumn used to order + the data. The vDataColumn type must be date + (date, datetime, timestamp...) or numerical. + y: str, optional + Response column. + start: int, optional + The behavior of the start parameter and its + range of accepted values depends on whether + you provide a timeseries-column (ts): + + - No provided timeseries-column: + start must be an integer greater or equal + to 0, where zero indicates to start prediction + at the end of the in-sample data. If start is a + positive value, the function predicts the values + between the end of the in-sample data and the + start index, and then uses the predicted values + as time series inputs for the subsequent + npredictions. + - timeseries-column provided: + start must be an integer greater or equal to 1 + and identifies the index (row) of the timeseries + -column at which to begin prediction. If the start + index is greater than the number of rows, N, in the + input data, the function predicts the values between + N and start and uses the predicted values as time + series inputs for the subsequent npredictions. + + Default: + + - No provided timeseries-column: + prediction begins from the end of the in-sample + data. + - timeseries-column provided: + prediction begins from the end of the provided + input data. + npredictions: int, optional + Integer greater or equal to 1, the number of predicted + timesteps. + chart: PlottingObject, optional + The chart object to plot on. + **style_kwargs + Any optional parameter to pass to the + Plotting functions. + + Returns + ------- + object + Plotting Object. + """ + dataset_provided = True + if isinstance(vdf, NoneType): + dataset_provided = False + vpy_plt, kwargs = self.get_plotting_lib( + class_name="TSPlot", + chart=chart, + style_kwargs=style_kwargs, + ) + return vpy_plt.TSPlot( + vdf=vDataFrame(self.input_relation), + columns=self.y, + order_by=self.ts, + prediction=self.predict( + vdf=vdf, + ts=ts, + y=y, + start=start, + npredictions=npredictions, + output_standard_errors=True, + ), + start=start, + dataset_provided=dataset_provided, + ).draw(**kwargs) + + +class ARIMA(TimeSeriesModelBase): + """ + Creates a inDB ARIMA model. + + .. versionadded:: 23.3.0 + + Parameters + ---------- + name: str, optional + Name of the model. The model is stored in the + database. + overwrite_model: bool, optional + If set to True, training a model with the same + name as an existing model overwrites the + existing model. + order: tuple, optional + The (p,d,q) order of the model for the autoregressive, + differences, and moving average components. + tol: float, optional + Determines whether the algorithm has reached + the specified accuracy result. + max_iter: int, optional + Determines the maximum number of iterations + the algorithm performs before achieving the + specified accuracy result. + init: str, optional + Initialization method, one of the following: + + - 'zero': + Coefficients are initialized to zero. + - 'hr': + Coefficients are initialized using the + Hannan-Rissanen algorithm. + + missing: str, optional + Method for handling missing values, one of the + following strings: + + - 'drop': + Missing values are ignored. + - 'raise': + Missing values raise an error. + - 'zero': + Missing values are set to zero. + - 'linear_interpolation': + Missing values are replaced by a linearly + interpolated value based on the nearest + valid entries before and after the missing + value. In cases where the first or last + values in a dataset are missing, the function + errors. + + Examples + --------- + + The following examples provide a basic understanding of usage. + For more detailed examples, please refer to the + :ref:`user_guide.machine_learning` or the + `Examples `_ + section on the website. + + Initialization + ^^^^^^^^^^^^^^^ + + We import ``verticapy``: + + .. ipython:: python + + import verticapy as vp + + .. hint:: + + By assigning an alias to ``verticapy``, we mitigate the risk + of code collisions with other libraries. This precaution is + necessary because verticapy uses commonly known function names + like "average" and "median", which can potentially lead to naming + conflicts. The use of an alias ensures that the functions from + verticapy are used as intended without interfering with functions + from other libraries. + + For this example, we will use the airline passengers dataset. + + .. code-block:: python + + import verticapy.datasets as vpd + + data = vpd.load_airline_passengers() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_airline_passengers.html + + .. note:: + + VerticaPy offers a wide range of sample datasets that are + ideal for training and testing purposes. You can explore + the full list of available datasets in the :ref:`api.datasets`, + which provides detailed information on each dataset + and how to use them effectively. These datasets are invaluable + resources for honing your data analysis and machine learning + skills within the VerticaPy environment. + + .. ipython:: python + :suppress: + + import verticapy.datasets as vpd + data = vpd.load_airline_passengers() + + Model Initialization + ^^^^^^^^^^^^^^^^^^^^^ + + First we import the ``ARIMA`` model: + + .. ipython:: python + + from verticapy.machine_learning.vertica.tsa import ARIMA + + Then we can create the model: + + .. ipython:: python + :okwarning: + + model = ARIMA(order = (12, 2, 2)) + + .. hint:: + + In ``verticapy`` 1.0.x and higher, you do not need to specify the + model name, as the name is automatically assigned. If you need to + re-use the model, you can fetch the model name from the model's + attributes. + + .. important:: + + The model name is crucial for the model management system and + versioning. It's highly recommended to provide a name if you + plan to reuse the model later. + + Model Training + ^^^^^^^^^^^^^^^ + + We can now fit the model: + + .. ipython:: python + :okwarning: + + model.fit(data, "date", "passengers") + + .. important:: + + To train a model, you can directly use the ``vDataFrame`` or the + name of the relation stored in the database. The test set is optional + and is only used to compute the test metrics. In ``verticapy``, we + don't work using ``X`` matrices and ``y`` vectors. Instead, we work + directly with lists of predictors and the response name. + + Features Importance + ^^^^^^^^^^^^^^^^^^^^ + + We can conveniently get the features importance: + + .. ipython:: python + :okwarning: + + model.features_importance() + + .. ipython:: python + :suppress: + :okwarning: + + vp.set_option("plotting_lib", "plotly") + fig = model.features_importance() + fig.write_html("SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_arima_features.html") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_arima_features.html + + .. important:: + + Feature importance is determined by using the coefficients of the + auto-regressive (AR) process and normalizing them. This method + tends to be precise when your time series primarily consists of an + auto-regressive component. However, its accuracy may be a topic of + discussion if the time series contains other components as well. + + Metrics + ^^^^^^^^ + + We can get the entire report using: + + .. code-block:: python + + model.report() + + .. ipython:: python + :suppress: + :okwarning: + + result = model.report() + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_arima_report.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_arima_report.html + + You can also choose the number of predictions and where to start the forecast. + For example, the following code will allow you to generate a report with 30 + predictions, starting the forecasting process at index 40. + + .. code-block:: python + + model.report(start = 40, npredictions = 30) + + .. ipython:: python + :suppress: + :okwarning: + + result = model.report(start = 40, npredictions = 30) + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_arima_report_pred_2.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_arima_report_pred_2.html + + .. important:: + + Most metrics are computed using a single SQL query, but some of them might + require multiple SQL queries. Selecting only the necessary metrics in the + report can help optimize performance. + E.g. ``model.report(metrics = ["mse", "r2"])``. + + You can utilize the + :py:mod:`verticapy.machine_learning.vertica.tsa.ARIMA.score` + function to calculate various regression metrics, with the explained + variance being the default. + + .. ipython:: python + :okwarning: + + model.score() + + The same applies to the score. You can choose where to start and + the number of predictions to use. + + .. ipython:: python + :okwarning: + + model.score(start = 40, npredictions = 30) + + .. important:: + + If you do not specify a starting point and the number of + predictions, the forecast will begin at one-fourth of the + dataset, which can result in an inaccurate score, especially + for large datasets. It's important to choose these parameters + carefully. + + Prediction + ^^^^^^^^^^^ + + Prediction is straight-forward: + + .. code-block:: python + + model.predict() + + .. ipython:: python + :suppress: + :okwarning: + + result = model.predict() + html_file = open("figures/machine_learning_vertica_tsa_arima_prediction.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_arima_prediction.html + + .. hint:: + + You can control the number of prediction steps by changing + the ``npredictions`` parameter: + ``model.predict(npredictions = 30)``. + + .. note:: + + Predictions can be made automatically by using the training set, + in which case you don't need to specify the predictors. Alternatively, you + can pass only the ``vDataFrame`` to the + :py:mod:`verticapy.machine_learning.vertica.tsa.ARIMA.predict` + function, but in this case, it's essential that the column names of + the ``vDataFrame`` match the predictors and response name in the + model. + + If you would like to have the 'time-stamps' (ts) in the output then + you can switch the ``output_estimated_ts`` the parameter. And if you + also would like to see the standard error then you can switch the + ``output_standard_errors``parameter: + + .. code-block:: python + + model.predict(output_estimated_ts = True, output_standard_errors = True) + + .. ipython:: python + :suppress: + :okwarning: + + result = model.predict(output_estimated_ts = True, output_standard_errors = True) + html_file = open("figures/machine_learning_vertica_tsa_arima_prediction_2.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_arima_prediction_2.html + + .. important:: + + The ``output_estimated_ts`` parameter provides an estimation of + 'ts' assuming that 'ts' is regularly spaced. + + If you don't provide any input, the function will begin forecasting + after the last known value. If you want to forecast starting from a + specific value within the input dataset or another dataset, you can + use the following syntax. + + .. code-block:: python + + model.predict( + data, + "date", + "passengers", + start = 40, + npredictions = 20, + output_estimated_ts = True, + output_standard_errors = True, + ) + + .. ipython:: python + :suppress: + :okwarning: + + result = model.predict(data, "date", "passengers", start = 40, npredictions = 20, output_estimated_ts = True, output_standard_errors = True) + html_file = open("figures/machine_learning_vertica_tsa_arima_prediction_3.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_arima_prediction_3.html + + Plots + ^^^^^^ + + We can conveniently plot the predictions on a line plot + to observe the efficacy of our model: + + .. code-block:: python + + model.plot(data, "date", "passengers", npredictions = 80, start=120) + + .. ipython:: python + :suppress: + :okwarning: + + vp.set_option("plotting_lib", "plotly") + fig = model.plot(data, "date", "passengers", npredictions = 80, start = 120, width = 650) + fig.write_html("figures/machine_learning_vertica_tsa_arima_plot_1.html") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_arima_plot_1.html + + .. note:: + + You can control the number of prediction steps by changing + the ``npredictions`` parameter: + ``model.plot(npredictions = 30)``. + + Please refer to :ref:`chart_gallery.tsa` for more examples. + + Model Register + ^^^^^^^^^^^^^^ + + In order to register the model for tracking and versioning: + + .. code-block:: python + + model.register("model_v1") + + Please refer to :ref:`notebooks/ml/model_tracking_versioning/index.html` + for more details on model tracking and versioning. + """ + + # Properties. + + @property + def _vertica_fit_sql(self) -> Literal["ARIMA"]: + return "ARIMA" + + @property + def _vertica_predict_sql(self) -> Literal["PREDICT_ARIMA"]: + return "PREDICT_ARIMA" + + @property + def _model_subcategory(self) -> Literal["TIMESERIES"]: + return "TIMESERIES" + + @property + def _model_type(self) -> Literal["ARIMA"]: + return "ARIMA" + + # System & Special Methods. + + @check_minimum_version + @save_verticapy_logs + def __init__( + self, + name: str = None, + overwrite_model: bool = False, + order: Union[tuple[int], list[int]] = (0, 0, 0), + tol: float = 1e-6, + max_iter: int = 100, + init: Literal["zero", "hr"] = "zero", + missing: Literal[ + "drop", "raise", "zero", "linear_interpolation" + ] = "linear_interpolation", + ) -> None: + super().__init__(name, overwrite_model) + if not (isinstance(order, (tuple, list)) or len(order)) != 3: + raise ValueError( + "Parameter 'order' must be a tuple or a list of 3 elements." + ) + for x in order: + if not (isinstance(x, int)): + raise ValueError( + "Parameter 'order' must be a tuple or a list of integers." + ) + self.parameters = { + "order": order, + "tol": tol, + "max_iter": max_iter, + "init": str(init).lower(), + "missing": str(missing).lower(), + } + + +class ARMA(TimeSeriesModelBase): + """ + Creates a inDB ARMA model. + + .. versionadded:: 12.0.3 + + Parameters + ---------- + name: str, optional + Name of the model. The model is stored in the + database. + overwrite_model: bool, optional + If set to True, training a model with the same + name as an existing model overwrites the + existing model. + order: tuple, optional + The (p,q) order of the model for the autoregressive, + and moving average components. + tol: float, optional + Determines whether the algorithm has reached + the specified accuracy result. + max_iter: int, optional + Determines the maximum number of iterations + the algorithm performs before achieving the + specified accuracy result. + init: str, optional + Initialization method, one of the following: + + - 'zero': + Coefficients are initialized to zero. + - 'hr': + Coefficients are initialized using the + Hannan-Rissanen algorithm. + + missing: str, optional + Method for handling missing values, one of the + following strings: + + - 'drop': + Missing values are ignored. + - 'raise': + Missing values raise an error. + - 'zero': + Missing values are set to zero. + - 'linear_interpolation': + Missing values are replaced by a linearly + interpolated value based on the nearest + valid entries before and after the missing + value. In cases where the first or last + values in a dataset are missing, the function + errors. + + Examples + --------- + + The following examples provide a basic understanding of usage. + For more detailed examples, please refer to the + :ref:`user_guide.machine_learning` or the + `Examples `_ + section on the website. + + Initialization + ^^^^^^^^^^^^^^^ + + We import ``verticapy``: + + .. ipython:: python + + import verticapy as vp + + .. hint:: + + By assigning an alias to ``verticapy``, we mitigate the risk + of code collisions with other libraries. This precaution is + necessary because verticapy uses commonly known function names + like "average" and "median", which can potentially lead to naming + conflicts. The use of an alias ensures that the functions from + verticapy are used as intended without interfering with functions + from other libraries. + + For this example, we will use the airline passengers dataset. + + .. code-block:: python + + import verticapy.datasets as vpd + + data = vpd.load_airline_passengers() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_airline_passengers.html + + .. note:: + + VerticaPy offers a wide range of sample datasets that are + ideal for training and testing purposes. You can explore + the full list of available datasets in the :ref:`api.datasets`, + which provides detailed information on each dataset + and how to use them effectively. These datasets are invaluable + resources for honing your data analysis and machine learning + skills within the VerticaPy environment. + + .. ipython:: python + :suppress: + + import verticapy.datasets as vpd + data = vpd.load_airline_passengers() + + Model Initialization + ^^^^^^^^^^^^^^^^^^^^^ + + First we import the ``ARMA`` model: + + .. ipython:: python + + from verticapy.machine_learning.vertica.tsa import ARMA + + Then we can create the model: + + .. ipython:: python + :okwarning: + + model = ARMA(order = (12, 2)) + + .. hint:: + + In ``verticapy`` 1.0.x and higher, you do not need to specify the + model name, as the name is automatically assigned. If you need to + re-use the model, you can fetch the model name from the model's + attributes. + + .. important:: + + The model name is crucial for the model management system and + versioning. It's highly recommended to provide a name if you + plan to reuse the model later. + + Model Training + ^^^^^^^^^^^^^^^ + + We can now fit the model: + + .. ipython:: python + :okwarning: + + model.fit(data, "date", "passengers") + + .. important:: + + To train a model, you can directly use the ``vDataFrame`` or the + name of the relation stored in the database. The test set is optional + and is only used to compute the test metrics. In ``verticapy``, we + don't work using ``X`` matrices and ``y`` vectors. Instead, we work + directly with lists of predictors and the response name. + + Features Importance + ^^^^^^^^^^^^^^^^^^^^ + + We can conveniently get the features importance: + + .. ipython:: python + :okwarning: + + model.features_importance() + + .. ipython:: python + :suppress: + :okwarning: + + vp.set_option("plotting_lib", "plotly") + fig = model.features_importance() + fig.write_html("SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_arma_features.html") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_arma_features.html + + .. important:: + + Feature importance is determined by using the coefficients of the + auto-regressive (AR) process and normalizing them. This method + tends to be precise when your time series primarily consists of an + auto-regressive component. However, its accuracy may be a topic of + discussion if the time series contains other components as well. + + Metrics + ^^^^^^^^ + + We can get the entire report using: + + .. code-block:: python + + model.report() + + .. ipython:: python + :suppress: + :okwarning: + + result = model.report() + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_arma_report.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_arma_report.html + + You can also choose the number of predictions and where to start the forecast. + For example, the following code will allow you to generate a report with 30 + predictions, starting the forecasting process at index 40. + + .. code-block:: python + + model.report(start = 40, npredictions = 30) + + .. ipython:: python + :suppress: + :okwarning: + + result = model.report(start = 40, npredictions = 30) + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_arma_report_pred_2.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_arma_report_pred_2.html + + .. important:: + + Most metrics are computed using a single SQL query, but some of them might + require multiple SQL queries. Selecting only the necessary metrics in the + report can help optimize performance. + E.g. ``model.report(metrics = ["mse", "r2"])``. + + You can utilize the + :py:mod:`verticapy.machine_learning.vertica.tsa.ARMA.score` + function to calculate various regression metrics, with the explained + variance being the default. + + .. ipython:: python + :okwarning: + + model.score() + + The same applies to the score. You can choose where to start and + the number of predictions to use. + + .. ipython:: python + :okwarning: + + model.score(start = 40, npredictions = 30) + + .. important:: + + If you do not specify a starting point and the number of + predictions, the forecast will begin at one-fourth of the + dataset, which can result in an inaccurate score, especially + for large datasets. It's important to choose these parameters + carefully. + + Prediction + ^^^^^^^^^^^ + + Prediction is straight-forward: + + .. code-block:: python + + model.predict() + + .. ipython:: python + :suppress: + :okwarning: + + result = model.predict() + html_file = open("figures/machine_learning_vertica_tsa_arma_prediction.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_arma_prediction.html + + .. hint:: + + You can control the number of prediction steps by changing + the ``npredictions`` parameter: + ``model.predict(npredictions = 30)``. + + .. note:: + + Predictions can be made automatically by using the training set, + in which case you don't need to specify the predictors. Alternatively, you + can pass only the ``vDataFrame`` to the + :py:mod:`verticapy.machine_learning.vertica.tsa.ARMA.predict` + function, but in this case, it's essential that the column names of + the ``vDataFrame`` match the predictors and response name in the + model. + + If you would like to have the 'time-stamps' (ts) in the output then + you can switch the ``output_estimated_ts`` the parameter. And if you + also would like to see the standard error then you can switch the + ``output_standard_errors``parameter: + + .. code-block:: python + + model.predict(output_estimated_ts = True, output_standard_errors = True) + + .. ipython:: python + :suppress: + :okwarning: + + result = model.predict(output_estimated_ts = True, output_standard_errors = True) + html_file = open("figures/machine_learning_vertica_tsa_arma_prediction_2.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_arma_prediction_2.html + + .. important:: + + The ``output_estimated_ts`` parameter provides an estimation of + 'ts' assuming that 'ts' is regularly spaced. + + If you don't provide any input, the function will begin forecasting + after the last known value. If you want to forecast starting from a + specific value within the input dataset or another dataset, you can + use the following syntax. + + .. code-block:: python + + model.predict( + data, + "date", + "passengers", + start = 40, + npredictions = 20, + output_estimated_ts = True, + output_standard_errors = True, + ) + + .. ipython:: python + :suppress: + :okwarning: + + result = model.predict(data, "date", "passengers", start = 40, npredictions = 20, output_estimated_ts = True, output_standard_errors = True) + html_file = open("figures/machine_learning_vertica_tsa_arma_prediction_3.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_arma_prediction_3.html + + Plots + ^^^^^^ + + We can conveniently plot the predictions on a line plot + to observe the efficacy of our model: + + .. code-block:: python + + model.plot(data, "date", "passengers", npredictions = 80, start=120) + + .. ipython:: python + :suppress: + :okwarning: + + vp.set_option("plotting_lib", "plotly") + fig = model.plot(data, "date", "passengers", npredictions = 80, start = 120, width = 650) + fig.write_html("figures/machine_learning_vertica_tsa_arma_plot_1.html") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_arma_plot_1.html + + .. note:: + + You can control the number of prediction steps by changing + the ``npredictions`` parameter: + ``model.plot(npredictions = 30)``. + + Please refer to :ref:`chart_gallery.tsa` for more examples. + + Model Register + ^^^^^^^^^^^^^^ + + In order to register the model for tracking and versioning: + + .. code-block:: python + + model.register("model_v1") + + Please refer to :ref:`notebooks/ml/model_tracking_versioning/index.html` + for more details on model tracking and versioning. + """ + + # Properties. + + @property + def _vertica_fit_sql(self) -> Literal["ARIMA"]: + return "ARIMA" + + @property + def _vertica_predict_sql(self) -> Literal["PREDICT_ARIMA"]: + return "PREDICT_ARIMA" + + @property + def _model_subcategory(self) -> Literal["TIMESERIES"]: + return "TIMESERIES" + + @property + def _model_type(self) -> Literal["ARMA"]: + return "ARMA" + + # System & Special Methods. + + @check_minimum_version + @save_verticapy_logs + def __init__( + self, + name: str = None, + overwrite_model: bool = False, + order: Union[tuple[int], list[int]] = (0, 0), + tol: float = 1e-6, + max_iter: int = 100, + init: Literal["zero", "hr"] = "zero", + missing: Literal[ + "drop", "raise", "zero", "linear_interpolation" + ] = "linear_interpolation", + ) -> None: + super().__init__(name, overwrite_model) + if not (isinstance(order, (tuple, list)) or len(order)) != 3: + raise ValueError( + "Parameter 'order' must be a tuple or a list of 2 elements." + ) + for x in order: + if not (isinstance(x, int)): + raise ValueError( + "Parameter 'order' must be a tuple or a list of integers." + ) + self.parameters = { + "order": order, + "tol": tol, + "max_iter": max_iter, + "init": str(init).lower(), + "missing": str(missing).lower(), + } + + +class AR(TimeSeriesModelBase): + """ + Creates a inDB Autoregressor model. + + .. versionadded:: 11.0.0 + + Parameters + ---------- + name: str, optional + Name of the model. The model is stored in the + database. + overwrite_model: bool, optional + If set to True, training a model with the same + name as an existing model overwrites the + existing model. + p: int, optional + Integer in the range [1, 1999], the number of + lags to consider in the computation. Larger + values for p weaken the correlation. + method: str, optional + One of the following algorithms for training the + model: + + - ols: + Ordinary Least Squares + - yule-walker: + Yule-Walker + penalty: str, optional + Method of regularization. + + - none: + No regularization. + - l2: + L2 regularization. + C: PythonNumber, optional + The regularization parameter value. The value + must be zero or non-negative. + + missing: str, optional + Method for handling missing values, one of the + following strings: + + - 'drop': + Missing values are ignored. + - 'raise': + Missing values raise an error. + - 'zero': + Missing values are set to zero. + - 'linear_interpolation': + Missing values are replaced by a linearly + interpolated value based on the nearest + valid entries before and after the missing + value. In cases where the first or last + values in a dataset are missing, the function + errors. + + Examples + --------- + + The following examples provide a basic understanding of usage. + For more detailed examples, please refer to the + :ref:`user_guide.machine_learning` or the + `Examples `_ + section on the website. + + Initialization + ^^^^^^^^^^^^^^^ + + We import ``verticapy``: + + .. ipython:: python + + import verticapy as vp + + .. hint:: + + By assigning an alias to ``verticapy``, we mitigate the risk + of code collisions with other libraries. This precaution is + necessary because verticapy uses commonly known function names + like "average" and "median", which can potentially lead to naming + conflicts. The use of an alias ensures that the functions from + verticapy are used as intended without interfering with functions + from other libraries. + + For this example, we will generate a dummy time-series + dataset. + + .. ipython:: python + + data = vp.vDataFrame( + { + "month": [i for i in range(1, 11)], + "GB": [5, 10, 20, 35, 55, 80, 110, 145, 185, 230], + } + ) + + .. ipython:: python + :suppress: + + html_file = open("figures/machine_learning_vertica_tsa_ar_data.html", "w") + html_file.write(data._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_ar_data.html + + .. note:: + + VerticaPy offers a wide range of sample datasets that are + ideal for training and testing purposes. You can explore + the full list of available datasets in the :ref:`api.datasets`, + which provides detailed information on each dataset + and how to use them effectively. These datasets are invaluable + resources for honing your data analysis and machine learning + skills within the VerticaPy environment. + + Model Initialization + ^^^^^^^^^^^^^^^^^^^^^ + + First we import the ``AR`` model: + + .. ipython:: python + + from verticapy.machine_learning.vertica.tsa import AR + + Then we can create the model: + + .. ipython:: python + :okwarning: + + model = AR(p = 2) + + .. hint:: + + In ``verticapy`` 1.0.x and higher, you do not need to specify the + model name, as the name is automatically assigned. If you need to + re-use the model, you can fetch the model name from the model's + attributes. + + .. important:: + + The model name is crucial for the model management system and + versioning. It's highly recommended to provide a name if you + plan to reuse the model later. + + Model Training + ^^^^^^^^^^^^^^^ + + We can now fit the model: + + .. ipython:: python + :okwarning: + + model.fit(data, "month", "GB") + + .. important:: + + To train a model, you can directly use the ``vDataFrame`` or the + name of the relation stored in the database. The test set is optional + and is only used to compute the test metrics. In ``verticapy``, we + don't work using ``X`` matrices and ``y`` vectors. Instead, we work + directly with lists of predictors and the response name. + + Features Importance + ^^^^^^^^^^^^^^^^^^^^ + + We can conveniently get the features importance: + + .. ipython:: python + :okwarning: + + model.features_importance() + + .. ipython:: python + :suppress: + :okwarning: + + vp.set_option("plotting_lib", "plotly") + fig = model.features_importance() + fig.write_html("SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_ar_features.html") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_ar_features.html + + Metrics + ^^^^^^^^ + + We can get the entire report using: + + .. code-block:: python + + model.report() + + .. ipython:: python + :suppress: + :okwarning: + + result = model.report() + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_ar_report.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_ar_report.html + + You can also choose the number of predictions and where to start the forecast. + For example, the following code will allow you to generate a report with 30 + predictions, starting the forecasting process at index 40. + + .. code-block:: python + + model.report(start = 40, npredictions = 30) + + .. ipython:: python + :suppress: + :okwarning: + + result = model.report(start = 40, npredictions = 30) + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_ar_report_pred_2.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_ar_report_pred_2.html + + .. important:: + + Most metrics are computed using a single SQL query, but some of them might + require multiple SQL queries. Selecting only the necessary metrics in the + report can help optimize performance. + E.g. ``model.report(metrics = ["mse", "r2"])``. + + You can utilize the + :py:mod:`verticapy.machine_learning.vertica.tsa.AR.score` + function to calculate various regression metrics, with the explained + variance being the default. + + .. ipython:: python + :okwarning: + + model.score() + + The same applies to the score. You can choose where to start and + the number of predictions to use. + + .. ipython:: python + :okwarning: + + model.score(start = 40, npredictions = 30) + + .. important:: + + If you do not specify a starting point and the number of + predictions, the forecast will begin at one-fourth of the + dataset, which can result in an inaccurate score, especially + for large datasets. It's important to choose these parameters + carefully. + + Prediction + ^^^^^^^^^^^ + + Prediction is straight-forward: + + .. code-block:: python + + model.predict() + + .. ipython:: python + :suppress: + :okwarning: + + result = model.predict() + html_file = open("figures/machine_learning_vertica_tsa_ar_prediction.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_ar_prediction.html + + .. hint:: + + You can control the number of prediction steps by changing + the ``npredictions`` parameter: + ``model.predict(npredictions = 30)``. + + .. note:: + + Predictions can be made automatically by using the training set, + in which case you don't need to specify the predictors. Alternatively, you + can pass only the ``vDataFrame`` to the + :py:mod:`verticapy.machine_learning.vertica.tsa.AR.predict` + function, but in this case, it's essential that the column names of + the ``vDataFrame`` match the predictors and response name in the + model. + + If you would like to have the 'time-stamps' (ts) in the output then + you can switch the ``output_estimated_ts`` the parameter. + + .. code-block:: python + + model.predict(output_estimated_ts = True) + + .. ipython:: python + :suppress: + :okwarning: + + result = model.predict(output_estimated_ts = True) + html_file = open("figures/machine_learning_vertica_tsa_ar_prediction_2.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_ar_prediction_2.html + + .. important:: + + The ``output_estimated_ts`` parameter provides an estimation of + 'ts' assuming that 'ts' is regularly spaced. + + If you don't provide any input, the function will begin forecasting + after the last known value. If you want to forecast starting from a + specific value within the input dataset or another dataset, you can + use the following syntax. + + .. code-block:: python + + model.predict( + data, + "month", + "GB", + start = 40, + npredictions = 20, + output_estimated_ts = True, + ) + + .. ipython:: python + :suppress: + :okwarning: + + result = model.predict(data, "month", "GB", start = 40, npredictions = 20, output_estimated_ts = True) + html_file = open("figures/machine_learning_vertica_tsa_ar_prediction_3.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_ar_prediction_3.html + + Plots + ^^^^^^ + + We can conveniently plot the predictions on a line plot + to observe the efficacy of our model: + + .. code-block:: python + + model.plot(data, "month", "GB", npredictions = 80, start=120) + + .. ipython:: python + :suppress: + :okwarning: + + vp.set_option("plotting_lib", "plotly") + fig = model.plot(data, "month", "GB", npredictions = 80, start = 120, width = 650) + fig.write_html("figures/machine_learning_vertica_tsa_ar_plot_1.html") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_tsa_ar_plot_1.html + + .. note:: + + You can control the number of prediction steps by changing + the ``npredictions`` parameter: + ``model.plot(npredictions = 30)``. + + Please refer to :ref:`chart_gallery.tsa` for more examples. + + Model Register + ^^^^^^^^^^^^^^ + + In order to register the model for tracking and versioning: + + .. code-block:: python + + model.register("model_v1") + + Please refer to :ref:`notebooks/ml/model_tracking_versioning/index.html` + for more details on model tracking and versioning. + """ + + # Properties. + + @property + def _vertica_fit_sql(self) -> Literal["AUTOREGRESSOR"]: + return "AUTOREGRESSOR" + + @property + def _vertica_predict_sql(self) -> Literal["PREDICT_AUTOREGRESSOR"]: + return "PREDICT_AUTOREGRESSOR" + + @property + def _model_subcategory(self) -> Literal["TIMESERIES"]: + return "TIMESERIES" + + @property + def _model_type(self) -> Literal["AR"]: + return "AR" + + # System & Special Methods. + + @check_minimum_version + @save_verticapy_logs + def __init__( + self, + name: str = None, + overwrite_model: bool = False, + p: int = 3, + method: Literal["ols", "yule-walker"] = "ols", + penalty: Literal[None, "none", "l2"] = "none", + C: PythonNumber = 1.0, + missing: Literal[ + "drop", "raise", "zero", "linear_interpolation" + ] = "linear_interpolation", + ) -> None: + super().__init__(name, overwrite_model) + self.parameters = { + "p": int(p), + "method": str(method).lower(), + "penalty": str(penalty).lower(), + "C": C, + "missing": str(missing).lower(), + "compute_mse": True, + } + + +class MA(TimeSeriesModelBase): + """ + Creates a inDB Moving Average model. + + Parameters + ---------- + name: str, optional + Name of the model. The model is stored in the + database. + overwrite_model: bool, optional + If set to True, training a model with the same + name as an existing model overwrites the + existing model. + q: int, optional + Integer in the range [1, 67), the number of lags + to consider in the computation. + penalty: str, optional + Method of regularization. + + - none: + No regularization. + - l2: + L2 regularization. + C: PythonNumber, optional + The regularization parameter value. The value + must be zero or non-negative. + + missing: str, optional + Method for handling missing values, one of the + following strings: + + - 'drop': + Missing values are ignored. + - 'raise': + Missing values raise an error. + - 'zero': + Missing values are set to zero. + - 'linear_interpolation': + Missing values are replaced by a linearly + interpolated value based on the nearest + valid entries before and after the missing + value. In cases where the first or last + values in a dataset are missing, the function + errors. + + Examples + --------- + + The following examples provide a basic understanding of usage. + For more detailed examples, please refer to the + :ref:`user_guide.machine_learning` or the + `Examples `_ + section on the website. + + ... + """ + + # Properties. + + @property + def _vertica_fit_sql(self) -> Literal["MOVING_AVERAGE"]: + return "MOVING_AVERAGE" + + @property + def _vertica_predict_sql(self) -> Literal["PREDICT_MOVING_AVERAGE"]: + return "PREDICT_MOVING_AVERAGE" + + @property + def _model_subcategory(self) -> Literal["TIMESERIES"]: + return "TIMESERIES" + + @property + def _model_type(self) -> Literal["MA"]: + return "MA" + + # System & Special Methods. + + @check_minimum_version + @save_verticapy_logs + def __init__( + self, + name: str = None, + overwrite_model: bool = False, + q: int = 1, + penalty: Literal[None, "none", "l2"] = "none", + C: PythonNumber = 1.0, + missing: Literal[ + "drop", "raise", "zero", "linear_interpolation" + ] = "linear_interpolation", + ) -> None: + super().__init__(name, overwrite_model) + self.parameters = { + "q": int(q), + "penalty": str(penalty).lower(), + "C": C, + "missing": str(missing).lower(), + "compute_mse": True, + } diff --git a/verticapy/plotting/_highcharts/__init__.py b/verticapy/plotting/_highcharts/__init__.py index fd326772d..cb562aea4 100755 --- a/verticapy/plotting/_highcharts/__init__.py +++ b/verticapy/plotting/_highcharts/__init__.py @@ -55,6 +55,7 @@ ) from verticapy.plotting._highcharts.machine_learning.stepwise import StepwisePlot from verticapy.plotting._highcharts.machine_learning.svm import SVMClassifierPlot +from verticapy.plotting._highcharts.machine_learning.tsa import TSPlot from verticapy.plotting._highcharts.line import LinePlot, MultiLinePlot from verticapy.plotting._highcharts.outliers import OutliersPlot from verticapy.plotting._highcharts.pie import NestedPieChart, PieChart diff --git a/verticapy/plotting/_highcharts/machine_learning/tsa.py b/verticapy/plotting/_highcharts/machine_learning/tsa.py new file mode 100755 index 000000000..4e594ccc8 --- /dev/null +++ b/verticapy/plotting/_highcharts/machine_learning/tsa.py @@ -0,0 +1,120 @@ +""" +Copyright (c) 2018-2023 Open Text or one of its +affiliates. Licensed under the Apache License, +Version 2.0 (the "License"); You may not use this +file except in compliance with the License. + +You may obtain a copy of the License at: +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in +writing, software distributed under the License is +distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing +permissions and limitations under the License. +""" +from typing import Literal, Optional + +import numpy as np + +from verticapy._typing import HChart +from verticapy.plotting._highcharts.line import LinePlot + + +class TSPlot(LinePlot): + # Properties. + + @property + def _category(self) -> Literal["graph"]: + return "graph" + + @property + def _kind(self) -> Literal["tsa"]: + return "tsa" + + @property + def _compute_method(self) -> Literal["tsa"]: + return "tsa" + + # Styling Methods. + + def _init_style(self) -> None: + self.init_style = { + "title": {"text": ""}, + "xAxis": { + "reversed": False, + "title": {"enabled": True, "text": self.layout["order_by"]}, + "startOnTick": True, + "endOnTick": True, + "showLastLabel": True, + }, + "yAxis": {"title": {"text": self.layout["columns"]}}, + "legend": {"enabled": True}, + "plotOptions": { + "scatter": { + "marker": { + "radius": 5, + "states": { + "hover": { + "enabled": True, + "lineColor": "rgb(100,100,100)", + } + }, + }, + "states": {"hover": {"marker": {"enabled": False}}}, + "tooltip": { + "headerFormat": '\u25CF {series.name}
', + "pointFormat": "" + + self.layout["order_by"] + + ": {point.x}
" + + self.layout["columns"], + }, + } + }, + "colors": self.get_colors(), + } + self.init_style_area_range = { + "zIndex": 0, + "lineWidth": 0, + "fillOpacity": 0.3, + } + + # Draw. + + def draw( + self, + chart: Optional[HChart] = None, + **style_kwargs, + ) -> HChart: + """ + Draws a time series plot using the HC API. + """ + chart, style_kwargs = self._get_chart(chart, style_kwargs=style_kwargs) + chart.set_dict_options(self.init_style) + chart.set_dict_options(style_kwargs) + x = self._to_datetime(self.data["x"]) + data = np.column_stack((x, self.data["y"])).tolist() + chart.add_data_set( + data, + "line", + self.layout["columns"], + ) + x = self._to_datetime(self.data["x_pred"]) + data = np.column_stack((x, self.data["y_pred"])).tolist() + chart.add_data_set( + data, + "line", + "prediction", + ) + if self.layout["has_se"]: + data_range = np.column_stack( + (x, self.data["se_low"], self.data["se_high"]) + ).tolist() + chart.add_data_set( + data_range, + "arearange", + "95% confidence interval", + **self.init_style_area_range, + ) + return chart diff --git a/verticapy/plotting/_highcharts/range.py b/verticapy/plotting/_highcharts/range.py index 63692aed0..48da56d52 100755 --- a/verticapy/plotting/_highcharts/range.py +++ b/verticapy/plotting/_highcharts/range.py @@ -55,7 +55,6 @@ def _init_style(self) -> None: "lineWidth": 0, "linkedTo": ":previous", "fillOpacity": 0.3, - "zIndex": 0, } # Draw. diff --git a/verticapy/plotting/_matplotlib/__init__.py b/verticapy/plotting/_matplotlib/__init__.py index 7c82c2ce3..7c2f25c5c 100755 --- a/verticapy/plotting/_matplotlib/__init__.py +++ b/verticapy/plotting/_matplotlib/__init__.py @@ -48,6 +48,7 @@ ) from verticapy.plotting._matplotlib.machine_learning.stepwise import StepwisePlot from verticapy.plotting._matplotlib.machine_learning.svm import SVMClassifierPlot +from verticapy.plotting._matplotlib.machine_learning.tsa import TSPlot from verticapy.plotting._matplotlib.acf import ACFPlot, ACFPACFPlot from verticapy.plotting._matplotlib.bar import BarChart, BarChart2D diff --git a/verticapy/plotting/_matplotlib/machine_learning/tsa.py b/verticapy/plotting/_matplotlib/machine_learning/tsa.py new file mode 100755 index 000000000..33fc7eccc --- /dev/null +++ b/verticapy/plotting/_matplotlib/machine_learning/tsa.py @@ -0,0 +1,112 @@ +""" +Copyright (c) 2018-2023 Open Text or one of its +affiliates. Licensed under the Apache License, +Version 2.0 (the "License"); You may not use this +file except in compliance with the License. + +You may obtain a copy of the License at: +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in +writing, software distributed under the License is +distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing +permissions and limitations under the License. +""" +from typing import Any, Literal, Optional + +import numpy as np + +from matplotlib.axes import Axes + +from verticapy.plotting._matplotlib.base import MatplotlibBase + + +class TSPlot(MatplotlibBase): + # Properties. + + @property + def _category(self) -> Literal["graph"]: + return "graph" + + @property + def _kind(self) -> Literal["tsa"]: + return "tsa" + + @property + def _compute_method(self) -> Literal["tsa"]: + return "tsa" + + # Styling Methods. + + def _init_style(self) -> None: + self.init_style = { + "color": self.get_colors(), + "linewidth": 2, + } + self.init_style_fill = { + "alpha": 0.2, + } + + # Draw. + + def draw( + self, + ax: Optional[Axes] = None, + **style_kwargs, + ) -> Axes: + """ + Draws a time series plot using the Matplotlib API. + """ + + # Initialization + style_kwargs = self._fix_color_style_kwargs(style_kwargs) + colors = self.get_colors() + color_kwargs = {"color": self.get_colors()} + ax, fig, style_kwargs = self._get_ax_fig( + ax, size=(8, 6), set_axis_below=True, grid="y", style_kwargs=style_kwargs + ) + # Standard Error + if self.layout["has_se"]: + args = [self.data["x_pred"], self.data["se_low"], self.data["se_high"]] + kwargs = self._update_dict( + self.init_style, {**color_kwargs, **style_kwargs}, color_idx=2 + ) + ax.fill_between( + *args, + facecolor=kwargs["color"], + **self.init_style_fill, + label="95% confidence interval", + ) + args = [self.data["x_pred"], self.data["se_low"]] + ax.plot(*args, color=kwargs["color"]) + args = [self.data["x_pred"], self.data["se_high"]] + ax.plot(*args, color=kwargs["color"]) + # Main Plot + args = [self.data["x"], self.data["y"]] + kwargs = self._update_dict( + self.init_style, {**color_kwargs, **style_kwargs}, color_idx=0 + ) + ax.plot(*args, **kwargs, label=self.layout["columns"]) + args = [self.data["x_pred"], self.data["y_pred"]] + kwargs = self._update_dict( + self.init_style, {**color_kwargs, **style_kwargs}, color_idx=1 + ) + kwargs = {**kwargs, **{"linestyle": "dashed"}} + ax.plot(*args, **kwargs, label="prediction") + # Labels + min_x = min(min(self.data["x"]), min(self.data["x_pred"])) + max_x = max(max(self.data["x"]), max(self.data["x_pred"])) + ax.set_xlim(min_x, max_x) + ax.set_xlabel(self.layout["order_by"]) + ax.set_ylabel(self.layout["columns"]) + for tick in ax.get_xticklabels(): + tick.set_rotation(90) + ax.legend( + loc="center left", + bbox_to_anchor=[1, 0.5], + ) + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) + return ax diff --git a/verticapy/plotting/_plotly/__init__.py b/verticapy/plotting/_plotly/__init__.py index 87feb2c21..21b433dd2 100755 --- a/verticapy/plotting/_plotly/__init__.py +++ b/verticapy/plotting/_plotly/__init__.py @@ -54,6 +54,7 @@ ChampionChallengerPlot, ) from verticapy.plotting._plotly.machine_learning.stepwise import StepwisePlot +from verticapy.plotting._plotly.machine_learning.tsa import TSPlot import plotly.io as pio diff --git a/verticapy/plotting/_plotly/line.py b/verticapy/plotting/_plotly/line.py index d955ae2a0..7d36bb037 100644 --- a/verticapy/plotting/_plotly/line.py +++ b/verticapy/plotting/_plotly/line.py @@ -54,7 +54,7 @@ def _get_kind(self) -> [str]: self.init_stack = None if self.layout["kind"] == "area": return "tozeroy" - elif self.layout["kind"] in {"area_stacked", "area_percent"}: + elif self.layout["kind"] in ("area_stacked", "area_percent"): self.init_stack = "group" return "tonexty" diff --git a/verticapy/plotting/_plotly/machine_learning/tsa.py b/verticapy/plotting/_plotly/machine_learning/tsa.py new file mode 100644 index 000000000..6168f1d5a --- /dev/null +++ b/verticapy/plotting/_plotly/machine_learning/tsa.py @@ -0,0 +1,124 @@ +""" +Copyright (c) 2018-2023 Open Text or one of its +affiliates. Licensed under the Apache License, +Version 2.0 (the "License"); You may not use this +file except in compliance with the License. + +You may obtain a copy of the License at: +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in +writing, software distributed under the License is +distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing +permissions and limitations under the License. +""" +from typing import Literal, Optional + +import pandas as pd +import numpy as np +import plotly.graph_objects as go +from plotly.graph_objs._figure import Figure + +from verticapy.plotting._plotly.base import PlotlyBase + + +class TSPlot(PlotlyBase): + # Properties. + + @property + def _category(self) -> Literal["graph"]: + return "graph" + + @property + def _kind(self) -> Literal["tsa"]: + return "tsa" + + @property + def _compute_method(self) -> Literal["tsa"]: + return "tsa" + + # Styling Methods. + + def _init_style(self) -> None: + self.init_style = { + "xaxis_title": self.layout["order_by"], + "yaxis_title": self.layout["columns"], + "width": 800, + "height": 450, + } + + # Draw. + + def draw( + self, + fig: Optional[Figure] = None, + step: bool = False, + markers: bool = True, + line_shape: Optional[str] = None, + **style_kwargs, + ) -> Figure: + """ + Draws a time series plot using the plotly API. + """ + fig_base = self._get_fig(fig) + marker_colors = self.get_colors() + if "colors" in style_kwargs: + marker_colors = ( + style_kwargs["colors"] + marker_colors + if isinstance(style_kwargs["colors"], list) + else [style_kwargs["colors"]] + marker_colors + ) + del style_kwargs["colors"] + # True Values + data_args = dict( + data=(np.column_stack((self.data["x"], self.data["y"]))), + columns=["time", self.layout["columns"]], + ) + df = pd.DataFrame(**data_args) + fig_base.add_trace( + go.Scatter( + x=df["time"], + y=df[self.layout["columns"]], + line_shape="spline", + line_color=marker_colors[0], + mode="lines+markers" if markers else "lines", + name=self.layout["columns"], + ) + ) + # Predictions + data_args = dict( + data=(np.column_stack((self.data["x_pred"], self.data["y_pred"]))), + columns=["time", self.layout["columns"]], + ) + df = pd.DataFrame(**data_args) + fig_base.add_trace( + go.Scatter( + x=df["time"], + y=df[self.layout["columns"]], + line_shape="spline", + line_color=marker_colors[1], + mode="lines+markers" if markers else "lines", + name="prediction", + ) + ) + # STD Error + if self.layout["has_se"]: + fig_base.add_trace( + go.Scatter( + x=np.hstack((self.data["x_pred"], self.data["x_pred"][::-1])), + y=np.hstack((self.data["se_low"], self.data["se_high"][::-1])), + fill="toself", + name="95% confidence interval", + mode="lines+markers" if not markers else "markers", + marker=dict(color=marker_colors[2]), + opacity=0.5, + ) + ) + # Final + for i in range(len(fig.data) if fig else 0): + fig_base.add_trace(fig.data[i]) + fig_base.update_layout(**self._update_dict(self.init_style, style_kwargs)) + fig_base.update_layout(fig.layout if fig else []) + return fig_base diff --git a/verticapy/plotting/base.py b/verticapy/plotting/base.py index 97b866877..8a9a75f3d 100755 --- a/verticapy/plotting/base.py +++ b/verticapy/plotting/base.py @@ -170,6 +170,7 @@ def __init__(self, *args, **kwargs) -> None: "range": self._compute_range, "rollup": self._compute_rollup, "sample": self._sample, + "tsa": self._compute_tsa, } if self._compute_method in functions: functions[self._compute_method](*args, **kwds) @@ -1489,6 +1490,57 @@ def _filter_line( "limit_over": limit_over, } + def _compute_tsa( + self, + vdf: "vDataFrame", + order_by: str, + columns: str, + prediction: "vDataFrame", + start: Optional[int], + dataset_provided: bool, + ) -> None: + columns, order_by = vdf.format_colnames(columns, order_by) + X = vdf[[order_by, columns]].sort(columns=[order_by]).to_numpy() + X_pred = prediction.to_numpy() + self.data = { + "x": X[:, 0], + "y": X[:, 1], + "y_pred": X_pred[:, 0], + } + if not (dataset_provided): + if isinstance(start, NoneType): + start = 1 + j = -1 + else: + if isinstance(start, NoneType): + start = 1 + j = -1 + else: + j = start - 1 + start = 0 + has_se = False + if X_pred.shape[1] > 1: + self.data["se"] = np.array([0.0] + list(X_pred[:, 1])) + has_se = True + delta = self.data["x"][1] - self.data["x"][0] + n = len(self.data["y_pred"]) + self.data["x_pred"] = np.array( + [self.data["x"][j]] + + [self.data["x"][j] + delta * i for i in range(start, n + start)] + ) + self.data["y_pred"] = np.array([self.data["y"][j]] + list(self.data["y_pred"])) + if has_se: + self.data["se_low"] = self.data["y_pred"] - 1.96 * self.data["se"] + self.data["se_high"] = self.data["y_pred"] + 1.96 * self.data["se"] + else: + self.data["se_low"] = None + self.data["se_high"] = None + self.layout = { + "columns": self._clean_quotes(columns), + "order_by": self._clean_quotes(order_by), + "has_se": has_se, + } + def _compute_range( self, vdf: "vDataFrame", diff --git a/verticapy/tests_new/machine_learning/vertica/test_linear_model.py b/verticapy/tests_new/machine_learning/vertica/test_linear_model.py index 56b9dbb71..22b0703fe 100644 --- a/verticapy/tests_new/machine_learning/vertica/test_linear_model.py +++ b/verticapy/tests_new/machine_learning/vertica/test_linear_model.py @@ -55,8 +55,9 @@ def test_fit( """ test function - fit """ - vpy_model_obj, py_model_obj = get_vpy_model(model_class), get_py_model( - model_class + vpy_model_obj, py_model_obj = ( + get_vpy_model(model_class), + get_py_model(model_class), ) if fit_attr == "score": diff --git a/verticapy/tests_new/machine_learning/vertica/test_tree_model.py b/verticapy/tests_new/machine_learning/vertica/test_tree_model.py index a44c2dbc1..ecedd80d2 100644 --- a/verticapy/tests_new/machine_learning/vertica/test_tree_model.py +++ b/verticapy/tests_new/machine_learning/vertica/test_tree_model.py @@ -61,8 +61,9 @@ def test_fit( """ test function - fit """ - vpy_model_obj, py_model_obj = get_vpy_model(model_class), get_py_model( - model_class + vpy_model_obj, py_model_obj = ( + get_vpy_model(model_class), + get_py_model(model_class), ) vpy_res = getattr(vpy_model_obj.model, fit_attr)()