From 48589f344de9c94e15491e6e074d083515aad161 Mon Sep 17 00:00:00 2001 From: Badr Date: Fri, 3 Nov 2023 22:58:44 -0400 Subject: [PATCH] Adding ARIMA model --- verticapy/_utils/_sql/_vertica_version.py | 3 + verticapy/machine_learning/vertica/base.py | 38 +- .../machine_learning/vertica/linear_model.py | 8 + verticapy/machine_learning/vertica/tsa.py | 432 ++++++++++++++++++ 4 files changed, 465 insertions(+), 16 deletions(-) create mode 100755 verticapy/machine_learning/vertica/tsa.py diff --git a/verticapy/_utils/_sql/_vertica_version.py b/verticapy/_utils/_sql/_vertica_version.py index 87c07c422..154d4cfc1 100755 --- a/verticapy/_utils/_sql/_vertica_version.py +++ b/verticapy/_utils/_sql/_vertica_version.py @@ -22,6 +22,8 @@ from verticapy.errors import VersionError MINIMUM_VERTICA_VERSION = { + "ARIMA": [12, 0, 0], + "AR": [11, 0, 0], "Balance": [8, 1, 1], "BernoulliNB": [8, 0, 0], "BisectingKMeans": [9, 3, 1], @@ -47,6 +49,7 @@ "LogisticRegression": [8, 0, 0], "KMeans": [8, 0, 0], "KPrototypes": [12, 0, 3], + "MA": [11, 0, 0], "MCA": [9, 1, 0], "MinMaxScaler": [8, 1, 0], "MultinomialNB": [8, 0, 0], diff --git a/verticapy/machine_learning/vertica/base.py b/verticapy/machine_learning/vertica/base.py index 6a7525e44..e490b2261 100755 --- a/verticapy/machine_learning/vertica/base.py +++ b/verticapy/machine_learning/vertica/base.py @@ -783,6 +783,8 @@ def fit( str model's summary. """ + + # Initialization if self.overwrite_model: self.drop() else: @@ -819,7 +821,7 @@ def fit( ROW_NUMBER() OVER (ORDER BY {', '.join(X)}) AS {id_column_name}""" - tmp_view = False + tmp_view = False if isinstance(input_relation, vDataFrame) or (id_column): tmp_view = True if isinstance(input_relation, vDataFrame): @@ -830,10 +832,9 @@ def fit( relation = gen_tmp_name( schema=schema_relation(self.model_name)[0], name="view" ) - drop(relation, method="view") _executeSQL( query=f""" - CREATE VIEW {relation} AS + CREATE OR REPLACE VIEW {relation} AS SELECT /*+LABEL('learn.VerticaModel.fit')*/ *{id_column} @@ -849,6 +850,7 @@ def fit( self.test_relation = test_relation else: self.test_relation = self.input_relation + # Fitting if self._is_native: parameters = self._get_vertica_param_dict() if ( @@ -917,15 +919,6 @@ def _attributes(self) -> list: def __init__(self) -> None: """Must be overridden in the child class""" self.features_importance_trees_ = {} - return None - # self.input_relation = None - # self.test_relation = None - # self.X = None - # self.y = None - # self.parameters = {} - # self.classes_ = None - # for att in self._attributes: - # setattr(self, att, None) def _compute_trees_arrays( self, tree: TableSample, X: list, return_probability: bool = False @@ -1265,6 +1258,10 @@ def plot_tree( class BinaryClassifier(Supervised): + """ + Base Class for Vertica Binary Classifier. + """ + # Properties. @property @@ -1805,13 +1802,16 @@ def roc_curve( class MulticlassClassifier(Supervised): + """ + Base Class for Vertica Multiclass Classifiers. + """ + # System & Special Methods. @abstractmethod def __init__(self, name: str, overwrite_model: bool = False) -> None: """Must be overridden in the child class""" super().__init__(name, overwrite_model) - # self.classes_ = None def _check_pos_label(self, pos_label: PythonScalar) -> PythonScalar: """ @@ -2645,6 +2645,10 @@ def roc_curve( class Regressor(Supervised): + """ + Base Class for Vertica Regressors. + """ + # System & Special Methods. @abstractmethod @@ -2845,7 +2849,7 @@ def predict( Returns ------- vDataFrame - the input object. + the input object. """ if hasattr(self, "_predict"): return self._predict(vdf=vdf, X=X, name=name, inplace=inplace) @@ -2904,6 +2908,8 @@ def fit( str model's summary. """ + + # Initialization if self.overwrite_model: self.drop() else: @@ -2938,10 +2944,9 @@ def fit( relation = gen_tmp_name( schema=schema_relation(self.model_name)[0], name="view" ) - drop(relation, method="view") _executeSQL( query=f""" - CREATE VIEW {relation} AS + CREATE OR REPLACE VIEW {relation} AS SELECT /*+LABEL('learn.VerticaModel.fit')*/ * {id_column} @@ -2962,6 +2967,7 @@ def fit( parameters = self._get_vertica_param_dict() if "num_components" in parameters and not parameters["num_components"]: del parameters["num_components"] + # Fitting fun = self._vertica_fit_sql if self._model_type != "MCA" else "PCA" query = f""" SELECT diff --git a/verticapy/machine_learning/vertica/linear_model.py b/verticapy/machine_learning/vertica/linear_model.py index 3700df290..1814001da 100755 --- a/verticapy/machine_learning/vertica/linear_model.py +++ b/verticapy/machine_learning/vertica/linear_model.py @@ -43,6 +43,10 @@ class LinearModel: + """ + Base Class for Vertica Linear Models. + """ + # Properties. @property @@ -188,6 +192,10 @@ def plot( class LinearModelClassifier(LinearModel): + """ + Base Class for Vertica Linear Models Classifiers. + """ + # Properties. @property diff --git a/verticapy/machine_learning/vertica/tsa.py b/verticapy/machine_learning/vertica/tsa.py new file mode 100755 index 000000000..2fddb6d33 --- /dev/null +++ b/verticapy/machine_learning/vertica/tsa.py @@ -0,0 +1,432 @@ +""" +Copyright (c) 2018-2023 Open Text or one of its +affiliates. Licensed under the Apache License, +Version 2.0 (the "License"); You may not use this +file except in compliance with the License. + +You may obtain a copy of the License at: +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in +writing, software distributed under the License is +distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing +permissions and limitations under the License. +""" +from abc import abstractmethod +from typing import Literal, Optional, Union + +import numpy as np + +from verticapy._typing import ( + NoneType, + SQLRelation, +) +from verticapy._utils._gen import gen_name, gen_tmp_name +from verticapy._utils._sql._collect import save_verticapy_logs +from verticapy._utils._sql._format import ( + clean_query, + quote_ident, + schema_relation, +) +from verticapy._utils._sql._sys import _executeSQL +from verticapy._utils._sql._vertica_version import ( + check_minimum_version, +) + +from verticapy.core.vdataframe.base import vDataFrame + +from verticapy.machine_learning.vertica.base import VerticaModel + +from verticapy.sql.drop import drop + +""" +General Classes. +""" + + +class TimeSeriesModelBase(VerticaModel): + """ + Base Class for Vertica Time Series Models. + """ + + # Properties. + + @property + def _attributes(self) -> list[str]: + return [ + "phi_", + "theta_", + "mse_", + "mean_", + ] + + # Attributes Methods. + + def _compute_attributes(self) -> None: + """ + Computes the model's attributes. + """ + coefficients = self.get_vertica_attributes("coefficients") + if "p" in self.parameters: + p = self.parameters["p"] + elif "order" in self.parameters: + p = self.parameters["order"][0] + else: + p = 0 + self.phi_ = np.array(coefficients["value"][:p]) + self.theta_ = np.array(coefficients["value"][p:]) + self.mse_ = self.get_vertica_attributes("mean_squared_error")[ + "mean_squared_error" + ][0] + self.mean_ = self.get_vertica_attributes("mean")["mean"][0] + + # System & Special Methods. + + @abstractmethod + def __init__(self, name: str, overwrite_model: bool = False) -> None: + """Must be overridden in the child class""" + super().__init__(name, overwrite_model) + + # Model Fitting Method. + + def fit( + self, + input_relation: SQLRelation, + ts: str, + y: str, + test_relation: SQLRelation = "", + return_report: bool = False, + ) -> Optional[str]: + """ + Trains the model. + + Parameters + ---------- + input_relation: SQLRelation + Training relation. + ts: str + TS (Time Series) vDataColumn used to order + the data. The vDataColumn type must be date + (date, datetime, timestamp...) or numerical. + y: str + Response column. + test_relation: SQLRelation, optional + Relation used to test the model. + return_report: bool, optional + [For native models] + When set to True, the model summary + will be returned. Otherwise, it will + be printed. + + Returns + ------- + str + model's summary. + """ + + # Initialization + if self.overwrite_model: + self.drop() + else: + self._is_already_stored(raise_error=True) + self.ts = quote_ident(ts) + self.y = quote_ident(y) + tmp_view = False + if isinstance(input_relation, vDataFrame) and self._is_native: + tmp_view = True + if isinstance(input_relation, vDataFrame): + self.input_relation = input_relation.current_relation() + else: + self.input_relation = input_relation + relation = gen_tmp_name( + schema=schema_relation(self.model_name)[0], name="view" + ) + _executeSQL( + query=f""" + CREATE OR REPLACE VIEW {relation} AS + SELECT + /*+LABEL('learn.VerticaModel.fit')*/ + {self.ts}, {self.y} + FROM {self.input_relation}""", + title="Creating a temporary view to fit the model.", + ) + else: + self.input_relation = input_relation + relation = input_relation + if isinstance(test_relation, vDataFrame): + self.test_relation = test_relation.current_relation() + elif test_relation: + self.test_relation = test_relation + else: + self.test_relation = self.input_relation + # Fitting + if self._is_native: + parameters = self._get_vertica_param_dict() + if "order" in parameters: + parameters["p"] = parameters["order"][0] + parameters["q"] = parameters["order"][-1] + if len(parameters["order"]) == 3: + parameters["d"] = parameters["order"][1] + del parameters["order"] + query = f""" + SELECT + /*+LABEL('learn.VerticaModel.fit')*/ + {self._vertica_fit_sql} + ('{self.model_name}', + '{relation}', + '{self.y}', + '{self.ts}' + USING PARAMETERS + {', '.join([f"{p} = {parameters[p]}" for p in parameters])})""" + try: + _executeSQL(query, title="Fitting the model.") + finally: + if tmp_view: + drop(relation, method="view") + self._compute_attributes() + if self._is_native: + report = self.summarize() + if return_report: + return report + print(report) + return None + + # I/O Methods. + + def deploySQL( + self, + ts: Optional[str] = None, + y: Optional[str] = None, + start: Optional[int] = None, + npredictions: int = 10, + ) -> str: + """ + Returns the SQL code needed to deploy the model. + + Parameters + ---------- + ts: str, optional + TS (Time Series) vDataColumn used to order + the data. The vDataColumn type must be date + (date, datetime, timestamp...) or numerical. + y: str, optional + Response column. + start: int, optional + The behavior of the start parameter and its + range of accepted values depends on whether + you provide a timeseries-column (ts): + + - No provided timeseries-column: start must + be an integer greater or equal to 0, where + zero indicates to start prediction at the + end of the in-sample data. If start is a + positive value, the function predicts the + values between the end of the in-sample + data and the start index, and then uses the + predicted values as time series inputs for + the subsequent npredictions. + - timeseries-column provided: start must be an + integer greater or equal to 1 and identifies + the index (row) of the timeseries-column at + which to begin prediction. If the start index + is greater than the number of rows, N, in the + input data, the function predicts the values + between N and start and uses the predicted + values as time series inputs for the subsequent + npredictions. + + Default: + + - No provided timeseries-column: prediction begins + from the end of the in-sample data. + - timeseries-column provided: prediction begins from + the end of the provided input data. + npredictions: int, optional + Integer greater or equal to 1, the number of predicted + timesteps. + + Returns + ------- + str + the SQL code needed to deploy the model. + """ + if self._vertica_predict_sql: + # Initialization + if isinstance(ts, NoneType): + ts = "" + else: + ts = quote_ident(X) + if isinstance(y, NoneType): + y = "" + else: + y = quote_ident(y) + if isinstance(start, NoneType): + start = "" + else: + start = f"start = {start}," + # Deployment + sql = f""" + {self._vertica_predict_sql}({y} + USING PARAMETERS + model_name = '{self.model_name}', + {start} + npredictions = {npredictions}) + OVER ({ts})""" + return clean_query(sql) + else: + raise AttributeError( + f"Method 'deploySQL' does not exist for {self._model_type} models." + ) + + # Prediction / Transformation Methods. + + def predict( + self, + vdf: Optional[SQLRelation] = None, + ts: Optional[str] = None, + y: Optional[str] = None, + start: Optional[int] = None, + npredictions: int = 10, + ) -> vDataFrame: + """ + Predicts using the input relation. + + Parameters + ---------- + vdf: SQLRelation + Object used to run the prediction. You can + also specify a customized relation, but you + must enclose it with an alias. For example, + "(SELECT 1) x" is valid, whereas "(SELECT 1)" + and "SELECT 1" are invalid. + ts: str, optional + TS (Time Series) vDataColumn used to order + the data. The vDataColumn type must be date + (date, datetime, timestamp...) or numerical. + y: str, optional + Response column. + start: int, optional + The behavior of the start parameter and its + range of accepted values depends on whether + you provide a timeseries-column (ts): + + - No provided timeseries-column: start must + be an integer greater or equal to 0, where + zero indicates to start prediction at the + end of the in-sample data. If start is a + positive value, the function predicts the + values between the end of the in-sample + data and the start index, and then uses the + predicted values as time series inputs for + the subsequent npredictions. + - timeseries-column provided: start must be an + integer greater or equal to 1 and identifies + the index (row) of the timeseries-column at + which to begin prediction. If the start index + is greater than the number of rows, N, in the + input data, the function predicts the values + between N and start and uses the predicted + values as time series inputs for the subsequent + npredictions. + + Default: + + - No provided timeseries-column: prediction begins + from the end of the in-sample data. + - timeseries-column provided: prediction begins from + the end of the provided input data. + npredictions: int, optional + Integer greater or equal to 1, the number of predicted + timesteps. + + Returns + ------- + vDataFrame + a new object. + """ + sql = "SELECT " + self.deploySQL( + ts=ts, y=y, start=start, npredictions=npredictions + ) + if not (isinstance(vdf, NoneType)): + sql += f" FROM {vdf}" + return vDataFrame(sql) + + +class ARIMA(TimeSeriesModelBase): + """ + Creates a + + Parameters + ---------- + name: str, optional + Name of the model. The model is stored in the + database. + overwrite_model: bool, optional + If set to True, training a model with the same + name as an existing model overwrites the + existing model. + ... + + Examples + --------- + + The following examples provide a basic understanding of usage. + For more detailed examples, please refer to the + :ref:`user_guide.machine_learning` or the + `Examples `_ + section on the website. + """ + + # Properties. + + @property + def _vertica_fit_sql(self) -> Literal["ARIMA"]: + return "ARIMA" + + @property + def _vertica_predict_sql(self) -> Literal["PREDICT_ARIMA"]: + return "PREDICT_ARIMA" + + @property + def _model_subcategory(self) -> Literal["TIMESERIES"]: + return "TIMESERIES" + + @property + def _model_type(self) -> Literal["ARIMA"]: + return "ARIMA" + + # System & Special Methods. + + @check_minimum_version + @save_verticapy_logs + def __init__( + self, + name: str = None, + overwrite_model: bool = False, + order: Union[tuple[int], list[int]] = (0, 0, 0), + tol: float = 1e-6, + max_iter: int = 100, + init: Literal["zero", "hr"] = "zero", + missing: Literal[ + "drop", "raise", "zero", "linear_interpolation" + ] = "linear_interpolation", + ) -> None: + super().__init__(name, overwrite_model) + if not (isinstance(order, (tuple, list)) or len(order)) != 3: + raise ValueError( + "Parameter 'order' must be a tuple or a list of 3 elements." + ) + for x in order: + if not (isinstance(x, int)): + raise ValueError( + "Parameter 'order' must be a tuple or a list of integers." + ) + self.parameters = { + "order": order, + "tol": tol, + "max_iter": max_iter, + "init": str(init).lower(), + "missing": str(missing).lower(), + }