From 48589f344de9c94e15491e6e074d083515aad161 Mon Sep 17 00:00:00 2001
From: Badr <badr.ouali@outlook.fr>
Date: Fri, 3 Nov 2023 22:58:44 -0400
Subject: [PATCH] Adding ARIMA model

---
 verticapy/_utils/_sql/_vertica_version.py     |   3 +
 verticapy/machine_learning/vertica/base.py    |  38 +-
 .../machine_learning/vertica/linear_model.py  |   8 +
 verticapy/machine_learning/vertica/tsa.py     | 432 ++++++++++++++++++
 4 files changed, 465 insertions(+), 16 deletions(-)
 create mode 100755 verticapy/machine_learning/vertica/tsa.py

diff --git a/verticapy/_utils/_sql/_vertica_version.py b/verticapy/_utils/_sql/_vertica_version.py
index 87c07c422..154d4cfc1 100755
--- a/verticapy/_utils/_sql/_vertica_version.py
+++ b/verticapy/_utils/_sql/_vertica_version.py
@@ -22,6 +22,8 @@
 from verticapy.errors import VersionError
 
 MINIMUM_VERTICA_VERSION = {
+    "ARIMA": [12, 0, 0],
+    "AR": [11, 0, 0],
     "Balance": [8, 1, 1],
     "BernoulliNB": [8, 0, 0],
     "BisectingKMeans": [9, 3, 1],
@@ -47,6 +49,7 @@
     "LogisticRegression": [8, 0, 0],
     "KMeans": [8, 0, 0],
     "KPrototypes": [12, 0, 3],
+    "MA": [11, 0, 0],
     "MCA": [9, 1, 0],
     "MinMaxScaler": [8, 1, 0],
     "MultinomialNB": [8, 0, 0],
diff --git a/verticapy/machine_learning/vertica/base.py b/verticapy/machine_learning/vertica/base.py
index 6a7525e44..e490b2261 100755
--- a/verticapy/machine_learning/vertica/base.py
+++ b/verticapy/machine_learning/vertica/base.py
@@ -783,6 +783,8 @@ def fit(
         str
             model's summary.
         """
+
+        # Initialization
         if self.overwrite_model:
             self.drop()
         else:
@@ -819,7 +821,7 @@ def fit(
                     ROW_NUMBER() OVER 
                     (ORDER BY {', '.join(X)}) 
                     AS {id_column_name}"""
-            tmp_view = False
+        tmp_view = False
         if isinstance(input_relation, vDataFrame) or (id_column):
             tmp_view = True
             if isinstance(input_relation, vDataFrame):
@@ -830,10 +832,9 @@ def fit(
                 relation = gen_tmp_name(
                     schema=schema_relation(self.model_name)[0], name="view"
                 )
-                drop(relation, method="view")
                 _executeSQL(
                     query=f"""
-                        CREATE VIEW {relation} AS 
+                        CREATE OR REPLACE VIEW {relation} AS 
                             SELECT 
                                 /*+LABEL('learn.VerticaModel.fit')*/ 
                                 *{id_column} 
@@ -849,6 +850,7 @@ def fit(
             self.test_relation = test_relation
         else:
             self.test_relation = self.input_relation
+        # Fitting
         if self._is_native:
             parameters = self._get_vertica_param_dict()
             if (
@@ -917,15 +919,6 @@ def _attributes(self) -> list:
     def __init__(self) -> None:
         """Must be overridden in the child class"""
         self.features_importance_trees_ = {}
-        return None
-        # self.input_relation = None
-        # self.test_relation = None
-        # self.X = None
-        # self.y = None
-        # self.parameters = {}
-        # self.classes_ = None
-        # for att in self._attributes:
-        #    setattr(self, att, None)
 
     def _compute_trees_arrays(
         self, tree: TableSample, X: list, return_probability: bool = False
@@ -1265,6 +1258,10 @@ def plot_tree(
 
 
 class BinaryClassifier(Supervised):
+    """
+    Base Class for Vertica Binary Classifier.
+    """
+
     # Properties.
 
     @property
@@ -1805,13 +1802,16 @@ def roc_curve(
 
 
 class MulticlassClassifier(Supervised):
+    """
+    Base Class for Vertica Multiclass Classifiers.
+    """
+
     # System & Special Methods.
 
     @abstractmethod
     def __init__(self, name: str, overwrite_model: bool = False) -> None:
         """Must be overridden in the child class"""
         super().__init__(name, overwrite_model)
-        # self.classes_ = None
 
     def _check_pos_label(self, pos_label: PythonScalar) -> PythonScalar:
         """
@@ -2645,6 +2645,10 @@ def roc_curve(
 
 
 class Regressor(Supervised):
+    """
+    Base Class for Vertica Regressors.
+    """
+
     # System & Special Methods.
 
     @abstractmethod
@@ -2845,7 +2849,7 @@ def predict(
         Returns
         -------
         vDataFrame
-                the input object.
+            the input object.
         """
         if hasattr(self, "_predict"):
             return self._predict(vdf=vdf, X=X, name=name, inplace=inplace)
@@ -2904,6 +2908,8 @@ def fit(
         str
             model's summary.
         """
+
+        # Initialization
         if self.overwrite_model:
             self.drop()
         else:
@@ -2938,10 +2944,9 @@ def fit(
             relation = gen_tmp_name(
                 schema=schema_relation(self.model_name)[0], name="view"
             )
-            drop(relation, method="view")
             _executeSQL(
                 query=f"""
-                    CREATE VIEW {relation} AS 
+                    CREATE OR REPLACE VIEW {relation} AS 
                         SELECT 
                             /*+LABEL('learn.VerticaModel.fit')*/ *
                             {id_column} 
@@ -2962,6 +2967,7 @@ def fit(
         parameters = self._get_vertica_param_dict()
         if "num_components" in parameters and not parameters["num_components"]:
             del parameters["num_components"]
+        # Fitting
         fun = self._vertica_fit_sql if self._model_type != "MCA" else "PCA"
         query = f"""
             SELECT 
diff --git a/verticapy/machine_learning/vertica/linear_model.py b/verticapy/machine_learning/vertica/linear_model.py
index 3700df290..1814001da 100755
--- a/verticapy/machine_learning/vertica/linear_model.py
+++ b/verticapy/machine_learning/vertica/linear_model.py
@@ -43,6 +43,10 @@
 
 
 class LinearModel:
+    """
+    Base Class for Vertica Linear Models.
+    """
+
     # Properties.
 
     @property
@@ -188,6 +192,10 @@ def plot(
 
 
 class LinearModelClassifier(LinearModel):
+    """
+    Base Class for Vertica Linear Models Classifiers.
+    """
+
     # Properties.
 
     @property
diff --git a/verticapy/machine_learning/vertica/tsa.py b/verticapy/machine_learning/vertica/tsa.py
new file mode 100755
index 000000000..2fddb6d33
--- /dev/null
+++ b/verticapy/machine_learning/vertica/tsa.py
@@ -0,0 +1,432 @@
+"""
+Copyright  (c)  2018-2023 Open Text  or  one  of its
+affiliates.  Licensed  under  the   Apache  License,
+Version 2.0 (the  "License"); You  may  not use this
+file except in compliance with the License.
+
+You may obtain a copy of the License at:
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless  required  by applicable  law or  agreed to in
+writing, software  distributed  under the  License is
+distributed on an  "AS IS" BASIS,  WITHOUT WARRANTIES
+OR CONDITIONS OF ANY KIND, either express or implied.
+See the  License for the specific  language governing
+permissions and limitations under the License.
+"""
+from abc import abstractmethod
+from typing import Literal, Optional, Union
+
+import numpy as np
+
+from verticapy._typing import (
+    NoneType,
+    SQLRelation,
+)
+from verticapy._utils._gen import gen_name, gen_tmp_name
+from verticapy._utils._sql._collect import save_verticapy_logs
+from verticapy._utils._sql._format import (
+    clean_query,
+    quote_ident,
+    schema_relation,
+)
+from verticapy._utils._sql._sys import _executeSQL
+from verticapy._utils._sql._vertica_version import (
+    check_minimum_version,
+)
+
+from verticapy.core.vdataframe.base import vDataFrame
+
+from verticapy.machine_learning.vertica.base import VerticaModel
+
+from verticapy.sql.drop import drop
+
+"""
+General Classes.
+"""
+
+
+class TimeSeriesModelBase(VerticaModel):
+    """
+    Base Class for Vertica Time Series Models.
+    """
+
+    # Properties.
+
+    @property
+    def _attributes(self) -> list[str]:
+        return [
+            "phi_",
+            "theta_",
+            "mse_",
+            "mean_",
+        ]
+
+    # Attributes Methods.
+
+    def _compute_attributes(self) -> None:
+        """
+        Computes the model's attributes.
+        """
+        coefficients = self.get_vertica_attributes("coefficients")
+        if "p" in self.parameters:
+            p = self.parameters["p"]
+        elif "order" in self.parameters:
+            p = self.parameters["order"][0]
+        else:
+            p = 0
+        self.phi_ = np.array(coefficients["value"][:p])
+        self.theta_ = np.array(coefficients["value"][p:])
+        self.mse_ = self.get_vertica_attributes("mean_squared_error")[
+            "mean_squared_error"
+        ][0]
+        self.mean_ = self.get_vertica_attributes("mean")["mean"][0]
+
+    # System & Special Methods.
+
+    @abstractmethod
+    def __init__(self, name: str, overwrite_model: bool = False) -> None:
+        """Must be overridden in the child class"""
+        super().__init__(name, overwrite_model)
+
+    # Model Fitting Method.
+
+    def fit(
+        self,
+        input_relation: SQLRelation,
+        ts: str,
+        y: str,
+        test_relation: SQLRelation = "",
+        return_report: bool = False,
+    ) -> Optional[str]:
+        """
+        Trains the model.
+
+        Parameters
+        ----------
+        input_relation: SQLRelation
+            Training relation.
+        ts: str
+            TS (Time Series)  vDataColumn used to order
+            the data.  The vDataColumn type must be  date
+            (date, datetime, timestamp...) or numerical.
+        y: str
+            Response column.
+        test_relation: SQLRelation, optional
+            Relation used to test the model.
+        return_report: bool, optional
+            [For native models]
+            When set to True, the model summary
+            will be returned. Otherwise, it will
+            be printed.
+
+        Returns
+        -------
+        str
+            model's summary.
+        """
+
+        # Initialization
+        if self.overwrite_model:
+            self.drop()
+        else:
+            self._is_already_stored(raise_error=True)
+        self.ts = quote_ident(ts)
+        self.y = quote_ident(y)
+        tmp_view = False
+        if isinstance(input_relation, vDataFrame) and self._is_native:
+            tmp_view = True
+            if isinstance(input_relation, vDataFrame):
+                self.input_relation = input_relation.current_relation()
+            else:
+                self.input_relation = input_relation
+            relation = gen_tmp_name(
+                schema=schema_relation(self.model_name)[0], name="view"
+            )
+            _executeSQL(
+                query=f"""
+                    CREATE OR REPLACE VIEW {relation} AS 
+                        SELECT 
+                            /*+LABEL('learn.VerticaModel.fit')*/ 
+                            {self.ts}, {self.y}
+                        FROM {self.input_relation}""",
+                title="Creating a temporary view to fit the model.",
+            )
+        else:
+            self.input_relation = input_relation
+            relation = input_relation
+        if isinstance(test_relation, vDataFrame):
+            self.test_relation = test_relation.current_relation()
+        elif test_relation:
+            self.test_relation = test_relation
+        else:
+            self.test_relation = self.input_relation
+        # Fitting
+        if self._is_native:
+            parameters = self._get_vertica_param_dict()
+            if "order" in parameters:
+                parameters["p"] = parameters["order"][0]
+                parameters["q"] = parameters["order"][-1]
+                if len(parameters["order"]) == 3:
+                    parameters["d"] = parameters["order"][1]
+                del parameters["order"]
+            query = f"""
+                SELECT 
+                    /*+LABEL('learn.VerticaModel.fit')*/ 
+                    {self._vertica_fit_sql}
+                    ('{self.model_name}', 
+                     '{relation}',
+                     '{self.y}',
+                     '{self.ts}' 
+                     USING PARAMETERS 
+                     {', '.join([f"{p} = {parameters[p]}" for p in parameters])})"""
+            try:
+                _executeSQL(query, title="Fitting the model.")
+            finally:
+                if tmp_view:
+                    drop(relation, method="view")
+        self._compute_attributes()
+        if self._is_native:
+            report = self.summarize()
+            if return_report:
+                return report
+            print(report)
+        return None
+
+    # I/O Methods.
+
+    def deploySQL(
+        self,
+        ts: Optional[str] = None,
+        y: Optional[str] = None,
+        start: Optional[int] = None,
+        npredictions: int = 10,
+    ) -> str:
+        """
+        Returns the SQL code needed to deploy the model.
+
+        Parameters
+        ----------
+        ts: str, optional
+            TS (Time Series)  vDataColumn used to order
+            the data.  The vDataColumn type must be  date
+            (date, datetime, timestamp...) or numerical.
+        y: str, optional
+            Response column.
+        start: int, optional
+            The behavior of the start parameter and its
+            range of accepted values depends on whether
+            you provide a timeseries-column (ts):
+
+              - No provided timeseries-column: start must
+                be an integer greater or equal to 0, where
+                zero indicates to start prediction at the
+                end of the in-sample data. If start is a
+                positive value, the function predicts the
+                values between the end of the in-sample
+                data and the start index, and then uses the
+                predicted values as time series inputs for
+                the subsequent npredictions.
+              - timeseries-column provided: start must be an
+                integer greater or equal to 1 and identifies
+                the index (row) of the timeseries-column at
+                which to begin prediction. If the start index
+                is greater than the number of rows, N, in the
+                input data, the function predicts the values
+                between N and start and uses the predicted
+                values as time series inputs for the subsequent
+                npredictions.
+
+            Default:
+
+              - No provided timeseries-column: prediction begins
+                from the end of the in-sample data.
+              - timeseries-column provided: prediction begins from
+                the end of the provided input data.
+        npredictions: int, optional
+            Integer greater or equal to 1, the number of predicted
+            timesteps.
+
+        Returns
+        -------
+        str
+            the SQL code needed to deploy the model.
+        """
+        if self._vertica_predict_sql:
+            # Initialization
+            if isinstance(ts, NoneType):
+                ts = ""
+            else:
+                ts = quote_ident(X)
+            if isinstance(y, NoneType):
+                y = ""
+            else:
+                y = quote_ident(y)
+            if isinstance(start, NoneType):
+                start = ""
+            else:
+                start = f"start = {start},"
+            # Deployment
+            sql = f"""
+                {self._vertica_predict_sql}({y}
+                                            USING PARAMETERS 
+                                            model_name = '{self.model_name}',
+                                            {start}
+                                            npredictions = {npredictions}) 
+                                            OVER ({ts})"""
+            return clean_query(sql)
+        else:
+            raise AttributeError(
+                f"Method 'deploySQL' does not exist for {self._model_type} models."
+            )
+
+    # Prediction / Transformation Methods.
+
+    def predict(
+        self,
+        vdf: Optional[SQLRelation] = None,
+        ts: Optional[str] = None,
+        y: Optional[str] = None,
+        start: Optional[int] = None,
+        npredictions: int = 10,
+    ) -> vDataFrame:
+        """
+        Predicts using the input relation.
+
+        Parameters
+        ----------
+        vdf: SQLRelation
+            Object  used to run  the prediction.  You can
+            also  specify a  customized  relation,  but you
+            must  enclose  it with an alias.  For  example,
+            "(SELECT 1) x" is valid, whereas "(SELECT 1)"
+            and "SELECT 1" are invalid.
+        ts: str, optional
+            TS (Time Series)  vDataColumn used to order
+            the data.  The vDataColumn type must be  date
+            (date, datetime, timestamp...) or numerical.
+        y: str, optional
+            Response column.
+        start: int, optional
+            The behavior of the start parameter and its
+            range of accepted values depends on whether
+            you provide a timeseries-column (ts):
+
+              - No provided timeseries-column: start must
+                be an integer greater or equal to 0, where
+                zero indicates to start prediction at the
+                end of the in-sample data. If start is a
+                positive value, the function predicts the
+                values between the end of the in-sample
+                data and the start index, and then uses the
+                predicted values as time series inputs for
+                the subsequent npredictions.
+              - timeseries-column provided: start must be an
+                integer greater or equal to 1 and identifies
+                the index (row) of the timeseries-column at
+                which to begin prediction. If the start index
+                is greater than the number of rows, N, in the
+                input data, the function predicts the values
+                between N and start and uses the predicted
+                values as time series inputs for the subsequent
+                npredictions.
+
+            Default:
+
+              - No provided timeseries-column: prediction begins
+                from the end of the in-sample data.
+              - timeseries-column provided: prediction begins from
+                the end of the provided input data.
+        npredictions: int, optional
+            Integer greater or equal to 1, the number of predicted
+            timesteps.
+
+        Returns
+        -------
+        vDataFrame
+            a new object.
+        """
+        sql = "SELECT " + self.deploySQL(
+            ts=ts, y=y, start=start, npredictions=npredictions
+        )
+        if not (isinstance(vdf, NoneType)):
+            sql += f" FROM {vdf}"
+        return vDataFrame(sql)
+
+
+class ARIMA(TimeSeriesModelBase):
+    """
+    Creates a
+
+    Parameters
+    ----------
+    name: str, optional
+        Name of the model. The  model is stored  in the
+        database.
+    overwrite_model: bool, optional
+        If set to True, training a model with the same
+        name as an existing model overwrites the
+        existing model.
+    ...
+
+    Examples
+    ---------
+
+    The following examples provide a basic understanding of usage.
+    For more detailed examples, please refer to the
+    :ref:`user_guide.machine_learning` or the
+    `Examples <https://www.vertica.com/python/examples/>`_
+    section on the website.
+    """
+
+    # Properties.
+
+    @property
+    def _vertica_fit_sql(self) -> Literal["ARIMA"]:
+        return "ARIMA"
+
+    @property
+    def _vertica_predict_sql(self) -> Literal["PREDICT_ARIMA"]:
+        return "PREDICT_ARIMA"
+
+    @property
+    def _model_subcategory(self) -> Literal["TIMESERIES"]:
+        return "TIMESERIES"
+
+    @property
+    def _model_type(self) -> Literal["ARIMA"]:
+        return "ARIMA"
+
+    # System & Special Methods.
+
+    @check_minimum_version
+    @save_verticapy_logs
+    def __init__(
+        self,
+        name: str = None,
+        overwrite_model: bool = False,
+        order: Union[tuple[int], list[int]] = (0, 0, 0),
+        tol: float = 1e-6,
+        max_iter: int = 100,
+        init: Literal["zero", "hr"] = "zero",
+        missing: Literal[
+            "drop", "raise", "zero", "linear_interpolation"
+        ] = "linear_interpolation",
+    ) -> None:
+        super().__init__(name, overwrite_model)
+        if not (isinstance(order, (tuple, list)) or len(order)) != 3:
+            raise ValueError(
+                "Parameter 'order' must be a tuple or a list of 3 elements."
+            )
+        for x in order:
+            if not (isinstance(x, int)):
+                raise ValueError(
+                    "Parameter 'order' must be a tuple or a list of integers."
+                )
+        self.parameters = {
+            "order": order,
+            "tol": tol,
+            "max_iter": max_iter,
+            "init": str(init).lower(),
+            "missing": str(missing).lower(),
+        }