diff --git a/verticapy/machine_learning/vertica/linear_model.py b/verticapy/machine_learning/vertica/linear_model.py index 1332b9ca5..554a7a98f 100755 --- a/verticapy/machine_learning/vertica/linear_model.py +++ b/verticapy/machine_learning/vertica/linear_model.py @@ -324,177 +324,6 @@ class ElasticNet(Regressor, LinearModel): in training the model. Note that setting fit_intercept to false does not work well with the BFGS optimizer. - """ - - # Properties. - - @property - def _vertica_fit_sql(self) -> Literal["LINEAR_REG"]: - return "LINEAR_REG" - - @property - def _vertica_predict_sql(self) -> Literal["PREDICT_LINEAR_REG"]: - return "PREDICT_LINEAR_REG" - - @property - def _model_subcategory(self) -> Literal["REGRESSOR"]: - return "REGRESSOR" - - @property - def _model_type(self) -> Literal["LinearRegression"]: - return "LinearRegression" - - # System & Special Methods. - - @check_minimum_version - @save_verticapy_logs - def __init__( - self, - name: str = None, - overwrite_model: bool = False, - tol: float = 1e-6, - C: PythonNumber = 1.0, - max_iter: int = 100, - solver: Literal["newton", "bfgs", "cgd"] = "cgd", - l1_ratio: float = 0.5, - fit_intercept: bool = True, - ) -> None: - super().__init__(name, overwrite_model) - if vertica_version()[0] < 12 and not fit_intercept: - raise VersionError( - "The parameter 'fit_intercept' can be activated for " - "Vertica versions greater or equal to 12." - ) - self.parameters = { - "penalty": "enet", - "tol": tol, - "C": C, - "max_iter": max_iter, - "solver": str(solver).lower(), - "l1_ratio": l1_ratio, - "fit_intercept": fit_intercept, - } - - -class Lasso(Regressor, LinearModel): - """ - Creates a Lasso object using the Vertica - Linear Regression algorithm. - Lasso is a regularized regression method - that uses an L1 penalty. - - Parameters - ---------- - name: str, optional - Name of the model. The model is stored in the - database. - overwrite_model: bool, optional - If set to True, training a model with the same - name as an existing model overwrites the - existing model. - tol: float, optional - Determines whether the algorithm has reached - the specified accuracy result. - C: PythonNumber, optional - The regularization parameter value. The value - must be zero or non-negative. - max_iter: int, optional - Determines the maximum number of iterations - the algorithm performs before achieving the - specified accuracy result. - solver: str, optional - The optimizer method used to train the model. - newton : Newton Method. - bfgs : Broyden Fletcher Goldfarb Shanno. - cgd : Coordinate Gradient Descent. - fit_intercept: bool, optional - Boolean, specifies whether the model includes an - intercept. If set to false, no intercept is - used in training the model. Note that setting - fit_intercept to false does not work well with the - BFGS optimizer. - """ - - # Properties. - - @property - def _vertica_fit_sql(self) -> Literal["LINEAR_REG"]: - return "LINEAR_REG" - - @property - def _vertica_predict_sql(self) -> Literal["PREDICT_LINEAR_REG"]: - return "PREDICT_LINEAR_REG" - - @property - def _model_subcategory(self) -> Literal["REGRESSOR"]: - return "REGRESSOR" - - @property - def _model_type(self) -> Literal["LinearRegression"]: - return "LinearRegression" - - # System & Special Methods. - - @check_minimum_version - @save_verticapy_logs - def __init__( - self, - name: str = None, - overwrite_model: bool = False, - tol: float = 1e-6, - C: PythonNumber = 1.0, - max_iter: int = 100, - solver: Literal["newton", "bfgs", "cgd"] = "cgd", - fit_intercept: bool = True, - ) -> None: - super().__init__(name, overwrite_model) - if vertica_version()[0] < 12 and not fit_intercept: - raise VersionError( - "The parameter 'fit_intercept' can be activated for " - "Vertica versions greater or equal to 12." - ) - self.parameters = { - "penalty": "l1", - "tol": tol, - "C": C, - "max_iter": max_iter, - "solver": str(solver).lower(), - "fit_intercept": fit_intercept, - } - - -class LinearRegression(Regressor, LinearModel): - """ - Creates a LinearRegression object using the Vertica - Linear Regression algorithm. - - Parameters - ---------- - name: str, optional - Name of the model. The model is stored in the - database. - overwrite_model: bool, optional - If set to True, training a model with the same - name as an existing model overwrites the - existing model. - tol: float, optional - Determines whether the algorithm has reached the - specified accuracy result. - max_iter: int, optional - Determines the maximum number of iterations the - algorithm performs before achieving the specified - accuracy result. - solver: str, optional - The optimizer method used to train the model. - newton : Newton Method. - bfgs : Broyden Fletcher Goldfarb Shanno. - fit_intercept: bool, optional - Boolean, specifies whether the model includes an - intercept. If set to false, no intercept is - used in training the model. Note that setting - fit_intercept to false does not work well with the - BFGS optimizer. - Examples --------- @@ -515,13 +344,13 @@ class LinearRegression(Regressor, LinearModel): .. hint:: - By assigning an alias to ``verticapy``, we mitigate the risk of code - collisions with other libraries. This precaution is necessary - because verticapy uses commonly known function names like "average" - and "median", which can potentially lead to naming conflicts. - The use of an alias ensures that the functions from verticapy are - used as intended without interfering with functions from other - libraries. + By assigning an alias to ``verticapy``, we mitigate the risk + of code collisions with other libraries. This precaution is + necessary because verticapy uses commonly known function names + like "average" and "median", which can potentially lead to naming + conflicts. The use of an alias ensures that the functions from + verticapy are used as intended without interfering with functions + from other libraries. For this example, we will use the winequality dataset. @@ -544,10 +373,10 @@ class LinearRegression(Regressor, LinearModel): resources for honing your data analysis and machine learning skills within the VerticaPy environment. - You can easily divide your dataset into training and testing subsets using the - :py:mod:`vDataFrame.train_test_split` method. This is a crucial step when preparing - your data for machine learning, as it allows you to evaluate the performance of - your models accurately. + You can easily divide your dataset into training and testing subsets + using the :py:mod:`vDataFrame.train_test_split` method. This is a + crucial step when preparing your data for machine learning, as it + allows you to evaluate the performance of your models accurately. .. code-block:: python @@ -574,20 +403,22 @@ class LinearRegression(Regressor, LinearModel): Model Initialization ^^^^^^^^^^^^^^^^^^^^^ - First we import the ``LinearRegression`` model: + First we import the ``ElasticNet`` model: .. code-block:: - from verticapy.machine_learning.vertica import LinearRegression + from verticapy.machine_learning.vertica import ElasticNet Then we can create the model: .. code-block:: - model = LinearRegression( + model = ElasticNet( tol = 1e-6, + C = 1, max_iter = 100, - solver = 'Newton', + solver = 'CGD', + l1_ratio = 0.5, fit_intercept = True, ) @@ -607,11 +438,13 @@ class LinearRegression(Regressor, LinearModel): .. ipython:: python :suppress: - from verticapy.machine_learning.vertica import LinearRegression - model = LinearRegression( + from verticapy.machine_learning.vertica import ElasticNet + model = ElasticNet( tol = 1e-6, + C = 1, max_iter = 100, - solver = 'Newton', + solver = 'CGD', + l1_ratio = 0.5, fit_intercept = True, ) @@ -621,6 +454,7 @@ class LinearRegression(Regressor, LinearModel): We can now fit the model: .. ipython:: python + :okwarning: model.fit( train, @@ -644,31 +478,6 @@ class LinearRegression(Regressor, LinearModel): don't work using ``X`` matrices and ``y`` vectors. Instead, we work directly with lists of predictors and the response name. - Features Importance - ^^^^^^^^^^^^^^^^^^^^ - - We can conveniently get the features importance: - - .. ipython:: python - :suppress: - - vp.set_option("plotting_lib", "plotly") - fig = model.features_importance() - fig.write_html("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_feature.html") - - .. code-block:: python - - result = model.features_importance() - - .. raw:: html - :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_feature.html - - .. note:: - - For ``LinearModel``, feature importance is computed using the coefficients. - These coefficients are then normalized using the feature distribution. An - activation function is applied to get the final score. - Metrics ^^^^^^^^ @@ -678,16 +487,16 @@ class LinearRegression(Regressor, LinearModel): :suppress: result = model.report() - html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_elasticnet_report.html", "w") html_file.write(result._repr_html_()) html_file.close() .. code-block:: python - result = model.report() + model.report() .. raw:: html - :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report.html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_elasticnet_report.html .. important:: @@ -702,16 +511,16 @@ class LinearRegression(Regressor, LinearModel): :suppress: result = model.report(metrics = "anova") - html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report_anova.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_elasticnet_report_anova.html", "w") html_file.write(result._repr_html_()) html_file.close() .. code-block:: python - result = model.report(metrics = "anova") + model.report(metrics = "anova") .. raw:: html - :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report_anova.html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_elasticnet_report_anova.html You can also use the ``LinearModel.score`` function to compute the R-squared value: @@ -740,7 +549,7 @@ class LinearRegression(Regressor, LinearModel): ], "prediction", ) - html_file = open("figures/machine_learning_vertica_linear_model_lr_prediction.html", "w") + html_file = open("figures/machine_learning_vertica_linear_model_elasticnet_prediction.html", "w") html_file.write(result._repr_html_()) html_file.close() @@ -760,7 +569,7 @@ class LinearRegression(Regressor, LinearModel): ) .. raw:: html - :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_prediction.html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_elasticnet_prediction.html .. note:: @@ -887,8 +696,10 @@ def __init__( name: str = None, overwrite_model: bool = False, tol: float = 1e-6, + C: PythonNumber = 1.0, max_iter: int = 100, - solver: Literal["newton", "bfgs"] = "newton", + solver: Literal["newton", "bfgs", "cgd"] = "cgd", + l1_ratio: float = 0.5, fit_intercept: bool = True, ) -> None: super().__init__(name, overwrite_model) @@ -898,149 +709,1864 @@ def __init__( "Vertica versions greater or equal to 12." ) self.parameters = { - "penalty": "none", + "penalty": "enet", "tol": tol, + "C": C, "max_iter": max_iter, "solver": str(solver).lower(), + "l1_ratio": l1_ratio, "fit_intercept": fit_intercept, } -class Ridge(Regressor, LinearModel): +class Lasso(Regressor, LinearModel): """ - Creates a Ridge object using the Vertica + Creates a Lasso object using the Vertica Linear Regression algorithm. - Ridge is a regularized regression method - which uses an L2 penalty. + Lasso is a regularized regression method + that uses an L1 penalty. Parameters ---------- name: str, optional - Name of the model. The model is stored in the + Name of the model. The model is stored in the database. overwrite_model: bool, optional If set to True, training a model with the same name as an existing model overwrites the existing model. tol: float, optional - Determines whether the algorithm has reached + Determines whether the algorithm has reached the specified accuracy result. C: PythonNumber, optional - The regularization parameter value. The value + The regularization parameter value. The value must be zero or non-negative. max_iter: int, optional - Determines the maximum number of iterations - the algorithm performs before achieving the + Determines the maximum number of iterations + the algorithm performs before achieving the specified accuracy result. solver: str, optional The optimizer method used to train the model. newton : Newton Method. bfgs : Broyden Fletcher Goldfarb Shanno. + cgd : Coordinate Gradient Descent. fit_intercept: bool, optional - Boolean, specifies whether the model includes - an intercept. If set to false, no intercept - is used in training the model. - Note that setting fit_intercept to false does - not work well with the BFGS optimizer. - """ + Boolean, specifies whether the model includes an + intercept. If set to false, no intercept is + used in training the model. Note that setting + fit_intercept to false does not work well with the + BFGS optimizer. + + Examples + --------- + + The following examples provide a basic understanding of usage. + For more detailed examples, please refer to the + :ref:`user_guide.machine_learning` or the + `Examples <https://www.vertica.com/python/examples/>`_ + section on the website. + + Load data for machine learning + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + We import ``verticapy``: + + .. code-block:: python + + import verticapy as vp + + .. hint:: + + By assigning an alias to ``verticapy``, we mitigate the risk + of code collisions with other libraries. This precaution is + necessary because verticapy uses commonly known function names + like "average" and "median", which can potentially lead to naming + conflicts. The use of an alias ensures that the functions from + verticapy are used as intended without interfering with functions + from other libraries. + + For this example, we will use the winequality dataset. + + .. code-block:: python + + import verticapy.datasets as vpd + + data = vpd.load_winequality() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_winequality.html + + .. note:: + + VerticaPy offers a wide range of sample datasets that are + ideal for training and testing purposes. You can explore + the full list of available datasets in the :ref:`api.datasets`, + which provides detailed information on each dataset + and how to use them effectively. These datasets are invaluable + resources for honing your data analysis and machine learning + skills within the VerticaPy environment. + + You can easily divide your dataset into training and testing subsets + using the :py:mod:`vDataFrame.train_test_split` method. This is a + crucial step when preparing your data for machine learning, as it + allows you to evaluate the performance of your models accurately. + + .. code-block:: python + + data = vpd.load_winequality() + train, test = data.train_test_split(test_size = 0.2) + + .. warning:: + + In this case, VerticaPy utilizes seeded randomization to guarantee + the reproducibility of your data split. However, please be aware + that this approach may lead to reduced performance. For a more + efficient data split, you can use the :py:mod:`vDataFrame.to_db` + method to save your results into ``tables`` or ``temporary tables``. + This will help enhance the overall performance of the process. + + .. ipython:: python + :suppress: + + import verticapy as vp + import verticapy.datasets as vpd + data = vpd.load_winequality() + train, test = data.train_test_split(test_size = 0.2) + + Model Initialization + ^^^^^^^^^^^^^^^^^^^^^ + + First we import the ``Lasso`` model: + + .. code-block:: + + from verticapy.machine_learning.vertica import Lasso + + Then we can create the model: + + .. code-block:: + + model = Lasso( + tol = 1e-6, + C = 0.5, + max_iter = 100, + solver = 'CGD', + ) + + .. hint:: + + In ``verticapy`` 1.0.x and higher, you do not need to specify the + model name, as the name is automatically assigned. If you need to + re-use the model, you can fetch the model name from the model's + attributes. + + .. important:: + + The model name is crucial for the model management system and + versioning. It's highly recommended to provide a name if you + plan to reuse the model later. + + .. ipython:: python + :suppress: + + from verticapy.machine_learning.vertica import Lasso + model = Lasso( + tol = 1e-6, + C = 0.5, + max_iter = 100, + solver = 'CGD', + ) + + Model Training + ^^^^^^^^^^^^^^^ + + We can now fit the model: + + .. ipython:: python + :okwarning: + + model.fit( + train, + [ + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", + "density" + ], + "quality", + test, + ) + + .. important:: + + To train a model, you can directly use the ``vDataFrame`` or the + name of the relation stored in the database. The test set is optional + and is only used to compute the test metrics. In ``verticapy``, we + don't work using ``X`` matrices and ``y`` vectors. Instead, we work + directly with lists of predictors and the response name. + + Metrics + ^^^^^^^^ + + We can get the entire report using: + + .. ipython:: python + :suppress: + + result = model.report() + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lasso_report.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. code-block:: python + + model.report() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lasso_report.html + + .. important:: + + Most metrics are computed using a single SQL query, but some of them might + require multiple SQL queries. Selecting only the necessary metrics in the + report can help optimize performance. + E.g. ``model.report(metrics = ["mse", "r2"])``. + + For ``LinearModel``, we can easily get the ANOVA table using: + + .. ipython:: python + :suppress: + + result = model.report(metrics = "anova") + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lasso_report_anova.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. code-block:: python + + model.report(metrics = "anova") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lasso_report_anova.html + + You can also use the ``LinearModel.score`` function to compute the R-squared + value: + + .. ipython:: python + + model.score() + + Prediction + ^^^^^^^^^^^ + + Prediction is straight-forward: + + .. ipython:: python + :suppress: + + result = model.predict( + test, + [ + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", + "density" + ], + "prediction", + ) + html_file = open("figures/machine_learning_vertica_linear_model_lasso_prediction.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. code-block:: python + + model.predict( + test, + [ + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", + "density" + ], + "prediction", + ) + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lasso_prediction.html + + .. note:: + + Predictions can be made automatically using the test set, in which + case you don't need to specify the predictors. Alternatively, you + can pass only the ``vDataFrame`` to the + :py:mod:`verticapy.machine_learning.vertica.linear_model.LinearModel.predict` + function, but in this case, it's essential that the column names of + the ``vDataFrame`` match the predictors and response name in the + model. + + Plots + ^^^^^^ + + If the model allows, you can also generate relevant plots. For example, + regression plots can be found in the :ref:`chart_gallery.regression_plot`. + + .. code-block:: python + + model.plot() + + .. important:: + + The plotting feature is typically suitable for models with fewer than + three predictors. + + Parameter Modification + ^^^^^^^^^^^^^^^^^^^^^^^ + + In order to see the parameters: + + .. ipython:: python + + model.get_params() + + And to manually change some of the parameters: + + .. ipython:: python + + model.set_params({'tol': 0.001}) + + Model Register + ^^^^^^^^^^^^^^ + + In order to register the model for tracking and versioning: + + .. code-block:: python + + model.register("model_v1") + + Please refer to :ref:`notebooks/ml/model_tracking_versioning/index.html` for + more details on model tracking and versioning. + + Model Exporting + ^^^^^^^^^^^^^^^^ + + **To Memmodel** + + .. code-block:: python + + model.to_memmodel() + + .. note:: + + ``MemModel`` objects serve as in-memory representations of machine + learning models. They can be used for both in-database and in-memory + prediction tasks. These objects can be pickled in the same way that + you would pickle a ``scikit-learn`` model. + + The following methods for exporting the model use ``MemModel``, and it + is recommended to use ``MemModel`` directly. + + **To SQL** + + You can get the SQL code by: + + .. ipython:: python + + model.to_sql() + + **To Python** + + To obtain the prediction function in Python syntax, use the following code: + + .. ipython:: python + + X = [[4.2, 0.17, 0.36, 1.8, 0.029, 0.9899]] + model.to_python()(X) + + .. hint:: + + The + :py:mod:`verticapy.machine_learning.vertica.linear_model.LinearModel.to_python` + method is used to retrieve predictions, + probabilities, or cluster distances. For specific details on how to + use this method for different model types, refer to the relevant + documentation for each model. + """ + + # Properties. + + @property + def _vertica_fit_sql(self) -> Literal["LINEAR_REG"]: + return "LINEAR_REG" + + @property + def _vertica_predict_sql(self) -> Literal["PREDICT_LINEAR_REG"]: + return "PREDICT_LINEAR_REG" + + @property + def _model_subcategory(self) -> Literal["REGRESSOR"]: + return "REGRESSOR" + + @property + def _model_type(self) -> Literal["LinearRegression"]: + return "LinearRegression" + + # System & Special Methods. + + @check_minimum_version + @save_verticapy_logs + def __init__( + self, + name: str = None, + overwrite_model: bool = False, + tol: float = 1e-6, + C: PythonNumber = 1.0, + max_iter: int = 100, + solver: Literal["newton", "bfgs", "cgd"] = "cgd", + fit_intercept: bool = True, + ) -> None: + super().__init__(name, overwrite_model) + if vertica_version()[0] < 12 and not fit_intercept: + raise VersionError( + "The parameter 'fit_intercept' can be activated for " + "Vertica versions greater or equal to 12." + ) + self.parameters = { + "penalty": "l1", + "tol": tol, + "C": C, + "max_iter": max_iter, + "solver": str(solver).lower(), + "fit_intercept": fit_intercept, + } + + +class LinearRegression(Regressor, LinearModel): + """ + Creates a LinearRegression object using the Vertica + Linear Regression algorithm. + + Parameters + ---------- + name: str, optional + Name of the model. The model is stored in the + database. + overwrite_model: bool, optional + If set to True, training a model with the same + name as an existing model overwrites the + existing model. + tol: float, optional + Determines whether the algorithm has reached the + specified accuracy result. + max_iter: int, optional + Determines the maximum number of iterations the + algorithm performs before achieving the specified + accuracy result. + solver: str, optional + The optimizer method used to train the model. + newton : Newton Method. + bfgs : Broyden Fletcher Goldfarb Shanno. + fit_intercept: bool, optional + Boolean, specifies whether the model includes an + intercept. If set to false, no intercept is + used in training the model. Note that setting + fit_intercept to false does not work well with the + BFGS optimizer. + + Examples + --------- + + The following examples provide a basic understanding of usage. + For more detailed examples, please refer to the + :ref:`user_guide.machine_learning` or the + `Examples <https://www.vertica.com/python/examples/>`_ + section on the website. + + Load data for machine learning + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + We import ``verticapy``: + + .. code-block:: python + + import verticapy as vp + + .. hint:: + + By assigning an alias to ``verticapy``, we mitigate the risk + of code collisions with other libraries. This precaution is + necessary because verticapy uses commonly known function names + like "average" and "median", which can potentially lead to naming + conflicts. The use of an alias ensures that the functions from + verticapy are used as intended without interfering with functions + from other libraries. + + For this example, we will use the winequality dataset. + + .. code-block:: python + + import verticapy.datasets as vpd + + data = vpd.load_winequality() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_winequality.html + + .. note:: + + VerticaPy offers a wide range of sample datasets that are + ideal for training and testing purposes. You can explore + the full list of available datasets in the :ref:`api.datasets`, + which provides detailed information on each dataset + and how to use them effectively. These datasets are invaluable + resources for honing your data analysis and machine learning + skills within the VerticaPy environment. + + You can easily divide your dataset into training and testing subsets + using the :py:mod:`vDataFrame.train_test_split` method. This is a + crucial step when preparing your data for machine learning, as it + allows you to evaluate the performance of your models accurately. + + .. code-block:: python + + data = vpd.load_winequality() + train, test = data.train_test_split(test_size = 0.2) + + .. warning:: + + In this case, VerticaPy utilizes seeded randomization to guarantee + the reproducibility of your data split. However, please be aware + that this approach may lead to reduced performance. For a more + efficient data split, you can use the :py:mod:`vDataFrame.to_db` + method to save your results into ``tables`` or ``temporary tables``. + This will help enhance the overall performance of the process. + + .. ipython:: python + :suppress: + + import verticapy as vp + import verticapy.datasets as vpd + data = vpd.load_winequality() + train, test = data.train_test_split(test_size = 0.2) + + Model Initialization + ^^^^^^^^^^^^^^^^^^^^^ + + First we import the ``LinearRegression`` model: + + .. code-block:: + + from verticapy.machine_learning.vertica import LinearRegression + + Then we can create the model: + + .. code-block:: + + model = LinearRegression( + tol = 1e-6, + max_iter = 100, + solver = 'Newton', + fit_intercept = True, + ) + + .. hint:: + + In ``verticapy`` 1.0.x and higher, you do not need to specify the + model name, as the name is automatically assigned. If you need to + re-use the model, you can fetch the model name from the model's + attributes. + + .. important:: + + The model name is crucial for the model management system and + versioning. It's highly recommended to provide a name if you + plan to reuse the model later. + + .. ipython:: python + :suppress: + + from verticapy.machine_learning.vertica import LinearRegression + model = LinearRegression( + tol = 1e-6, + max_iter = 100, + solver = 'Newton', + fit_intercept = True, + ) + + Model Training + ^^^^^^^^^^^^^^^ + + We can now fit the model: + + .. ipython:: python + + model.fit( + train, + [ + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", + "density" + ], + "quality", + test, + ) + + .. important:: + + To train a model, you can directly use the ``vDataFrame`` or the + name of the relation stored in the database. The test set is optional + and is only used to compute the test metrics. In ``verticapy``, we + don't work using ``X`` matrices and ``y`` vectors. Instead, we work + directly with lists of predictors and the response name. + + Features Importance + ^^^^^^^^^^^^^^^^^^^^ + + We can conveniently get the features importance: + + .. ipython:: python + :suppress: + + vp.set_option("plotting_lib", "plotly") + fig = model.features_importance() + fig.write_html("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_feature.html") + + .. code-block:: python + + result = model.features_importance() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_feature.html + + .. note:: + + For ``LinearModel``, feature importance is computed using + the coefficients. These coefficients are then normalized using the + feature distribution. An activation function is applied to + get the final score. + + Metrics + ^^^^^^^^ + + We can get the entire report using: + + .. ipython:: python + :suppress: + + result = model.report() + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. code-block:: python + + result = model.report() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report.html + + .. important:: + + Most metrics are computed using a single SQL query, but some + of them might require multiple SQL queries. Selecting only the + necessary metrics in the report can help optimize performance. + E.g. ``model.report(metrics = ["mse", "r2"])``. + + For ``LinearModel``, we can easily get the ANOVA table using: + + .. ipython:: python + :suppress: + + result = model.report(metrics = "anova") + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report_anova.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. code-block:: python + + result = model.report(metrics = "anova") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report_anova.html + + You can also use the ``LinearModel.score`` function to compute the R-squared + value: + + .. ipython:: python + + model.score() + + Prediction + ^^^^^^^^^^^ + + Prediction is straight-forward: + + .. ipython:: python + :suppress: + + result = model.predict( + test, + [ + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", + "density" + ], + "prediction", + ) + html_file = open("figures/machine_learning_vertica_linear_model_lr_prediction.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. code-block:: python + + model.predict( + test, + [ + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", + "density" + ], + "prediction", + ) + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_prediction.html + + .. note:: + + Predictions can be made automatically using the test set, in which + case you don't need to specify the predictors. Alternatively, you + can pass only the ``vDataFrame`` to the + :py:mod:`verticapy.machine_learning.vertica.linear_model.LinearModel.predict` + function, but in this case, it's essential that the column names of + the ``vDataFrame`` match the predictors and response name in the + model. + + Plots + ^^^^^^ + + If the model allows, you can also generate relevant plots. For example, + regression plots can be found in the :ref:`chart_gallery.regression_plot`. + + .. code-block:: python + + model.plot() + + .. important:: + + The plotting feature is typically suitable for models with fewer + than three predictors. + + Parameter Modification + ^^^^^^^^^^^^^^^^^^^^^^^ + + In order to see the parameters: + + .. ipython:: python + + model.get_params() + + And to manually change some of the parameters: + + .. ipython:: python + + model.set_params({'tol': 0.001}) + + Model Register + ^^^^^^^^^^^^^^ + + In order to register the model for tracking and versioning: + + .. code-block:: python + + model.register("model_v1") + + Please refer to :ref:`notebooks/ml/model_tracking_versioning/index.html` for + more details on model tracking and versioning. + + Model Exporting + ^^^^^^^^^^^^^^^^ + + **To Memmodel** + + .. code-block:: python + + model.to_memmodel() + + .. note:: + + ``MemModel`` objects serve as in-memory representations of machine + learning models. They can be used for both in-database and in-memory + prediction tasks. These objects can be pickled in the same way that + you would pickle a ``scikit-learn`` model. + + The following methods for exporting the model use ``MemModel``, and it + is recommended to use ``MemModel`` directly. + + **To SQL** + + You can get the SQL code by: + + .. ipython:: python + + model.to_sql() + + **To Python** + + To obtain the prediction function in Python syntax, use the following code: + + .. ipython:: python + + X = [[4.2, 0.17, 0.36, 1.8, 0.029, 0.9899]] + model.to_python()(X) + + .. hint:: + + The + :py:mod:`verticapy.machine_learning.vertica.linear_model.LinearModel.to_python` + method is used to retrieve predictions, + probabilities, or cluster distances. For specific details on how + to use this method for different model types, refer to the + relevant documentation for each model. + """ + + # Properties. + + @property + def _vertica_fit_sql(self) -> Literal["LINEAR_REG"]: + return "LINEAR_REG" + + @property + def _vertica_predict_sql(self) -> Literal["PREDICT_LINEAR_REG"]: + return "PREDICT_LINEAR_REG" + + @property + def _model_subcategory(self) -> Literal["REGRESSOR"]: + return "REGRESSOR" + + @property + def _model_type(self) -> Literal["LinearRegression"]: + return "LinearRegression" + + # System & Special Methods. + + @check_minimum_version + @save_verticapy_logs + def __init__( + self, + name: str = None, + overwrite_model: bool = False, + tol: float = 1e-6, + max_iter: int = 100, + solver: Literal["newton", "bfgs"] = "newton", + fit_intercept: bool = True, + ) -> None: + super().__init__(name, overwrite_model) + if vertica_version()[0] < 12 and not fit_intercept: + raise VersionError( + "The parameter 'fit_intercept' can be activated for " + "Vertica versions greater or equal to 12." + ) + self.parameters = { + "penalty": "none", + "tol": tol, + "max_iter": max_iter, + "solver": str(solver).lower(), + "fit_intercept": fit_intercept, + } + + +class Ridge(Regressor, LinearModel): + """ + Creates a Ridge object using the Vertica + Linear Regression algorithm. + Ridge is a regularized regression method + which uses an L2 penalty. + + Parameters + ---------- + name: str, optional + Name of the model. The model is stored in the + database. + overwrite_model: bool, optional + If set to True, training a model with the same + name as an existing model overwrites the + existing model. + tol: float, optional + Determines whether the algorithm has reached + the specified accuracy result. + C: PythonNumber, optional + The regularization parameter value. The value + must be zero or non-negative. + max_iter: int, optional + Determines the maximum number of iterations + the algorithm performs before achieving the + specified accuracy result. + solver: str, optional + The optimizer method used to train the model. + newton : Newton Method. + bfgs : Broyden Fletcher Goldfarb Shanno. + fit_intercept: bool, optional + Boolean, specifies whether the model includes + an intercept. If set to false, no intercept + is used in training the model. + Note that setting fit_intercept to false does + not work well with the BFGS optimizer. + + Examples + --------- + + The following examples provide a basic understanding of usage. + For more detailed examples, please refer to the + :ref:`user_guide.machine_learning` or the + `Examples <https://www.vertica.com/python/examples/>`_ + section on the website. + + Load data for machine learning + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + We import ``verticapy``: + + .. code-block:: python + + import verticapy as vp + + .. hint:: + + By assigning an alias to ``verticapy``, we mitigate the risk + of code collisions with other libraries. This precaution is + necessary because verticapy uses commonly known function names + like "average" and "median", which can potentially lead to naming + conflicts. The use of an alias ensures that the functions from + verticapy are used as intended without interfering with functions + from other libraries. + + For this example, we will use the winequality dataset. + + .. code-block:: python + + import verticapy.datasets as vpd + + data = vpd.load_winequality() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_winequality.html + + .. note:: + + VerticaPy offers a wide range of sample datasets that are + ideal for training and testing purposes. You can explore + the full list of available datasets in the :ref:`api.datasets`, + which provides detailed information on each dataset + and how to use them effectively. These datasets are invaluable + resources for honing your data analysis and machine learning + skills within the VerticaPy environment. + + You can easily divide your dataset into training and testing subsets + using the :py:mod:`vDataFrame.train_test_split` method. This is a + crucial step when preparing your data for machine learning, as it + allows you to evaluate the performance of your models accurately. + + .. code-block:: python + + data = vpd.load_winequality() + train, test = data.train_test_split(test_size = 0.2) + + .. warning:: + + In this case, VerticaPy utilizes seeded randomization to guarantee + the reproducibility of your data split. However, please be aware + that this approach may lead to reduced performance. For a more + efficient data split, you can use the :py:mod:`vDataFrame.to_db` + method to save your results into ``tables`` or ``temporary tables``. + This will help enhance the overall performance of the process. + + .. ipython:: python + :suppress: + + import verticapy as vp + import verticapy.datasets as vpd + data = vpd.load_winequality() + train, test = data.train_test_split(test_size = 0.2) + + Model Initialization + ^^^^^^^^^^^^^^^^^^^^^ + + First we import the ``Ridge`` model: + + .. code-block:: + + from verticapy.machine_learning.vertica import Ridge + + Then we can create the model: + + .. code-block:: + + model = Ridge( + tol = 1e-6, + C = 0.5, + max_iter = 100, + solver = 'Newton', + ) + + .. hint:: + + In ``verticapy`` 1.0.x and higher, you do not need to specify the + model name, as the name is automatically assigned. If you need to + re-use the model, you can fetch the model name from the model's + attributes. + + .. important:: + + The model name is crucial for the model management system and + versioning. It's highly recommended to provide a name if you + plan to reuse the model later. + + .. ipython:: python + :suppress: + + from verticapy.machine_learning.vertica import Ridge + model = Ridge( + tol = 1e-6, + C = 0.5, + max_iter = 100, + solver = 'Newton', + ) + + Model Training + ^^^^^^^^^^^^^^^ + + We can now fit the model: + + .. ipython:: python + + model.fit( + train, + [ + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", + "density" + ], + "quality", + test, + ) + + .. important:: + + To train a model, you can directly use the ``vDataFrame`` or the + name of the relation stored in the database. The test set is optional + and is only used to compute the test metrics. In ``verticapy``, we + don't work using ``X`` matrices and ``y`` vectors. Instead, we work + directly with lists of predictors and the response name. + + Features Importance + ^^^^^^^^^^^^^^^^^^^^ + + We can conveniently get the features importance: + + .. ipython:: python + :suppress: + + vp.set_option("plotting_lib", "plotly") + fig = model.features_importance() + fig.write_html("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_ridge_feature.html") + + .. code-block:: python + + result = model.features_importance() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_ridge_feature.html + + .. note:: + + For ``LinearModel``, feature importance is computed using the coefficients. + These coefficients are then normalized using the feature distribution. An + activation function is applied to get the final score. + + Metrics + ^^^^^^^^ + + We can get the entire report using: + + .. ipython:: python + :suppress: + + result = model.report() + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_ridge_report.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. code-block:: python + + model.report() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_ridge_report.html + + .. important:: + + Most metrics are computed using a single SQL query, but some of them might + require multiple SQL queries. Selecting only the necessary metrics in the + report can help optimize performance. + E.g. ``model.report(metrics = ["mse", "r2"])``. + + For ``LinearModel``, we can easily get the ANOVA table using: + + .. ipython:: python + :suppress: + + result = model.report(metrics = "anova") + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_ridge_report_anova.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. code-block:: python + + model.report(metrics = "anova") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_ridge_report_anova.html + + You can also use the ``LinearModel.score`` function to compute the R-squared + value: + + .. ipython:: python + + model.score() + + Prediction + ^^^^^^^^^^^ + + Prediction is straight-forward: + + .. ipython:: python + :suppress: + + result = model.predict( + test, + [ + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", + "density" + ], + "prediction", + ) + html_file = open("figures/machine_learning_vertica_linear_model_ridge_prediction.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. code-block:: python + + model.predict( + test, + [ + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", + "density" + ], + "prediction", + ) + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_ridge_prediction.html + + .. note:: + + Predictions can be made automatically using the test set, in which + case you don't need to specify the predictors. Alternatively, you + can pass only the ``vDataFrame`` to the + :py:mod:`verticapy.machine_learning.vertica.linear_model.LinearModel.predict` + function, but in this case, it's essential that the column names of + the ``vDataFrame`` match the predictors and response name in the + model. + + Plots + ^^^^^^ + + If the model allows, you can also generate relevant plots. For example, + regression plots can be found in the :ref:`chart_gallery.regression_plot`. + + .. code-block:: python + + model.plot() + + .. important:: + + The plotting feature is typically suitable for models with fewer than + three predictors. + + Parameter Modification + ^^^^^^^^^^^^^^^^^^^^^^^ + + In order to see the parameters: + + .. ipython:: python + + model.get_params() + + And to manually change some of the parameters: + + .. ipython:: python + + model.set_params({'tol': 0.001}) + + Model Register + ^^^^^^^^^^^^^^ + + In order to register the model for tracking and versioning: + + .. code-block:: python + + model.register("model_v1") + + Please refer to :ref:`notebooks/ml/model_tracking_versioning/index.html` + for more details on model tracking and versioning. + + Model Exporting + ^^^^^^^^^^^^^^^^ + + **To Memmodel** + + .. code-block:: python + + model.to_memmodel() + + .. note:: + + ``MemModel`` objects serve as in-memory representations of machine + learning models. They can be used for both in-database and in-memory + prediction tasks. These objects can be pickled in the same way that + you would pickle a ``scikit-learn`` model. + + The following methods for exporting the model use ``MemModel``, and it + is recommended to use ``MemModel`` directly. + + **To SQL** + + You can get the SQL code by: + + .. ipython:: python + + model.to_sql() + + **To Python** + + To obtain the prediction function in Python syntax, use the following code: + + .. ipython:: python + + X = [[4.2, 0.17, 0.36, 1.8, 0.029, 0.9899]] + model.to_python()(X) + + .. hint:: + + The + :py:mod:`verticapy.machine_learning.vertica.linear_model.LinearModel.to_python` + method is used to retrieve predictions, + probabilities, or cluster distances. For specific details on how to + use this method for different model types, refer to the relevant + documentation for each model. + """ # Properties. - @property - def _vertica_fit_sql(self) -> Literal["LINEAR_REG"]: - return "LINEAR_REG" + @property + def _vertica_fit_sql(self) -> Literal["LINEAR_REG"]: + return "LINEAR_REG" + + @property + def _vertica_predict_sql(self) -> Literal["PREDICT_LINEAR_REG"]: + return "PREDICT_LINEAR_REG" + + @property + def _model_subcategory(self) -> Literal["REGRESSOR"]: + return "REGRESSOR" + + @property + def _model_type(self) -> Literal["LinearRegression"]: + return "LinearRegression" + + # System & Special Methods. + + @check_minimum_version + @save_verticapy_logs + def __init__( + self, + name: str = None, + overwrite_model: bool = False, + tol: float = 1e-6, + C: PythonNumber = 1.0, + max_iter: int = 100, + solver: Literal["newton", "bfgs"] = "newton", + fit_intercept: bool = True, + ) -> None: + super().__init__(name, overwrite_model) + if vertica_version()[0] < 12 and not fit_intercept: + raise VersionError( + "The parameter 'fit_intercept' can be activated for " + "Vertica versions greater or equal to 12." + ) + self.parameters = { + "penalty": "l2", + "tol": tol, + "C": C, + "max_iter": max_iter, + "solver": str(solver).lower(), + "fit_intercept": fit_intercept, + } + + +""" +Algorithms used for classification. +""" + + +class LogisticRegression(BinaryClassifier, LinearModelClassifier): + """ + Creates a LogisticRegression object using the Vertica + Logistic Regression algorithm. + + Parameters + ---------- + name: str, optional + Name of the model. The model is stored in the + database. + overwrite_model: bool, optional + If set to True, training a model with the same + name as an existing model overwrites the + existing model. + penalty: str, optional + Determines the method of regularization. + None : No Regularization. + l1 : L1 Regularization. + l2 : L2 Regularization. + enet : Combination between L1 and L2. + tol: float, optional + Determines whether the algorithm has reached the + specified accuracy result. + C: PythonNumber, optional + The regularization parameter value. The value must + be zero or non-negative. + max_iter: int, optional + Determines the maximum number of iterations the + algorithm performs before achieving the specified + accuracy result. + solver: str, optional + The optimizer method used to train the model. + newton : Newton Method. + bfgs : Broyden Fletcher Goldfarb Shanno. + cgd : Coordinate Gradient Descent. + l1_ratio: float, optional + ENet mixture parameter that defines the provided + ratio of L1 versus L2 regularization. + fit_intercept: bool, optional + Boolean, specifies whether the model includes an + intercept. + If set to false, no intercept is used in + training the model. Note that setting fit_intercept + to false does not work well with the BFGS optimizer. + + Examples + --------- + + The following examples provide a basic understanding of usage. + For more detailed examples, please refer to the + :ref:`user_guide.machine_learning` or the + `Examples <https://www.vertica.com/python/examples/>`_ + section on the website. + + Load data for machine learning + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + We import ``verticapy``: + + .. code-block:: python + + import verticapy as vp + + .. hint:: + + By assigning an alias to ``verticapy``, we mitigate the risk of code + collisions with other libraries. This precaution is necessary + because verticapy uses commonly known function names like "average" + and "median", which can potentially lead to naming conflicts. + The use of an alias ensures that the functions from verticapy are + used as intended without interfering with functions from other + libraries. + + For this example, we will use the winequality dataset. + + .. code-block:: python + + import verticapy.datasets as vpd + + data = vpd.load_winequality() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_winequality.html + + .. note:: + + VerticaPy offers a wide range of sample datasets that are + ideal for training and testing purposes. You can explore + the full list of available datasets in the :ref:`api.datasets`, + which provides detailed information on each dataset + and how to use them effectively. These datasets are invaluable + resources for honing your data analysis and machine learning + skills within the VerticaPy environment. + + You can easily divide your dataset into training and testing subsets + using the :py:mod:`vDataFrame.train_test_split` method. This is a + crucial step when preparing your data for machine learning, as it + allows you to evaluate the performance of your models accurately. + + .. code-block:: python + + data = vpd.load_winequality() + train, test = data.train_test_split(test_size = 0.2) + + .. warning:: + + In this case, VerticaPy utilizes seeded randomization to guarantee + the reproducibility of your data split. However, please be aware + that this approach may lead to reduced performance. For a more + efficient data split, you can use the :py:mod:`vDataFrame.to_db` + method to save your results into ``tables`` or ``temporary tables``. + This will help enhance the overall performance of the process. + + .. ipython:: python + :suppress: + + import verticapy as vp + import verticapy.datasets as vpd + data = vpd.load_winequality() + train, test = data.train_test_split(test_size = 0.2) + + Model Initialization + ^^^^^^^^^^^^^^^^^^^^^ + + First we import the ``LogisticRegression`` model: + + .. code-block:: + + from verticapy.machine_learning.vertica import LogisticRegression + + Then we can create the model: + + .. code-block:: + + model = LogisticRegression( + tol = 1e-6, + max_iter = 100, + solver = 'Newton', + fit_intercept = True, + ) + + .. hint:: + + In ``verticapy`` 1.0.x and higher, you do not need to specify the + model name, as the name is automatically assigned. If you need to + re-use the model, you can fetch the model name from the model's + attributes. + + .. important:: + + The model name is crucial for the model management system and + versioning. It's highly recommended to provide a name if you + plan to reuse the model later. + + .. ipython:: python + :suppress: + + from verticapy.machine_learning.vertica import LogisticRegression + model = LogisticRegression( + tol = 1e-6, + max_iter = 100, + solver = 'Newton', + fit_intercept = True, + ) + + Model Training + ^^^^^^^^^^^^^^^ + + We can now fit the model: + + .. ipython:: python + + model.fit( + train, + [ + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", + "density" + ], + "good", + test, + ) + + .. important:: + + To train a model, you can directly use the ``vDataFrame`` or the + name of the relation stored in the database. The test set is optional + and is only used to compute the test metrics. In ``verticapy``, we + don't work using ``X`` matrices and ``y`` vectors. Instead, we work + directly with lists of predictors and the response name. + + Features Importance + ^^^^^^^^^^^^^^^^^^^^ + + We can conveniently get the features importance: + + .. ipython:: python + :suppress: + + vp.set_option("plotting_lib", "plotly") + fig = model.features_importance() + fig.write_html("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_logr_feature.html") + + .. code-block:: python + + result = model.features_importance() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_logr_feature.html + + .. note:: + + For ``LinearModel``, feature importance is computed using the coefficients. + These coefficients are then normalized using the feature distribution. An + activation function is applied to get the final score. + + Metrics + ^^^^^^^^ + + We can get the entire report using: + + .. ipython:: python + :suppress: + + result = model.report() + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_logr_report.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. code-block:: python + + model.report() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_logr_report.html + + .. important:: + + Most metrics are computed using a single SQL query, but some of them might + require multiple SQL queries. Selecting only the necessary metrics in the + report can help optimize performance. + E.g. ``model.report(metrics = ["auc", "accuracy"])``. + + For classification models, we can easily modify the ``cutoff`` to observe + the effect on different metrics: + + .. ipython:: python + :suppress: + + result = model.report(cutoff = 0.2) + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_logr_report_cutoff.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. code-block:: python + + model.report(cutoff = 0.2) + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_logr_report_cutoff.html + + + You can also use the ``LinearModel.score`` function to compute any + classification metric. The default metric is the accuracy: + + .. ipython:: python + + model.score() + + Prediction + ^^^^^^^^^^^ + + Prediction is straight-forward: + + .. ipython:: python + :suppress: + + result = model.predict( + test, + [ + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", + "density" + ], + "prediction", + ) + html_file = open("figures/machine_learning_vertica_linear_model_logr_prediction.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. code-block:: python - @property - def _vertica_predict_sql(self) -> Literal["PREDICT_LINEAR_REG"]: - return "PREDICT_LINEAR_REG" + model.predict( + test, + [ + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", + "density" + ], + "prediction", + ) - @property - def _model_subcategory(self) -> Literal["REGRESSOR"]: - return "REGRESSOR" + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_logr_prediction.html - @property - def _model_type(self) -> Literal["LinearRegression"]: - return "LinearRegression" + .. note:: - # System & Special Methods. + Predictions can be made automatically using the test set, in which + case you don't need to specify the predictors. Alternatively, you + can pass only the ``vDataFrame`` to the + :py:mod:`verticapy.machine_learning.vertica.linear_model.LinearModel.predict` + function, but in this case, it's essential that the column names of + the ``vDataFrame`` match the predictors and response name in the + model. - @check_minimum_version - @save_verticapy_logs - def __init__( - self, - name: str = None, - overwrite_model: bool = False, - tol: float = 1e-6, - C: PythonNumber = 1.0, - max_iter: int = 100, - solver: Literal["newton", "bfgs"] = "newton", - fit_intercept: bool = True, - ) -> None: - super().__init__(name, overwrite_model) - if vertica_version()[0] < 12 and not fit_intercept: - raise VersionError( - "The parameter 'fit_intercept' can be activated for " - "Vertica versions greater or equal to 12." - ) - self.parameters = { - "penalty": "l2", - "tol": tol, - "C": C, - "max_iter": max_iter, - "solver": str(solver).lower(), - "fit_intercept": fit_intercept, - } + Probabilities + ^^^^^^^^^^^^^^ + It is also easy to get the model's probabilities: -""" -Algorithms used for classification. -""" + .. ipython:: python + :suppress: + result = model.predict_proba( + test, + [ + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", + "density" + ], + "prediction", + ) + html_file = open("figures/machine_learning_vertica_linear_model_logr_proba.html", "w") + html_file.write(result._repr_html_()) + html_file.close() -class LogisticRegression(BinaryClassifier, LinearModelClassifier): - """ - Creates a LogisticRegression object using the Vertica - Logistic Regression algorithm. + .. code-block:: python - Parameters - ---------- - name: str, optional - Name of the model. The model is stored in the - database. - overwrite_model: bool, optional - If set to True, training a model with the same - name as an existing model overwrites the - existing model. - penalty: str, optional - Determines the method of regularization. - None : No Regularization. - l1 : L1 Regularization. - l2 : L2 Regularization. - enet : Combination between L1 and L2. - tol: float, optional - Determines whether the algorithm has reached the - specified accuracy result. - C: PythonNumber, optional - The regularization parameter value. The value must - be zero or non-negative. - max_iter: int, optional - Determines the maximum number of iterations the - algorithm performs before achieving the specified - accuracy result. - solver: str, optional - The optimizer method used to train the model. - newton : Newton Method. - bfgs : Broyden Fletcher Goldfarb Shanno. - cgd : Coordinate Gradient Descent. - l1_ratio: float, optional - ENet mixture parameter that defines the provided - ratio of L1 versus L2 regularization. - fit_intercept: bool, optional - Boolean, specifies whether the model includes an - intercept. - If set to false, no intercept is used in - training the model. Note that setting fit_intercept - to false does not work well with the BFGS optimizer. + model.predict_proba( + test, + [ + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", + "density" + ], + "prediction", + ) + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_logr_proba.html + + .. note:: + + Probabilities are added to the vDataFrame, and VerticaPy uses the + corresponding probability function in SQL behind the scenes. You + can use the ``pos_label`` parameter to add only the probability + of the selected category. + + Confusion Matrix + ^^^^^^^^^^^^^^^^^ + + You can obtain the confusion matrix of your choice by specifying + the desired cutoff. + + .. ipython:: python + + model.confusion_matrix(cutoff = 0.5) + + .. note:: + + In classification, the ``cutoff`` is a threshold value used to + determine class assignment based on predicted probabilities or + scores from a classification model. In binary classification, + if the predicted probability for a specific class is greater + than or equal to the cutoff, the instance is assigned to the + positive class; otherwise, it is assigned to the negative class. + Adjusting the cutoff allows for trade-offs between true positives + and false positives, enabling the model to be optimized for + specific objectives or to consider the relative costs of different + classification errors. The choice of cutoff is critical for + tailoring the model's performance to meet specific needs. + + Main Plots (Classification Curves) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + Classification models allow for the creation of various plots that + are very helpful in understanding the model, such as the ROC Curve, + PRC Curve, Cutoff Curve, Gain Curve, and more. + + Most of the classification curves can be found in the + :ref:`chart_gallery.classification_curve`. + + For example, let's draw the model's ROC curve. + + .. code-block:: python + + model.roc_curve() + + .. ipython:: python + :suppress: + + fig = model.roc_curve() + fig.write_html("figures/machine_learning_vertica_linear_model_logr_roc.html") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_logr_roc.html + + .. important:: + + Most of the curves have a parameter called ``nbins``, which is essential + for estimating metrics. The larger the ``nbins``, the more precise the + estimation, but it can significantly impact performance. Exercise caution + when increasing this parameter excessively. + + .. hint:: + + In binary classification, various curves can be easily plotted. However, + in multi-class classification, it's important to select the ``pos_label`` + , representing the class to be treated as positive when drawing the curve. + + Other Plots + ^^^^^^^^^^^^ + + If the model allows, you can also generate relevant plots. + For example, classification plots can be found in the + :ref:`chart_gallery.classification_plot`. + + .. code-block:: python + + model.plot() + + .. important:: + + The plotting feature is typically suitable for models with + fewer than three predictors. + + Parameter Modification + ^^^^^^^^^^^^^^^^^^^^^^^ + + In order to see the parameters: + + .. ipython:: python + + model.get_params() + + And to manually change some of the parameters: + + .. ipython:: python + + model.set_params({'tol': 0.001}) + + Model Register + ^^^^^^^^^^^^^^ + + In order to register the model for tracking and versioning: + + .. code-block:: python + + model.register("model_v1") + + Please refer to :ref:`notebooks/ml/model_tracking_versioning/index.html` + for more details on model tracking and versioning. + + Model Exporting + ^^^^^^^^^^^^^^^^ + + **To Memmodel** + + .. code-block:: python + + model.to_memmodel() + + .. note:: + + ``MemModel`` objects serve as in-memory representations of machine + learning models. They can be used for both in-database and in-memory + prediction tasks. These objects can be pickled in the same way that + you would pickle a ``scikit-learn`` model. + + The following methods for exporting the model use ``MemModel``, and it + is recommended to use ``MemModel`` directly. + + **To SQL** + + You can get the SQL code by: + + .. ipython:: python + + model.to_sql() + + **To Python** + + To obtain the prediction function in Python syntax, use the following code: + + .. ipython:: python + + X = [[4.2, 0.17, 0.36, 1.8, 0.029, 0.9899]] + model.to_python()(X) + + .. hint:: + + The + :py:mod:`verticapy.machine_learning.vertica.linear_model.LinearModel.to_python` + method is used to retrieve predictions, + probabilities, or cluster distances. For specific details on how to + use this method for different model types, refer to the relevant + documentation for each model. """ # Properties.