From 4de0e40fd7ce400d3471cb62f0da6989805b3f1a Mon Sep 17 00:00:00 2001
From: Umar Farooq Ghumman <46414488+mail4umar@users.noreply.github.com>
Date: Sat, 21 Oct 2023 12:39:39 -0500
Subject: [PATCH] Sphinx Docstring Update for
 MachineLearning/Vertica/LinearModel (#753)

* Sphinx Docstring Update for MachineLearning/Vertica/LinearModel

Draft for LinearRegression. Rest of the ml algortihm classes will follow the same syntax.

* Update linear_model.py

Fixing the example.

@mail4umar: TODO - Verify each reference, for some them the link needs to be added.
Please double check the code.

* updated references

* Addressed comments

---------

Co-authored-by: Badr <badr.ouali@outlook.fr>
---
 .../machine_learning/vertica/linear_model.py  | 364 ++++++++++++++++++
 1 file changed, 364 insertions(+)

diff --git a/verticapy/machine_learning/vertica/linear_model.py b/verticapy/machine_learning/vertica/linear_model.py
index e6ab43971..1332b9ca5 100755
--- a/verticapy/machine_learning/vertica/linear_model.py
+++ b/verticapy/machine_learning/vertica/linear_model.py
@@ -494,6 +494,370 @@ class LinearRegression(Regressor, LinearModel):
         used in  training the model.  Note that setting
         fit_intercept to false does not work well with the
         BFGS optimizer.
+
+    Examples
+    ---------
+
+    The following examples provide a basic understanding of usage.
+    For more detailed examples, please refer to the
+    :ref:`user_guide.machine_learning` or the
+    `Examples <https://www.vertica.com/python/examples/>`_
+    section on the website.
+
+    Load data for machine learning
+    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+    We import ``verticapy``:
+
+    .. code-block:: python
+
+        import verticapy as vp
+
+    .. hint::
+
+        By assigning an alias to ``verticapy``, we mitigate the risk of code
+        collisions with other libraries. This precaution is necessary
+        because verticapy uses commonly known function names like "average"
+        and "median", which can potentially lead to naming conflicts.
+        The use of an alias ensures that the functions from verticapy are
+        used as intended without interfering with functions from other
+        libraries.
+
+    For this example, we will use the winequality dataset.
+
+    .. code-block:: python
+
+        import verticapy.datasets as vpd
+
+        data = vpd.load_winequality()
+
+    .. raw:: html
+        :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_winequality.html
+
+    .. note::
+
+        VerticaPy offers a wide range of sample datasets that are
+        ideal for training and testing purposes. You can explore
+        the full list of available datasets in the :ref:`api.datasets`,
+        which provides detailed information on each dataset
+        and how to use them effectively. These datasets are invaluable
+        resources for honing your data analysis and machine learning
+        skills within the VerticaPy environment.
+
+    You can easily divide your dataset into training and testing subsets using the
+    :py:mod:`vDataFrame.train_test_split` method. This is a crucial step when preparing
+    your data for machine learning, as it allows you to evaluate the performance of
+    your models accurately.
+
+    .. code-block:: python
+
+        data = vpd.load_winequality()
+        train, test = data.train_test_split(test_size = 0.2)
+
+    .. warning::
+
+        In this case, VerticaPy utilizes seeded randomization to guarantee
+        the reproducibility of your data split. However, please be aware
+        that this approach may lead to reduced performance. For a more
+        efficient data split, you can use the :py:mod:`vDataFrame.to_db`
+        method to save your results into ``tables`` or ``temporary tables``.
+        This will help enhance the overall performance of the process.
+
+    .. ipython:: python
+        :suppress:
+
+        import verticapy as vp
+        import verticapy.datasets as vpd
+        data = vpd.load_winequality()
+        train, test = data.train_test_split(test_size = 0.2)
+
+    Model Initialization
+    ^^^^^^^^^^^^^^^^^^^^^
+
+    First we import the ``LinearRegression`` model:
+
+    .. code-block::
+
+        from verticapy.machine_learning.vertica import LinearRegression
+
+    Then we can create the model:
+
+    .. code-block::
+
+        model = LinearRegression(
+            tol = 1e-6,
+            max_iter = 100,
+            solver = 'Newton',
+            fit_intercept = True,
+        )
+
+    .. hint::
+
+        In ``verticapy`` 1.0.x and higher, you do not need to specify the
+        model name, as the name is automatically assigned. If you need to
+        re-use the model, you can fetch the model name from the model's
+        attributes.
+
+    .. important::
+
+        The model name is crucial for the model management system and
+        versioning. It's highly recommended to provide a name if you
+        plan to reuse the model later.
+
+    .. ipython:: python
+        :suppress:
+
+        from verticapy.machine_learning.vertica import LinearRegression
+        model = LinearRegression(
+            tol = 1e-6,
+            max_iter = 100,
+            solver = 'Newton',
+            fit_intercept = True,
+        )
+
+    Model Training
+    ^^^^^^^^^^^^^^^
+
+    We can now fit the model:
+
+    .. ipython:: python
+
+        model.fit(
+            train,
+            [
+                "fixed_acidity",
+                "volatile_acidity",
+                "citric_acid",
+                "residual_sugar",
+                "chlorides",
+                "density"
+            ],
+            "quality",
+            test,
+        )
+
+    .. important::
+
+        To train a model, you can directly use the ``vDataFrame`` or the
+        name of the relation stored in the database. The test set is optional
+        and is only used to compute the test metrics. In ``verticapy``, we
+        don't work using ``X`` matrices and ``y`` vectors. Instead, we work
+        directly with lists of predictors and the response name.
+
+    Features Importance
+    ^^^^^^^^^^^^^^^^^^^^
+
+    We can conveniently get the features importance:
+
+    .. ipython:: python
+        :suppress:
+
+        vp.set_option("plotting_lib", "plotly")
+        fig = model.features_importance()
+        fig.write_html("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_feature.html")
+
+    .. code-block:: python
+
+        result = model.features_importance()
+
+    .. raw:: html
+        :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_feature.html
+
+    .. note::
+
+        For ``LinearModel``, feature importance is computed using the coefficients.
+        These coefficients are then normalized using the feature distribution. An
+        activation function is applied to get the final score.
+
+    Metrics
+    ^^^^^^^^
+
+    We can get the entire report using:
+
+    .. ipython:: python
+        :suppress:
+
+        result = model.report()
+        html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report.html", "w")
+        html_file.write(result._repr_html_())
+        html_file.close()
+
+    .. code-block:: python
+
+        result = model.report()
+
+    .. raw:: html
+        :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report.html
+
+    .. important::
+
+        Most metrics are computed using a single SQL query, but some of them might
+        require multiple SQL queries. Selecting only the necessary metrics in the
+        report can help optimize performance.
+        E.g. ``model.report(metrics = ["mse", "r2"])``.
+
+    For ``LinearModel``, we can easily get the ANOVA table using:
+
+    .. ipython:: python
+        :suppress:
+
+        result = model.report(metrics = "anova")
+        html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report_anova.html", "w")
+        html_file.write(result._repr_html_())
+        html_file.close()
+
+    .. code-block:: python
+
+        result = model.report(metrics = "anova")
+
+    .. raw:: html
+        :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report_anova.html
+
+    You can also use the ``LinearModel.score`` function to compute the R-squared
+    value:
+
+    .. ipython:: python
+
+        model.score()
+
+    Prediction
+    ^^^^^^^^^^^
+
+    Prediction is straight-forward:
+
+    .. ipython:: python
+        :suppress:
+
+        result = model.predict(
+            test,
+            [
+                "fixed_acidity",
+                "volatile_acidity",
+                "citric_acid",
+                "residual_sugar",
+                "chlorides",
+                "density"
+            ],
+            "prediction",
+        )
+        html_file = open("figures/machine_learning_vertica_linear_model_lr_prediction.html", "w")
+        html_file.write(result._repr_html_())
+        html_file.close()
+
+    .. code-block:: python
+
+        model.predict(
+            test,
+            [
+                "fixed_acidity",
+                "volatile_acidity",
+                "citric_acid",
+                "residual_sugar",
+                "chlorides",
+                "density"
+            ],
+            "prediction",
+        )
+
+    .. raw:: html
+        :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_prediction.html
+
+    .. note::
+
+        Predictions can be made automatically using the test set, in which
+        case you don't need to specify the predictors. Alternatively, you
+        can pass only the ``vDataFrame`` to the
+        :py:mod:`verticapy.machine_learning.vertica.linear_model.LinearModel.predict`
+        function, but in this case, it's essential that the column names of
+        the ``vDataFrame`` match the predictors and response name in the
+        model.
+
+    Plots
+    ^^^^^^
+
+    If the model allows, you can also generate relevant plots. For example,
+    regression plots can be found in the :ref:`chart_gallery.regression_plot`.
+
+    .. code-block:: python
+
+        model.plot()
+
+    .. important::
+
+        The plotting feature is typically suitable for models with fewer than
+        three predictors.
+
+    Parameter Modification
+    ^^^^^^^^^^^^^^^^^^^^^^^
+
+    In order to see the parameters:
+
+    .. ipython:: python
+
+        model.get_params()
+
+    And to manually change some of the parameters:
+
+    .. ipython:: python
+
+        model.set_params({'tol': 0.001})
+
+    Model Register
+    ^^^^^^^^^^^^^^
+
+    In order to register the model for tracking and versioning:
+
+    .. code-block:: python
+
+        model.register("model_v1")
+
+    Please refer to :ref:`notebooks/ml/model_tracking_versioning/index.html` for
+    more details on model tracking and versioning.
+
+    Model Exporting
+    ^^^^^^^^^^^^^^^^
+
+    **To Memmodel**
+
+    .. code-block:: python
+
+        model.to_memmodel()
+
+    .. note::
+
+        ``MemModel`` objects serve as in-memory representations of machine
+        learning models. They can be used for both in-database and in-memory
+        prediction tasks. These objects can be pickled in the same way that
+        you would pickle a ``scikit-learn`` model.
+
+    The following methods for exporting the model use ``MemModel``, and it
+    is recommended to use ``MemModel`` directly.
+
+    **To SQL**
+
+    You can get the SQL code by:
+
+    .. ipython:: python
+
+        model.to_sql()
+
+    **To Python**
+
+    To obtain the prediction function in Python syntax, use the following code:
+
+    .. ipython:: python
+
+        X = [[4.2, 0.17, 0.36, 1.8, 0.029, 0.9899]]
+        model.to_python()(X)
+
+    .. hint::
+
+        The
+        :py:mod:`verticapy.machine_learning.vertica.linear_model.LinearModel.to_python`
+        method is used to retrieve predictions,
+        probabilities, or cluster distances. For specific details on how to
+        use this method for different model types, refer to the relevant
+        documentation for each model.
     """
 
     # Properties.