From 4548e799708d73f853125e5dec4ef20d47e9ce9e Mon Sep 17 00:00:00 2001 From: Umar Farooq Ghumman Date: Sat, 21 Oct 2023 09:47:55 -0500 Subject: [PATCH 1/4] Sphinx Docstring Update for MachineLearning/Vertica/LinearModel Draft for LinearRegression. Rest of the ml algortihm classes will follow the same syntax. --- .../machine_learning/vertica/linear_model.py | 311 ++++++++++++++++++ 1 file changed, 311 insertions(+) diff --git a/verticapy/machine_learning/vertica/linear_model.py b/verticapy/machine_learning/vertica/linear_model.py index e6ab43971..bec34749b 100755 --- a/verticapy/machine_learning/vertica/linear_model.py +++ b/verticapy/machine_learning/vertica/linear_model.py @@ -494,6 +494,317 @@ class LinearRegression(Regressor, LinearModel): used in training the model. Note that setting fit_intercept to false does not work well with the BFGS optimizer. + + Examples + --------- + + + Load data for machine learning + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + We import ``verticapy``: + + .. code-block:: python + + import verticapy as vp + + .. hint:: + + By giving an alias to ``verticapy``, we avoid code colission with other libraries. + This is verticapy uses some well-known functions names like average, median etc. + + For this example, we will use the winequality dataset. + + .. code-block:: python + + import verticapy.datasets as vpd + + data = vpd.load_winequality() + + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_winequality.html + + + .. note:: + + VerticaPy provides many sample datasets that can be used for training/testing purposes. Check out + :ref:`` for all the available datasets. + + We can use the ``vDataFrame.train_test_split`` method to split-up the dataset into training and testing + portions. + + .. code-block:: python + + data = vpd.load_winequality() + train, test = data.train_test_split(test_size = 0.2) + + .. warning:: + + In this case, ``verticapy`` will use seeded random to ensure reproducibility. However, + this process can drastically reduce the performance. In order to performa more efficient split + you can use ``vDataFrame.to_db`` to save your result in to ``tables`` or ``temporary tables``. + + .. ipython:: python + :suppress: + + import verticapy as vp + import verticapy.datasets as vpd + data = vpd.load_winequality() + train, test = data.train_test_split(test_size = 0.2) + + + + Model Initialziation + ^^^^^^^^^^^^^^^^^^^^^ + + First we import the ``LinearRegression`` model: + + + + .. code-block:: + + from verticapy.machine_learning.vertica import LinearRegression + + Then we can create the model: + + .. code-block:: + + model = LinearRegression( + tol = 1e-6, + max_iter = 100, + solver = 'Newton', + fit_intercept = True, + ) + + .. hint:: + + In ``verticapy`` 1.0 and higher, you do not need to specify the model name as the name is automatically + assigned. If you need to re-use the model, you can fetch the model name from model attributes. + + .. important:: + + The model name is really important for the model management system and versioning. It is highly recommended + to give a name if you plan to re-use the model later. + + .. ipython:: python + :suppress: + + from verticapy.machine_learning.vertica import LinearRegression + model = LinearRegression( + tol = 1e-6, + max_iter = 100, + solver = 'Newton', + fit_intercept = True, + ) + + + Model Training + ^^^^^^^^^^^^^^^ + + We can now fit the model: + + .. ipython:: python + + model.fit( + train, + ["fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar", "chlorides", "density"], + "quality", + test, + ) + + .. important:: + + To train a model you can use directly use the ``vDataFrame`` or the name of the relation stored in + the database. The test set is optional and is only used to compute the test metrics. In ``verticapy`` + we do not work using ``X`` matrix and ``y`` vectors. We work directly with list of predictors and response name. + + Features Importance + ^^^^^^^^^^^^^^^^^^^^ + + We can conveniently get the features importance: + + + .. ipython:: python + :suppress: + + vp.set_option("plotting_lib", "plotly") + fig = model.features_importance() + fig.write_html("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_feature.html") + + .. code-block:: python + + result = model.features_importance() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_feature.html + + .. note:: + + For ``LinearModel`` the features importance is computed using the coefficients. They are normalized + using the feature distribution. Then an activation function is applied to get the final score. + + Metrics + ^^^^^^^^ + + We can get the entire report using: + + .. ipython:: python + :suppress: + + result = model.report() + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. code-block:: python + + result = model.report() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report.html + + .. important:: + + Most metrics are computed using a single SQL query but some of them might need multiple SQL queries. + By selecting only the necessary metric in the report + you may be able to optimize performance. E.g. ``model.report(metrics = ["mse", "r2"])``. + + For ``LinearModel``, we can easily get the ANOVA table using: + + .. ipython:: python + :suppress: + + result = model.report(metrics = "anova") + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report_anova.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. code-block:: python + + result = model.report(metrics = "anova") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report_anova.html + + You can also use the ``score`` function to compute the R-squared value: + + .. ipython:: python + + model.score() + + Prediction + ^^^^^^^^^^^ + + Prediction is straight-forward: + + .. ipython:: python + :suppress: + + result = model.predict( + test, + ["fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar", "chlorides", "density"], + "prediction", + ) + html_file = open("figures/machine_learning_vertica_linear_model_lr_prediction.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + + .. code-block:: python + + model.predict( + test, + ["fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar", "chlorides", "density"], + "prediction", + ) + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_prediction.html + + .. note:: + + Prediction can be done automatically using the test set in which case you do not nee to + specify the predictors. It is also possible to pass only the the ``vDataFrame`` to the ``predict`` + function. But in this case, it is important that the column names of the ``vDataFrame`` are + matching the predictors and response name in the model. + + Plots + ^^^^^^ + + If the models allows, you can also draw the relevant plots. Example of regression plots can + be found in :ref:`chart_gallery.regression_plot`. + + .. code-block:: python + + model.plot() + + .. important:: The plot generally works for models which have less than 3 predictors. + + Parameter Modification + ^^^^^^^^^^^^^^^^^^^^^^^ + + In order to see the parameters: + + .. ipython:: python + + model.get_params() + + And to manually change some of the parameters: + + .. ipython:: python + + model.set_params({'tol': 0.001}) + + + Model Register + ^^^^^^^^^^^^^^ + + In order to register the model for tracking and versioning: + + .. code-block:: python + + model.register("model_v1") + + Please refer to :ref:`notebooks/ml/model_tracking_versioning/index.html` for more details on model tracking + and versioning. + + Model Exporting + ^^^^^^^^^^^^^^^^ + + To Memmodel + """ """""" """""" """" + + .. code-block:: python + + model.to_memmodel() + + .. note:: + + ``Memmodel`` are in-memory representation of ML models. They can be used to do in-database and + in-memory prediction. They can pickled the same way that you pickle your ``scikit-learn`` model. + + + The following ways of exporting the model uses memmodel and it is perfered to use the ``memmodels`` directly. + + To SQL + """ """""" """""" """" + + You can get the SQL code by: + + .. ipython:: python + + model.to_sql() + + To Python + """ """""" """""" """" + + To get the prediction function in a python sytanx: + + .. ipython:: python + + X = [[4.2, 0.17, 0.36, 1.8, 0.029, 0.9899]] + model.to_python()(X) """ # Properties. From 599bb8802b989331d14b2249db2ebe77d690c648 Mon Sep 17 00:00:00 2001 From: Badr Date: Sat, 21 Oct 2023 11:27:24 -0400 Subject: [PATCH 2/4] Update linear_model.py Fixing the example. @mail4umar: TODO - Verify each reference, for some them the link needs to be added. Please double check the code. --- .../machine_learning/vertica/linear_model.py | 162 +++++++++++------- 1 file changed, 104 insertions(+), 58 deletions(-) diff --git a/verticapy/machine_learning/vertica/linear_model.py b/verticapy/machine_learning/vertica/linear_model.py index bec34749b..8bef17fb6 100755 --- a/verticapy/machine_learning/vertica/linear_model.py +++ b/verticapy/machine_learning/vertica/linear_model.py @@ -498,6 +498,9 @@ class LinearRegression(Regressor, LinearModel): Examples --------- + The following examples provide a basic understanding of usage. For more + detailed examples, please refer to the User Guide or the "Examples" + section on the website. Load data for machine learning ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -510,8 +513,13 @@ class LinearRegression(Regressor, LinearModel): .. hint:: - By giving an alias to ``verticapy``, we avoid code colission with other libraries. - This is verticapy uses some well-known functions names like average, median etc. + By assigning an alias to ``verticapy``, we mitigate the risk of code + collisions with other libraries. This precaution is necessary + because verticapy uses commonly known function names like "average" + and "median," which can potentially lead to naming conflicts. + The use of an alias ensures that the functions from verticapy are + used as intended without interfering with functions from other + libraries. For this example, we will use the winequality dataset. @@ -521,29 +529,35 @@ class LinearRegression(Regressor, LinearModel): data = vpd.load_winequality() - .. raw:: html :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_winequality.html + .. note:: - .. note:: - - VerticaPy provides many sample datasets that can be used for training/testing purposes. Check out - :ref:`` for all the available datasets. + VerticaPy offers a wide range of sample datasets that are ideal for training + and testing purposes. You can explore the full list of available datasets in + the :ref: ``, which provides detailed information on each dataset and how to + use them effectively. These datasets are invaluable resources for honing your + data analysis and machine learning skills within the VerticaPy environment. - We can use the ``vDataFrame.train_test_split`` method to split-up the dataset into training and testing - portions. + You can easily divide your dataset into training and testing subsets using the + ``vDataFrame.train_test_split`` method. This is a crucial step when preparing + your data for machine learning, as it allows you to evaluate the performance of + your models accurately. .. code-block:: python data = vpd.load_winequality() train, test = data.train_test_split(test_size = 0.2) - .. warning:: + .. warning:: - In this case, ``verticapy`` will use seeded random to ensure reproducibility. However, - this process can drastically reduce the performance. In order to performa more efficient split - you can use ``vDataFrame.to_db`` to save your result in to ``tables`` or ``temporary tables``. + In this case, VerticaPy utilizes seeded randomization to guarantee the + reproducibility of your data split. However, please be aware that this + approach may lead to reduced performance. For a more efficient data + split, you can use the ``vDataFrame.to_db`` method to save your results + into ``tables`` or ``temporary tables``. This will help enhance the + overall performance of the process. .. ipython:: python :suppress: @@ -553,15 +567,11 @@ class LinearRegression(Regressor, LinearModel): data = vpd.load_winequality() train, test = data.train_test_split(test_size = 0.2) - - - Model Initialziation + Model Initialization ^^^^^^^^^^^^^^^^^^^^^ First we import the ``LinearRegression`` model: - - .. code-block:: from verticapy.machine_learning.vertica import LinearRegression @@ -579,13 +589,16 @@ class LinearRegression(Regressor, LinearModel): .. hint:: - In ``verticapy`` 1.0 and higher, you do not need to specify the model name as the name is automatically - assigned. If you need to re-use the model, you can fetch the model name from model attributes. + In ``verticapy`` 1.0.x and higher, you do not need to specify the + model name, as the name is automatically assigned. If you need to + re-use the model, you can fetch the model name from the model's + attributes. .. important:: - The model name is really important for the model management system and versioning. It is highly recommended - to give a name if you plan to re-use the model later. + The model name is crucial for the model management system and + versioning. It's highly recommended to provide a name if you + plan to reuse the model later. .. ipython:: python :suppress: @@ -598,7 +611,6 @@ class LinearRegression(Regressor, LinearModel): fit_intercept = True, ) - Model Training ^^^^^^^^^^^^^^^ @@ -608,23 +620,31 @@ class LinearRegression(Regressor, LinearModel): model.fit( train, - ["fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar", "chlorides", "density"], + [ + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", + "density" + ], "quality", test, ) .. important:: - To train a model you can use directly use the ``vDataFrame`` or the name of the relation stored in - the database. The test set is optional and is only used to compute the test metrics. In ``verticapy`` - we do not work using ``X`` matrix and ``y`` vectors. We work directly with list of predictors and response name. + To train a model, you can directly use the ``vDataFrame`` or the + name of the relation stored in the database. The test set is optional + and is only used to compute the test metrics. In ``verticapy``, we + don't work using ``X`` matrices and ``y`` vectors. Instead, we work + directly with lists of predictors and the response name. Features Importance ^^^^^^^^^^^^^^^^^^^^ We can conveniently get the features importance: - .. ipython:: python :suppress: @@ -641,8 +661,9 @@ class LinearRegression(Regressor, LinearModel): .. note:: - For ``LinearModel`` the features importance is computed using the coefficients. They are normalized - using the feature distribution. Then an activation function is applied to get the final score. + For ``LinearModel``, feature importance is computed using the coefficients. + These coefficients are then normalized using the feature distribution. An + activation function is applied to get the final score. Metrics ^^^^^^^^ @@ -666,9 +687,10 @@ class LinearRegression(Regressor, LinearModel): .. important:: - Most metrics are computed using a single SQL query but some of them might need multiple SQL queries. - By selecting only the necessary metric in the report - you may be able to optimize performance. E.g. ``model.report(metrics = ["mse", "r2"])``. + Most metrics are computed using a single SQL query, but some of them might + require multiple SQL queries. Selecting only the necessary metrics in the + report can help optimize performance. + E.g. ``model.report(metrics = ["mse", "r2"])``. For ``LinearModel``, we can easily get the ANOVA table using: @@ -687,7 +709,8 @@ class LinearRegression(Regressor, LinearModel): .. raw:: html :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report_anova.html - You can also use the ``score`` function to compute the R-squared value: + You can also use the ``LinearModel.score`` function to compute the R-squared + value: .. ipython:: python @@ -703,19 +726,32 @@ class LinearRegression(Regressor, LinearModel): result = model.predict( test, - ["fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar", "chlorides", "density"], + [ + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", + "density" + ], "prediction", ) html_file = open("figures/machine_learning_vertica_linear_model_lr_prediction.html", "w") html_file.write(result._repr_html_()) html_file.close() - .. code-block:: python model.predict( test, - ["fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar", "chlorides", "density"], + [ + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", + "density" + ], "prediction", ) @@ -724,22 +760,27 @@ class LinearRegression(Regressor, LinearModel): .. note:: - Prediction can be done automatically using the test set in which case you do not nee to - specify the predictors. It is also possible to pass only the the ``vDataFrame`` to the ``predict`` - function. But in this case, it is important that the column names of the ``vDataFrame`` are - matching the predictors and response name in the model. + Predictions can be made automatically using the test set, in which + case you don't need to specify the predictors. Alternatively, you + can pass only the ``vDataFrame`` to the ``LinearModel.predict`` + function, but in this case, it's essential that the column names of + the ``vDataFrame`` match the predictors and response name in the + model. Plots ^^^^^^ - If the models allows, you can also draw the relevant plots. Example of regression plots can - be found in :ref:`chart_gallery.regression_plot`. + If the model allows, you can also generate relevant plots. For example, + regression plots can be found in the :ref:`chart_gallery.regression_plot`. .. code-block:: python model.plot() - .. important:: The plot generally works for models which have less than 3 predictors. + .. important:: + + The plotting feature is typically suitable for models with fewer than + three predictors. Parameter Modification ^^^^^^^^^^^^^^^^^^^^^^^ @@ -756,7 +797,6 @@ class LinearRegression(Regressor, LinearModel): model.set_params({'tol': 0.001}) - Model Register ^^^^^^^^^^^^^^ @@ -766,14 +806,13 @@ class LinearRegression(Regressor, LinearModel): model.register("model_v1") - Please refer to :ref:`notebooks/ml/model_tracking_versioning/index.html` for more details on model tracking - and versioning. + Please refer to :ref:`notebooks/ml/model_tracking_versioning/index.html` for + more details on model tracking and versioning. Model Exporting ^^^^^^^^^^^^^^^^ - To Memmodel - """ """""" """""" """" + **To Memmodel** .. code-block:: python @@ -781,14 +820,15 @@ class LinearRegression(Regressor, LinearModel): .. note:: - ``Memmodel`` are in-memory representation of ML models. They can be used to do in-database and - in-memory prediction. They can pickled the same way that you pickle your ``scikit-learn`` model. - + ``MemModel`` objects serve as in-memory representations of machine + learning models. They can be used for both in-database and in-memory + prediction tasks. These objects can be pickled in the same way that + you would pickle a ``scikit-learn`` model. - The following ways of exporting the model uses memmodel and it is perfered to use the ``memmodels`` directly. + The following methods for exporting the model use ``MemModel``, and it + is recommended to use ``MemModel`` directly. - To SQL - """ """""" """""" """" + **To SQL** You can get the SQL code by: @@ -796,15 +836,21 @@ class LinearRegression(Regressor, LinearModel): model.to_sql() - To Python - """ """""" """""" """" + **To Python** - To get the prediction function in a python sytanx: + To obtain the prediction function in Python syntax, use the following code: .. ipython:: python X = [[4.2, 0.17, 0.36, 1.8, 0.029, 0.9899]] model.to_python()(X) + + .. hint:: + + The ``LinearModel.to_python`` method is used to retrieve predictions, + probabilities, or cluster distances. For specific details on how to + use this method for different model types, refer to the relevant + documentation for each model. """ # Properties. From fff89deb5400c2ca30121f74afded8788b402674 Mon Sep 17 00:00:00 2001 From: Umar Farooq Ghumman Date: Sat, 21 Oct 2023 12:00:18 -0500 Subject: [PATCH 3/4] updated references --- .../machine_learning/vertica/linear_model.py | 185 +++++++++--------- 1 file changed, 94 insertions(+), 91 deletions(-) diff --git a/verticapy/machine_learning/vertica/linear_model.py b/verticapy/machine_learning/vertica/linear_model.py index 8bef17fb6..104aad226 100755 --- a/verticapy/machine_learning/vertica/linear_model.py +++ b/verticapy/machine_learning/vertica/linear_model.py @@ -494,12 +494,12 @@ class LinearRegression(Regressor, LinearModel): used in training the model. Note that setting fit_intercept to false does not work well with the BFGS optimizer. - + Examples --------- - The following examples provide a basic understanding of usage. For more - detailed examples, please refer to the User Guide or the "Examples" + The following examples provide a basic understanding of usage. For more + detailed examples, please refer to the :ref:`user_guide.machine_learning` or the "Examples" section on the website. Load data for machine learning @@ -511,14 +511,14 @@ class LinearRegression(Regressor, LinearModel): import verticapy as vp - .. hint:: + .. hint:: - By assigning an alias to ``verticapy``, we mitigate the risk of code - collisions with other libraries. This precaution is necessary - because verticapy uses commonly known function names like "average" - and "median," which can potentially lead to naming conflicts. - The use of an alias ensures that the functions from verticapy are - used as intended without interfering with functions from other + By assigning an alias to ``verticapy``, we mitigate the risk of code + collisions with other libraries. This precaution is necessary + because verticapy uses commonly known function names like "average" + and "median," which can potentially lead to naming conflicts. + The use of an alias ensures that the functions from verticapy are + used as intended without interfering with functions from other libraries. For this example, we will use the winequality dataset. @@ -527,36 +527,36 @@ class LinearRegression(Regressor, LinearModel): import verticapy.datasets as vpd - data = vpd.load_winequality() + data = vpd.load_winequality() .. raw:: html - :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_winequality.html - + :file: /project/data/VerticaPy/docs/figures/datasets_loaders_load_winequality.html + .. note:: - VerticaPy offers a wide range of sample datasets that are ideal for training - and testing purposes. You can explore the full list of available datasets in - the :ref: ``, which provides detailed information on each dataset and how to - use them effectively. These datasets are invaluable resources for honing your + VerticaPy offers a wide range of sample datasets that are ideal for training + and testing purposes. You can explore the full list of available datasets in + the :ref:`api.datasets`, which provides detailed information on each dataset and how to + use them effectively. These datasets are invaluable resources for honing your data analysis and machine learning skills within the VerticaPy environment. - You can easily divide your dataset into training and testing subsets using the - ``vDataFrame.train_test_split`` method. This is a crucial step when preparing - your data for machine learning, as it allows you to evaluate the performance of + You can easily divide your dataset into training and testing subsets using the + :py:mod:`vDataFrame.train_test_split` method. This is a crucial step when preparing + your data for machine learning, as it allows you to evaluate the performance of your models accurately. .. code-block:: python data = vpd.load_winequality() - train, test = data.train_test_split(test_size = 0.2) + train, test = data.train_test_split(test_size = 0.2) .. warning:: - In this case, VerticaPy utilizes seeded randomization to guarantee the - reproducibility of your data split. However, please be aware that this - approach may lead to reduced performance. For a more efficient data - split, you can use the ``vDataFrame.to_db`` method to save your results - into ``tables`` or ``temporary tables``. This will help enhance the + In this case, VerticaPy utilizes seeded randomization to guarantee the + reproducibility of your data split. However, please be aware that this + approach may lead to reduced performance. For a more efficient data + split, you can use the :py:mod:`vDataFrame.to_db` method to save your results + into ``tables`` or ``temporary tables``. This will help enhance the overall performance of the process. .. ipython:: python @@ -575,29 +575,29 @@ class LinearRegression(Regressor, LinearModel): .. code-block:: from verticapy.machine_learning.vertica import LinearRegression - + Then we can create the model: .. code-block:: model = LinearRegression( tol = 1e-6, - max_iter = 100, + max_iter = 100, solver = 'Newton', fit_intercept = True, ) - .. hint:: + .. hint:: - In ``verticapy`` 1.0.x and higher, you do not need to specify the - model name, as the name is automatically assigned. If you need to - re-use the model, you can fetch the model name from the model's + In ``verticapy`` 1.0.x and higher, you do not need to specify the + model name, as the name is automatically assigned. If you need to + re-use the model, you can fetch the model name from the model's attributes. - .. important:: + .. important:: - The model name is crucial for the model management system and - versioning. It's highly recommended to provide a name if you + The model name is crucial for the model management system and + versioning. It's highly recommended to provide a name if you plan to reuse the model later. .. ipython:: python @@ -606,7 +606,7 @@ class LinearRegression(Regressor, LinearModel): from verticapy.machine_learning.vertica import LinearRegression model = LinearRegression( tol = 1e-6, - max_iter = 100, + max_iter = 100, solver = 'Newton', fit_intercept = True, ) @@ -617,27 +617,27 @@ class LinearRegression(Regressor, LinearModel): We can now fit the model: .. ipython:: python - + model.fit( train, [ - "fixed_acidity", - "volatile_acidity", - "citric_acid", - "residual_sugar", - "chlorides", + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", "density" ], "quality", test, ) - + .. important:: - To train a model, you can directly use the ``vDataFrame`` or the - name of the relation stored in the database. The test set is optional - and is only used to compute the test metrics. In ``verticapy``, we - don't work using ``X`` matrices and ``y`` vectors. Instead, we work + To train a model, you can directly use the ``vDataFrame`` or the + name of the relation stored in the database. The test set is optional + and is only used to compute the test metrics. In ``verticapy``, we + don't work using ``X`` matrices and ``y`` vectors. Instead, we work directly with lists of predictors and the response name. Features Importance @@ -650,19 +650,19 @@ class LinearRegression(Regressor, LinearModel): vp.set_option("plotting_lib", "plotly") fig = model.features_importance() - fig.write_html("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_feature.html") + fig.write_html("/project/data/VerticaPy/docs/figures/machine_learning_vertica_linear_model_lr_feature.html") .. code-block:: python result = model.features_importance() .. raw:: html - :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_feature.html + :file: /project/data/VerticaPy/docs/figures/machine_learning_vertica_linear_model_lr_feature.html .. note:: - For ``LinearModel``, feature importance is computed using the coefficients. - These coefficients are then normalized using the feature distribution. An + For ``LinearModel``, feature importance is computed using the coefficients. + These coefficients are then normalized using the feature distribution. An activation function is applied to get the final score. Metrics @@ -674,7 +674,7 @@ class LinearRegression(Regressor, LinearModel): :suppress: result = model.report() - html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report.html", "w") + html_file = open("/project/data/VerticaPy/docs/figures/machine_learning_vertica_linear_model_lr_report.html", "w") html_file.write(result._repr_html_()) html_file.close() @@ -683,22 +683,22 @@ class LinearRegression(Regressor, LinearModel): result = model.report() .. raw:: html - :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report.html + :file: /project/data/VerticaPy/docs/figures/machine_learning_vertica_linear_model_lr_report.html .. important:: - Most metrics are computed using a single SQL query, but some of them might - require multiple SQL queries. Selecting only the necessary metrics in the - report can help optimize performance. + Most metrics are computed using a single SQL query, but some of them might + require multiple SQL queries. Selecting only the necessary metrics in the + report can help optimize performance. E.g. ``model.report(metrics = ["mse", "r2"])``. - + For ``LinearModel``, we can easily get the ANOVA table using: .. ipython:: python :suppress: result = model.report(metrics = "anova") - html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report_anova.html", "w") + html_file = open("/project/data/VerticaPy/docs/figures/machine_learning_vertica_linear_model_lr_report_anova.html", "w") html_file.write(result._repr_html_()) html_file.close() @@ -707,19 +707,19 @@ class LinearRegression(Regressor, LinearModel): result = model.report(metrics = "anova") .. raw:: html - :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report_anova.html + :file: /project/data/VerticaPy/docs/figures/machine_learning_vertica_linear_model_lr_report_anova.html - You can also use the ``LinearModel.score`` function to compute the R-squared + You can also use the ``LinearModel.score`` function to compute the R-squared value: .. ipython:: python model.score() - + Prediction ^^^^^^^^^^^ - Prediction is straight-forward: + Prediction is straight-forward: .. ipython:: python :suppress: @@ -727,11 +727,11 @@ class LinearRegression(Regressor, LinearModel): result = model.predict( test, [ - "fixed_acidity", - "volatile_acidity", - "citric_acid", - "residual_sugar", - "chlorides", + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", "density" ], "prediction", @@ -745,41 +745,42 @@ class LinearRegression(Regressor, LinearModel): model.predict( test, [ - "fixed_acidity", - "volatile_acidity", - "citric_acid", - "residual_sugar", - "chlorides", + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", "density" ], "prediction", ) .. raw:: html - :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_prediction.html + :file: /project/data/VerticaPy/docs/figures/machine_learning_vertica_linear_model_lr_prediction.html - .. note:: + .. note:: - Predictions can be made automatically using the test set, in which - case you don't need to specify the predictors. Alternatively, you - can pass only the ``vDataFrame`` to the ``LinearModel.predict`` - function, but in this case, it's essential that the column names of - the ``vDataFrame`` match the predictors and response name in the + Predictions can be made automatically using the test set, in which + case you don't need to specify the predictors. Alternatively, you + can pass only the ``vDataFrame`` to the + :py:mod:`verticapy.machine_learning.vertica.linear_model.LinearModel.predict` + function, but in this case, it's essential that the column names of + the ``vDataFrame`` match the predictors and response name in the model. Plots ^^^^^^ - If the model allows, you can also generate relevant plots. For example, + If the model allows, you can also generate relevant plots. For example, regression plots can be found in the :ref:`chart_gallery.regression_plot`. .. code-block:: python - model.plot() + model.plot() - .. important:: + .. important:: - The plotting feature is typically suitable for models with fewer than + The plotting feature is typically suitable for models with fewer than three predictors. Parameter Modification @@ -806,7 +807,7 @@ class LinearRegression(Regressor, LinearModel): model.register("model_v1") - Please refer to :ref:`notebooks/ml/model_tracking_versioning/index.html` for + Please refer to :ref:`notebooks/ml/model_tracking_versioning/index.html` for more details on model tracking and versioning. Model Exporting @@ -820,12 +821,12 @@ class LinearRegression(Regressor, LinearModel): .. note:: - ``MemModel`` objects serve as in-memory representations of machine - learning models. They can be used for both in-database and in-memory - prediction tasks. These objects can be pickled in the same way that + ``MemModel`` objects serve as in-memory representations of machine + learning models. They can be used for both in-database and in-memory + prediction tasks. These objects can be pickled in the same way that you would pickle a ``scikit-learn`` model. - The following methods for exporting the model use ``MemModel``, and it + The following methods for exporting the model use ``MemModel``, and it is recommended to use ``MemModel`` directly. **To SQL** @@ -847,9 +848,11 @@ class LinearRegression(Regressor, LinearModel): .. hint:: - The ``LinearModel.to_python`` method is used to retrieve predictions, - probabilities, or cluster distances. For specific details on how to - use this method for different model types, refer to the relevant + The + :py:mod:`verticapy.machine_learning.vertica.linear_model.LinearModel.to_python` + method is used to retrieve predictions, + probabilities, or cluster distances. For specific details on how to + use this method for different model types, refer to the relevant documentation for each model. """ From 33c91d61679ad3b6fb89fc73dc19261919560e1a Mon Sep 17 00:00:00 2001 From: Umar Farooq Ghumman Date: Sat, 21 Oct 2023 12:38:35 -0500 Subject: [PATCH 4/4] Addressed comments --- .../machine_learning/vertica/linear_model.py | 48 ++++++++++--------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/verticapy/machine_learning/vertica/linear_model.py b/verticapy/machine_learning/vertica/linear_model.py index 104aad226..1332b9ca5 100755 --- a/verticapy/machine_learning/vertica/linear_model.py +++ b/verticapy/machine_learning/vertica/linear_model.py @@ -498,8 +498,10 @@ class LinearRegression(Regressor, LinearModel): Examples --------- - The following examples provide a basic understanding of usage. For more - detailed examples, please refer to the :ref:`user_guide.machine_learning` or the "Examples" + The following examples provide a basic understanding of usage. + For more detailed examples, please refer to the + :ref:`user_guide.machine_learning` or the + `Examples `_ section on the website. Load data for machine learning @@ -516,7 +518,7 @@ class LinearRegression(Regressor, LinearModel): By assigning an alias to ``verticapy``, we mitigate the risk of code collisions with other libraries. This precaution is necessary because verticapy uses commonly known function names like "average" - and "median," which can potentially lead to naming conflicts. + and "median", which can potentially lead to naming conflicts. The use of an alias ensures that the functions from verticapy are used as intended without interfering with functions from other libraries. @@ -530,15 +532,17 @@ class LinearRegression(Regressor, LinearModel): data = vpd.load_winequality() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/datasets_loaders_load_winequality.html + :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_winequality.html .. note:: - VerticaPy offers a wide range of sample datasets that are ideal for training - and testing purposes. You can explore the full list of available datasets in - the :ref:`api.datasets`, which provides detailed information on each dataset and how to - use them effectively. These datasets are invaluable resources for honing your - data analysis and machine learning skills within the VerticaPy environment. + VerticaPy offers a wide range of sample datasets that are + ideal for training and testing purposes. You can explore + the full list of available datasets in the :ref:`api.datasets`, + which provides detailed information on each dataset + and how to use them effectively. These datasets are invaluable + resources for honing your data analysis and machine learning + skills within the VerticaPy environment. You can easily divide your dataset into training and testing subsets using the :py:mod:`vDataFrame.train_test_split` method. This is a crucial step when preparing @@ -552,12 +556,12 @@ class LinearRegression(Regressor, LinearModel): .. warning:: - In this case, VerticaPy utilizes seeded randomization to guarantee the - reproducibility of your data split. However, please be aware that this - approach may lead to reduced performance. For a more efficient data - split, you can use the :py:mod:`vDataFrame.to_db` method to save your results - into ``tables`` or ``temporary tables``. This will help enhance the - overall performance of the process. + In this case, VerticaPy utilizes seeded randomization to guarantee + the reproducibility of your data split. However, please be aware + that this approach may lead to reduced performance. For a more + efficient data split, you can use the :py:mod:`vDataFrame.to_db` + method to save your results into ``tables`` or ``temporary tables``. + This will help enhance the overall performance of the process. .. ipython:: python :suppress: @@ -650,14 +654,14 @@ class LinearRegression(Regressor, LinearModel): vp.set_option("plotting_lib", "plotly") fig = model.features_importance() - fig.write_html("/project/data/VerticaPy/docs/figures/machine_learning_vertica_linear_model_lr_feature.html") + fig.write_html("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_feature.html") .. code-block:: python result = model.features_importance() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/machine_learning_vertica_linear_model_lr_feature.html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_feature.html .. note:: @@ -674,7 +678,7 @@ class LinearRegression(Regressor, LinearModel): :suppress: result = model.report() - html_file = open("/project/data/VerticaPy/docs/figures/machine_learning_vertica_linear_model_lr_report.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report.html", "w") html_file.write(result._repr_html_()) html_file.close() @@ -683,7 +687,7 @@ class LinearRegression(Regressor, LinearModel): result = model.report() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/machine_learning_vertica_linear_model_lr_report.html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report.html .. important:: @@ -698,7 +702,7 @@ class LinearRegression(Regressor, LinearModel): :suppress: result = model.report(metrics = "anova") - html_file = open("/project/data/VerticaPy/docs/figures/machine_learning_vertica_linear_model_lr_report_anova.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report_anova.html", "w") html_file.write(result._repr_html_()) html_file.close() @@ -707,7 +711,7 @@ class LinearRegression(Regressor, LinearModel): result = model.report(metrics = "anova") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/machine_learning_vertica_linear_model_lr_report_anova.html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report_anova.html You can also use the ``LinearModel.score`` function to compute the R-squared value: @@ -756,7 +760,7 @@ class LinearRegression(Regressor, LinearModel): ) .. raw:: html - :file: /project/data/VerticaPy/docs/figures/machine_learning_vertica_linear_model_lr_prediction.html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_prediction.html .. note::