diff --git a/verticapy/machine_learning/vertica/linear_model.py b/verticapy/machine_learning/vertica/linear_model.py index bec34749b..8bef17fb6 100755 --- a/verticapy/machine_learning/vertica/linear_model.py +++ b/verticapy/machine_learning/vertica/linear_model.py @@ -498,6 +498,9 @@ class LinearRegression(Regressor, LinearModel): Examples --------- + The following examples provide a basic understanding of usage. For more + detailed examples, please refer to the User Guide or the "Examples" + section on the website. Load data for machine learning ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -510,8 +513,13 @@ class LinearRegression(Regressor, LinearModel): .. hint:: - By giving an alias to ``verticapy``, we avoid code colission with other libraries. - This is verticapy uses some well-known functions names like average, median etc. + By assigning an alias to ``verticapy``, we mitigate the risk of code + collisions with other libraries. This precaution is necessary + because verticapy uses commonly known function names like "average" + and "median," which can potentially lead to naming conflicts. + The use of an alias ensures that the functions from verticapy are + used as intended without interfering with functions from other + libraries. For this example, we will use the winequality dataset. @@ -521,29 +529,35 @@ class LinearRegression(Regressor, LinearModel): data = vpd.load_winequality() - .. raw:: html :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_winequality.html + .. note:: - .. note:: - - VerticaPy provides many sample datasets that can be used for training/testing purposes. Check out - :ref:`` for all the available datasets. + VerticaPy offers a wide range of sample datasets that are ideal for training + and testing purposes. You can explore the full list of available datasets in + the :ref: ``, which provides detailed information on each dataset and how to + use them effectively. These datasets are invaluable resources for honing your + data analysis and machine learning skills within the VerticaPy environment. - We can use the ``vDataFrame.train_test_split`` method to split-up the dataset into training and testing - portions. + You can easily divide your dataset into training and testing subsets using the + ``vDataFrame.train_test_split`` method. This is a crucial step when preparing + your data for machine learning, as it allows you to evaluate the performance of + your models accurately. .. code-block:: python data = vpd.load_winequality() train, test = data.train_test_split(test_size = 0.2) - .. warning:: + .. warning:: - In this case, ``verticapy`` will use seeded random to ensure reproducibility. However, - this process can drastically reduce the performance. In order to performa more efficient split - you can use ``vDataFrame.to_db`` to save your result in to ``tables`` or ``temporary tables``. + In this case, VerticaPy utilizes seeded randomization to guarantee the + reproducibility of your data split. However, please be aware that this + approach may lead to reduced performance. For a more efficient data + split, you can use the ``vDataFrame.to_db`` method to save your results + into ``tables`` or ``temporary tables``. This will help enhance the + overall performance of the process. .. ipython:: python :suppress: @@ -553,15 +567,11 @@ class LinearRegression(Regressor, LinearModel): data = vpd.load_winequality() train, test = data.train_test_split(test_size = 0.2) - - - Model Initialziation + Model Initialization ^^^^^^^^^^^^^^^^^^^^^ First we import the ``LinearRegression`` model: - - .. code-block:: from verticapy.machine_learning.vertica import LinearRegression @@ -579,13 +589,16 @@ class LinearRegression(Regressor, LinearModel): .. hint:: - In ``verticapy`` 1.0 and higher, you do not need to specify the model name as the name is automatically - assigned. If you need to re-use the model, you can fetch the model name from model attributes. + In ``verticapy`` 1.0.x and higher, you do not need to specify the + model name, as the name is automatically assigned. If you need to + re-use the model, you can fetch the model name from the model's + attributes. .. important:: - The model name is really important for the model management system and versioning. It is highly recommended - to give a name if you plan to re-use the model later. + The model name is crucial for the model management system and + versioning. It's highly recommended to provide a name if you + plan to reuse the model later. .. ipython:: python :suppress: @@ -598,7 +611,6 @@ class LinearRegression(Regressor, LinearModel): fit_intercept = True, ) - Model Training ^^^^^^^^^^^^^^^ @@ -608,23 +620,31 @@ class LinearRegression(Regressor, LinearModel): model.fit( train, - ["fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar", "chlorides", "density"], + [ + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", + "density" + ], "quality", test, ) .. important:: - To train a model you can use directly use the ``vDataFrame`` or the name of the relation stored in - the database. The test set is optional and is only used to compute the test metrics. In ``verticapy`` - we do not work using ``X`` matrix and ``y`` vectors. We work directly with list of predictors and response name. + To train a model, you can directly use the ``vDataFrame`` or the + name of the relation stored in the database. The test set is optional + and is only used to compute the test metrics. In ``verticapy``, we + don't work using ``X`` matrices and ``y`` vectors. Instead, we work + directly with lists of predictors and the response name. Features Importance ^^^^^^^^^^^^^^^^^^^^ We can conveniently get the features importance: - .. ipython:: python :suppress: @@ -641,8 +661,9 @@ class LinearRegression(Regressor, LinearModel): .. note:: - For ``LinearModel`` the features importance is computed using the coefficients. They are normalized - using the feature distribution. Then an activation function is applied to get the final score. + For ``LinearModel``, feature importance is computed using the coefficients. + These coefficients are then normalized using the feature distribution. An + activation function is applied to get the final score. Metrics ^^^^^^^^ @@ -666,9 +687,10 @@ class LinearRegression(Regressor, LinearModel): .. important:: - Most metrics are computed using a single SQL query but some of them might need multiple SQL queries. - By selecting only the necessary metric in the report - you may be able to optimize performance. E.g. ``model.report(metrics = ["mse", "r2"])``. + Most metrics are computed using a single SQL query, but some of them might + require multiple SQL queries. Selecting only the necessary metrics in the + report can help optimize performance. + E.g. ``model.report(metrics = ["mse", "r2"])``. For ``LinearModel``, we can easily get the ANOVA table using: @@ -687,7 +709,8 @@ class LinearRegression(Regressor, LinearModel): .. raw:: html :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_linear_model_lr_report_anova.html - You can also use the ``score`` function to compute the R-squared value: + You can also use the ``LinearModel.score`` function to compute the R-squared + value: .. ipython:: python @@ -703,19 +726,32 @@ class LinearRegression(Regressor, LinearModel): result = model.predict( test, - ["fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar", "chlorides", "density"], + [ + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", + "density" + ], "prediction", ) html_file = open("figures/machine_learning_vertica_linear_model_lr_prediction.html", "w") html_file.write(result._repr_html_()) html_file.close() - .. code-block:: python model.predict( test, - ["fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar", "chlorides", "density"], + [ + "fixed_acidity", + "volatile_acidity", + "citric_acid", + "residual_sugar", + "chlorides", + "density" + ], "prediction", ) @@ -724,22 +760,27 @@ class LinearRegression(Regressor, LinearModel): .. note:: - Prediction can be done automatically using the test set in which case you do not nee to - specify the predictors. It is also possible to pass only the the ``vDataFrame`` to the ``predict`` - function. But in this case, it is important that the column names of the ``vDataFrame`` are - matching the predictors and response name in the model. + Predictions can be made automatically using the test set, in which + case you don't need to specify the predictors. Alternatively, you + can pass only the ``vDataFrame`` to the ``LinearModel.predict`` + function, but in this case, it's essential that the column names of + the ``vDataFrame`` match the predictors and response name in the + model. Plots ^^^^^^ - If the models allows, you can also draw the relevant plots. Example of regression plots can - be found in :ref:`chart_gallery.regression_plot`. + If the model allows, you can also generate relevant plots. For example, + regression plots can be found in the :ref:`chart_gallery.regression_plot`. .. code-block:: python model.plot() - .. important:: The plot generally works for models which have less than 3 predictors. + .. important:: + + The plotting feature is typically suitable for models with fewer than + three predictors. Parameter Modification ^^^^^^^^^^^^^^^^^^^^^^^ @@ -756,7 +797,6 @@ class LinearRegression(Regressor, LinearModel): model.set_params({'tol': 0.001}) - Model Register ^^^^^^^^^^^^^^ @@ -766,14 +806,13 @@ class LinearRegression(Regressor, LinearModel): model.register("model_v1") - Please refer to :ref:`notebooks/ml/model_tracking_versioning/index.html` for more details on model tracking - and versioning. + Please refer to :ref:`notebooks/ml/model_tracking_versioning/index.html` for + more details on model tracking and versioning. Model Exporting ^^^^^^^^^^^^^^^^ - To Memmodel - """ """""" """""" """" + **To Memmodel** .. code-block:: python @@ -781,14 +820,15 @@ class LinearRegression(Regressor, LinearModel): .. note:: - ``Memmodel`` are in-memory representation of ML models. They can be used to do in-database and - in-memory prediction. They can pickled the same way that you pickle your ``scikit-learn`` model. - + ``MemModel`` objects serve as in-memory representations of machine + learning models. They can be used for both in-database and in-memory + prediction tasks. These objects can be pickled in the same way that + you would pickle a ``scikit-learn`` model. - The following ways of exporting the model uses memmodel and it is perfered to use the ``memmodels`` directly. + The following methods for exporting the model use ``MemModel``, and it + is recommended to use ``MemModel`` directly. - To SQL - """ """""" """""" """" + **To SQL** You can get the SQL code by: @@ -796,15 +836,21 @@ class LinearRegression(Regressor, LinearModel): model.to_sql() - To Python - """ """""" """""" """" + **To Python** - To get the prediction function in a python sytanx: + To obtain the prediction function in Python syntax, use the following code: .. ipython:: python X = [[4.2, 0.17, 0.36, 1.8, 0.029, 0.9899]] model.to_python()(X) + + .. hint:: + + The ``LinearModel.to_python`` method is used to retrieve predictions, + probabilities, or cluster distances. For specific details on how to + use this method for different model types, refer to the relevant + documentation for each model. """ # Properties.