diff --git a/verticapy/machine_learning/metrics/classification.py b/verticapy/machine_learning/metrics/classification.py index c189e7743..cb74dec12 100755 --- a/verticapy/machine_learning/metrics/classification.py +++ b/verticapy/machine_learning/metrics/classification.py @@ -226,7 +226,10 @@ def _compute_final_score( and average != "binary" ): raise ValueError( - "Parameter 'pos_label' can only be used when parameter 'average' is set to 'binary' or undefined." + "The 'pos_label' parameter can only be used when the 'average' " + "parameter is set to 'binary' or left undefined. This error can " + "also occur when you are using a binary classifier; in that case, " + "the 'average' parameter can only be set to 'binary' or left undefined." ) if not (isinstance(pos_label, NoneType)) and not (isinstance(labels, NoneType)): labels = None diff --git a/verticapy/machine_learning/vertica/cluster.py b/verticapy/machine_learning/vertica/cluster.py index 8871f7fea..d871e6a2c 100755 --- a/verticapy/machine_learning/vertica/cluster.py +++ b/verticapy/machine_learning/vertica/cluster.py @@ -1793,21 +1793,32 @@ class DBSCAN(VerticaModel): compute the distances and neighbors, and uses Python to compute the cluster propagation (non-scalable phase). - \u26A0 Warning : This algorithm uses a CROSS JOIN - during computation and is therefore - computationally expensive at O(n * n), - where n is the total number of elements. - This algorithm indexes elements of the - table in order to be optimal (the CROSS - JOIN will happen only with IDs which are - integers). - Since DBSCAN uses the p-distance, it - is highly sensitive to unnormalized data. - However, DBSCAN is robust to outliers and - can find non-linear clusters. It is a very - powerful algorithm for outlier detection - and clustering. A table is created at - the end of the learning phase. + .. warning : + + This algorithm uses a CROSS JOIN + during computation and is therefore + computationally expensive at O(n * n), + where n is the total number of elements. + This algorithm indexes elements of the + table in order to be optimal (the CROSS + JOIN will happen only with IDs which are + integers). + Since DBSCAN uses the p-distance, it + is highly sensitive to unnormalized data. + However, DBSCAN is robust to outliers and + can find non-linear clusters. It is a very + powerful algorithm for outlier detection + and clustering. A table is created at + the end of the learning phase. + + .. important:: + + This algorithm is not Vertica Native and relies solely + on SQL for attribute computation. While this model does + not take advantage of the benefits provided by a model + management system, including versioning and tracking, + the SQL code it generates can still be used to create a + pipeline. Parameters ---------- @@ -1828,6 +1839,159 @@ class DBSCAN(VerticaModel): p: int, optional The p of the p-distance (distance metric used during the model computation). + + Examples + --------- + + The following examples provide a basic understanding of usage. + For more detailed examples, please refer to the + :ref:`user_guide.machine_learning` or the + `Examples `_ + section on the website. + + Load data for machine learning + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + We import ``verticapy``: + + .. ipython:: python + + import verticapy as vp + + .. hint:: + + By assigning an alias to ``verticapy``, we mitigate the risk of code + collisions with other libraries. This precaution is necessary + because verticapy uses commonly known function names like "average" + and "median", which can potentially lead to naming conflicts. + The use of an alias ensures that the functions from verticapy are + used as intended without interfering with functions from other + libraries. + + For this example, we will create a small dataset. + + .. ipython:: python + + data = vp.vDataFrame({"col":[1.2, 1.1, 1.3, 1.5, 2, 2.2, 1.09, 0.9, 100, 102]}) + + .. note:: + + VerticaPy offers a wide range of sample datasets that are + ideal for training and testing purposes. You can explore + the full list of available datasets in the :ref:`api.datasets`, + which provides detailed information on each dataset + and how to use them effectively. These datasets are invaluable + resources for honing your data analysis and machine learning + skills within the VerticaPy environment. + + Model Initialization + ^^^^^^^^^^^^^^^^^^^^^ + + First we import the ``DBSCAN`` model: + + .. code-block:: + + from verticapy.machine_learning.vertica import DBSCAN + + .. ipython:: python + :suppress: + + from verticapy.machine_learning.vertica import DBSCAN + + Then we can create the model: + + .. ipython:: python + :okwarning: + + model = DBSCAN( + eps = 0.5, + min_samples = 2, + p = 2, + ) + + .. important:: + + As this model is not native, it solely relies on SQL statements to + compute various attributes, storing them within the object. No data + is saved in the database. + + Model Training + ^^^^^^^^^^^^^^^ + + We can now fit the model: + + .. ipython:: python + :okwarning: + + model.fit(data, X = ["col"]) + + .. important:: + + To train a model, you can directly use the ``vDataFrame`` or the + name of the relation stored in the database. + + .. hint:: + + For clustering and anomaly detection, the use of predictors is + optional. In such cases, all available predictors are considered, + which can include solely numerical variables or a combination of + numerical and categorical variables, depending on the model's + capabilities. + + .. important:: + + As this model is not native, it solely relies on SQL statements to + compute various attributes, storing them within the object. No data + is saved in the database. + + Prediction + ^^^^^^^^^^^ + + Predicting or ranking the dataset is straight-forward: + + .. ipython:: python + :suppress: + + result = model.predict() + html_file = open("figures/machine_learning_vertica_dbscan_prediction.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. code-block:: python + + model.predict() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_dbscan_prediction.html + + As shown above, a new column has been created, containing + the clusters. + + .. hint:: + The name of the new column is optional. If not provided, + it is randomly assigned. + + Parameter Modification + ^^^^^^^^^^^^^^^^^^^^^^^ + + In order to see the parameters: + + .. ipython:: python + + model.get_params() + + And to manually change some of the parameters: + + .. ipython:: python + + model.set_params({'min_samples': 5}) + + Model Register + ^^^^^^^^^^^^^^ + + As this model is not native, it does not support model management and + versioning. However, it is possible to use the SQL code it generates + for deployment. """ # Properties. diff --git a/verticapy/machine_learning/vertica/decomposition.py b/verticapy/machine_learning/vertica/decomposition.py index 0593fb661..89af42336 100755 --- a/verticapy/machine_learning/vertica/decomposition.py +++ b/verticapy/machine_learning/vertica/decomposition.py @@ -542,7 +542,7 @@ class PCA(Decomposition): name of the relation stored in the database. Scores - ^^^^^^ + ^^^^^^^ The decomposition score on the dataset for each transformed column can be calculated by: @@ -561,7 +561,7 @@ class PCA(Decomposition): model.explained_variance_ Principal Components - ^^^^^^^^^^^^^^^^^^^^^^ + ^^^^^^^^^^^^^^^^^^^^^ To get the transformed dataset in the form of principal components: @@ -784,6 +784,15 @@ class MCA(PCA): transformed to a TCDT (transformed complete disjunctive table) before applying the PCA. + .. important:: + + This algorithm is not Vertica Native and relies solely + on SQL for attribute computation. While this model does + not take advantage of the benefits provided by a model + management system, including versioning and tracking, + the SQL code it generates can still be used to create a + pipeline. + Parameters ---------- name: str, optional @@ -793,6 +802,261 @@ class MCA(PCA): If set to True, training a model with the same name as an existing model overwrites the existing model. + + Examples + --------- + + The following examples provide a basic understanding of usage. + For more detailed examples, please refer to the + :ref:`user_guide.machine_learning` or the + `Examples `_ + section on the website. + + Load data for machine learning + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + We import ``verticapy``: + + .. ipython:: python + + import verticapy as vp + + .. hint:: + + By assigning an alias to ``verticapy``, we mitigate the risk of code + collisions with other libraries. This precaution is necessary + because verticapy uses commonly known function names like "average" + and "median", which can potentially lead to naming conflicts. + The use of an alias ensures that the functions from verticapy are + used as intended without interfering with functions from other + libraries. + + For this example, we will use the Titanic dataset. + + .. code-block:: python + + import verticapy.datasets as vpd + + data = vpd.load_titanic() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_titanic.html + + .. note:: + + VerticaPy offers a wide range of sample datasets that are + ideal for training and testing purposes. You can explore + the full list of available datasets in the :ref:`api.datasets`, + which provides detailed information on each dataset + and how to use them effectively. These datasets are invaluable + resources for honing your data analysis and machine learning + skills within the VerticaPy environment. + + .. ipython:: python + :suppress: + + import verticapy.datasets as vpd + data = vpd.load_titanic() + + Model Initialization + ^^^^^^^^^^^^^^^^^^^^^ + + First we import the ``MCA`` model: + + .. ipython:: python + + from verticapy.machine_learning.vertica import MCA + + Then we can create the model: + + .. ipython:: python + :okwarning: + + model = MCA() + + You can select the number of components by the ``n_component`` + parameter. If it is not provided, then all are considered. + + .. important:: + + As this model is not native, it solely relies on SQL statements to + compute various attributes, storing them within the object. No data + is saved in the database. + + Model Training + ^^^^^^^^^^^^^^^ + + Before fitting the model, we need to calculate the Transformed Completely + Disjontive Table before fitting the model: + + .. ipython:: python + :okwarning: + + tcdt = data[["survived", "pclass", "sex"]].cdt() + + We can now fit the model: + + .. ipython:: python + :okwarning: + + model.fit(tcdt) + + .. important:: + + To train a model, you can directly use the ``vDataFrame`` or the + name of the relation stored in the database. + + Scores + ^^^^^^ + + The decomposition score on the dataset for each + transformed column can be calculated by: + + .. ipython:: python + + model.score() + + For more details on the function, check out + :py:mod:`verticapy.machine_learning.MCA.score` + + You can also fetch the explained variance by: + + .. ipython:: python + + model.explained_variance_ + + Principal Components + ^^^^^^^^^^^^^^^^^^^^^^ + + To get the transformed dataset in the form of principal + components: + + .. ipython:: python + + model.transform(tcdt) + + Please refer to :py:mod:`verticapy.machine_learning.MCA.transform` + for more details on transforming a ``vDataFrame``. + + Similarly, you can perform the inverse tranform to get + the original features using: + + .. code-block:: python + + model.inverse_transform(data_transformed) + + The variable ``data_transformed`` includes the MCA components. + + Plots - MCA + ^^^^^^^^^^^^ + + You can plot the first two dimensions conveniently using: + + .. code-block:: python + + model.plot() + + .. ipython:: python + :suppress: + + vp.set_option("plotting_lib", "plotly") + fig = model.plot() + fig.write_html("figures/machine_learning_vertica_mca_plot.html") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_mca_plot.html + + Plots - Scree + ^^^^^^^^^^^^^^ + + You can also plot the Scree plot: + + .. code-block:: python + + model.plot_scree() + + .. ipython:: python + :suppress: + + vp.set_option("plotting_lib", "highcharts") + fig = model.plot_scree() + html_text = fig.htmlcontent.replace("container", "ml_vertica_MCA_scree") + with open("figures/machine_learning_vertica_mca_plot_scree.html", "w") as file: + file.write(html_text) + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_mca_plot_scree.html + + Plots - Decomposition Circle + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + You can also plot the Decomposition Circles: + + .. code-block:: python + + model.plot_circle() + + .. ipython:: python + :suppress: + + vp.set_option("plotting_lib", "plotly") + fig = model.plot_circle() + fig.write_html("figures/machine_learning_vertica_mca_plot_circle.html") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_mca_plot_circle.html + + Model Register + ^^^^^^^^^^^^^^ + + As this model is not native, it does not support model management and + versioning. However, it is possible to use the SQL code it generates + for deployment. + + Model Exporting + ^^^^^^^^^^^^^^^^ + + **To Memmodel** + + .. code-block:: python + + model.to_memmodel() + + .. note:: + + ``MemModel`` objects serve as in-memory representations of machine + learning models. They can be used for both in-database and in-memory + prediction tasks. These objects can be pickled in the same way that + you would pickle a ``scikit-learn`` model. + + The preceding methods for exporting the model use ``MemModel``, and it + is recommended to use ``MemModel`` directly. + + **SQL** + + To get the SQL query use below: + + .. ipython:: python + + model.to_sql() + + **To Python** + + To obtain the prediction function in Python syntax, use the following code: + + .. ipython:: python + + X = [[0, 1, 0, 1, 1, 0, 1]] + model.to_python()(X) + + .. hint:: + + The + :py:mod:`verticapy.machine_learning.vertica.decomposition.MCA.to_python` + method is used to retrieve the Principal Component values. + For specific details on how to + use this method for different model types, refer to the relevant + documentation for each model. """ # Properties. diff --git a/verticapy/machine_learning/vertica/neighbors.py b/verticapy/machine_learning/vertica/neighbors.py index f64c490a1..81eaddaf0 100755 --- a/verticapy/machine_learning/vertica/neighbors.py +++ b/verticapy/machine_learning/vertica/neighbors.py @@ -282,13 +282,26 @@ class KNeighborsClassifier(MulticlassClassifier): pure SQL to compute all the distances and final score. - \u26A0 Warning : This algorithm uses a CROSS JOIN - during computation and is therefore - computationally expensive at O(n * n), - where n is the total number of elements. - Since KNeighborsClassifier uses the p- - distance, it is highly sensitive to - unnormalized data. + .. warning : + + This algorithm uses a CROSS JOIN + during computation and is therefore + computationally expensive at O(n * n), + where n is the total number of elements. + Since LocalOutlierFactor uses the p- + distance, it is highly sensitive to + unnormalized data. + A table is created at the end of + the learning phase. + + .. important:: + + This algorithm is not Vertica Native and relies solely + on SQL for attribute computation. While this model does + not take advantage of the benefits provided by a model + management system, including versioning and tracking, + the SQL code it generates can still be used to create a + pipeline. Parameters ---------- @@ -298,6 +311,427 @@ class KNeighborsClassifier(MulticlassClassifier): p: int, optional The p of the p-distances (distance metric used during the model computation). + + Examples + --------- + + The following examples provide a basic understanding of usage. + For more detailed examples, please refer to the + :ref:`user_guide.machine_learning` or the + `Examples `_ + section on the website. + + Load data for machine learning + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + We import ``verticapy``: + + .. ipython:: python + + import verticapy as vp + + .. hint:: + + By assigning an alias to ``verticapy``, we mitigate the risk of code + collisions with other libraries. This precaution is necessary + because verticapy uses commonly known function names like "average" + and "median", which can potentially lead to naming conflicts. + The use of an alias ensures that the functions from verticapy are + used as intended without interfering with functions from other + libraries. + + For this example, we will use the winequality dataset. + + .. code-block:: python + + import verticapy.datasets as vpd + + data = vpd.load_winequality() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_winequality.html + + .. note:: + + VerticaPy offers a wide range of sample datasets that are + ideal for training and testing purposes. You can explore + the full list of available datasets in the :ref:`api.datasets`, + which provides detailed information on each dataset + and how to use them effectively. These datasets are invaluable + resources for honing your data analysis and machine learning + skills within the VerticaPy environment. + + You can easily divide your dataset into training and testing subsets + using the :py:mod:`vDataFrame.train_test_split` method. This is a + crucial step when preparing your data for machine learning, as it + allows you to evaluate the performance of your models accurately. + + .. code-block:: python + + data = vpd.load_winequality() + train, test = data.train_test_split(test_size = 0.2) + + .. warning:: + + In this case, VerticaPy utilizes seeded randomization to guarantee + the reproducibility of your data split. However, please be aware + that this approach may lead to reduced performance. For a more + efficient data split, you can use the :py:mod:`vDataFrame.to_db` + method to save your results into ``tables`` or ``temporary tables``. + This will help enhance the overall performance of the process. + + .. ipython:: python + :suppress: + + import verticapy as vp + import verticapy.datasets as vpd + data = vpd.load_winequality() + train, test = data.train_test_split(test_size = 0.2) + + Model Initialization + ^^^^^^^^^^^^^^^^^^^^^ + + First we import the ``KNeighborsClassifier`` model: + + .. code-block:: + + from verticapy.machine_learning.vertica import KNeighborsClassifier + + .. ipython:: python + :suppress: + + from verticapy.machine_learning.vertica import KNeighborsClassifier + + Then we can create the model: + + .. ipython:: python + :okwarning: + + model = KNeighborsClassifier( + n_neighbors = 10, + p = 2, + ) + + .. hint:: + + In ``verticapy`` 1.0.x and higher, you do not need to specify the + model name, as the name is automatically assigned. If you need to + re-use the model, you can fetch the model name from the model's + attributes. + + Model Training + ^^^^^^^^^^^^^^^ + + We can now fit the model: + + .. ipython:: python + :okwarning: + + model.fit( + train, + [ + "fixed_acidity", + "volatile_acidity", + "density", + "pH", + ], + "quality", + test, + ) + + .. important:: + + To train a model, you can directly use the ``vDataFrame`` or the + name of the relation stored in the database. The test set is optional + and is only used to compute the test metrics. In ``verticapy``, we + don't work using ``X`` matrices and ``y`` vectors. Instead, we work + directly with lists of predictors and the response name. + + .. important:: + + As this model is not native, it solely relies on SQL statements to + compute various attributes, storing them within the object. No data + is saved in the database. + + Metrics + ^^^^^^^^ + + We can get the entire report using: + + .. ipython:: python + :suppress: + + result = model.report() + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_neighbors_knc_report.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. code-block:: python + + model.report() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_neighbors_knc_report.html + + .. important:: + + Most metrics are computed using a single SQL query, but some of them might + require multiple SQL queries. Selecting only the necessary metrics in the + report can help optimize performance. + E.g. ``model.report(metrics = ["auc", "accuracy"])``. + + For classification models, we can easily modify the ``cutoff`` to observe + the effect on different metrics: + + .. ipython:: python + :suppress: + + result = model.report(cutoff = 0.2) + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_neighbors_knc_report_cutoff.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. code-block:: python + + model.report(cutoff = 0.2) + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_neighbors_knc_report_cutoff.html + + + You can also use the ``KNeighborsClassifier.score`` function to compute any + classification metric. The default metric is the accuracy: + + .. ipython:: python + + model.score(metric = "f1", average = "macro") + + .. note:: + + For multi-class scoring, ``verticapy`` allows the + flexibility to use three averaging techniques: + micro, macro and weighted. Please refer to + `this link `_ + for more details on how they are calculated. + + Prediction + ^^^^^^^^^^^ + + Prediction is straight-forward: + + .. ipython:: python + :suppress: + + result = model.predict( + test, + [ + "fixed_acidity", + "volatile_acidity", + "density", + "pH", + ], + "prediction", + ) + html_file = open("figures/machine_learning_vertica_neighbors_knc_prediction.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. code-block:: python + + model.predict( + test, + [ + "fixed_acidity", + "volatile_acidity", + "density", + "pH", + ], + "prediction", + ) + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_neighbors_knc_prediction.html + + .. note:: + + Predictions can be made automatically using the test set, in which + case you don't need to specify the predictors. Alternatively, you + can pass only the ``vDataFrame`` to the + :py:mod:`verticapy.machine_learning.vertica.linear_model.LinearModel.predict` + function, but in this case, it's essential that the column names of + the ``vDataFrame`` match the predictors and response name in the + model. + + Probabilities + ^^^^^^^^^^^^^^ + + It is also easy to get the model's probabilities: + + .. ipython:: python + :suppress: + + result = model.predict_proba( + test, + [ + "fixed_acidity", + "volatile_acidity", + "density", + "pH", + ], + "prediction", + ) + html_file = open("figures/machine_learning_vertica_neighbors_knc_proba.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. code-block:: python + + model.predict_proba( + test, + [ + "fixed_acidity", + "volatile_acidity", + "density", + "pH", + ], + "prediction", + ) + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_neighbors_knc_proba.html + + .. note:: + + Probabilities are added to the vDataFrame, and VerticaPy uses the + corresponding probability function in SQL behind the scenes. You + can use the ``pos_label`` parameter to add only the probability + of the selected category. + + Confusion Matrix + ^^^^^^^^^^^^^^^^^ + + You can obtain the confusion matrix of your choice by specifying + the desired cutoff. + + .. ipython:: python + + model.confusion_matrix(cutoff = 0.5) + + .. hint:: + + In the context of multi-class classification, you typically work + with an overall confusion matrix that summarizes the classification + efficiency across all classes. However, you have the flexibility to + specify a ``pos_label`` and adjust the cutoff threshold. In this case, + a binary confusion matrix is computed, where the chosen class is treated + as the positive class, allowing you to evaluate its efficiency as if it + were a binary classification problem. + + .. ipython:: python + + model.confusion_matrix(pos_label = "5", cutoff = 0.6) + + .. note:: + + In classification, the ``cutoff`` is a threshold value used to + determine class assignment based on predicted probabilities or + scores from a classification model. In binary classification, + if the predicted probability for a specific class is greater + than or equal to the cutoff, the instance is assigned to the + positive class; otherwise, it is assigned to the negative class. + Adjusting the cutoff allows for trade-offs between true positives + and false positives, enabling the model to be optimized for + specific objectives or to consider the relative costs of different + classification errors. The choice of cutoff is critical for + tailoring the model's performance to meet specific needs. + + + Main Plots (Classification Curves) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + Classification models allow for the creation of various plots that + are very helpful in understanding the model, such as the ROC Curve, + PRC Curve, Cutoff Curve, Gain Curve, and more. + + Most of the classification curves can be found in the + :ref:`chart_gallery.classification_curve`. + + For example, let's draw the model's ROC curve. + + .. code-block:: python + + model.roc_curve(pos_label = "5") + + .. ipython:: python + :suppress: + + vp.set_option("plotting_lib", "plotly") + fig = model.roc_curve(pos_label = "5") + fig.write_html("figures/machine_learning_vertica_neighbors_knc_roc.html") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_neighbors_knc_roc.html + + .. important:: + + Most of the curves have a parameter called ``nbins``, which is essential + for estimating metrics. The larger the ``nbins``, the more precise the + estimation, but it can significantly impact performance. Exercise caution + when increasing this parameter excessively. + + .. hint:: + + In binary classification, various curves can be easily plotted. However, + in multi-class classification, it's important to select the ``pos_label`` + , representing the class to be treated as positive when drawing the curve. + + Other Plots + ^^^^^^^^^^^^ + + **Contour plot** is another useful plot that can be produced + for models with two predictors. + + .. code-block:: python + + model.contour(pos_label = "5") + + .. important:: + + Machine learning models with two predictors can usually + benefit from their own contour plot. This visual representation + aids in exploring predictions and gaining a deeper understanding + of how these models perform in different scenarios. + Please refer to :ref:`chart_gallery.contour` for more examples. + + Parameter Modification + ^^^^^^^^^^^^^^^^^^^^^^^ + + In order to see the parameters: + + .. ipython:: python + + model.get_params() + + And to manually change some of the parameters: + + .. ipython:: python + + model.set_params({'n_neighbors': 8}) + + Model Register + ^^^^^^^^^^^^^^ + + As this model is not native, it does not support model management and + versioning. However, it is possible to use the SQL code it generates + for deployment. + + Model Exporting + ^^^^^^^^^^^^^^^^ + + It is not possible to export this type of model, but you can still + examine the SQL code generated by using the + :py:mod:`verticapy.machine_learning.vertica.neighbors.KNeighborsClassifier.deploySQL` + method. """ # Properties. @@ -468,11 +902,14 @@ def _get_final_relation( """ Returns the final relation used to do the predictions. """ + filter_sql = "" + if not (isinstance(pos_label, NoneType)): + filter_sql = f"WHERE predict_neighbors = '{pos_label}'" return f""" (SELECT * - FROM {self.deploySQL()} - WHERE predict_neighbors = '{pos_label}') + FROM {self.deploySQL()} + {filter_sql}) final_centroids_relation""" def _get_y_proba( @@ -494,7 +931,20 @@ def _get_y_score( Returns the input that represents the model's scoring. """ cutoff = self._check_cutoff(cutoff=cutoff) - return f"(CASE WHEN proba_predict > {cutoff} THEN 1 ELSE 0 END)" + if isinstance(pos_label, NoneType) and not (self._is_binary_classifier()): + return "predict_neighbors" + elif self._is_binary_classifier(): + return f""" + (CASE + WHEN proba_predict > {cutoff} THEN '{self.classes_[1]}' + ELSE '{self.classes_[0]}' + END)""" + else: + return f""" + (CASE + WHEN proba_predict < {cutoff} AND predict_neighbors = '{pos_label}' THEN NULL + ELSE predict_neighbors + END)""" def _compute_accuracy(self) -> float: """ @@ -520,7 +970,7 @@ def _confusion_matrix( ORDER BY proba_predict DESC) AS pos FROM {self.deploySQL()}) neighbors_table WHERE pos = 1""" return mt.confusion_matrix( - self.y, "predict_neighbors", input_relation, classes=self.classes_ + self.y, "predict_neighbors", input_relation, labels=self.classes_ ) else: cutoff = self._check_cutoff(cutoff=cutoff) @@ -1144,15 +1594,26 @@ class LocalOutlierFactor(VerticaModel): Sander. This object is using pure SQL to compute all the distances and final score. - \u26A0 Warning : This algorithm uses a CROSS JOIN - during computation and is therefore - computationally expensive at O(n * n), - where n is the total number of elements. - Since LocalOutlierFactor uses the p- - distance, it is highly sensitive to - unnormalized data. - A table is created at the end of - the learning phase. + .. warning : + + This algorithm uses a CROSS JOIN + during computation and is therefore + computationally expensive at O(n * n), + where n is the total number of elements. + Since LocalOutlierFactor uses the p- + distance, it is highly sensitive to + unnormalized data. + A table is created at the end of + the learning phase. + + .. important:: + + This algorithm is not Vertica Native and relies solely + on SQL for attribute computation. While this model does + not take advantage of the benefits provided by a model + management system, including versioning and tracking, + the SQL code it generates can still be used to create a + pipeline. Parameters ---------- @@ -1170,6 +1631,192 @@ class LocalOutlierFactor(VerticaModel): p: int, optional The p of the p-distances (distance metric used during the model computation). + + Examples + --------- + + The following examples provide a basic understanding of usage. + For more detailed examples, please refer to the + :ref:`user_guide.machine_learning` or the + `Examples `_ + section on the website. + + Load data for machine learning + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + We import ``verticapy``: + + .. ipython:: python + + import verticapy as vp + + .. hint:: + + By assigning an alias to ``verticapy``, we mitigate the risk of code + collisions with other libraries. This precaution is necessary + because verticapy uses commonly known function names like "average" + and "median", which can potentially lead to naming conflicts. + The use of an alias ensures that the functions from verticapy are + used as intended without interfering with functions from other + libraries. + + For this example, we will use the winequality dataset. + + .. code-block:: python + + import verticapy.datasets as vpd + + data = vpd.load_winequality() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_winequality.html + + .. note:: + + VerticaPy offers a wide range of sample datasets that are + ideal for training and testing purposes. You can explore + the full list of available datasets in the :ref:`api.datasets`, + which provides detailed information on each dataset + and how to use them effectively. These datasets are invaluable + resources for honing your data analysis and machine learning + skills within the VerticaPy environment. + + .. ipython:: python + :suppress: + + import verticapy.datasets as vpd + data = vpd.load_winequality() + + Model Initialization + ^^^^^^^^^^^^^^^^^^^^^ + + First we import the ``LocalOutlierFactor`` model: + + .. code-block:: + + from verticapy.machine_learning.vertica import LocalOutlierFactor + + .. ipython:: python + :suppress: + + from verticapy.machine_learning.vertica import LocalOutlierFactor + + Then we can create the model: + + .. ipython:: python + :okwarning: + + model = LocalOutlierFactor( + n_neighbors = 10, + p = 2, + ) + + .. important:: + + As this model is not native, it solely relies on SQL statements to + compute various attributes, storing them within the object. No data + is saved in the database. + + Model Training + ^^^^^^^^^^^^^^^ + + We can now fit the model: + + .. ipython:: python + :okwarning: + + model.fit(data, X = ["density", "sulphates"]) + + .. important:: + + To train a model, you can directly use the ``vDataFrame`` or the + name of the relation stored in the database. The test set is optional + and is only used to compute the test metrics. In ``verticapy``, we + don't work using ``X`` matrices and ``y`` vectors. Instead, we work + directly with lists of predictors and the response name. + + .. hint:: + + For clustering and anomaly detection, the use of predictors is + optional. In such cases, all available predictors are considered, + which can include solely numerical variables or a combination of + numerical and categorical variables, depending on the model's + capabilities. + + .. important:: + + As this model is not native, it solely relies on SQL statements to + compute various attributes, storing them within the object. No data + is saved in the database. + + Prediction + ^^^^^^^^^^^ + + To find out the LOF score for each datapoint: + + .. ipython:: python + :suppress: + + result = model.predict() + html_file = open("figures/machine_learning_vertica_lof_prediction.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. code-block:: python + + model.predict() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_lof_prediction.html + + As shown above, a new column has been created, containing + the lof score. + + + Plots - Outliers + ^^^^^^^^^^^^^^^^^ + + Plots highlighting the outliers can be easily drawn using: + + .. code-block:: python + + model.plot() + + .. ipython:: python + :suppress: + + vp.set_option("plotting_lib", "plotly") + fig = model.plot(width = 600) + fig.write_html("figures/machine_learning_vertica_lof_plot.html") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_lof_plot.html + + .. important:: + + Please refer to :ref:`chart_gallery.lof` for more examples. + + Parameter Modification + ^^^^^^^^^^^^^^^^^^^^^^^ + + In order to see the parameters: + + .. ipython:: python + + model.get_params() + + And to manually change some of the parameters: + + .. ipython:: python + + model.set_params({'p': 3}) + + Model Register + ^^^^^^^^^^^^^^ + + As this model is not native, it does not support model management and + versioning. However, it is possible to use the SQL code it generates + for deployment. """ # Properties.