diff --git a/verticapy/machine_learning/metrics/classification.py b/verticapy/machine_learning/metrics/classification.py
index c189e7743..cb74dec12 100755
--- a/verticapy/machine_learning/metrics/classification.py
+++ b/verticapy/machine_learning/metrics/classification.py
@@ -226,7 +226,10 @@ def _compute_final_score(
and average != "binary"
):
raise ValueError(
- "Parameter 'pos_label' can only be used when parameter 'average' is set to 'binary' or undefined."
+ "The 'pos_label' parameter can only be used when the 'average' "
+ "parameter is set to 'binary' or left undefined. This error can "
+ "also occur when you are using a binary classifier; in that case, "
+ "the 'average' parameter can only be set to 'binary' or left undefined."
)
if not (isinstance(pos_label, NoneType)) and not (isinstance(labels, NoneType)):
labels = None
diff --git a/verticapy/machine_learning/vertica/cluster.py b/verticapy/machine_learning/vertica/cluster.py
index 8871f7fea..d871e6a2c 100755
--- a/verticapy/machine_learning/vertica/cluster.py
+++ b/verticapy/machine_learning/vertica/cluster.py
@@ -1793,21 +1793,32 @@ class DBSCAN(VerticaModel):
compute the distances and neighbors, and uses Python to
compute the cluster propagation (non-scalable phase).
- \u26A0 Warning : This algorithm uses a CROSS JOIN
- during computation and is therefore
- computationally expensive at O(n * n),
- where n is the total number of elements.
- This algorithm indexes elements of the
- table in order to be optimal (the CROSS
- JOIN will happen only with IDs which are
- integers).
- Since DBSCAN uses the p-distance, it
- is highly sensitive to unnormalized data.
- However, DBSCAN is robust to outliers and
- can find non-linear clusters. It is a very
- powerful algorithm for outlier detection
- and clustering. A table is created at
- the end of the learning phase.
+ .. warning :
+
+ This algorithm uses a CROSS JOIN
+ during computation and is therefore
+ computationally expensive at O(n * n),
+ where n is the total number of elements.
+ This algorithm indexes elements of the
+ table in order to be optimal (the CROSS
+ JOIN will happen only with IDs which are
+ integers).
+ Since DBSCAN uses the p-distance, it
+ is highly sensitive to unnormalized data.
+ However, DBSCAN is robust to outliers and
+ can find non-linear clusters. It is a very
+ powerful algorithm for outlier detection
+ and clustering. A table is created at
+ the end of the learning phase.
+
+ .. important::
+
+ This algorithm is not Vertica Native and relies solely
+ on SQL for attribute computation. While this model does
+ not take advantage of the benefits provided by a model
+ management system, including versioning and tracking,
+ the SQL code it generates can still be used to create a
+ pipeline.
Parameters
----------
@@ -1828,6 +1839,159 @@ class DBSCAN(VerticaModel):
p: int, optional
The p of the p-distance (distance metric used
during the model computation).
+
+ Examples
+ ---------
+
+ The following examples provide a basic understanding of usage.
+ For more detailed examples, please refer to the
+ :ref:`user_guide.machine_learning` or the
+ `Examples `_
+ section on the website.
+
+ Load data for machine learning
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ We import ``verticapy``:
+
+ .. ipython:: python
+
+ import verticapy as vp
+
+ .. hint::
+
+ By assigning an alias to ``verticapy``, we mitigate the risk of code
+ collisions with other libraries. This precaution is necessary
+ because verticapy uses commonly known function names like "average"
+ and "median", which can potentially lead to naming conflicts.
+ The use of an alias ensures that the functions from verticapy are
+ used as intended without interfering with functions from other
+ libraries.
+
+ For this example, we will create a small dataset.
+
+ .. ipython:: python
+
+ data = vp.vDataFrame({"col":[1.2, 1.1, 1.3, 1.5, 2, 2.2, 1.09, 0.9, 100, 102]})
+
+ .. note::
+
+ VerticaPy offers a wide range of sample datasets that are
+ ideal for training and testing purposes. You can explore
+ the full list of available datasets in the :ref:`api.datasets`,
+ which provides detailed information on each dataset
+ and how to use them effectively. These datasets are invaluable
+ resources for honing your data analysis and machine learning
+ skills within the VerticaPy environment.
+
+ Model Initialization
+ ^^^^^^^^^^^^^^^^^^^^^
+
+ First we import the ``DBSCAN`` model:
+
+ .. code-block::
+
+ from verticapy.machine_learning.vertica import DBSCAN
+
+ .. ipython:: python
+ :suppress:
+
+ from verticapy.machine_learning.vertica import DBSCAN
+
+ Then we can create the model:
+
+ .. ipython:: python
+ :okwarning:
+
+ model = DBSCAN(
+ eps = 0.5,
+ min_samples = 2,
+ p = 2,
+ )
+
+ .. important::
+
+ As this model is not native, it solely relies on SQL statements to
+ compute various attributes, storing them within the object. No data
+ is saved in the database.
+
+ Model Training
+ ^^^^^^^^^^^^^^^
+
+ We can now fit the model:
+
+ .. ipython:: python
+ :okwarning:
+
+ model.fit(data, X = ["col"])
+
+ .. important::
+
+ To train a model, you can directly use the ``vDataFrame`` or the
+ name of the relation stored in the database.
+
+ .. hint::
+
+ For clustering and anomaly detection, the use of predictors is
+ optional. In such cases, all available predictors are considered,
+ which can include solely numerical variables or a combination of
+ numerical and categorical variables, depending on the model's
+ capabilities.
+
+ .. important::
+
+ As this model is not native, it solely relies on SQL statements to
+ compute various attributes, storing them within the object. No data
+ is saved in the database.
+
+ Prediction
+ ^^^^^^^^^^^
+
+ Predicting or ranking the dataset is straight-forward:
+
+ .. ipython:: python
+ :suppress:
+
+ result = model.predict()
+ html_file = open("figures/machine_learning_vertica_dbscan_prediction.html", "w")
+ html_file.write(result._repr_html_())
+ html_file.close()
+
+ .. code-block:: python
+
+ model.predict()
+
+ .. raw:: html
+ :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_dbscan_prediction.html
+
+ As shown above, a new column has been created, containing
+ the clusters.
+
+ .. hint::
+ The name of the new column is optional. If not provided,
+ it is randomly assigned.
+
+ Parameter Modification
+ ^^^^^^^^^^^^^^^^^^^^^^^
+
+ In order to see the parameters:
+
+ .. ipython:: python
+
+ model.get_params()
+
+ And to manually change some of the parameters:
+
+ .. ipython:: python
+
+ model.set_params({'min_samples': 5})
+
+ Model Register
+ ^^^^^^^^^^^^^^
+
+ As this model is not native, it does not support model management and
+ versioning. However, it is possible to use the SQL code it generates
+ for deployment.
"""
# Properties.
diff --git a/verticapy/machine_learning/vertica/decomposition.py b/verticapy/machine_learning/vertica/decomposition.py
index 0593fb661..89af42336 100755
--- a/verticapy/machine_learning/vertica/decomposition.py
+++ b/verticapy/machine_learning/vertica/decomposition.py
@@ -542,7 +542,7 @@ class PCA(Decomposition):
name of the relation stored in the database.
Scores
- ^^^^^^
+ ^^^^^^^
The decomposition score on the dataset for each
transformed column can be calculated by:
@@ -561,7 +561,7 @@ class PCA(Decomposition):
model.explained_variance_
Principal Components
- ^^^^^^^^^^^^^^^^^^^^^^
+ ^^^^^^^^^^^^^^^^^^^^^
To get the transformed dataset in the form of principal
components:
@@ -784,6 +784,15 @@ class MCA(PCA):
transformed to a TCDT (transformed complete disjunctive
table) before applying the PCA.
+ .. important::
+
+ This algorithm is not Vertica Native and relies solely
+ on SQL for attribute computation. While this model does
+ not take advantage of the benefits provided by a model
+ management system, including versioning and tracking,
+ the SQL code it generates can still be used to create a
+ pipeline.
+
Parameters
----------
name: str, optional
@@ -793,6 +802,261 @@ class MCA(PCA):
If set to True, training a model with the same
name as an existing model overwrites the
existing model.
+
+ Examples
+ ---------
+
+ The following examples provide a basic understanding of usage.
+ For more detailed examples, please refer to the
+ :ref:`user_guide.machine_learning` or the
+ `Examples `_
+ section on the website.
+
+ Load data for machine learning
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ We import ``verticapy``:
+
+ .. ipython:: python
+
+ import verticapy as vp
+
+ .. hint::
+
+ By assigning an alias to ``verticapy``, we mitigate the risk of code
+ collisions with other libraries. This precaution is necessary
+ because verticapy uses commonly known function names like "average"
+ and "median", which can potentially lead to naming conflicts.
+ The use of an alias ensures that the functions from verticapy are
+ used as intended without interfering with functions from other
+ libraries.
+
+ For this example, we will use the Titanic dataset.
+
+ .. code-block:: python
+
+ import verticapy.datasets as vpd
+
+ data = vpd.load_titanic()
+
+ .. raw:: html
+ :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_titanic.html
+
+ .. note::
+
+ VerticaPy offers a wide range of sample datasets that are
+ ideal for training and testing purposes. You can explore
+ the full list of available datasets in the :ref:`api.datasets`,
+ which provides detailed information on each dataset
+ and how to use them effectively. These datasets are invaluable
+ resources for honing your data analysis and machine learning
+ skills within the VerticaPy environment.
+
+ .. ipython:: python
+ :suppress:
+
+ import verticapy.datasets as vpd
+ data = vpd.load_titanic()
+
+ Model Initialization
+ ^^^^^^^^^^^^^^^^^^^^^
+
+ First we import the ``MCA`` model:
+
+ .. ipython:: python
+
+ from verticapy.machine_learning.vertica import MCA
+
+ Then we can create the model:
+
+ .. ipython:: python
+ :okwarning:
+
+ model = MCA()
+
+ You can select the number of components by the ``n_component``
+ parameter. If it is not provided, then all are considered.
+
+ .. important::
+
+ As this model is not native, it solely relies on SQL statements to
+ compute various attributes, storing them within the object. No data
+ is saved in the database.
+
+ Model Training
+ ^^^^^^^^^^^^^^^
+
+ Before fitting the model, we need to calculate the Transformed Completely
+ Disjontive Table before fitting the model:
+
+ .. ipython:: python
+ :okwarning:
+
+ tcdt = data[["survived", "pclass", "sex"]].cdt()
+
+ We can now fit the model:
+
+ .. ipython:: python
+ :okwarning:
+
+ model.fit(tcdt)
+
+ .. important::
+
+ To train a model, you can directly use the ``vDataFrame`` or the
+ name of the relation stored in the database.
+
+ Scores
+ ^^^^^^
+
+ The decomposition score on the dataset for each
+ transformed column can be calculated by:
+
+ .. ipython:: python
+
+ model.score()
+
+ For more details on the function, check out
+ :py:mod:`verticapy.machine_learning.MCA.score`
+
+ You can also fetch the explained variance by:
+
+ .. ipython:: python
+
+ model.explained_variance_
+
+ Principal Components
+ ^^^^^^^^^^^^^^^^^^^^^^
+
+ To get the transformed dataset in the form of principal
+ components:
+
+ .. ipython:: python
+
+ model.transform(tcdt)
+
+ Please refer to :py:mod:`verticapy.machine_learning.MCA.transform`
+ for more details on transforming a ``vDataFrame``.
+
+ Similarly, you can perform the inverse tranform to get
+ the original features using:
+
+ .. code-block:: python
+
+ model.inverse_transform(data_transformed)
+
+ The variable ``data_transformed`` includes the MCA components.
+
+ Plots - MCA
+ ^^^^^^^^^^^^
+
+ You can plot the first two dimensions conveniently using:
+
+ .. code-block:: python
+
+ model.plot()
+
+ .. ipython:: python
+ :suppress:
+
+ vp.set_option("plotting_lib", "plotly")
+ fig = model.plot()
+ fig.write_html("figures/machine_learning_vertica_mca_plot.html")
+
+ .. raw:: html
+ :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_mca_plot.html
+
+ Plots - Scree
+ ^^^^^^^^^^^^^^
+
+ You can also plot the Scree plot:
+
+ .. code-block:: python
+
+ model.plot_scree()
+
+ .. ipython:: python
+ :suppress:
+
+ vp.set_option("plotting_lib", "highcharts")
+ fig = model.plot_scree()
+ html_text = fig.htmlcontent.replace("container", "ml_vertica_MCA_scree")
+ with open("figures/machine_learning_vertica_mca_plot_scree.html", "w") as file:
+ file.write(html_text)
+
+ .. raw:: html
+ :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_mca_plot_scree.html
+
+ Plots - Decomposition Circle
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ You can also plot the Decomposition Circles:
+
+ .. code-block:: python
+
+ model.plot_circle()
+
+ .. ipython:: python
+ :suppress:
+
+ vp.set_option("plotting_lib", "plotly")
+ fig = model.plot_circle()
+ fig.write_html("figures/machine_learning_vertica_mca_plot_circle.html")
+
+ .. raw:: html
+ :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_mca_plot_circle.html
+
+ Model Register
+ ^^^^^^^^^^^^^^
+
+ As this model is not native, it does not support model management and
+ versioning. However, it is possible to use the SQL code it generates
+ for deployment.
+
+ Model Exporting
+ ^^^^^^^^^^^^^^^^
+
+ **To Memmodel**
+
+ .. code-block:: python
+
+ model.to_memmodel()
+
+ .. note::
+
+ ``MemModel`` objects serve as in-memory representations of machine
+ learning models. They can be used for both in-database and in-memory
+ prediction tasks. These objects can be pickled in the same way that
+ you would pickle a ``scikit-learn`` model.
+
+ The preceding methods for exporting the model use ``MemModel``, and it
+ is recommended to use ``MemModel`` directly.
+
+ **SQL**
+
+ To get the SQL query use below:
+
+ .. ipython:: python
+
+ model.to_sql()
+
+ **To Python**
+
+ To obtain the prediction function in Python syntax, use the following code:
+
+ .. ipython:: python
+
+ X = [[0, 1, 0, 1, 1, 0, 1]]
+ model.to_python()(X)
+
+ .. hint::
+
+ The
+ :py:mod:`verticapy.machine_learning.vertica.decomposition.MCA.to_python`
+ method is used to retrieve the Principal Component values.
+ For specific details on how to
+ use this method for different model types, refer to the relevant
+ documentation for each model.
"""
# Properties.
diff --git a/verticapy/machine_learning/vertica/neighbors.py b/verticapy/machine_learning/vertica/neighbors.py
index f64c490a1..81eaddaf0 100755
--- a/verticapy/machine_learning/vertica/neighbors.py
+++ b/verticapy/machine_learning/vertica/neighbors.py
@@ -282,13 +282,26 @@ class KNeighborsClassifier(MulticlassClassifier):
pure SQL to compute all the distances and final
score.
- \u26A0 Warning : This algorithm uses a CROSS JOIN
- during computation and is therefore
- computationally expensive at O(n * n),
- where n is the total number of elements.
- Since KNeighborsClassifier uses the p-
- distance, it is highly sensitive to
- unnormalized data.
+ .. warning :
+
+ This algorithm uses a CROSS JOIN
+ during computation and is therefore
+ computationally expensive at O(n * n),
+ where n is the total number of elements.
+ Since LocalOutlierFactor uses the p-
+ distance, it is highly sensitive to
+ unnormalized data.
+ A table is created at the end of
+ the learning phase.
+
+ .. important::
+
+ This algorithm is not Vertica Native and relies solely
+ on SQL for attribute computation. While this model does
+ not take advantage of the benefits provided by a model
+ management system, including versioning and tracking,
+ the SQL code it generates can still be used to create a
+ pipeline.
Parameters
----------
@@ -298,6 +311,427 @@ class KNeighborsClassifier(MulticlassClassifier):
p: int, optional
The p of the p-distances (distance metric used
during the model computation).
+
+ Examples
+ ---------
+
+ The following examples provide a basic understanding of usage.
+ For more detailed examples, please refer to the
+ :ref:`user_guide.machine_learning` or the
+ `Examples `_
+ section on the website.
+
+ Load data for machine learning
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ We import ``verticapy``:
+
+ .. ipython:: python
+
+ import verticapy as vp
+
+ .. hint::
+
+ By assigning an alias to ``verticapy``, we mitigate the risk of code
+ collisions with other libraries. This precaution is necessary
+ because verticapy uses commonly known function names like "average"
+ and "median", which can potentially lead to naming conflicts.
+ The use of an alias ensures that the functions from verticapy are
+ used as intended without interfering with functions from other
+ libraries.
+
+ For this example, we will use the winequality dataset.
+
+ .. code-block:: python
+
+ import verticapy.datasets as vpd
+
+ data = vpd.load_winequality()
+
+ .. raw:: html
+ :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_winequality.html
+
+ .. note::
+
+ VerticaPy offers a wide range of sample datasets that are
+ ideal for training and testing purposes. You can explore
+ the full list of available datasets in the :ref:`api.datasets`,
+ which provides detailed information on each dataset
+ and how to use them effectively. These datasets are invaluable
+ resources for honing your data analysis and machine learning
+ skills within the VerticaPy environment.
+
+ You can easily divide your dataset into training and testing subsets
+ using the :py:mod:`vDataFrame.train_test_split` method. This is a
+ crucial step when preparing your data for machine learning, as it
+ allows you to evaluate the performance of your models accurately.
+
+ .. code-block:: python
+
+ data = vpd.load_winequality()
+ train, test = data.train_test_split(test_size = 0.2)
+
+ .. warning::
+
+ In this case, VerticaPy utilizes seeded randomization to guarantee
+ the reproducibility of your data split. However, please be aware
+ that this approach may lead to reduced performance. For a more
+ efficient data split, you can use the :py:mod:`vDataFrame.to_db`
+ method to save your results into ``tables`` or ``temporary tables``.
+ This will help enhance the overall performance of the process.
+
+ .. ipython:: python
+ :suppress:
+
+ import verticapy as vp
+ import verticapy.datasets as vpd
+ data = vpd.load_winequality()
+ train, test = data.train_test_split(test_size = 0.2)
+
+ Model Initialization
+ ^^^^^^^^^^^^^^^^^^^^^
+
+ First we import the ``KNeighborsClassifier`` model:
+
+ .. code-block::
+
+ from verticapy.machine_learning.vertica import KNeighborsClassifier
+
+ .. ipython:: python
+ :suppress:
+
+ from verticapy.machine_learning.vertica import KNeighborsClassifier
+
+ Then we can create the model:
+
+ .. ipython:: python
+ :okwarning:
+
+ model = KNeighborsClassifier(
+ n_neighbors = 10,
+ p = 2,
+ )
+
+ .. hint::
+
+ In ``verticapy`` 1.0.x and higher, you do not need to specify the
+ model name, as the name is automatically assigned. If you need to
+ re-use the model, you can fetch the model name from the model's
+ attributes.
+
+ Model Training
+ ^^^^^^^^^^^^^^^
+
+ We can now fit the model:
+
+ .. ipython:: python
+ :okwarning:
+
+ model.fit(
+ train,
+ [
+ "fixed_acidity",
+ "volatile_acidity",
+ "density",
+ "pH",
+ ],
+ "quality",
+ test,
+ )
+
+ .. important::
+
+ To train a model, you can directly use the ``vDataFrame`` or the
+ name of the relation stored in the database. The test set is optional
+ and is only used to compute the test metrics. In ``verticapy``, we
+ don't work using ``X`` matrices and ``y`` vectors. Instead, we work
+ directly with lists of predictors and the response name.
+
+ .. important::
+
+ As this model is not native, it solely relies on SQL statements to
+ compute various attributes, storing them within the object. No data
+ is saved in the database.
+
+ Metrics
+ ^^^^^^^^
+
+ We can get the entire report using:
+
+ .. ipython:: python
+ :suppress:
+
+ result = model.report()
+ html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_neighbors_knc_report.html", "w")
+ html_file.write(result._repr_html_())
+ html_file.close()
+
+ .. code-block:: python
+
+ model.report()
+
+ .. raw:: html
+ :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_neighbors_knc_report.html
+
+ .. important::
+
+ Most metrics are computed using a single SQL query, but some of them might
+ require multiple SQL queries. Selecting only the necessary metrics in the
+ report can help optimize performance.
+ E.g. ``model.report(metrics = ["auc", "accuracy"])``.
+
+ For classification models, we can easily modify the ``cutoff`` to observe
+ the effect on different metrics:
+
+ .. ipython:: python
+ :suppress:
+
+ result = model.report(cutoff = 0.2)
+ html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_neighbors_knc_report_cutoff.html", "w")
+ html_file.write(result._repr_html_())
+ html_file.close()
+
+ .. code-block:: python
+
+ model.report(cutoff = 0.2)
+
+ .. raw:: html
+ :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_neighbors_knc_report_cutoff.html
+
+
+ You can also use the ``KNeighborsClassifier.score`` function to compute any
+ classification metric. The default metric is the accuracy:
+
+ .. ipython:: python
+
+ model.score(metric = "f1", average = "macro")
+
+ .. note::
+
+ For multi-class scoring, ``verticapy`` allows the
+ flexibility to use three averaging techniques:
+ micro, macro and weighted. Please refer to
+ `this link `_
+ for more details on how they are calculated.
+
+ Prediction
+ ^^^^^^^^^^^
+
+ Prediction is straight-forward:
+
+ .. ipython:: python
+ :suppress:
+
+ result = model.predict(
+ test,
+ [
+ "fixed_acidity",
+ "volatile_acidity",
+ "density",
+ "pH",
+ ],
+ "prediction",
+ )
+ html_file = open("figures/machine_learning_vertica_neighbors_knc_prediction.html", "w")
+ html_file.write(result._repr_html_())
+ html_file.close()
+
+ .. code-block:: python
+
+ model.predict(
+ test,
+ [
+ "fixed_acidity",
+ "volatile_acidity",
+ "density",
+ "pH",
+ ],
+ "prediction",
+ )
+
+ .. raw:: html
+ :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_neighbors_knc_prediction.html
+
+ .. note::
+
+ Predictions can be made automatically using the test set, in which
+ case you don't need to specify the predictors. Alternatively, you
+ can pass only the ``vDataFrame`` to the
+ :py:mod:`verticapy.machine_learning.vertica.linear_model.LinearModel.predict`
+ function, but in this case, it's essential that the column names of
+ the ``vDataFrame`` match the predictors and response name in the
+ model.
+
+ Probabilities
+ ^^^^^^^^^^^^^^
+
+ It is also easy to get the model's probabilities:
+
+ .. ipython:: python
+ :suppress:
+
+ result = model.predict_proba(
+ test,
+ [
+ "fixed_acidity",
+ "volatile_acidity",
+ "density",
+ "pH",
+ ],
+ "prediction",
+ )
+ html_file = open("figures/machine_learning_vertica_neighbors_knc_proba.html", "w")
+ html_file.write(result._repr_html_())
+ html_file.close()
+
+ .. code-block:: python
+
+ model.predict_proba(
+ test,
+ [
+ "fixed_acidity",
+ "volatile_acidity",
+ "density",
+ "pH",
+ ],
+ "prediction",
+ )
+
+ .. raw:: html
+ :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_neighbors_knc_proba.html
+
+ .. note::
+
+ Probabilities are added to the vDataFrame, and VerticaPy uses the
+ corresponding probability function in SQL behind the scenes. You
+ can use the ``pos_label`` parameter to add only the probability
+ of the selected category.
+
+ Confusion Matrix
+ ^^^^^^^^^^^^^^^^^
+
+ You can obtain the confusion matrix of your choice by specifying
+ the desired cutoff.
+
+ .. ipython:: python
+
+ model.confusion_matrix(cutoff = 0.5)
+
+ .. hint::
+
+ In the context of multi-class classification, you typically work
+ with an overall confusion matrix that summarizes the classification
+ efficiency across all classes. However, you have the flexibility to
+ specify a ``pos_label`` and adjust the cutoff threshold. In this case,
+ a binary confusion matrix is computed, where the chosen class is treated
+ as the positive class, allowing you to evaluate its efficiency as if it
+ were a binary classification problem.
+
+ .. ipython:: python
+
+ model.confusion_matrix(pos_label = "5", cutoff = 0.6)
+
+ .. note::
+
+ In classification, the ``cutoff`` is a threshold value used to
+ determine class assignment based on predicted probabilities or
+ scores from a classification model. In binary classification,
+ if the predicted probability for a specific class is greater
+ than or equal to the cutoff, the instance is assigned to the
+ positive class; otherwise, it is assigned to the negative class.
+ Adjusting the cutoff allows for trade-offs between true positives
+ and false positives, enabling the model to be optimized for
+ specific objectives or to consider the relative costs of different
+ classification errors. The choice of cutoff is critical for
+ tailoring the model's performance to meet specific needs.
+
+
+ Main Plots (Classification Curves)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ Classification models allow for the creation of various plots that
+ are very helpful in understanding the model, such as the ROC Curve,
+ PRC Curve, Cutoff Curve, Gain Curve, and more.
+
+ Most of the classification curves can be found in the
+ :ref:`chart_gallery.classification_curve`.
+
+ For example, let's draw the model's ROC curve.
+
+ .. code-block:: python
+
+ model.roc_curve(pos_label = "5")
+
+ .. ipython:: python
+ :suppress:
+
+ vp.set_option("plotting_lib", "plotly")
+ fig = model.roc_curve(pos_label = "5")
+ fig.write_html("figures/machine_learning_vertica_neighbors_knc_roc.html")
+
+ .. raw:: html
+ :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_neighbors_knc_roc.html
+
+ .. important::
+
+ Most of the curves have a parameter called ``nbins``, which is essential
+ for estimating metrics. The larger the ``nbins``, the more precise the
+ estimation, but it can significantly impact performance. Exercise caution
+ when increasing this parameter excessively.
+
+ .. hint::
+
+ In binary classification, various curves can be easily plotted. However,
+ in multi-class classification, it's important to select the ``pos_label``
+ , representing the class to be treated as positive when drawing the curve.
+
+ Other Plots
+ ^^^^^^^^^^^^
+
+ **Contour plot** is another useful plot that can be produced
+ for models with two predictors.
+
+ .. code-block:: python
+
+ model.contour(pos_label = "5")
+
+ .. important::
+
+ Machine learning models with two predictors can usually
+ benefit from their own contour plot. This visual representation
+ aids in exploring predictions and gaining a deeper understanding
+ of how these models perform in different scenarios.
+ Please refer to :ref:`chart_gallery.contour` for more examples.
+
+ Parameter Modification
+ ^^^^^^^^^^^^^^^^^^^^^^^
+
+ In order to see the parameters:
+
+ .. ipython:: python
+
+ model.get_params()
+
+ And to manually change some of the parameters:
+
+ .. ipython:: python
+
+ model.set_params({'n_neighbors': 8})
+
+ Model Register
+ ^^^^^^^^^^^^^^
+
+ As this model is not native, it does not support model management and
+ versioning. However, it is possible to use the SQL code it generates
+ for deployment.
+
+ Model Exporting
+ ^^^^^^^^^^^^^^^^
+
+ It is not possible to export this type of model, but you can still
+ examine the SQL code generated by using the
+ :py:mod:`verticapy.machine_learning.vertica.neighbors.KNeighborsClassifier.deploySQL`
+ method.
"""
# Properties.
@@ -468,11 +902,14 @@ def _get_final_relation(
"""
Returns the final relation used to do the predictions.
"""
+ filter_sql = ""
+ if not (isinstance(pos_label, NoneType)):
+ filter_sql = f"WHERE predict_neighbors = '{pos_label}'"
return f"""
(SELECT
*
- FROM {self.deploySQL()}
- WHERE predict_neighbors = '{pos_label}')
+ FROM {self.deploySQL()}
+ {filter_sql})
final_centroids_relation"""
def _get_y_proba(
@@ -494,7 +931,20 @@ def _get_y_score(
Returns the input that represents the model's scoring.
"""
cutoff = self._check_cutoff(cutoff=cutoff)
- return f"(CASE WHEN proba_predict > {cutoff} THEN 1 ELSE 0 END)"
+ if isinstance(pos_label, NoneType) and not (self._is_binary_classifier()):
+ return "predict_neighbors"
+ elif self._is_binary_classifier():
+ return f"""
+ (CASE
+ WHEN proba_predict > {cutoff} THEN '{self.classes_[1]}'
+ ELSE '{self.classes_[0]}'
+ END)"""
+ else:
+ return f"""
+ (CASE
+ WHEN proba_predict < {cutoff} AND predict_neighbors = '{pos_label}' THEN NULL
+ ELSE predict_neighbors
+ END)"""
def _compute_accuracy(self) -> float:
"""
@@ -520,7 +970,7 @@ def _confusion_matrix(
ORDER BY proba_predict DESC) AS pos
FROM {self.deploySQL()}) neighbors_table WHERE pos = 1"""
return mt.confusion_matrix(
- self.y, "predict_neighbors", input_relation, classes=self.classes_
+ self.y, "predict_neighbors", input_relation, labels=self.classes_
)
else:
cutoff = self._check_cutoff(cutoff=cutoff)
@@ -1144,15 +1594,26 @@ class LocalOutlierFactor(VerticaModel):
Sander. This object is using pure SQL to compute all
the distances and final score.
- \u26A0 Warning : This algorithm uses a CROSS JOIN
- during computation and is therefore
- computationally expensive at O(n * n),
- where n is the total number of elements.
- Since LocalOutlierFactor uses the p-
- distance, it is highly sensitive to
- unnormalized data.
- A table is created at the end of
- the learning phase.
+ .. warning :
+
+ This algorithm uses a CROSS JOIN
+ during computation and is therefore
+ computationally expensive at O(n * n),
+ where n is the total number of elements.
+ Since LocalOutlierFactor uses the p-
+ distance, it is highly sensitive to
+ unnormalized data.
+ A table is created at the end of
+ the learning phase.
+
+ .. important::
+
+ This algorithm is not Vertica Native and relies solely
+ on SQL for attribute computation. While this model does
+ not take advantage of the benefits provided by a model
+ management system, including versioning and tracking,
+ the SQL code it generates can still be used to create a
+ pipeline.
Parameters
----------
@@ -1170,6 +1631,192 @@ class LocalOutlierFactor(VerticaModel):
p: int, optional
The p of the p-distances (distance metric used
during the model computation).
+
+ Examples
+ ---------
+
+ The following examples provide a basic understanding of usage.
+ For more detailed examples, please refer to the
+ :ref:`user_guide.machine_learning` or the
+ `Examples `_
+ section on the website.
+
+ Load data for machine learning
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ We import ``verticapy``:
+
+ .. ipython:: python
+
+ import verticapy as vp
+
+ .. hint::
+
+ By assigning an alias to ``verticapy``, we mitigate the risk of code
+ collisions with other libraries. This precaution is necessary
+ because verticapy uses commonly known function names like "average"
+ and "median", which can potentially lead to naming conflicts.
+ The use of an alias ensures that the functions from verticapy are
+ used as intended without interfering with functions from other
+ libraries.
+
+ For this example, we will use the winequality dataset.
+
+ .. code-block:: python
+
+ import verticapy.datasets as vpd
+
+ data = vpd.load_winequality()
+
+ .. raw:: html
+ :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_winequality.html
+
+ .. note::
+
+ VerticaPy offers a wide range of sample datasets that are
+ ideal for training and testing purposes. You can explore
+ the full list of available datasets in the :ref:`api.datasets`,
+ which provides detailed information on each dataset
+ and how to use them effectively. These datasets are invaluable
+ resources for honing your data analysis and machine learning
+ skills within the VerticaPy environment.
+
+ .. ipython:: python
+ :suppress:
+
+ import verticapy.datasets as vpd
+ data = vpd.load_winequality()
+
+ Model Initialization
+ ^^^^^^^^^^^^^^^^^^^^^
+
+ First we import the ``LocalOutlierFactor`` model:
+
+ .. code-block::
+
+ from verticapy.machine_learning.vertica import LocalOutlierFactor
+
+ .. ipython:: python
+ :suppress:
+
+ from verticapy.machine_learning.vertica import LocalOutlierFactor
+
+ Then we can create the model:
+
+ .. ipython:: python
+ :okwarning:
+
+ model = LocalOutlierFactor(
+ n_neighbors = 10,
+ p = 2,
+ )
+
+ .. important::
+
+ As this model is not native, it solely relies on SQL statements to
+ compute various attributes, storing them within the object. No data
+ is saved in the database.
+
+ Model Training
+ ^^^^^^^^^^^^^^^
+
+ We can now fit the model:
+
+ .. ipython:: python
+ :okwarning:
+
+ model.fit(data, X = ["density", "sulphates"])
+
+ .. important::
+
+ To train a model, you can directly use the ``vDataFrame`` or the
+ name of the relation stored in the database. The test set is optional
+ and is only used to compute the test metrics. In ``verticapy``, we
+ don't work using ``X`` matrices and ``y`` vectors. Instead, we work
+ directly with lists of predictors and the response name.
+
+ .. hint::
+
+ For clustering and anomaly detection, the use of predictors is
+ optional. In such cases, all available predictors are considered,
+ which can include solely numerical variables or a combination of
+ numerical and categorical variables, depending on the model's
+ capabilities.
+
+ .. important::
+
+ As this model is not native, it solely relies on SQL statements to
+ compute various attributes, storing them within the object. No data
+ is saved in the database.
+
+ Prediction
+ ^^^^^^^^^^^
+
+ To find out the LOF score for each datapoint:
+
+ .. ipython:: python
+ :suppress:
+
+ result = model.predict()
+ html_file = open("figures/machine_learning_vertica_lof_prediction.html", "w")
+ html_file.write(result._repr_html_())
+ html_file.close()
+
+ .. code-block:: python
+
+ model.predict()
+
+ .. raw:: html
+ :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_lof_prediction.html
+
+ As shown above, a new column has been created, containing
+ the lof score.
+
+
+ Plots - Outliers
+ ^^^^^^^^^^^^^^^^^
+
+ Plots highlighting the outliers can be easily drawn using:
+
+ .. code-block:: python
+
+ model.plot()
+
+ .. ipython:: python
+ :suppress:
+
+ vp.set_option("plotting_lib", "plotly")
+ fig = model.plot(width = 600)
+ fig.write_html("figures/machine_learning_vertica_lof_plot.html")
+
+ .. raw:: html
+ :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_lof_plot.html
+
+ .. important::
+
+ Please refer to :ref:`chart_gallery.lof` for more examples.
+
+ Parameter Modification
+ ^^^^^^^^^^^^^^^^^^^^^^^
+
+ In order to see the parameters:
+
+ .. ipython:: python
+
+ model.get_params()
+
+ And to manually change some of the parameters:
+
+ .. ipython:: python
+
+ model.set_params({'p': 3})
+
+ Model Register
+ ^^^^^^^^^^^^^^
+
+ As this model is not native, it does not support model management and
+ versioning. However, it is possible to use the SQL code it generates
+ for deployment.
"""
# Properties.