diff --git a/verticapy/machine_learning/vertica/preprocessing.py b/verticapy/machine_learning/vertica/preprocessing.py
index 64404e99d..535c9d746 100755
--- a/verticapy/machine_learning/vertica/preprocessing.py
+++ b/verticapy/machine_learning/vertica/preprocessing.py
@@ -57,8 +57,8 @@ def Balance(
ratio: float = 0.5,
) -> vDataFrame:
- Creates a view with an equal distribution of the
- input data based on the response_column.
+ Creates a view with an equal distribution of
+ the input data based on the response_column.
@@ -70,18 +70,25 @@ def Balance(
Response column.
method: str, optional
Method used to do the balancing.
- hybrid : Performs over-sampling and
- under-sampling on different
- classes so that each class is
- equally represented.
- over : Over-samples on all classes,
- except the most represented
- class, towards the most
- represented class's cardinality.
- under : Under-samples on all classes,
- except the least represented
- class, towards the least
- represented class's cardinality.
+ - hybrid :
+ Performs over-sampling and
+ under-sampling on different
+ classes so that each class is
+ equally represented.
+ - over :
+ Over-samples on all classes,
+ except the most represented
+ class, towards the most
+ represented class's cardinality.
+ - under:
+ Under-samples on all classes,
+ except the least represented
+ class, towards the least
+ represented class's cardinality.
ratio: float, optional
The desired ratio between the majority class
and the minority class. This value has no
@@ -92,6 +99,102 @@ def Balance(
vDataFrame of the created view.
+ Examples
+ --------
+ The following examples provide a basic understanding of usage.
+ For more detailed examples, please refer to the
+ :ref:`user_guide.machine_learning` or the
+ `Examples `_
+ section on the website.
+ Load data for machine learning
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ We import ``verticapy``:
+ .. ipython:: python
+ import verticapy as vp
+ .. hint::
+ By assigning an alias to ``verticapy``, we mitigate the risk of code
+ collisions with other libraries. This precaution is necessary
+ because verticapy uses commonly known function names like "average"
+ and "median", which can potentially lead to naming conflicts.
+ The use of an alias ensures that the functions from verticapy are
+ used as intended without interfering with functions from other
+ libraries.
+ For this example, we will use the Titanic dataset.
+ .. code-block:: python
+ import verticapy.datasets as vpd
+ data = vpd.load_titanic()
+ .. raw:: html
+ :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_titanic.html
+ .. ipython:: python
+ :suppress:
+ import verticapy.datasets as vpd
+ data = vpd.load_titanic()
+ .. note::
+ VerticaPy offers a wide range of sample datasets that are
+ ideal for training and testing purposes. You can explore
+ the full list of available datasets in the :ref:`api.datasets`,
+ which provides detailed information on each dataset
+ and how to use them effectively. These datasets are invaluable
+ resources for honing your data analysis and machine learning
+ skills within the VerticaPy environment.
+ Function Application
+ ^^^^^^^^^^^^^^^^^^^^^
+ First we import the ``Balance`` function:
+ .. ipython:: python
+ from verticapy.machine_learning.vertica import Balance
+ Then we can directly apply it to the dataset:
+ .. ipython:: python
+ :okwarning:
+ :suppress:
+ vp.drop("balance_model")
+ result = Balance(name = "balance_model",
+ input_relation = data,
+ y = "survived",
+ method = "under"
+ )
+ html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_preprocessing_balance.html", "w")
+ html_file.write(result._repr_html_())
+ html_file.close()
+ .. code-block:: python
+ Balance(name = "balance_model",
+ input_relation = data,
+ y = "survived",
+ method = "under"
+ )
+ .. raw:: html
+ :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_preprocessing_balance.html
+ .. seealso::
+ | :py:mod:`verticapy.vDataFrame.sample` :
+ Sampling the dataset.
@@ -657,12 +760,221 @@ class Scaler(Preprocessing):
existing model.
method: str, optional
Method used to scale the data.
- zscore : Scaling using the Z-Score.
- (x - avg) / std
- robust_zscore : Scaling using the Robust Z-Score.
- (x - median) / (1.4826 * mad)
- minmax : Normalization using the Min & Max.
- (x - min) / (max - min)
+ - zscore:
+ Scaling using the Z-Score
+ .. math::
+ Z_score = (x - avg) / std
+ - robust_zscore:
+ Scaling using the Robust Z-Score.
+ .. math::
+ Z_rscore = (x - median) / (1.4826 * mad)
+ - minmax:
+ Normalization using the Min & Max.
+ .. math::
+ Z_minmax = (x - min) / (max - min)
+ Examples
+ --------
+ The following examples provide a basic understanding of usage.
+ For more detailed examples, please refer to the
+ :ref:`user_guide.machine_learning` or the
+ `Examples `_
+ section on the website.
+ Load data for machine learning
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ We import ``verticapy``:
+ .. ipython:: python
+ import verticapy as vp
+ .. hint::
+ By assigning an alias to ``verticapy``, we mitigate the risk of code
+ collisions with other libraries. This precaution is necessary
+ because verticapy uses commonly known function names like "average"
+ and "median", which can potentially lead to naming conflicts.
+ The use of an alias ensures that the functions from verticapy are
+ used as intended without interfering with functions from other
+ libraries.
+ For this example, we will use a dummy dataset.
+ .. ipython:: python
+ data = vp.vDataFrame({"values": [1, 1.01, 1.02, 1.05, 1.024]})
+ .. note::
+ VerticaPy offers a wide range of sample datasets that are
+ ideal for training and testing purposes. You can explore
+ the full list of available datasets in the :ref:`api.datasets`,
+ which provides detailed information on each dataset
+ and how to use them effectively. These datasets are invaluable
+ resources for honing your data analysis and machine learning
+ skills within the VerticaPy environment.
+ Model Initialization
+ ^^^^^^^^^^^^^^^^^^^^^
+ First we import the ``Scaler`` model:
+ .. ipython:: python
+ from verticapy.machine_learning.vertica import Scaler
+ Then we can create the model:
+ .. ipython:: python
+ :okwarning:
+ model = Scaler(method = "zscore")
+ .. hint::
+ In ``verticapy`` 1.0.x and higher, you do not need to specify the
+ model name, as the name is automatically assigned. If you need to
+ re-use the model, you can fetch the model name from the model's
+ attributes.
+ .. important::
+ The model name is crucial for the model management system and
+ versioning. It's highly recommended to provide a name if you
+ plan to reuse the model later.
+ Model Fitting
+ ^^^^^^^^^^^^^^
+ We can now fit the model:
+ .. ipython:: python
+ :okwarning:
+ model.fit(data)
+ .. important::
+ To fit a model, you can directly use the ``vDataFrame``
+ or the name of the relation stored in the database.
+ Model Parameters
+ ^^^^^^^^^^^^^^^^^
+ To fetch the model parameter (mean) you can use:
+ .. ipython:: python
+ model.mean_
+ Similarly for standard deviation:
+ .. ipython:: python
+ model.std_
+ Conversion/Transformation
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^
+ To get the scaled dataset, we can use the ``transform``
+ function. Let us transform the data:
+ .. ipython:: python
+ :okwarning:
+ model.transform(data)
+ Please refer to
+ :py:mod:`verticapy.machine_learning.Scaler.transform`
+ for more details on transforming a ``vDataFrame``.
+ Similarly, you can perform the inverse tranform to get
+ the original features using:
+ .. code-block:: python
+ model.inverse_transform(data_transformed)
+ The variable ``data_transformed`` is the scaled dataset.
+ Model Register
+ ^^^^^^^^^^^^^^^
+ In order to register the model for tracking and versioning:
+ .. code-block:: python
+ model.register("model_v1")
+ Please refer to
+ :ref:`notebooks/ml/model_tracking_versioning/index.html`
+ for more details on model tracking and versioning.
+ Model Exporting
+ ^^^^^^^^^^^^^^^^
+ **To Memmodel**
+ .. code-block:: python
+ model.to_memmodel()
+ .. note::
+ ``MemModel`` objects serve as in-memory representations of
+ machine learning models. They can be used for both in-database
+ and in-memory prediction tasks. These objects can be pickled
+ in the same way that you would pickle a ``scikit-learn`` model.
+ The preceding methods for exporting the model use ``MemModel``,
+ and it is recommended to use ``MemModel`` directly.
+ **SQL**
+ To get the SQL query use below:
+ .. ipython:: python
+ model.to_sql()
+ **To Python**
+ To obtain the prediction function in Python syntax, use the
+ following code:
+ .. ipython:: python
+ X = [[1]]
+ model.to_python()(X)
+ .. hint::
+ The
+ :py:mod:`verticapy.machine_learning.vertica.preprocessing.Scaler.to_python`
+ method is used to scale the data. For specific details on how
+ to use this method for different model types, refer to the
+ relevant documentation for each model.
+ .. seealso::
+ | :py:mod:`verticapy.machine_learning.vertica.preprocessing.StandardScaler` :
+ Scalar with method set as ``zscore``.
+ | :py:mod:`verticapy.machine_learning.vertica.preprocessing.RobustScaler` :
+ Scalar with method set as ``robust_zscore``.
+ | :py:mod:`verticapy.machine_learning.vertica.preprocessing.MinMaxScaler` :
+ Scalar with method set as ``minmax``.
# Properties.
@@ -742,7 +1054,15 @@ def to_memmodel(self) -> mm.Scaler:
class StandardScaler(Scaler):
- """i.e. Scaler with param method = 'zscore'"""
+ """
+ i.e. Scaler with param method = 'zscore'
+ .. note::
+ This is a child class. See
+ :py:mod:`verticapy.machine_learning.vertica.preprocessing.Scaler`
+ for more details and examples.
+ """
def _attributes(self) -> list[str]:
@@ -753,7 +1073,15 @@ def __init__(self, name: str = None, overwrite_model: bool = False) -> None:
class RobustScaler(Scaler):
- """i.e. Scaler with param method = 'robust_zscore'"""
+ """
+ i.e. Scaler with param method = 'robust_zscore'
+ .. note::
+ This is a child class. See
+ :py:mod:`verticapy.machine_learning.vertica.preprocessing.Scaler`
+ for more details and examples.
+ """
def _attributes(self) -> list[str]:
@@ -764,7 +1092,15 @@ def __init__(self, name: str = None, overwrite_model: bool = False) -> None:
class MinMaxScaler(Scaler):
- """i.e. Scaler with param method = 'minmax'"""
+ """
+ i.e. Scaler with param method = 'minmax'
+ .. note::
+ This is a child class. See
+ :py:mod:`verticapy.machine_learning.vertica.preprocessing.Scaler`
+ for more details and examples.
+ """
def _attributes(self) -> list[str]:
@@ -947,8 +1283,8 @@ class OneHotEncoder(Preprocessing):
To get the transformed dataset in the form that is encoded,
- we can use the ``transform`` function. Let us transform the data
- and display the first 20 datapoints.
+ we can use the ``transform`` function. Let us transform the
+ data and display the first 20 datapoints.
.. ipython:: python
@@ -970,7 +1306,7 @@ class OneHotEncoder(Preprocessing):
Model Register
- ^^^^^^^^^^^^^^
+ ^^^^^^^^^^^^^^^
In order to register the model for tracking and versioning: