diff --git a/verticapy/machine_learning/vertica/preprocessing.py b/verticapy/machine_learning/vertica/preprocessing.py
index 64404e99d..535c9d746 100755
--- a/verticapy/machine_learning/vertica/preprocessing.py
+++ b/verticapy/machine_learning/vertica/preprocessing.py
@@ -57,8 +57,8 @@ def Balance(
ratio: float = 0.5,
) -> vDataFrame:
"""
- Creates a view with an equal distribution of the
- input data based on the response_column.
+ Creates a view with an equal distribution of
+ the input data based on the response_column.
Parameters
----------
@@ -70,18 +70,25 @@ def Balance(
Response column.
method: str, optional
Method used to do the balancing.
- hybrid : Performs over-sampling and
- under-sampling on different
- classes so that each class is
- equally represented.
- over : Over-samples on all classes,
- except the most represented
- class, towards the most
- represented class's cardinality.
- under : Under-samples on all classes,
- except the least represented
- class, towards the least
- represented class's cardinality.
+
+ - hybrid :
+ Performs over-sampling and
+ under-sampling on different
+ classes so that each class is
+ equally represented.
+
+ - over :
+ Over-samples on all classes,
+ except the most represented
+ class, towards the most
+ represented class's cardinality.
+
+ - under:
+ Under-samples on all classes,
+ except the least represented
+ class, towards the least
+ represented class's cardinality.
+
ratio: float, optional
The desired ratio between the majority class
and the minority class. This value has no
@@ -92,6 +99,102 @@ def Balance(
-------
vDataFrame
vDataFrame of the created view.
+
+ Examples
+ --------
+
+ The following examples provide a basic understanding of usage.
+ For more detailed examples, please refer to the
+ :ref:`user_guide.machine_learning` or the
+ `Examples `_
+ section on the website.
+
+ Load data for machine learning
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ We import ``verticapy``:
+
+ .. ipython:: python
+
+ import verticapy as vp
+
+ .. hint::
+
+ By assigning an alias to ``verticapy``, we mitigate the risk of code
+ collisions with other libraries. This precaution is necessary
+ because verticapy uses commonly known function names like "average"
+ and "median", which can potentially lead to naming conflicts.
+ The use of an alias ensures that the functions from verticapy are
+ used as intended without interfering with functions from other
+ libraries.
+
+ For this example, we will use the Titanic dataset.
+
+ .. code-block:: python
+
+ import verticapy.datasets as vpd
+
+ data = vpd.load_titanic()
+
+ .. raw:: html
+ :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_titanic.html
+
+ .. ipython:: python
+ :suppress:
+
+ import verticapy.datasets as vpd
+ data = vpd.load_titanic()
+
+ .. note::
+
+ VerticaPy offers a wide range of sample datasets that are
+ ideal for training and testing purposes. You can explore
+ the full list of available datasets in the :ref:`api.datasets`,
+ which provides detailed information on each dataset
+ and how to use them effectively. These datasets are invaluable
+ resources for honing your data analysis and machine learning
+ skills within the VerticaPy environment.
+
+ Function Application
+ ^^^^^^^^^^^^^^^^^^^^^
+
+ First we import the ``Balance`` function:
+
+ .. ipython:: python
+
+ from verticapy.machine_learning.vertica import Balance
+
+ Then we can directly apply it to the dataset:
+
+ .. ipython:: python
+ :okwarning:
+ :suppress:
+
+
+ vp.drop("balance_model")
+ result = Balance(name = "balance_model",
+ input_relation = data,
+ y = "survived",
+ method = "under"
+ )
+ html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_preprocessing_balance.html", "w")
+ html_file.write(result._repr_html_())
+ html_file.close()
+
+ .. code-block:: python
+
+ Balance(name = "balance_model",
+ input_relation = data,
+ y = "survived",
+ method = "under"
+ )
+
+ .. raw:: html
+ :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_preprocessing_balance.html
+
+ .. seealso::
+ | :py:mod:`verticapy.vDataFrame.sample` :
+ Sampling the dataset.
"""
_executeSQL(
query=f"""
@@ -657,12 +760,221 @@ class Scaler(Preprocessing):
existing model.
method: str, optional
Method used to scale the data.
- zscore : Scaling using the Z-Score.
- (x - avg) / std
- robust_zscore : Scaling using the Robust Z-Score.
- (x - median) / (1.4826 * mad)
- minmax : Normalization using the Min & Max.
- (x - min) / (max - min)
+
+ - zscore:
+ Scaling using the Z-Score
+
+ .. math::
+
+ Z_score = (x - avg) / std
+
+ - robust_zscore:
+ Scaling using the Robust Z-Score.
+
+ .. math::
+
+ Z_rscore = (x - median) / (1.4826 * mad)
+
+ - minmax:
+ Normalization using the Min & Max.
+
+ .. math::
+
+ Z_minmax = (x - min) / (max - min)
+
+ Examples
+ --------
+
+ The following examples provide a basic understanding of usage.
+ For more detailed examples, please refer to the
+ :ref:`user_guide.machine_learning` or the
+ `Examples `_
+ section on the website.
+
+ Load data for machine learning
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ We import ``verticapy``:
+
+ .. ipython:: python
+
+ import verticapy as vp
+
+ .. hint::
+
+ By assigning an alias to ``verticapy``, we mitigate the risk of code
+ collisions with other libraries. This precaution is necessary
+ because verticapy uses commonly known function names like "average"
+ and "median", which can potentially lead to naming conflicts.
+ The use of an alias ensures that the functions from verticapy are
+ used as intended without interfering with functions from other
+ libraries.
+
+ For this example, we will use a dummy dataset.
+
+ .. ipython:: python
+
+ data = vp.vDataFrame({"values": [1, 1.01, 1.02, 1.05, 1.024]})
+
+ .. note::
+
+ VerticaPy offers a wide range of sample datasets that are
+ ideal for training and testing purposes. You can explore
+ the full list of available datasets in the :ref:`api.datasets`,
+ which provides detailed information on each dataset
+ and how to use them effectively. These datasets are invaluable
+ resources for honing your data analysis and machine learning
+ skills within the VerticaPy environment.
+
+ Model Initialization
+ ^^^^^^^^^^^^^^^^^^^^^
+
+ First we import the ``Scaler`` model:
+
+ .. ipython:: python
+
+ from verticapy.machine_learning.vertica import Scaler
+
+ Then we can create the model:
+
+ .. ipython:: python
+ :okwarning:
+
+ model = Scaler(method = "zscore")
+
+ .. hint::
+
+ In ``verticapy`` 1.0.x and higher, you do not need to specify the
+ model name, as the name is automatically assigned. If you need to
+ re-use the model, you can fetch the model name from the model's
+ attributes.
+
+ .. important::
+
+ The model name is crucial for the model management system and
+ versioning. It's highly recommended to provide a name if you
+ plan to reuse the model later.
+
+ Model Fitting
+ ^^^^^^^^^^^^^^
+
+ We can now fit the model:
+
+ .. ipython:: python
+ :okwarning:
+
+ model.fit(data)
+
+ .. important::
+
+ To fit a model, you can directly use the ``vDataFrame``
+ or the name of the relation stored in the database.
+
+ Model Parameters
+ ^^^^^^^^^^^^^^^^^
+
+ To fetch the model parameter (mean) you can use:
+
+ .. ipython:: python
+
+ model.mean_
+
+ Similarly for standard deviation:
+
+ .. ipython:: python
+
+ model.std_
+
+ Conversion/Transformation
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ To get the scaled dataset, we can use the ``transform``
+ function. Let us transform the data:
+
+ .. ipython:: python
+ :okwarning:
+
+ model.transform(data)
+
+ Please refer to
+ :py:mod:`verticapy.machine_learning.Scaler.transform`
+ for more details on transforming a ``vDataFrame``.
+
+ Similarly, you can perform the inverse tranform to get
+ the original features using:
+
+ .. code-block:: python
+
+ model.inverse_transform(data_transformed)
+
+ The variable ``data_transformed`` is the scaled dataset.
+
+ Model Register
+ ^^^^^^^^^^^^^^^
+
+ In order to register the model for tracking and versioning:
+
+ .. code-block:: python
+
+ model.register("model_v1")
+
+ Please refer to
+ :ref:`notebooks/ml/model_tracking_versioning/index.html`
+ for more details on model tracking and versioning.
+
+ Model Exporting
+ ^^^^^^^^^^^^^^^^
+
+ **To Memmodel**
+
+ .. code-block:: python
+
+ model.to_memmodel()
+
+ .. note::
+
+ ``MemModel`` objects serve as in-memory representations of
+ machine learning models. They can be used for both in-database
+ and in-memory prediction tasks. These objects can be pickled
+ in the same way that you would pickle a ``scikit-learn`` model.
+
+ The preceding methods for exporting the model use ``MemModel``,
+ and it is recommended to use ``MemModel`` directly.
+
+ **SQL**
+
+ To get the SQL query use below:
+
+ .. ipython:: python
+
+ model.to_sql()
+
+ **To Python**
+
+ To obtain the prediction function in Python syntax, use the
+ following code:
+
+ .. ipython:: python
+
+ X = [[1]]
+ model.to_python()(X)
+
+ .. hint::
+
+ The
+ :py:mod:`verticapy.machine_learning.vertica.preprocessing.Scaler.to_python`
+ method is used to scale the data. For specific details on how
+ to use this method for different model types, refer to the
+ relevant documentation for each model.
+
+ .. seealso::
+ | :py:mod:`verticapy.machine_learning.vertica.preprocessing.StandardScaler` :
+ Scalar with method set as ``zscore``.
+ | :py:mod:`verticapy.machine_learning.vertica.preprocessing.RobustScaler` :
+ Scalar with method set as ``robust_zscore``.
+ | :py:mod:`verticapy.machine_learning.vertica.preprocessing.MinMaxScaler` :
+ Scalar with method set as ``minmax``.
+
"""
# Properties.
@@ -742,7 +1054,15 @@ def to_memmodel(self) -> mm.Scaler:
class StandardScaler(Scaler):
- """i.e. Scaler with param method = 'zscore'"""
+ """
+ i.e. Scaler with param method = 'zscore'
+
+ .. note::
+
+ This is a child class. See
+ :py:mod:`verticapy.machine_learning.vertica.preprocessing.Scaler`
+ for more details and examples.
+ """
@property
def _attributes(self) -> list[str]:
@@ -753,7 +1073,15 @@ def __init__(self, name: str = None, overwrite_model: bool = False) -> None:
class RobustScaler(Scaler):
- """i.e. Scaler with param method = 'robust_zscore'"""
+ """
+ i.e. Scaler with param method = 'robust_zscore'
+
+ .. note::
+
+ This is a child class. See
+ :py:mod:`verticapy.machine_learning.vertica.preprocessing.Scaler`
+ for more details and examples.
+ """
@property
def _attributes(self) -> list[str]:
@@ -764,7 +1092,15 @@ def __init__(self, name: str = None, overwrite_model: bool = False) -> None:
class MinMaxScaler(Scaler):
- """i.e. Scaler with param method = 'minmax'"""
+ """
+ i.e. Scaler with param method = 'minmax'
+
+ .. note::
+
+ This is a child class. See
+ :py:mod:`verticapy.machine_learning.vertica.preprocessing.Scaler`
+ for more details and examples.
+ """
@property
def _attributes(self) -> list[str]:
@@ -947,8 +1283,8 @@ class OneHotEncoder(Preprocessing):
^^^^^^^^^^^^^^^^^^^^^^^^^^
To get the transformed dataset in the form that is encoded,
- we can use the ``transform`` function. Let us transform the data
- and display the first 20 datapoints.
+ we can use the ``transform`` function. Let us transform the
+ data and display the first 20 datapoints.
.. ipython:: python
:okwarning:
@@ -970,7 +1306,7 @@ class OneHotEncoder(Preprocessing):
components.
Model Register
- ^^^^^^^^^^^^^^
+ ^^^^^^^^^^^^^^^
In order to register the model for tracking and versioning: