diff --git a/verticapy/machine_learning/vertica/preprocessing.py b/verticapy/machine_learning/vertica/preprocessing.py index 64404e99d..535c9d746 100755 --- a/verticapy/machine_learning/vertica/preprocessing.py +++ b/verticapy/machine_learning/vertica/preprocessing.py @@ -57,8 +57,8 @@ def Balance( ratio: float = 0.5, ) -> vDataFrame: """ - Creates a view with an equal distribution of the - input data based on the response_column. + Creates a view with an equal distribution of + the input data based on the response_column. Parameters ---------- @@ -70,18 +70,25 @@ def Balance( Response column. method: str, optional Method used to do the balancing. - hybrid : Performs over-sampling and - under-sampling on different - classes so that each class is - equally represented. - over : Over-samples on all classes, - except the most represented - class, towards the most - represented class's cardinality. - under : Under-samples on all classes, - except the least represented - class, towards the least - represented class's cardinality. + + - hybrid : + Performs over-sampling and + under-sampling on different + classes so that each class is + equally represented. + + - over : + Over-samples on all classes, + except the most represented + class, towards the most + represented class's cardinality. + + - under: + Under-samples on all classes, + except the least represented + class, towards the least + represented class's cardinality. + ratio: float, optional The desired ratio between the majority class and the minority class. This value has no @@ -92,6 +99,102 @@ def Balance( ------- vDataFrame vDataFrame of the created view. + + Examples + -------- + + The following examples provide a basic understanding of usage. + For more detailed examples, please refer to the + :ref:`user_guide.machine_learning` or the + `Examples `_ + section on the website. + + Load data for machine learning + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + We import ``verticapy``: + + .. ipython:: python + + import verticapy as vp + + .. hint:: + + By assigning an alias to ``verticapy``, we mitigate the risk of code + collisions with other libraries. This precaution is necessary + because verticapy uses commonly known function names like "average" + and "median", which can potentially lead to naming conflicts. + The use of an alias ensures that the functions from verticapy are + used as intended without interfering with functions from other + libraries. + + For this example, we will use the Titanic dataset. + + .. code-block:: python + + import verticapy.datasets as vpd + + data = vpd.load_titanic() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_titanic.html + + .. ipython:: python + :suppress: + + import verticapy.datasets as vpd + data = vpd.load_titanic() + + .. note:: + + VerticaPy offers a wide range of sample datasets that are + ideal for training and testing purposes. You can explore + the full list of available datasets in the :ref:`api.datasets`, + which provides detailed information on each dataset + and how to use them effectively. These datasets are invaluable + resources for honing your data analysis and machine learning + skills within the VerticaPy environment. + + Function Application + ^^^^^^^^^^^^^^^^^^^^^ + + First we import the ``Balance`` function: + + .. ipython:: python + + from verticapy.machine_learning.vertica import Balance + + Then we can directly apply it to the dataset: + + .. ipython:: python + :okwarning: + :suppress: + + + vp.drop("balance_model") + result = Balance(name = "balance_model", + input_relation = data, + y = "survived", + method = "under" + ) + html_file = open("SPHINX_DIRECTORY/figures/machine_learning_vertica_preprocessing_balance.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. code-block:: python + + Balance(name = "balance_model", + input_relation = data, + y = "survived", + method = "under" + ) + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_preprocessing_balance.html + + .. seealso:: + | :py:mod:`verticapy.vDataFrame.sample` : + Sampling the dataset. """ _executeSQL( query=f""" @@ -657,12 +760,221 @@ class Scaler(Preprocessing): existing model. method: str, optional Method used to scale the data. - zscore : Scaling using the Z-Score. - (x - avg) / std - robust_zscore : Scaling using the Robust Z-Score. - (x - median) / (1.4826 * mad) - minmax : Normalization using the Min & Max. - (x - min) / (max - min) + + - zscore: + Scaling using the Z-Score + + .. math:: + + Z_score = (x - avg) / std + + - robust_zscore: + Scaling using the Robust Z-Score. + + .. math:: + + Z_rscore = (x - median) / (1.4826 * mad) + + - minmax: + Normalization using the Min & Max. + + .. math:: + + Z_minmax = (x - min) / (max - min) + + Examples + -------- + + The following examples provide a basic understanding of usage. + For more detailed examples, please refer to the + :ref:`user_guide.machine_learning` or the + `Examples `_ + section on the website. + + Load data for machine learning + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + We import ``verticapy``: + + .. ipython:: python + + import verticapy as vp + + .. hint:: + + By assigning an alias to ``verticapy``, we mitigate the risk of code + collisions with other libraries. This precaution is necessary + because verticapy uses commonly known function names like "average" + and "median", which can potentially lead to naming conflicts. + The use of an alias ensures that the functions from verticapy are + used as intended without interfering with functions from other + libraries. + + For this example, we will use a dummy dataset. + + .. ipython:: python + + data = vp.vDataFrame({"values": [1, 1.01, 1.02, 1.05, 1.024]}) + + .. note:: + + VerticaPy offers a wide range of sample datasets that are + ideal for training and testing purposes. You can explore + the full list of available datasets in the :ref:`api.datasets`, + which provides detailed information on each dataset + and how to use them effectively. These datasets are invaluable + resources for honing your data analysis and machine learning + skills within the VerticaPy environment. + + Model Initialization + ^^^^^^^^^^^^^^^^^^^^^ + + First we import the ``Scaler`` model: + + .. ipython:: python + + from verticapy.machine_learning.vertica import Scaler + + Then we can create the model: + + .. ipython:: python + :okwarning: + + model = Scaler(method = "zscore") + + .. hint:: + + In ``verticapy`` 1.0.x and higher, you do not need to specify the + model name, as the name is automatically assigned. If you need to + re-use the model, you can fetch the model name from the model's + attributes. + + .. important:: + + The model name is crucial for the model management system and + versioning. It's highly recommended to provide a name if you + plan to reuse the model later. + + Model Fitting + ^^^^^^^^^^^^^^ + + We can now fit the model: + + .. ipython:: python + :okwarning: + + model.fit(data) + + .. important:: + + To fit a model, you can directly use the ``vDataFrame`` + or the name of the relation stored in the database. + + Model Parameters + ^^^^^^^^^^^^^^^^^ + + To fetch the model parameter (mean) you can use: + + .. ipython:: python + + model.mean_ + + Similarly for standard deviation: + + .. ipython:: python + + model.std_ + + Conversion/Transformation + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + + To get the scaled dataset, we can use the ``transform`` + function. Let us transform the data: + + .. ipython:: python + :okwarning: + + model.transform(data) + + Please refer to + :py:mod:`verticapy.machine_learning.Scaler.transform` + for more details on transforming a ``vDataFrame``. + + Similarly, you can perform the inverse tranform to get + the original features using: + + .. code-block:: python + + model.inverse_transform(data_transformed) + + The variable ``data_transformed`` is the scaled dataset. + + Model Register + ^^^^^^^^^^^^^^^ + + In order to register the model for tracking and versioning: + + .. code-block:: python + + model.register("model_v1") + + Please refer to + :ref:`notebooks/ml/model_tracking_versioning/index.html` + for more details on model tracking and versioning. + + Model Exporting + ^^^^^^^^^^^^^^^^ + + **To Memmodel** + + .. code-block:: python + + model.to_memmodel() + + .. note:: + + ``MemModel`` objects serve as in-memory representations of + machine learning models. They can be used for both in-database + and in-memory prediction tasks. These objects can be pickled + in the same way that you would pickle a ``scikit-learn`` model. + + The preceding methods for exporting the model use ``MemModel``, + and it is recommended to use ``MemModel`` directly. + + **SQL** + + To get the SQL query use below: + + .. ipython:: python + + model.to_sql() + + **To Python** + + To obtain the prediction function in Python syntax, use the + following code: + + .. ipython:: python + + X = [[1]] + model.to_python()(X) + + .. hint:: + + The + :py:mod:`verticapy.machine_learning.vertica.preprocessing.Scaler.to_python` + method is used to scale the data. For specific details on how + to use this method for different model types, refer to the + relevant documentation for each model. + + .. seealso:: + | :py:mod:`verticapy.machine_learning.vertica.preprocessing.StandardScaler` : + Scalar with method set as ``zscore``. + | :py:mod:`verticapy.machine_learning.vertica.preprocessing.RobustScaler` : + Scalar with method set as ``robust_zscore``. + | :py:mod:`verticapy.machine_learning.vertica.preprocessing.MinMaxScaler` : + Scalar with method set as ``minmax``. + """ # Properties. @@ -742,7 +1054,15 @@ def to_memmodel(self) -> mm.Scaler: class StandardScaler(Scaler): - """i.e. Scaler with param method = 'zscore'""" + """ + i.e. Scaler with param method = 'zscore' + + .. note:: + + This is a child class. See + :py:mod:`verticapy.machine_learning.vertica.preprocessing.Scaler` + for more details and examples. + """ @property def _attributes(self) -> list[str]: @@ -753,7 +1073,15 @@ def __init__(self, name: str = None, overwrite_model: bool = False) -> None: class RobustScaler(Scaler): - """i.e. Scaler with param method = 'robust_zscore'""" + """ + i.e. Scaler with param method = 'robust_zscore' + + .. note:: + + This is a child class. See + :py:mod:`verticapy.machine_learning.vertica.preprocessing.Scaler` + for more details and examples. + """ @property def _attributes(self) -> list[str]: @@ -764,7 +1092,15 @@ def __init__(self, name: str = None, overwrite_model: bool = False) -> None: class MinMaxScaler(Scaler): - """i.e. Scaler with param method = 'minmax'""" + """ + i.e. Scaler with param method = 'minmax' + + .. note:: + + This is a child class. See + :py:mod:`verticapy.machine_learning.vertica.preprocessing.Scaler` + for more details and examples. + """ @property def _attributes(self) -> list[str]: @@ -947,8 +1283,8 @@ class OneHotEncoder(Preprocessing): ^^^^^^^^^^^^^^^^^^^^^^^^^^ To get the transformed dataset in the form that is encoded, - we can use the ``transform`` function. Let us transform the data - and display the first 20 datapoints. + we can use the ``transform`` function. Let us transform the + data and display the first 20 datapoints. .. ipython:: python :okwarning: @@ -970,7 +1306,7 @@ class OneHotEncoder(Preprocessing): components. Model Register - ^^^^^^^^^^^^^^ + ^^^^^^^^^^^^^^^ In order to register the model for tracking and versioning: