Correcting balance function and adding docstrings (#859)

- Balance -> balance (it is not a class but a function) - docstring to explain what to do. - formatting.
vertica · Nov 14, 2023 · d039706 · d039706
1 parent ef0138f
commit d039706
Show file tree

Hide file tree

Showing 13 changed files with 458 additions and 34 deletions.
diff --git a/docs/source/machine_learning_vertica_decomposition.rst b/docs/source/machine_learning_vertica_decomposition.rst
@@ -402,8 +402,7 @@ Balance
 .. autosummary::
    :toctree: api/
 
-   preprocessing.Balance
-
+   preprocessing.balance
 
 Count Vectorizor (Beta)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/verticapy/_utils/_sql/_vertica_version.py b/verticapy/_utils/_sql/_vertica_version.py
@@ -25,7 +25,7 @@
     "ARIMA": [23, 4, 0],
     "AR": [11, 0, 0],
     "ARMA": [12, 0, 4],
-    "Balance": [8, 1, 1],
+    "balance": [8, 1, 1],
     "BernoulliNB": [8, 0, 0],
     "BisectingKMeans": [9, 3, 1],
     "CategoricalNB": [8, 0, 0],

diff --git a/verticapy/learn/preprocessing/__init__.py b/verticapy/learn/preprocessing/__init__.py
@@ -15,7 +15,7 @@
 permissions and limitations under the License.
 """
 from verticapy.machine_learning.vertica.preprocessing import (
-    Balance,
+    balance,
     CountVectorizer,
     MinMaxScaler,
     Scaler,

diff --git a/verticapy/machine_learning/vertica/__init__.py b/verticapy/machine_learning/vertica/__init__.py
@@ -58,7 +58,7 @@
 from verticapy.machine_learning.vertica.pipeline import Pipeline
 from verticapy.machine_learning.vertica.pmml import PMMLModel
 from verticapy.machine_learning.vertica.preprocessing import (
-    Balance,
+    balance,
     CountVectorizer,
     MinMaxScaler,
     Scaler,

diff --git a/verticapy/machine_learning/vertica/cluster.py b/verticapy/machine_learning/vertica/cluster.py
@@ -2590,6 +2590,59 @@ class NearestCentroid(MulticlassClassifier):
         data = vpd.load_iris()
         train, test = data.train_test_split(test_size = 0.2)
 
+    Balancing the Dataset
+    ^^^^^^^^^^^^^^^^^^^^^^
+
+    In VerticaPy, balancing a dataset to address class imbalances
+    is made straightforward through the
+    :py:mod:`verticapy.machine_learning.vertica.preprocessing.balance`
+    function within the ``preprocessing`` module. This function
+    enables users to rectify skewed class distributions efficiently.
+    By specifying the target variable and setting parameters like
+    the method for balancing, users can effortlessly achieve a more
+    equitable representation of classes in their dataset.
+    Whether opting for over-sampling, under-sampling, or a combination
+    of both, VerticaPy's
+    :py:mod:`verticapy.machine_learning.vertica.preprocessing.balance`
+    function streamlines the process, empowering users to enhance the
+    performance and fairness of their machine learning models trained
+    on imbalanced data.
+
+    To balance the dataset, use the following syntax.
+
+    .. code-block:: python
+
+        from verticapy.machine_learning.vertica.preprocessing import balance
+
+        balanced_train = balance(
+            name = "my_schema.train_balanced",
+            input_relation = train,
+            y = "good",
+            method = "hybrid",
+        )
+
+    .. note::
+
+        With this code, a table named `train_balanced` is created in the
+        `my_schema` schema. It can then be used to train the model. In the
+        rest of the example, we will work with the full dataset.
+
+    .. hint::
+
+        Balancing the dataset is a crucial step in improving the accuracy
+        of machine learning models, particularly when faced with imbalanced
+        class distributions. By addressing disparities in the number of
+        instances across different classes, the model becomes more adept at
+        learning patterns from all classes rather than being biased towards
+        the majority class. This, in turn, enhances the model's ability to
+        make accurate predictions for under-represented classes. The balanced
+        dataset ensures that the model is not dominated by the majority class
+        and, as a result, leads to more robust and unbiased model performance.
+        Therefore, by employing techniques such as over-sampling, under-sampling,
+        or a combination of both during dataset preparation, practitioners can
+        significantly contribute to achieving higher accuracy and better
+        generalization of their machine learning models.
+
     Model Initialization
     ^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/verticapy/machine_learning/vertica/ensemble.py b/verticapy/machine_learning/vertica/ensemble.py
@@ -1723,6 +1723,59 @@ class RandomForestClassifier(MulticlassClassifier, RandomForest):
         data = vpd.load_winequality()
         train, test = data.train_test_split(test_size = 0.2)
 
+    Balancing the Dataset
+    ^^^^^^^^^^^^^^^^^^^^^^
+
+    In VerticaPy, balancing a dataset to address class imbalances
+    is made straightforward through the
+    :py:mod:`verticapy.machine_learning.vertica.preprocessing.balance`
+    function within the ``preprocessing`` module. This function
+    enables users to rectify skewed class distributions efficiently.
+    By specifying the target variable and setting parameters like
+    the method for balancing, users can effortlessly achieve a more
+    equitable representation of classes in their dataset.
+    Whether opting for over-sampling, under-sampling, or a combination
+    of both, VerticaPy's
+    :py:mod:`verticapy.machine_learning.vertica.preprocessing.balance`
+    function streamlines the process, empowering users to enhance the
+    performance and fairness of their machine learning models trained
+    on imbalanced data.
+
+    To balance the dataset, use the following syntax.
+
+    .. code-block:: python
+
+        from verticapy.machine_learning.vertica.preprocessing import balance
+
+        balanced_train = balance(
+            name = "my_schema.train_balanced",
+            input_relation = train,
+            y = "good",
+            method = "hybrid",
+        )
+
+    .. note::
+
+        With this code, a table named `train_balanced` is created in the
+        `my_schema` schema. It can then be used to train the model. In the
+        rest of the example, we will work with the full dataset.
+
+    .. hint::
+
+        Balancing the dataset is a crucial step in improving the accuracy
+        of machine learning models, particularly when faced with imbalanced
+        class distributions. By addressing disparities in the number of
+        instances across different classes, the model becomes more adept at
+        learning patterns from all classes rather than being biased towards
+        the majority class. This, in turn, enhances the model's ability to
+        make accurate predictions for under-represented classes. The balanced
+        dataset ensures that the model is not dominated by the majority class
+        and, as a result, leads to more robust and unbiased model performance.
+        Therefore, by employing techniques such as over-sampling, under-sampling,
+        or a combination of both during dataset preparation, practitioners can
+        significantly contribute to achieving higher accuracy and better
+        generalization of their machine learning models.
+
     Model Initialization
     ^^^^^^^^^^^^^^^^^^^^^
 
@@ -2474,6 +2527,59 @@ class XGBClassifier(MulticlassClassifier, XGBoost):
         data = vpd.load_winequality()
         train, test = data.train_test_split(test_size = 0.2)
 
+    Balancing the Dataset
+    ^^^^^^^^^^^^^^^^^^^^^^
+
+    In VerticaPy, balancing a dataset to address class imbalances
+    is made straightforward through the
+    :py:mod:`verticapy.machine_learning.vertica.preprocessing.balance`
+    function within the ``preprocessing`` module. This function
+    enables users to rectify skewed class distributions efficiently.
+    By specifying the target variable and setting parameters like
+    the method for balancing, users can effortlessly achieve a more
+    equitable representation of classes in their dataset.
+    Whether opting for over-sampling, under-sampling, or a combination
+    of both, VerticaPy's
+    :py:mod:`verticapy.machine_learning.vertica.preprocessing.balance`
+    function streamlines the process, empowering users to enhance the
+    performance and fairness of their machine learning models trained
+    on imbalanced data.
+
+    To balance the dataset, use the following syntax.
+
+    .. code-block:: python
+
+        from verticapy.machine_learning.vertica.preprocessing import balance
+
+        balanced_train = balance(
+            name = "my_schema.train_balanced",
+            input_relation = train,
+            y = "good",
+            method = "hybrid",
+        )
+
+    .. note::
+
+        With this code, a table named `train_balanced` is created in the
+        `my_schema` schema. It can then be used to train the model. In the
+        rest of the example, we will work with the full dataset.
+
+    .. hint::
+
+        Balancing the dataset is a crucial step in improving the accuracy
+        of machine learning models, particularly when faced with imbalanced
+        class distributions. By addressing disparities in the number of
+        instances across different classes, the model becomes more adept at
+        learning patterns from all classes rather than being biased towards
+        the majority class. This, in turn, enhances the model's ability to
+        make accurate predictions for under-represented classes. The balanced
+        dataset ensures that the model is not dominated by the majority class
+        and, as a result, leads to more robust and unbiased model performance.
+        Therefore, by employing techniques such as over-sampling, under-sampling,
+        or a combination of both during dataset preparation, practitioners can
+        significantly contribute to achieving higher accuracy and better
+        generalization of their machine learning models.
+
     Model Initialization
     ^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/verticapy/machine_learning/vertica/linear_model.py b/verticapy/machine_learning/vertica/linear_model.py
@@ -2847,6 +2847,59 @@ class LogisticRegression(LinearModelClassifier, BinaryClassifier):
         data = vpd.load_winequality()
         train, test = data.train_test_split(test_size = 0.2)
 
+    Balancing the Dataset
+    ^^^^^^^^^^^^^^^^^^^^^^
+
+    In VerticaPy, balancing a dataset to address class imbalances
+    is made straightforward through the
+    :py:mod:`verticapy.machine_learning.vertica.preprocessing.balance`
+    function within the ``preprocessing`` module. This function
+    enables users to rectify skewed class distributions efficiently.
+    By specifying the target variable and setting parameters like
+    the method for balancing, users can effortlessly achieve a more
+    equitable representation of classes in their dataset.
+    Whether opting for over-sampling, under-sampling, or a combination
+    of both, VerticaPy's
+    :py:mod:`verticapy.machine_learning.vertica.preprocessing.balance`
+    function streamlines the process, empowering users to enhance the
+    performance and fairness of their machine learning models trained
+    on imbalanced data.
+
+    To balance the dataset, use the following syntax.
+
+    .. code-block:: python
+
+        from verticapy.machine_learning.vertica.preprocessing import balance
+
+        balanced_train = balance(
+            name = "my_schema.train_balanced",
+            input_relation = train,
+            y = "good",
+            method = "hybrid",
+        )
+
+    .. note::
+
+        With this code, a table named `train_balanced` is created in the
+        `my_schema` schema. It can then be used to train the model. In the
+        rest of the example, we will work with the full dataset.
+
+    .. hint::
+
+        Balancing the dataset is a crucial step in improving the accuracy
+        of machine learning models, particularly when faced with imbalanced
+        class distributions. By addressing disparities in the number of
+        instances across different classes, the model becomes more adept at
+        learning patterns from all classes rather than being biased towards
+        the majority class. This, in turn, enhances the model's ability to
+        make accurate predictions for under-represented classes. The balanced
+        dataset ensures that the model is not dominated by the majority class
+        and, as a result, leads to more robust and unbiased model performance.
+        Therefore, by employing techniques such as over-sampling, under-sampling,
+        or a combination of both during dataset preparation, practitioners can
+        significantly contribute to achieving higher accuracy and better
+        generalization of their machine learning models.
+
     Model Initialization
     ^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/verticapy/machine_learning/vertica/naive_bayes.py b/verticapy/machine_learning/vertica/naive_bayes.py
@@ -188,6 +188,59 @@ class NaiveBayes(MulticlassClassifier):
         data = vpd.load_iris()
         train, test = data.train_test_split(test_size = 0.2)
 
+    Balancing the Dataset
+    ^^^^^^^^^^^^^^^^^^^^^^
+
+    In VerticaPy, balancing a dataset to address class imbalances
+    is made straightforward through the
+    :py:mod:`verticapy.machine_learning.vertica.preprocessing.balance`
+    function within the ``preprocessing`` module. This function
+    enables users to rectify skewed class distributions efficiently.
+    By specifying the target variable and setting parameters like
+    the method for balancing, users can effortlessly achieve a more
+    equitable representation of classes in their dataset.
+    Whether opting for over-sampling, under-sampling, or a combination
+    of both, VerticaPy's
+    :py:mod:`verticapy.machine_learning.vertica.preprocessing.balance`
+    function streamlines the process, empowering users to enhance the
+    performance and fairness of their machine learning models trained
+    on imbalanced data.
+
+    To balance the dataset, use the following syntax.
+
+    .. code-block:: python
+
+        from verticapy.machine_learning.vertica.preprocessing import balance
+
+        balanced_train = balance(
+            name = "my_schema.train_balanced",
+            input_relation = train,
+            y = "good",
+            method = "hybrid",
+        )
+
+    .. note::
+
+        With this code, a table named `train_balanced` is created in the
+        `my_schema` schema. It can then be used to train the model. In the
+        rest of the example, we will work with the full dataset.
+
+    .. hint::
+
+        Balancing the dataset is a crucial step in improving the accuracy
+        of machine learning models, particularly when faced with imbalanced
+        class distributions. By addressing disparities in the number of
+        instances across different classes, the model becomes more adept at
+        learning patterns from all classes rather than being biased towards
+        the majority class. This, in turn, enhances the model's ability to
+        make accurate predictions for under-represented classes. The balanced
+        dataset ensures that the model is not dominated by the majority class
+        and, as a result, leads to more robust and unbiased model performance.
+        Therefore, by employing techniques such as over-sampling, under-sampling,
+        or a combination of both during dataset preparation, practitioners can
+        significantly contribute to achieving higher accuracy and better
+        generalization of their machine learning models.
+
     Model Initialization
     ^^^^^^^^^^^^^^^^^^^^^