diff --git a/verticapy/machine_learning/model_selection/statistical_tests/norm.py b/verticapy/machine_learning/model_selection/statistical_tests/norm.py index 4ca8c18b0..25c4d32f8 100755 --- a/verticapy/machine_learning/model_selection/statistical_tests/norm.py +++ b/verticapy/machine_learning/model_selection/statistical_tests/norm.py @@ -43,6 +43,7 @@ def jarque_bera(input_relation: SQLRelation, column: str) -> tuple[float, float] ------- tuple statistic, p_value + """ if isinstance(input_relation, vDataFrame): vdf = input_relation.copy() @@ -93,8 +94,8 @@ def kurtosistest(input_relation: SQLRelation, column: str) -> tuple[float, float @save_verticapy_logs def normaltest(input_relation: SQLRelation, column: str) -> tuple[float, float]: """ - Test whether a sample differs from a normal - distribution. + This function tests the null hypothesis that a + sample comes from a normal distribution. Parameters ---------- @@ -107,6 +108,161 @@ def normaltest(input_relation: SQLRelation, column: str) -> tuple[float, float]: ------- tuple statistic, p_value + + Examples + --------- + + Let's try this test on two set of distribution to + obverse the contrast in test results: + + - normally distributed dataset + - uniformly distributed dataset + + Normally Distributed + ^^^^^^^^^^^^^^^^^^^^^ + + Import the necessary libraries: + + .. code-block:: python + + import verticapy as vp + import numpy as np + import random + + .. ipython:: python + :suppress: + + import verticapy as vp + import numpy as np + import random + N = 100 + mean = 0 + std_dev = 1 + data = np.random.normal(mean, std_dev, N) + + Then we can define the basic parameters for the + normal distribution: + + .. code-block:: python + + # Distribution parameters + N = 100 # Number of rows + mean = 0 + std_dev = 1 + + # Dataset + data = np.random.normal(mean, std_dev, N) + + Now we can create the ``vDataFrame``: + + .. ipython:: python + + vdf = vp.vDataFrame({"col": data}) + + We can visualize the distribution: + + .. code-block:: + + vdf["col"].hist() + + .. ipython:: python + :suppress: + + vp.set_option("plotting_lib", "plotly") + fig = vdf["col"].hist() + fig.write_html("figures/plotting_machine_learning_model_selection_norm_normaltest_1.html") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/plotting_machine_learning_model_selection_norm_normaltest_1.html + + To find the test p-value, we can import the test function: + + .. ipython:: python + + from verticapy.machine_learning.model_selection.statistical_tests import normaltest + + And simply apply it on the ``vDataFrame``: + + .. ipython:: python + + normaltest(vdf, column = "col") + + We can see that the p-value is high meaning that + we cannot reject the null hypothesis. + + .. note:: + + A ``p_value`` in statistics represents the + probability of obtaining results as extreme + as, or more extreme than, the observed data, + assuming the null hypothesis is true. + A *smaller* p-value typically suggests + stronger evidence against the null hypothesis + i.e. the test distribution does not belong + to a normal distribution. + + However, *small* is a relative term. And + the choice for the threshold value which + determines a "small" should be made before + analyzing the data. + + Generally a ``p-value`` less than 0.05 + is considered the threshold to reject the + null hypothesis. But it is not always + the case - + `read more `_ + + Uniform Distribution + ^^^^^^^^^^^^^^^^^^^^^ + + .. ipython:: python + :suppress: + + low = 0 + high = 1 + data = np.random.uniform(low, high, N) + vdf = vp.vDataFrame({"col": data}) + + We can define the basic parameters for the + uniform distribution: + + .. code-block:: python + + # Distribution parameters + low = 0 + high = 1 + + # Dataset + data = np.random.uniform(low, high, N) + + # vDataFrame + vdf = vp.vDataFrame({"col": data}) + + We can visualize the distribution: + + .. code-block:: + + vdf["col"].hist() + + .. ipython:: python + :suppress: + + fig = vdf["col"].hist() + fig.write_html("figures/plotting_machine_learning_model_selection_norm_normaltest_2.html") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/plotting_machine_learning_model_selection_norm_normaltest_2.html + + And simply apply it on the ``vDataFrame``: + + .. ipython:: python + + normaltest(vdf, column = "col") + + + In this case, the p-value is quite low + meaning that it is highly probable that + the data is not normally distributed. """ if isinstance(input_relation, vDataFrame): vdf = input_relation.copy() diff --git a/verticapy/machine_learning/model_selection/statistical_tests/ols.py b/verticapy/machine_learning/model_selection/statistical_tests/ols.py index 6a4ed036c..39b8496d8 100755 --- a/verticapy/machine_learning/model_selection/statistical_tests/ols.py +++ b/verticapy/machine_learning/model_selection/statistical_tests/ols.py @@ -58,6 +58,235 @@ def het_breuschpagan( tuple Lagrange Multiplier statistic, LM pvalue, F statistic, F pvalue + + Examples + --------- + + Initialization + ^^^^^^^^^^^^^^^ + + Let's try this test on a dummy dataset that has the + following elements: + + - x (a predictor) + - y (the response) + - Random noise + + .. note:: + + This metric requires ``eps``, which represents + the difference between the predicted value + and the true value. If you already have ``eps`` + available, you can directly use it instead of + recomputing it, as demonstrated in the example + below. + + Before we begin we can import the necessary libraries: + + .. ipython:: python + + import verticapy as vp + import numpy as np + from verticapy.learn.linear_model import LinearRegression + + Example 1: Homoscedasticity + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + Next, we can create some values with random + noise: + + .. ipython:: python + + y_vals = [0, 2, 4, 6, 8, 10] + np.random.normal(0, 0.4, 6) + + We can use those values to create the ``vDataFrame``: + + .. ipython:: python + + vdf = vp.vDataFrame( + { + "x": [0, 1, 2, 3, 4, 5], + "y": y_vals, + } + ) + + We can initialize a regression model: + + .. ipython:: python + + model = LinearRegression() + + Fit that model on the dataset: + + .. ipython:: python + + model.fit(input_relation = vdf, X = "x", y = "y") + + We can create a column in the ``vDataFrame`` that + has the predictions: + + .. ipython:: python + + model.predict(vdf, X = "x", name = "y_pred") + + Then we can calculate the residuals i.e. ``eps``: + + .. ipython:: python + + vdf["eps"] = vdf["y"] - vdf["y_pred"] + + We can plot the residuals to see the trend: + + .. code-block:: python + + vdf.scatter(["x", "eps"]) + + .. ipython:: python + :suppress: + + vp.set_option("plotting_lib", "plotly") + fig = vdf.scatter(["x", "eps"]) + fig.write_html("figures/plotting_machine_learning_model_selection_ols_het_breuschpagan.html") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/plotting_machine_learning_model_selection_ols_het_breuschpagan.html + + Notice the randomness of the residuals with respect to x. + This shows that the noise is homoscedestic. + + To test its score, we can import the test function: + + .. ipython:: python + + from verticapy.stats import het_breuschpagan + + And simply apply it on the ``vDataFrame``: + + .. ipython:: python + + lm_statistic, lm_pvalue, f_statistic, f_pvalue = het_breuschpagan(vdf, eps = "eps", X = "x") + + .. ipython:: python + + print(lm_statistic, lm_pvalue, f_statistic, f_pvalue) + + As the noise was not heteroscedestic, we got higher + p_value scores and lower statistics score. + + .. note:: + + A ``p_value`` in statistics represents the + probability of obtaining results as extreme + as, or more extreme than, the observed data, + assuming the null hypothesis is true. + A *smaller* p-value typically suggests + stronger evidence against the null hypothesis + i.e. the test data does not have + a heteroscedestic noise in the current case. + + However, *small* is a relative term. And + the choice for the threshold value which + determines a "small" should be made before + analyzing the data. + + Generally a ``p-value`` less than 0.05 + is considered the threshold to reject the + null hypothesis. But it is not always + the case - + `read more `_ + + .. note:: + + F-statistics tests the overall significance + of a model, while LM statistics tests the + validity of linear restrictions on model + parameters. High values indicate heterescedestic + noise in this case. + + Example 2: Heteroscedasticity + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + We can contrast the above result with a dataset that + has **heteroscedestic noise** below: + + .. ipython:: python + + # y values + y_vals = np.array([0, 2, 4, 6, 8, 10]) + + # Adding some heteroscedestic noise + y_vals = y_vals + [0.5, 0.3, 0.2, 0.1, 0.05, 0] + + .. ipython:: python + + vdf = vp.vDataFrame( + { + "x": [0, 1, 2, 3, 4, 5], + "y": y_vals, + } + ) + + We can intialize a regression model: + + .. ipython:: python + + model = LinearRegression() + + Fit that model on the dataset: + + .. ipython:: python + + model.fit(input_relation = vdf, X = "x", y = "y") + + We can create a column in the ``vDataFrame`` that + has the predictions: + + .. ipython:: python + + model.predict(vdf, X = "x", name = "y_pred") + + Then we can calculate the residual i.e. ``eps``: + + .. ipython:: python + + vdf["eps"] = vdf["y"] - vdf["y_pred"] + + We can plot the residuals to see the trend: + + .. code-block:: python + + vdf.scatter(["x", "eps"]) + + .. ipython:: python + :suppress: + + fig = vdf.scatter(["x", "eps"]) + fig.write_html("figures/plotting_machine_learning_model_selection_ols_het_breuschpagan_2.html") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/plotting_machine_learning_model_selection_ols_het_breuschpagan_2.html + + Notice the relationship of the residuals with + respect to x. This shows that the noise is + heteroscedestic. + + Now we can perform the test on this dataset: + + .. ipython:: python + + lm_statistic, lm_pvalue, f_statistic, f_pvalue = het_breuschpagan(vdf, eps = "eps", X = "x") + + .. ipython:: python + + print(lm_statistic, lm_pvalue, f_statistic, f_pvalue) + + Notice the contrast of the two test results. In this + dataset, the noise was heteroscedestic so we got very low + p_value scores and higher statistics score. Thus confirming + that the noise was in fact heteroscedestic. + + For more information check out + `this link `_. """ if isinstance(input_relation, vDataFrame): vdf = input_relation.copy() diff --git a/verticapy/machine_learning/model_selection/statistical_tests/tsa.py b/verticapy/machine_learning/model_selection/statistical_tests/tsa.py index dbb2f958d..8bb415d6a 100755 --- a/verticapy/machine_learning/model_selection/statistical_tests/tsa.py +++ b/verticapy/machine_learning/model_selection/statistical_tests/tsa.py @@ -273,12 +273,14 @@ def mkt( """ Mann Kendall test (Time Series trend). - \u26A0 Warning : This Test is computationally expensive - because it uses a CROSS JOIN during the - computation. The complexity is O(n * k), - n being the total count of the vDataFrame - and k the number of rows to use to do the - test. + .. warning:: + + This Test is computationally expensive + because it uses a CROSS JOIN during the + computation. The complexity is O(n * k), + n being the total count of the vDataFrame + and k the number of rows to use to do the + test. Parameters ---------- @@ -297,6 +299,141 @@ def mkt( ------- TableSample result of the test. + + Examples + --------- + + Initialization + ^^^^^^^^^^^^^^^ + + Let's try this test on a dummy dataset that has the + following elements: + + - A value of interest + - Time-stamp data + + Before we begin we can import the necessary libraries: + + .. ipython:: python + + import verticapy as vp + + Example 1: Trend + ^^^^^^^^^^^^^^^^^ + + Now we can create the dummy dataset: + + .. ipython:: python + + vdf = vp.vDataFrame( + { + "X": [0, 1, 2, 3, 4, 5, 6], + "year": [1990, 1991, 1992, 1993, 1994, 1995, 1996], + } + ) + + We can visually inspect the trend by drawing the + appropriate graph: + + .. code-block:: + + vdf["X"].plot(ts="year") + + .. ipython:: python + :suppress: + + vp.set_option("plotting_lib", "plotly") + fig = vdf["X"].plot(ts="year") + fig.write_html("figures/plotting_machine_learning_model_selection_tsa_mkt.html") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/plotting_machine_learning_model_selection_tsa_mkt.html + + Thouse the increasing trend is obvious, + we can test its ``mkt`` score by first importing + the function: + + .. ipython:: python + + from verticapy.stats import mkt + + And then simply applying it on the ``vDataFrame``: + + .. ipython:: python + + mkt(vdf, column = "X", ts= "year") + + In the above context, the low p-value is + evidence of the presence of trend. The function + also gives us information about the nature of + trend. In this case, we can see that it is a + monotonically increasing trend which conforms + with our plot that we observed above. + + .. note:: + + A ``p_value`` in statistics represents the + probability of obtaining results as extreme + as, or more extreme than, the observed data, + assuming the null hypothesis is true. + A *smaller* p-value typically suggests + stronger evidence against the null hypothesis + i.e. the test data does not have + a trend with respect to time in the current case. + + However, *small* is a relative term. And + the choice for the threshold value which + determines a "small" should be made before + analyzing the data. + + Generally a ``p-value`` less than 0.05 + is considered the threshold to reject the + null hypothesis. But it is not always + the case - + `read more `_ + + Example 1: No Trend + ^^^^^^^^^^^^^^^^^^^^ + + We can contrast the results with a dataset that + has barely any trend: + + .. ipython:: python + + vdf = vp.vDataFrame( + { + "X":[1, 1, 1, 1, 1, 1, 1], + "year": [1990, 1991, 1992, 1993, 1994, 1995, 1996], + } + ) + + We can visually inspect the absence of trend + by drawing the appropriate graph: + + .. code-block:: + + vdf["X"].plot(ts="year") + + .. ipython:: python + :suppress: + + fig = vdf["X"].plot(ts="year") + fig.write_html("figures/plotting_machine_learning_model_selection_tsa_mkt_2.html") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/plotting_machine_learning_model_selection_tsa_mkt_2.html + + Now we can perform the test on this dataset: + + .. ipython:: python + + mkt(vdf, column = "X", ts = "year") + + Notice the extreme p-value which is + significant to disprove the null hypothesis. + + For more information check out + `this link `_. """ if isinstance(input_relation, vDataFrame): vdf = input_relation.copy()