From ede0589668fd004c4c09c33e3bcf6d0da005e1d3 Mon Sep 17 00:00:00 2001 From: Badr Date: Sun, 29 Oct 2023 15:42:41 -0400 Subject: [PATCH 1/3] Correcting Cramer's V --- verticapy/core/vdataframe/_corr.py | 81 ++++++++++++++++-------------- 1 file changed, 42 insertions(+), 39 deletions(-) diff --git a/verticapy/core/vdataframe/_corr.py b/verticapy/core/vdataframe/_corr.py index b39124a7f..c50fe8491 100755 --- a/verticapy/core/vdataframe/_corr.py +++ b/verticapy/core/vdataframe/_corr.py @@ -165,37 +165,12 @@ def _aggregate_matrix( elif method == "cramer": if columns[1] == columns[0]: return 1 - table_0_1 = f""" - SELECT - {columns[0]}, - {columns[1]}, - COUNT(*) AS nij - FROM {self} - WHERE {columns[0]} IS NOT NULL - AND {columns[1]} IS NOT NULL - GROUP BY 1, 2""" - table_0 = f""" - SELECT - {columns[0]}, - COUNT(*) AS ni - FROM {self} - WHERE {columns[0]} IS NOT NULL - AND {columns[1]} IS NOT NULL - GROUP BY 1""" - table_1 = f""" - SELECT - {columns[1]}, - COUNT(*) AS nj - FROM {self} - WHERE {columns[0]} IS NOT NULL - AND {columns[1]} IS NOT NULL - GROUP BY 1""" n, k, r = _executeSQL( query=f""" SELECT /*+LABEL('vDataframe._aggregate_matrix')*/ COUNT(*) AS n, - APPROXIMATE_COUNT_DISTINCT({columns[0]}) AS k, - APPROXIMATE_COUNT_DISTINCT({columns[1]}) AS r + COUNT(DISTINCT {columns[0]}) AS k, + COUNT(DISTINCT {columns[1]}) AS r FROM {self} WHERE {columns[0]} IS NOT NULL AND {columns[1]} IS NOT NULL""", @@ -205,17 +180,45 @@ def _aggregate_matrix( symbol=self._vars["symbol"], ) chi2 = f""" - SELECT /*+LABEL('vDataframe._aggregate_matrix')*/ - SUM((nij - ni * nj / {n}) * (nij - ni * nj / {n}) - / ((ni * nj) / {n})) AS chi2 - FROM + WITH all_categories AS ( + SELECT * FROM + (SELECT + DISTINCT {columns[0]} + FROM {self} WHERE {columns[0]} IS NOT NULL) table_0 + CROSS JOIN (SELECT - * - FROM ({table_0_1}) table_0_1 - LEFT JOIN ({table_0}) table_0 - ON table_0_1.{columns[0]} = table_0.{columns[0]}) x - LEFT JOIN ({table_1}) table_1 - ON x.{columns[1]} = table_1.{columns[1]}""" + DISTINCT {columns[1]} + FROM {self} WHERE {columns[1]} IS NOT NULL) table_1 + ), categories_counts AS ( + SELECT + {columns[0]}, + {columns[1]}, + COUNT(*) AS _nij + FROM {self} + WHERE + {columns[0]} IS NOT NULL + AND {columns[1]} IS NOT NULL + GROUP BY 1, 2 + ), contingent_table AS ( + SELECT + all_categories.{columns[0]}, + all_categories.{columns[1]}, + COALESCE(_nij, 0) AS _nij + FROM all_categories + LEFT JOIN categories_counts + ON all_categories.{columns[0]} = categories_counts.{columns[0]} + AND all_categories.{columns[1]} = categories_counts.{columns[1]} + ), expected_values AS ( + SELECT + _nij AS O, + SUM(_nij) OVER(PARTITION BY {columns[0]}) + * SUM(_nij) OVER(PARTITION BY {columns[1]}) + / SUM(_nij) OVER () AS E + FROM contingent_table + ) + SELECT + SUM((POWER(O - E, 2) / E)) + FROM expected_values;""" result = _executeSQL( chi2, title=( @@ -1122,8 +1125,8 @@ def corr_pvalue( k, r = _executeSQL( query=f""" SELECT /*+LABEL('vDataframe.corr_pvalue')*/ - APPROXIMATE_COUNT_DISTINCT({column1}) AS k, - APPROXIMATE_COUNT_DISTINCT({column2}) AS r + COUNT(DISTINCT {column1}) AS k, + COUNT(DISTINCT {column2}) AS r FROM {self} WHERE {column1} IS NOT NULL AND {column2} IS NOT NULL""", From f4b9c18b88bcfe92eac4819a53eac50c386db6eb Mon Sep 17 00:00:00 2001 From: Badr Date: Sun, 29 Oct 2023 16:02:50 -0400 Subject: [PATCH 2/3] Update _corr.py --- verticapy/core/vdataframe/_corr.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/verticapy/core/vdataframe/_corr.py b/verticapy/core/vdataframe/_corr.py index c50fe8491..c8d2fee30 100755 --- a/verticapy/core/vdataframe/_corr.py +++ b/verticapy/core/vdataframe/_corr.py @@ -179,7 +179,7 @@ def _aggregate_matrix( sql_push_ext=self._vars["sql_push_ext"], symbol=self._vars["symbol"], ) - chi2 = f""" + chi2_sql = f""" WITH all_categories AS ( SELECT * FROM (SELECT @@ -219,8 +219,8 @@ def _aggregate_matrix( SELECT SUM((POWER(O - E, 2) / E)) FROM expected_values;""" - result = _executeSQL( - chi2, + chi2 = _executeSQL( + chi2_sql, title=( f"Computing the CramerV correlation between {columns[0]} " f"and {columns[1]} (Chi2 Statistic)." @@ -229,13 +229,11 @@ def _aggregate_matrix( sql_push_ext=self._vars["sql_push_ext"], symbol=self._vars["symbol"], ) - if min(k - 1, r - 1) == 0: - result = np.nan - else: - result = float(math.sqrt(result / n / min(k - 1, r - 1))) - if result > 1 or result < 0: - result = np.nan - return result + phi2 = chi2 / n + phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1)) + rcorr = r - ((r - 1) ** 2) / (n - 1) + kcorr = k - ((k - 1) ** 2) / (n - 1) + return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1))) elif method == "kendall": if columns[1] == columns[0]: return 1 From d985c857f02e10e6abc1957b47e98503b1e1a7f9 Mon Sep 17 00:00:00 2001 From: Badr Date: Sun, 29 Oct 2023 20:46:55 -0400 Subject: [PATCH 3/3] test correction --- verticapy/core/vdataframe/_corr.py | 5 ++++- .../tests/vDataFrame/test_vDF_correlation.py | 16 ++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/verticapy/core/vdataframe/_corr.py b/verticapy/core/vdataframe/_corr.py index c8d2fee30..913d6ebe7 100755 --- a/verticapy/core/vdataframe/_corr.py +++ b/verticapy/core/vdataframe/_corr.py @@ -233,7 +233,10 @@ def _aggregate_matrix( phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1)) rcorr = r - ((r - 1) ** 2) / (n - 1) kcorr = k - ((k - 1) ** 2) / (n - 1) - return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1))) + div = min((kcorr - 1), (rcorr - 1)) + if div == 0: + return np.nan + return np.sqrt(phi2corr / div) elif method == "kendall": if columns[1] == columns[0]: return 1 diff --git a/verticapy/tests/vDataFrame/test_vDF_correlation.py b/verticapy/tests/vDataFrame/test_vDF_correlation.py index 8a72de8f2..f6d4f51ee 100755 --- a/verticapy/tests/vDataFrame/test_vDF_correlation.py +++ b/verticapy/tests/vDataFrame/test_vDF_correlation.py @@ -296,13 +296,13 @@ def test_vDF_corr(self, titanic_vd): ) # plt.close("all") assert result5["survived"][0] == 1.0 - assert result5["survived"][1] == pytest.approx(0.3358661117846154, 1e-2) - assert result5["survived"][2] == pytest.approx(0.18608072188932145, 1e-2) - assert result5["pclass"][0] == pytest.approx(0.3358661117846154, 1e-2) + assert result5["survived"][1] == pytest.approx(0.3335779245846321, 1e-2) + assert result5["survived"][2] == pytest.approx(0.1817365402078066, 1e-2) + assert result5["pclass"][0] == pytest.approx(0.3335779245846321, 1e-2) assert result5["pclass"][1] == 1.0 - assert result5["pclass"][2] == pytest.approx(0.27453049870161333, 1e-2) - assert result5["embarked"][0] == pytest.approx(0.18608072188932145, 1e-2) - assert result5["embarked"][1] == pytest.approx(0.27453049870161333, 1e-2) + assert result5["pclass"][2] == pytest.approx(0.27177620037368444, 1e-2) + assert result5["embarked"][0] == pytest.approx(0.1817365402078066, 1e-2) + assert result5["embarked"][1] == pytest.approx(0.27177620037368444, 1e-2) assert result5["embarked"][2] == 1.0 # testing vDataFrame.corr (method = 'cramer') with focus @@ -312,8 +312,8 @@ def test_vDF_corr(self, titanic_vd): show=False, ) # plt.close("all") - assert result5_f["survived"][1] == pytest.approx(0.5531019147439457, 1e-2) - assert result5_f["survived"][2] == pytest.approx(0.3358661117846154, 1e-2) + assert result5_f["survived"][1] == pytest.approx(0.46510584526020793, 1e-2) + assert result5_f["survived"][2] == pytest.approx(0.3335779245846321, 1e-2) # # DENSE SPEARMAN