Skip to content

Commit

Permalink
Correcting Cramer's V (#783)
Browse files Browse the repository at this point in the history
* Correcting Cramer's V

* Update _corr.py

* test correction
  • Loading branch information
oualib authored Oct 30, 2023
1 parent 7890f01 commit e929cda
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 57 deletions.
102 changes: 53 additions & 49 deletions verticapy/core/vdataframe/_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,37 +165,12 @@ def _aggregate_matrix(
elif method == "cramer":
if columns[1] == columns[0]:
return 1
table_0_1 = f"""
SELECT
{columns[0]},
{columns[1]},
COUNT(*) AS nij
FROM {self}
WHERE {columns[0]} IS NOT NULL
AND {columns[1]} IS NOT NULL
GROUP BY 1, 2"""
table_0 = f"""
SELECT
{columns[0]},
COUNT(*) AS ni
FROM {self}
WHERE {columns[0]} IS NOT NULL
AND {columns[1]} IS NOT NULL
GROUP BY 1"""
table_1 = f"""
SELECT
{columns[1]},
COUNT(*) AS nj
FROM {self}
WHERE {columns[0]} IS NOT NULL
AND {columns[1]} IS NOT NULL
GROUP BY 1"""
n, k, r = _executeSQL(
query=f"""
SELECT /*+LABEL('vDataframe._aggregate_matrix')*/
COUNT(*) AS n,
APPROXIMATE_COUNT_DISTINCT({columns[0]}) AS k,
APPROXIMATE_COUNT_DISTINCT({columns[1]}) AS r
COUNT(DISTINCT {columns[0]}) AS k,
COUNT(DISTINCT {columns[1]}) AS r
FROM {self}
WHERE {columns[0]} IS NOT NULL
AND {columns[1]} IS NOT NULL""",
Expand All @@ -204,20 +179,48 @@ def _aggregate_matrix(
sql_push_ext=self._vars["sql_push_ext"],
symbol=self._vars["symbol"],
)
chi2 = f"""
SELECT /*+LABEL('vDataframe._aggregate_matrix')*/
SUM((nij - ni * nj / {n}) * (nij - ni * nj / {n})
/ ((ni * nj) / {n})) AS chi2
FROM
chi2_sql = f"""
WITH all_categories AS (
SELECT * FROM
(SELECT
*
FROM ({table_0_1}) table_0_1
LEFT JOIN ({table_0}) table_0
ON table_0_1.{columns[0]} = table_0.{columns[0]}) x
LEFT JOIN ({table_1}) table_1
ON x.{columns[1]} = table_1.{columns[1]}"""
result = _executeSQL(
chi2,
DISTINCT {columns[0]}
FROM {self} WHERE {columns[0]} IS NOT NULL) table_0
CROSS JOIN
(SELECT
DISTINCT {columns[1]}
FROM {self} WHERE {columns[1]} IS NOT NULL) table_1
), categories_counts AS (
SELECT
{columns[0]},
{columns[1]},
COUNT(*) AS _nij
FROM {self}
WHERE
{columns[0]} IS NOT NULL
AND {columns[1]} IS NOT NULL
GROUP BY 1, 2
), contingent_table AS (
SELECT
all_categories.{columns[0]},
all_categories.{columns[1]},
COALESCE(_nij, 0) AS _nij
FROM all_categories
LEFT JOIN categories_counts
ON all_categories.{columns[0]} = categories_counts.{columns[0]}
AND all_categories.{columns[1]} = categories_counts.{columns[1]}
), expected_values AS (
SELECT
_nij AS O,
SUM(_nij) OVER(PARTITION BY {columns[0]})
* SUM(_nij) OVER(PARTITION BY {columns[1]})
/ SUM(_nij) OVER () AS E
FROM contingent_table
)
SELECT
SUM((POWER(O - E, 2) / E))
FROM expected_values;"""
chi2 = _executeSQL(
chi2_sql,
title=(
f"Computing the CramerV correlation between {columns[0]} "
f"and {columns[1]} (Chi2 Statistic)."
Expand All @@ -226,13 +229,14 @@ def _aggregate_matrix(
sql_push_ext=self._vars["sql_push_ext"],
symbol=self._vars["symbol"],
)
if min(k - 1, r - 1) == 0:
result = np.nan
else:
result = float(math.sqrt(result / n / min(k - 1, r - 1)))
if result > 1 or result < 0:
result = np.nan
return result
phi2 = chi2 / n
phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
rcorr = r - ((r - 1) ** 2) / (n - 1)
kcorr = k - ((k - 1) ** 2) / (n - 1)
div = min((kcorr - 1), (rcorr - 1))
if div == 0:
return np.nan
return np.sqrt(phi2corr / div)
elif method == "kendall":
if columns[1] == columns[0]:
return 1
Expand Down Expand Up @@ -1122,8 +1126,8 @@ def corr_pvalue(
k, r = _executeSQL(
query=f"""
SELECT /*+LABEL('vDataframe.corr_pvalue')*/
APPROXIMATE_COUNT_DISTINCT({column1}) AS k,
APPROXIMATE_COUNT_DISTINCT({column2}) AS r
COUNT(DISTINCT {column1}) AS k,
COUNT(DISTINCT {column2}) AS r
FROM {self}
WHERE {column1} IS NOT NULL
AND {column2} IS NOT NULL""",
Expand Down
16 changes: 8 additions & 8 deletions verticapy/tests/vDataFrame/test_vDF_correlation.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,13 +296,13 @@ def test_vDF_corr(self, titanic_vd):
)
# plt.close("all")
assert result5["survived"][0] == 1.0
assert result5["survived"][1] == pytest.approx(0.3358661117846154, 1e-2)
assert result5["survived"][2] == pytest.approx(0.18608072188932145, 1e-2)
assert result5["pclass"][0] == pytest.approx(0.3358661117846154, 1e-2)
assert result5["survived"][1] == pytest.approx(0.3335779245846321, 1e-2)
assert result5["survived"][2] == pytest.approx(0.1817365402078066, 1e-2)
assert result5["pclass"][0] == pytest.approx(0.3335779245846321, 1e-2)
assert result5["pclass"][1] == 1.0
assert result5["pclass"][2] == pytest.approx(0.27453049870161333, 1e-2)
assert result5["embarked"][0] == pytest.approx(0.18608072188932145, 1e-2)
assert result5["embarked"][1] == pytest.approx(0.27453049870161333, 1e-2)
assert result5["pclass"][2] == pytest.approx(0.27177620037368444, 1e-2)
assert result5["embarked"][0] == pytest.approx(0.1817365402078066, 1e-2)
assert result5["embarked"][1] == pytest.approx(0.27177620037368444, 1e-2)
assert result5["embarked"][2] == 1.0

# testing vDataFrame.corr (method = 'cramer') with focus
Expand All @@ -312,8 +312,8 @@ def test_vDF_corr(self, titanic_vd):
show=False,
)
# plt.close("all")
assert result5_f["survived"][1] == pytest.approx(0.5531019147439457, 1e-2)
assert result5_f["survived"][2] == pytest.approx(0.3358661117846154, 1e-2)
assert result5_f["survived"][1] == pytest.approx(0.46510584526020793, 1e-2)
assert result5_f["survived"][2] == pytest.approx(0.3335779245846321, 1e-2)

#
# DENSE SPEARMAN
Expand Down

0 comments on commit e929cda

Please sign in to comment.