Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IMP: more verbose error handling for merge method use cases #64

Merged
merged 11 commits into from
May 1, 2024
37 changes: 28 additions & 9 deletions q2_metadata/_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,40 @@ def merge(metadata1: qiime2.Metadata,
n_overlapping_ids = len(overlapping_ids)
n_overlapping_columns = len(overlapping_columns)

if len(overlapping_ids) > 0 and len(overlapping_columns) > 0:
raise ValueError(f"Merging can currently handle overlapping ids "
f"or overlapping columns, but not both. "
f"{n_overlapping_ids} overlapping ids were "
f"identified ({', '.join(overlapping_ids)}) and"
f"{n_overlapping_columns} overlapping columns "
f"were identified {', '.join(overlapping_columns)}.")
if n_overlapping_ids and n_overlapping_columns:
raise ValueError(
"Merging can currently handle overlapping ids or overlapping"
hagenjp marked this conversation as resolved.
Show resolved Hide resolved
f"but not both. {n_overlapping_ids} overlapping ids were "
f"identified ({', '.join(overlapping_ids)}) and"
hagenjp marked this conversation as resolved.
Show resolved Hide resolved
f"{n_overlapping_columns} overlapping columns were identified "
f"{', '.join(overlapping_columns)}."
hagenjp marked this conversation as resolved.
Show resolved Hide resolved
)

df1 = metadata1.to_dataframe()
df2 = metadata2.to_dataframe()

if n_overlapping_columns == 0:
if df1.index.name != df2.index.name:
raise ValueError(
"Metadata files contain different ID column names. "
f"Metadata1 file contains '{df1.index.name}' and metadata2 "
f"contains '{df2.index.name}'. These column names must match."
)

if not n_overlapping_columns:
result = pd.merge(df1, df2, how='outer', left_index=True,
right_index=True)
else: # i.e., n_overlapping_ids == 0

else:
for column in overlapping_columns:
if df1[column].dtype != df2[column].dtype:
raise ValueError(
"Metadata files contain identically named columns "
hagenjp marked this conversation as resolved.
Show resolved Hide resolved
f"with different data-types. The column {column} is of "
f"type {df1[column].dtype} in metadata1 and of type "
f"{df2[column].dtype} in metadata2. These data-types must "
"match."
)

result = pd.merge(df1, df2, how='outer', left_index=True,
right_index=True, suffixes=('', '_'))
for c in overlapping_columns:
Expand Down
54 changes: 50 additions & 4 deletions q2_metadata/tests/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,6 @@ def test_merge_all_columns_overlapping(self):
columns=['col1', 'col2', 'col3']))

obs1 = merge(md1, md2)
print(obs1.to_dataframe())
hagenjp marked this conversation as resolved.
Show resolved Hide resolved

index_exp1 = pd.Index(['sample1', 'sample2', 'sample3',
'sample4', 'sample5', 'sample6'], name='id')
data_exp1 = [['a', 'd', 'h'],
Expand All @@ -135,8 +133,6 @@ def test_merge_all_columns_overlapping(self):
exp1 = qiime2.Metadata(
pd.DataFrame(data_exp1, index=index_exp1, dtype=object,
columns=['col1', 'col2', 'col3']))

print(exp1.to_dataframe())
self.assertEqual(obs1, exp1)

def test_merge_some_columns_overlapping(self):
Expand Down Expand Up @@ -201,3 +197,53 @@ def test_merge_no_samples_or_columns_overlapping(self):
'col4', 'col5', 'col6']))

self.assertEqual(obs1, exp1)

def test_invalid_index_name_in_error_message(self):
hagenjp marked this conversation as resolved.
Show resolved Hide resolved
index1 = pd.Index(['sample1', 'sample2', 'sample3'], name='id')
data1 = [['a', 'd', 'h'],
['b', 'e', 'i'],
['c', 'f', 'j']]
md1 = qiime2.Metadata(pd.DataFrame(data1, index=index1, dtype=object,
columns=['col1', 'col2', 'col3']))

index2 = pd.Index(['sample4', 'sample5', 'sample6'], name='sample-id')
data2 = [['k', 'n', 'q'],
['l', 'o', 'r'],
['m', 'p', 's']]
md2 = qiime2.Metadata(pd.DataFrame(data2, index=index2, dtype=object,
columns=['col4', 'col5', 'col6']))

with self.assertRaisesRegex(
ValueError,
"Metadata files contain different ID column names.*id.*sample-id"
):
merge(md1, md2)

def test_merge_file_name_in_error_message_float_in_categorical_md(self):
hagenjp marked this conversation as resolved.
Show resolved Hide resolved
index1 = pd.Index(['sample1', 'sample2', 'sample3'], name='id')
data1 = [['a', 'd', 'h'],
['b', 'e', 'i'],
['c', 'f', 'j']]
md1 = qiime2.Metadata(
pd.DataFrame(
data1,
index=index1,
dtype=object,
columns=['col1', 'col2', 'col3']
)
)

index2 = pd.Index(['sample4', 'sample5', 'sample6'], name='id')
data2 = [['k', 'n', 40.0],
['l', 'o', 41.0],
['m', 'p', 42.0]]
md2 = qiime2.Metadata(
pd.DataFrame(data2, index=index2, columns=['col1', 'col2', 'col3'])
)

with self.assertRaisesRegex(
hagenjp marked this conversation as resolved.
Show resolved Hide resolved
ValueError,
"Metadata files contain identically named columns.*col3.*object"
".*float"
):
merge(md1, md2)
Loading