Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IMP: more verbose error handling for merge method use cases #64

Merged
merged 11 commits into from
May 1, 2024
43 changes: 34 additions & 9 deletions q2_metadata/_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,46 @@ def merge(metadata1: qiime2.Metadata,
n_overlapping_ids = len(overlapping_ids)
n_overlapping_columns = len(overlapping_columns)

if len(overlapping_ids) > 0 and len(overlapping_columns) > 0:
raise ValueError(f"Merging can currently handle overlapping ids "
f"or overlapping columns, but not both. "
f"{n_overlapping_ids} overlapping ids were "
f"identified ({', '.join(overlapping_ids)}) and"
f"{n_overlapping_columns} overlapping columns "
f"were identified {', '.join(overlapping_columns)}.")
if n_overlapping_ids and n_overlapping_columns:
raise ValueError(
"Merging can currently handle overlapping ids or overlapping "
f"columns but not both. {n_overlapping_ids} overlapping ids were "
f"identified ({', '.join(overlapping_ids)}) and "
f"{n_overlapping_columns} overlapping columns were identified "
f"({', '.join(overlapping_columns)})."
)

df1 = metadata1.to_dataframe()
df2 = metadata2.to_dataframe()

if n_overlapping_columns == 0:
if df1.index.name != df2.index.name:
raise ValueError(
"Metadata files contain different ID column names. "
f"Metadata1 file contains '{df1.index.name}' and metadata2 "
f"contains '{df2.index.name}'. These column names must match."
)

if not n_overlapping_columns:
result = pd.merge(df1, df2, how='outer', left_index=True,
right_index=True)
else: # i.e., n_overlapping_ids == 0

else:
for column in overlapping_columns:
if df1[column].dtype != df2[column].dtype:
column_type1 = type(
qiime2.Metadata(df1[[column]]).get_column(column))
column_type2 = type(
qiime2.Metadata(df2[[column]]).get_column(column))
raise ValueError(
f"Metadata files contain the shared column '{column}' "
"with different type designations. "
f"In 'metadata1', the column '{column}' is of type "
f"'{column_type1.__name__}', "
f"and in 'metadata2', it is of type "
f"'{column_type2.__name__}'. These type designations must "
"match."
)

result = pd.merge(df1, df2, how='outer', left_index=True,
right_index=True, suffixes=('', '_'))
for c in overlapping_columns:
Expand Down
59 changes: 55 additions & 4 deletions q2_metadata/tests/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,6 @@ def test_merge_all_columns_overlapping(self):
columns=['col1', 'col2', 'col3']))

obs1 = merge(md1, md2)
print(obs1.to_dataframe())
hagenjp marked this conversation as resolved.
Show resolved Hide resolved

index_exp1 = pd.Index(['sample1', 'sample2', 'sample3',
'sample4', 'sample5', 'sample6'], name='id')
data_exp1 = [['a', 'd', 'h'],
Expand All @@ -135,8 +133,6 @@ def test_merge_all_columns_overlapping(self):
exp1 = qiime2.Metadata(
pd.DataFrame(data_exp1, index=index_exp1, dtype=object,
columns=['col1', 'col2', 'col3']))

print(exp1.to_dataframe())
self.assertEqual(obs1, exp1)

def test_merge_some_columns_overlapping(self):
Expand Down Expand Up @@ -201,3 +197,58 @@ def test_merge_no_samples_or_columns_overlapping(self):
'col4', 'col5', 'col6']))

self.assertEqual(obs1, exp1)

def test_merge_mismatched_columnID_names_in_error_message(self):
index1 = pd.Index(['sample1', 'sample2', 'sample3'], name='id')
data1 = [['a', 'd', 'h'],
['b', 'e', 'i'],
['c', 'f', 'j']]
md1 = qiime2.Metadata(pd.DataFrame(data1, index=index1, dtype=object,
columns=['col1', 'col2', 'col3']))

index2 = pd.Index(['sample4', 'sample5', 'sample6'], name='sample-id')
data2 = [['k', 'n', 'q'],
['l', 'o', 'r'],
['m', 'p', 's']]
md2 = qiime2.Metadata(pd.DataFrame(data2, index=index2, dtype=object,
columns=['col4', 'col5', 'col6']))

with self.assertRaisesRegex(
ValueError,
"Metadata files contain different ID column names.*id.*sample-id"
):
merge(md1, md2)

def test_merge_mismatched_md_column_type_designations(self):
index1 = pd.Index(['sample1', 'sample2', 'sample3'], name='id')
data1 = [['a', 'd', 'h'],
['b', 'e', 'i'],
['c', 'f', 'j']]
md1 = qiime2.Metadata(
pd.DataFrame(
data1,
index=index1,
dtype=object,
columns=['col1', 'col2', 'col3']
)
)
index2 = pd.Index(['sample4', 'sample5', 'sample6'], name='id')
data2 = [['k', 'n', 40.0],
['l', 'o', 41.0],
['m', 'p', 42.0]]
md2 = qiime2.Metadata(
pd.DataFrame(
data2,
index=index2,
columns=['col1', 'col2', 'col3']
)
)
with self.assertRaisesRegex(
hagenjp marked this conversation as resolved.
Show resolved Hide resolved
ValueError,
"Metadata files contain the shared column 'col3' with different "
"type designations. In 'metadata1', the column 'col3' is of type "
r"\'CategoricalMetadataColumn\', and in 'metadata2', it is of "
r"type \'NumericMetadataColumn\'. These type designations must "
"match."
):
merge(md1, md2)
Loading