Skip to content

Commit

Permalink
IMP: better error handling for merge method use cases (#64)
Browse files Browse the repository at this point in the history
  • Loading branch information
hagenjp authored May 1, 2024
1 parent 952133e commit 2766cad
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 13 deletions.
43 changes: 34 additions & 9 deletions q2_metadata/_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,46 @@ def merge(metadata1: qiime2.Metadata,
n_overlapping_ids = len(overlapping_ids)
n_overlapping_columns = len(overlapping_columns)

if len(overlapping_ids) > 0 and len(overlapping_columns) > 0:
raise ValueError(f"Merging can currently handle overlapping ids "
f"or overlapping columns, but not both. "
f"{n_overlapping_ids} overlapping ids were "
f"identified ({', '.join(overlapping_ids)}) and"
f"{n_overlapping_columns} overlapping columns "
f"were identified {', '.join(overlapping_columns)}.")
if n_overlapping_ids and n_overlapping_columns:
raise ValueError(
"Merging can currently handle overlapping ids or overlapping "
f"columns but not both. {n_overlapping_ids} overlapping ids were "
f"identified ({', '.join(overlapping_ids)}) and "
f"{n_overlapping_columns} overlapping columns were identified "
f"({', '.join(overlapping_columns)})."
)

df1 = metadata1.to_dataframe()
df2 = metadata2.to_dataframe()

if n_overlapping_columns == 0:
if df1.index.name != df2.index.name:
raise ValueError(
"Metadata files contain different ID column names. "
f"Metadata1 file contains '{df1.index.name}' and metadata2 "
f"contains '{df2.index.name}'. These column names must match."
)

if not n_overlapping_columns:
result = pd.merge(df1, df2, how='outer', left_index=True,
right_index=True)
else: # i.e., n_overlapping_ids == 0

else:
for column in overlapping_columns:
if df1[column].dtype != df2[column].dtype:
column_type1 = type(
qiime2.Metadata(df1[[column]]).get_column(column))
column_type2 = type(
qiime2.Metadata(df2[[column]]).get_column(column))
raise ValueError(
f"Metadata files contain the shared column '{column}' "
"with different type designations. "
f"In 'metadata1', the column '{column}' is of type "
f"'{column_type1.__name__}', "
f"and in 'metadata2', it is of type "
f"'{column_type2.__name__}'. These type designations must "
"match."
)

result = pd.merge(df1, df2, how='outer', left_index=True,
right_index=True, suffixes=('', '_'))
for c in overlapping_columns:
Expand Down
59 changes: 55 additions & 4 deletions q2_metadata/tests/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,6 @@ def test_merge_all_columns_overlapping(self):
columns=['col1', 'col2', 'col3']))

obs1 = merge(md1, md2)
print(obs1.to_dataframe())

index_exp1 = pd.Index(['sample1', 'sample2', 'sample3',
'sample4', 'sample5', 'sample6'], name='id')
data_exp1 = [['a', 'd', 'h'],
Expand All @@ -135,8 +133,6 @@ def test_merge_all_columns_overlapping(self):
exp1 = qiime2.Metadata(
pd.DataFrame(data_exp1, index=index_exp1, dtype=object,
columns=['col1', 'col2', 'col3']))

print(exp1.to_dataframe())
self.assertEqual(obs1, exp1)

def test_merge_some_columns_overlapping(self):
Expand Down Expand Up @@ -201,3 +197,58 @@ def test_merge_no_samples_or_columns_overlapping(self):
'col4', 'col5', 'col6']))

self.assertEqual(obs1, exp1)

def test_merge_mismatched_columnID_names_in_error_message(self):
index1 = pd.Index(['sample1', 'sample2', 'sample3'], name='id')
data1 = [['a', 'd', 'h'],
['b', 'e', 'i'],
['c', 'f', 'j']]
md1 = qiime2.Metadata(pd.DataFrame(data1, index=index1, dtype=object,
columns=['col1', 'col2', 'col3']))

index2 = pd.Index(['sample4', 'sample5', 'sample6'], name='sample-id')
data2 = [['k', 'n', 'q'],
['l', 'o', 'r'],
['m', 'p', 's']]
md2 = qiime2.Metadata(pd.DataFrame(data2, index=index2, dtype=object,
columns=['col4', 'col5', 'col6']))

with self.assertRaisesRegex(
ValueError,
"Metadata files contain different ID column names.*id.*sample-id"
):
merge(md1, md2)

def test_merge_mismatched_md_column_type_designations(self):
index1 = pd.Index(['sample1', 'sample2', 'sample3'], name='id')
data1 = [['a', 'd', 'h'],
['b', 'e', 'i'],
['c', 'f', 'j']]
md1 = qiime2.Metadata(
pd.DataFrame(
data1,
index=index1,
dtype=object,
columns=['col1', 'col2', 'col3']
)
)
index2 = pd.Index(['sample4', 'sample5', 'sample6'], name='id')
data2 = [['k', 'n', 40.0],
['l', 'o', 41.0],
['m', 'p', 42.0]]
md2 = qiime2.Metadata(
pd.DataFrame(
data2,
index=index2,
columns=['col1', 'col2', 'col3']
)
)
with self.assertRaisesRegex(
ValueError,
"Metadata files contain the shared column 'col3' with different "
"type designations. In 'metadata1', the column 'col3' is of type "
r"\'CategoricalMetadataColumn\', and in 'metadata2', it is of "
r"type \'NumericMetadataColumn\'. These type designations must "
"match."
):
merge(md1, md2)

0 comments on commit 2766cad

Please sign in to comment.