diff --git a/q2_metadata/_merge.py b/q2_metadata/_merge.py index 82a2e8c..0497e46 100644 --- a/q2_metadata/_merge.py +++ b/q2_metadata/_merge.py @@ -19,21 +19,46 @@ def merge(metadata1: qiime2.Metadata, n_overlapping_ids = len(overlapping_ids) n_overlapping_columns = len(overlapping_columns) - if len(overlapping_ids) > 0 and len(overlapping_columns) > 0: - raise ValueError(f"Merging can currently handle overlapping ids " - f"or overlapping columns, but not both. " - f"{n_overlapping_ids} overlapping ids were " - f"identified ({', '.join(overlapping_ids)}) and" - f"{n_overlapping_columns} overlapping columns " - f"were identified {', '.join(overlapping_columns)}.") + if n_overlapping_ids and n_overlapping_columns: + raise ValueError( + "Merging can currently handle overlapping ids or overlapping " + f"columns but not both. {n_overlapping_ids} overlapping ids were " + f"identified ({', '.join(overlapping_ids)}) and " + f"{n_overlapping_columns} overlapping columns were identified " + f"({', '.join(overlapping_columns)})." + ) df1 = metadata1.to_dataframe() df2 = metadata2.to_dataframe() - if n_overlapping_columns == 0: + if df1.index.name != df2.index.name: + raise ValueError( + "Metadata files contain different ID column names. " + f"Metadata1 file contains '{df1.index.name}' and metadata2 " + f"contains '{df2.index.name}'. These column names must match." + ) + + if not n_overlapping_columns: result = pd.merge(df1, df2, how='outer', left_index=True, right_index=True) - else: # i.e., n_overlapping_ids == 0 + + else: + for column in overlapping_columns: + if df1[column].dtype != df2[column].dtype: + column_type1 = type( + qiime2.Metadata(df1[[column]]).get_column(column)) + column_type2 = type( + qiime2.Metadata(df2[[column]]).get_column(column)) + raise ValueError( + f"Metadata files contain the shared column '{column}' " + "with different type designations. " + f"In 'metadata1', the column '{column}' is of type " + f"'{column_type1.__name__}', " + f"and in 'metadata2', it is of type " + f"'{column_type2.__name__}'. These type designations must " + "match." + ) + result = pd.merge(df1, df2, how='outer', left_index=True, right_index=True, suffixes=('', '_')) for c in overlapping_columns: diff --git a/q2_metadata/tests/test_merge.py b/q2_metadata/tests/test_merge.py index b0c3a79..8f82e1e 100644 --- a/q2_metadata/tests/test_merge.py +++ b/q2_metadata/tests/test_merge.py @@ -122,8 +122,6 @@ def test_merge_all_columns_overlapping(self): columns=['col1', 'col2', 'col3'])) obs1 = merge(md1, md2) - print(obs1.to_dataframe()) - index_exp1 = pd.Index(['sample1', 'sample2', 'sample3', 'sample4', 'sample5', 'sample6'], name='id') data_exp1 = [['a', 'd', 'h'], @@ -135,8 +133,6 @@ def test_merge_all_columns_overlapping(self): exp1 = qiime2.Metadata( pd.DataFrame(data_exp1, index=index_exp1, dtype=object, columns=['col1', 'col2', 'col3'])) - - print(exp1.to_dataframe()) self.assertEqual(obs1, exp1) def test_merge_some_columns_overlapping(self): @@ -201,3 +197,58 @@ def test_merge_no_samples_or_columns_overlapping(self): 'col4', 'col5', 'col6'])) self.assertEqual(obs1, exp1) + + def test_merge_mismatched_columnID_names_in_error_message(self): + index1 = pd.Index(['sample1', 'sample2', 'sample3'], name='id') + data1 = [['a', 'd', 'h'], + ['b', 'e', 'i'], + ['c', 'f', 'j']] + md1 = qiime2.Metadata(pd.DataFrame(data1, index=index1, dtype=object, + columns=['col1', 'col2', 'col3'])) + + index2 = pd.Index(['sample4', 'sample5', 'sample6'], name='sample-id') + data2 = [['k', 'n', 'q'], + ['l', 'o', 'r'], + ['m', 'p', 's']] + md2 = qiime2.Metadata(pd.DataFrame(data2, index=index2, dtype=object, + columns=['col4', 'col5', 'col6'])) + + with self.assertRaisesRegex( + ValueError, + "Metadata files contain different ID column names.*id.*sample-id" + ): + merge(md1, md2) + + def test_merge_mismatched_md_column_type_designations(self): + index1 = pd.Index(['sample1', 'sample2', 'sample3'], name='id') + data1 = [['a', 'd', 'h'], + ['b', 'e', 'i'], + ['c', 'f', 'j']] + md1 = qiime2.Metadata( + pd.DataFrame( + data1, + index=index1, + dtype=object, + columns=['col1', 'col2', 'col3'] + ) + ) + index2 = pd.Index(['sample4', 'sample5', 'sample6'], name='id') + data2 = [['k', 'n', 40.0], + ['l', 'o', 41.0], + ['m', 'p', 42.0]] + md2 = qiime2.Metadata( + pd.DataFrame( + data2, + index=index2, + columns=['col1', 'col2', 'col3'] + ) + ) + with self.assertRaisesRegex( + ValueError, + "Metadata files contain the shared column 'col3' with different " + "type designations. In 'metadata1', the column 'col3' is of type " + r"\'CategoricalMetadataColumn\', and in 'metadata2', it is of " + r"type \'NumericMetadataColumn\'. These type designations must " + "match." + ): + merge(md1, md2)