From e061f4c402c8332d6b5296cacfc739f108855d1a Mon Sep 17 00:00:00 2001
From: Taylor Salo <salot@pennmedicine.upenn.edu>
Date: Fri, 17 Jan 2025 13:03:04 -0500
Subject: [PATCH 1/6] Support array-type metadata fields.

---
 cubids/cubids.py | 56 +++++++++++++++++++++++++++++++++++-------------
 cubids/utils.py  | 16 ++++++++++++++
 2 files changed, 57 insertions(+), 15 deletions(-)

diff --git a/cubids/cubids.py b/cubids/cubids.py
index 158e97d05..f5401b2ab 100644
--- a/cubids/cubids.py
+++ b/cubids/cubids.py
@@ -1664,23 +1664,49 @@ def format_params(param_group_df, config, modality):
             continue
 
         if "tolerance" in column_fmt and len(param_group_df) > 1:
-            array = param_group_df[column_name].to_numpy().reshape(-1, 1)
-
-            for i in range(len(array)):
-                if np.isnan(array[i, 0]):
-                    array[i, 0] = -999
-
-            tolerance = to_format[column_name]["tolerance"]
-            clustering = AgglomerativeClustering(
-                n_clusters=None, distance_threshold=tolerance, linkage="complete"
-            ).fit(array)
+            column_data = param_group_df[column_name].to_numpy()
+
+            if any(isinstance(x, list) for x in column_data):
+                # For array/list data, we should first define "clusters" based on the number of
+                # elements, then apply the clustering within each set of lengths.
+                # For example, if there are four runs with five elements and 10 runs with four
+                # elements, we should cluster the four-element runs separately from the
+                # five-element runs, and account for that in the clustering labels.
+                lengths = ["x".join(str(i) for i in np.array(x).shape) for x in column_data]
+                unique_lengths = np.unique(lengths)
+                cluster_idx = 0
+                for unique_length in unique_lengths:
+                    sel_rows = [i for i, x in enumerate(lengths) if x == unique_length]
+                    array = np.array([np.array(x) for x in column_data[sel_rows]])
+
+                    if array.shape[0] > 1:
+                        # clustering requires at least two samples
+                        array[np.isnan(array)] = -999
+
+                        tolerance = to_format[column_name]["tolerance"]
+                        clustering = AgglomerativeClustering(
+                            n_clusters=None, distance_threshold=tolerance, linkage="complete"
+                        ).fit(array)
+
+                        param_group_df.loc[sel_rows, f"Cluster_{column_name}"] = (
+                            clustering.labels_ + cluster_idx
+                        )
+                        cluster_idx += max(clustering.labels_) + 1
+                    else:
+                        # single-file cluster
+                        param_group_df.loc[sel_rows, f"Cluster_{column_name}"] = cluster_idx
+                        cluster_idx += 1
+            else:
+                array = column_data.reshape(-1, 1)
+                array[np.isnan(array)] = -999
 
-            for i in range(len(array)):
-                if array[i, 0] == -999:
-                    array[i, 0] = np.nan
+                tolerance = to_format[column_name]["tolerance"]
+                clustering = AgglomerativeClustering(
+                    n_clusters=None, distance_threshold=tolerance, linkage="complete"
+                ).fit(array)
 
-            # now add clustering_labels as a column
-            param_group_df[f"Cluster_{column_name}"] = clustering.labels_
+                # now add clustering_labels as a column
+                param_group_df[f"Cluster_{column_name}"] = clustering.labels_
 
     return param_group_df
 
diff --git a/cubids/utils.py b/cubids/utils.py
index c4cdb457f..358aa772e 100644
--- a/cubids/utils.py
+++ b/cubids/utils.py
@@ -34,3 +34,19 @@ def _get_container_type(image_name):
         return "docker"
 
     raise Exception("Unable to determine the container type of " + image_name)
+
+
+def _compress_lists(df):
+    """Compress lists in a DataFrame to strings."""
+    for col in df.columns:
+        if isinstance(df[col].values[0], list):
+            df[col] = df[col].apply(lambda x: "|&|".join(x))
+    return df
+
+
+def _expand_lists(df):
+    """Expand strings in a DataFrame to lists."""
+    for col in df.columns:
+        if isinstance(df[col].values[0], str):
+            df[col] = df[col].apply(lambda x: x.split("|&|"))
+    return df

From ce5dea8657253b931db997b691d1a0bd4e39b617 Mon Sep 17 00:00:00 2001
From: Taylor Salo <salot@pennmedicine.upenn.edu>
Date: Fri, 17 Jan 2025 13:25:50 -0500
Subject: [PATCH 2/6] Try supporting lists of strings too.

---
 cubids/cubids.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/cubids/cubids.py b/cubids/cubids.py
index f5401b2ab..df15d7bee 100644
--- a/cubids/cubids.py
+++ b/cubids/cubids.py
@@ -1669,9 +1669,9 @@ def format_params(param_group_df, config, modality):
             if any(isinstance(x, list) for x in column_data):
                 # For array/list data, we should first define "clusters" based on the number of
                 # elements, then apply the clustering within each set of lengths.
-                # For example, if there are four runs with five elements and 10 runs with four
-                # elements, we should cluster the four-element runs separately from the
-                # five-element runs, and account for that in the clustering labels.
+                # For example, if there are four runs with five elements and 10 runs with three
+                # elements, we should cluster the five-element runs separately from the
+                # three-element runs, and account for that in the clustering labels.
                 lengths = ["x".join(str(i) for i in np.array(x).shape) for x in column_data]
                 unique_lengths = np.unique(lengths)
                 cluster_idx = 0
@@ -1708,6 +1708,21 @@ def format_params(param_group_df, config, modality):
                 # now add clustering_labels as a column
                 param_group_df[f"Cluster_{column_name}"] = clustering.labels_
 
+        else:
+            # We can rely on string matching for string-type fields,
+            # but arrays of strings need to be handled differently.
+            column_data = param_group_df[column_name].tolist()
+
+            if any(isinstance(x, list) for x in column_data):
+                cluster_idx = 0
+
+                column_data = ["|&|".join(str(val) for val in cell) for cell in column_data]
+                unique_vals = np.unique(column_data)
+                for val in unique_vals:
+                    sel_rows = [i for i, x in enumerate(column_data) if x == val]
+                    param_group_df.loc[sel_rows, f"Cluster_{column_name}"] = cluster_idx
+                    cluster_idx += 1
+
     return param_group_df
 
 

From 144a4926780281e80d16b0c15622872c1f8c3c77 Mon Sep 17 00:00:00 2001
From: Taylor Salo <salot@pennmedicine.upenn.edu>
Date: Fri, 17 Jan 2025 14:20:29 -0500
Subject: [PATCH 3/6] Add test.

---
 cubids/tests/test_utils.py | 94 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 cubids/tests/test_utils.py

diff --git a/cubids/tests/test_utils.py b/cubids/tests/test_utils.py
new file mode 100644
index 000000000..3375b3cd4
--- /dev/null
+++ b/cubids/tests/test_utils.py
@@ -0,0 +1,94 @@
+"""Tests for the utils module."""
+
+import numpy as np
+import pandas as pd
+
+from cubids.cubids import format_params
+
+
+def test_format_params():
+    config = {
+        "sidecar_params": {
+            "func": {
+                "RepetitionTime": {"tolerance": 0.01, "suggest_variant_rename": True},
+                "TaskName": {"suggest_variant_rename": True},
+                "SliceTiming": {"tolerance": 0.01, "suggest_variant_rename": True},
+                "ImageType": {"suggest_variant_rename": True},
+            },
+        },
+        "derived_params": {
+            "func": {},
+        },
+    }
+
+    # Mock up the input
+    params = [
+        {
+            "RepetitionTime": 2.0,
+            "TaskName": "rest eyes closed",
+            "SliceTiming": [0.0, 1.0, 2.0],
+            "ImageType": ["ORIGINAL", "NONE", "M"],
+        },
+        {
+            "RepetitionTime": 2.0,
+            "TaskName": "rest eyes closed",
+            "SliceTiming": [0.0, 1.0, 2.0],
+            "ImageType": ["ORIGINAL", "NONE", "M"],
+        },
+        {
+            "RepetitionTime": 2.0,
+            "TaskName": "rest eyes open",
+            "SliceTiming": [0.0, 1.0, 2.0],
+            "ImageType": ["ORIGINAL", "NONE", "M"],
+        },
+        {
+            "RepetitionTime": 1.9,
+            "TaskName": "rest eyes closed",
+            "SliceTiming": [0.0, 1.0, 2.0],
+            "ImageType": ["ORIGINAL", "NONE", "M"],
+        },
+        {
+            "RepetitionTime": 2.0,
+            "TaskName": "rest eyes closed",
+            "SliceTiming": [0.0, 0.5, 1.0, 1.5, 2.0],
+            "ImageType": ["ORIGINAL", "NONE", "M"],
+        },
+        {
+            "RepetitionTime": 2.0,
+            "TaskName": "rest eyes closed",
+            "SliceTiming": [0.0, 1.0, 1.9],
+            "ImageType": ["ORIGINAL", "NONE", "M"],
+        },
+        {
+            "RepetitionTime": 2.0,
+            "TaskName": "rest eyes closed",
+            "SliceTiming": [0.0, 1.0, 2.0],
+            "ImageType": ["ORIGINAL", "NONE", "M", "NORM"],
+        },
+    ]
+    param_group_df = pd.DataFrame(params)
+    modality = "func"
+
+    # Run the function
+    formatted_params = format_params(
+        param_group_df=param_group_df,
+        config=config,
+        modality=modality,
+    )
+    assert isinstance(formatted_params, pd.DataFrame)
+    assert "Cluster_RepetitionTime" in formatted_params.columns
+    assert "Cluster_TaskName" not in formatted_params.columns
+    assert "Cluster_SliceTiming" in formatted_params.columns
+    assert "Cluster_ImageType" in formatted_params.columns
+    assert np.array_equal(
+        formatted_params["Cluster_RepetitionTime"].values,
+        [0, 0, 0, 1, 0, 0, 0],
+    )
+    assert np.array_equal(
+        formatted_params["Cluster_SliceTiming"].values,
+        [0, 0, 0, 0, 2, 1, 0],
+    )
+    assert np.array_equal(
+        formatted_params["Cluster_ImageType"].values,
+        [0, 0, 0, 0, 0, 0, 1],
+    )

From 52d02d048f535a28afa93a0ca2eac0e433fdd4a4 Mon Sep 17 00:00:00 2001
From: Taylor Salo <salot@pennmedicine.upenn.edu>
Date: Fri, 17 Jan 2025 14:38:22 -0500
Subject: [PATCH 4/6] Update test_utils.py

---
 cubids/tests/test_utils.py | 129 ++++++++++++++++++++++++++++++++-----
 1 file changed, 113 insertions(+), 16 deletions(-)

diff --git a/cubids/tests/test_utils.py b/cubids/tests/test_utils.py
index 3375b3cd4..db50e107e 100644
--- a/cubids/tests/test_utils.py
+++ b/cubids/tests/test_utils.py
@@ -7,6 +7,11 @@
 
 
 def test_format_params():
+    """Test the format_params function.
+
+    We want to test that the function correctly clusters parameters based on the
+    configuration dictionary.
+    """
     config = {
         "sidecar_params": {
             "func": {
@@ -21,7 +26,7 @@ def test_format_params():
         },
     }
 
-    # Mock up the input
+    # Mock up the input. The variants are explicitly prepared.
     params = [
         {
             "RepetitionTime": 2.0,
@@ -37,11 +42,13 @@ def test_format_params():
         },
         {
             "RepetitionTime": 2.0,
+            # TaskName variant
             "TaskName": "rest eyes open",
             "SliceTiming": [0.0, 1.0, 2.0],
             "ImageType": ["ORIGINAL", "NONE", "M"],
         },
         {
+            # RepetitionTime variant
             "RepetitionTime": 1.9,
             "TaskName": "rest eyes closed",
             "SliceTiming": [0.0, 1.0, 2.0],
@@ -50,12 +57,14 @@ def test_format_params():
         {
             "RepetitionTime": 2.0,
             "TaskName": "rest eyes closed",
+            # SliceTiming variant (length)
             "SliceTiming": [0.0, 0.5, 1.0, 1.5, 2.0],
             "ImageType": ["ORIGINAL", "NONE", "M"],
         },
         {
             "RepetitionTime": 2.0,
             "TaskName": "rest eyes closed",
+            # SliceTiming variant (values)
             "SliceTiming": [0.0, 1.0, 1.9],
             "ImageType": ["ORIGINAL", "NONE", "M"],
         },
@@ -63,32 +72,120 @@ def test_format_params():
             "RepetitionTime": 2.0,
             "TaskName": "rest eyes closed",
             "SliceTiming": [0.0, 1.0, 2.0],
+            # ImageType variant (length)
             "ImageType": ["ORIGINAL", "NONE", "M", "NORM"],
         },
+        {
+            "RepetitionTime": 2.0,
+            "TaskName": "rest eyes closed",
+            "SliceTiming": [0.0, 1.0, 2.0],
+            # ImageType variant (values)
+            "ImageType": ["ORIGINAL", "NONE", "P"],
+        },
     ]
     param_group_df = pd.DataFrame(params)
     modality = "func"
 
     # Run the function
-    formatted_params = format_params(
+    out_df = format_params(
         param_group_df=param_group_df,
         config=config,
         modality=modality,
     )
-    assert isinstance(formatted_params, pd.DataFrame)
-    assert "Cluster_RepetitionTime" in formatted_params.columns
-    assert "Cluster_TaskName" not in formatted_params.columns
-    assert "Cluster_SliceTiming" in formatted_params.columns
-    assert "Cluster_ImageType" in formatted_params.columns
-    assert np.array_equal(
-        formatted_params["Cluster_RepetitionTime"].values,
-        [0, 0, 0, 1, 0, 0, 0],
+    assert isinstance(out_df, pd.DataFrame)
+    assert "Cluster_RepetitionTime" in out_df.columns
+    assert "Cluster_SliceTiming" in out_df.columns
+    assert "Cluster_ImageType" in out_df.columns
+    # Non-list columns without tolerance don't get clustered
+    assert "Cluster_TaskName" not in out_df.columns
+
+    assert compare_group_assignments(
+        out_df["Cluster_RepetitionTime"].values.astype(int),
+        [0, 0, 0, 1, 0, 0, 0, 0],
     )
-    assert np.array_equal(
-        formatted_params["Cluster_SliceTiming"].values,
-        [0, 0, 0, 0, 2, 1, 0],
+    assert compare_group_assignments(
+        out_df["Cluster_SliceTiming"].values.astype(int),
+        [0, 0, 0, 0, 2, 1, 0, 0],
     )
-    assert np.array_equal(
-        formatted_params["Cluster_ImageType"].values,
-        [0, 0, 0, 0, 0, 0, 1],
+    assert compare_group_assignments(
+        out_df["Cluster_ImageType"].values.astype(int),
+        [0, 0, 0, 0, 0, 0, 1, 2],
     )
+
+    # Change the tolerance for SliceTiming
+    config["sidecar_params"]["func"]["SliceTiming"]["tolerance"] = 0.5
+    out_df = format_params(
+        param_group_df=param_group_df,
+        config=config,
+        modality=modality,
+    )
+    assert isinstance(out_df, pd.DataFrame)
+    assert "Cluster_RepetitionTime" in out_df.columns
+    assert "Cluster_SliceTiming" in out_df.columns
+    assert "Cluster_ImageType" in out_df.columns
+    # Non-list columns without tolerance don't get clustered
+    assert "Cluster_TaskName" not in out_df.columns
+
+    assert compare_group_assignments(
+        out_df["Cluster_RepetitionTime"].values.astype(int),
+        [0, 0, 0, 1, 0, 0, 0, 0],
+    )
+    # Different lengths still produce different clusters,
+    # but the value-based variants are now the same
+    assert compare_group_assignments(
+        out_df["Cluster_SliceTiming"].values.astype(int),
+        [0, 0, 0, 0, 1, 0, 0, 0],
+    )
+    assert compare_group_assignments(
+        out_df["Cluster_ImageType"].values.astype(int),
+        [0, 0, 0, 0, 0, 0, 1, 2],
+    )
+
+
+def compare_group_assignments(list1, list2):
+    """
+    Compare two lists for equality based on group assignments.
+
+    This function checks if two lists can be considered equal based on their group assignments.
+    The actual values in the lists do not matter, only the group assignments do. Each unique value
+    in the first list is mapped to a unique value in the second list, and the function checks if
+    this mapping is consistent throughout the lists.
+
+    Parameters
+    ----------
+    list1 : list
+        The first list to compare.
+    list2 : list
+        The second list to compare.
+
+    Returns
+    -------
+    bool
+        True if the lists are equal based on group assignments, False otherwise.
+
+    Examples
+    --------
+    >>> list1 = [1, 2, 1, 3, 2]
+    >>> list2 = ['a', 'b', 'a', 'c', 'b']
+    >>> compare_group_assignments(list1, list2)
+    True
+
+    >>> list1 = [1, 2, 1, 3, 2]
+    >>> list2 = ['a', 'b', 'a', 'c', 'd']
+    >>> compare_group_assignments(list1, list2)
+    False
+    """
+    if len(list1) != len(list2):
+        return False
+
+    mapping = {}
+    for a, b in zip(list1, list2):
+        if a in mapping:
+            if mapping[a] != b:
+                return False
+        else:
+            if b in mapping.values():
+                return False
+            mapping[a] = b
+
+    return True

From 7a546b691aa6443296d793ca15a2710f55b4cccb Mon Sep 17 00:00:00 2001
From: Taylor Salo <salot@pennmedicine.upenn.edu>
Date: Fri, 17 Jan 2025 14:39:50 -0500
Subject: [PATCH 5/6] Update stuff.

---
 cubids/tests/test_utils.py |  1 -
 cubids/utils.py            | 31 +++++++++++++++++++++++++++++--
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/cubids/tests/test_utils.py b/cubids/tests/test_utils.py
index db50e107e..184e1d250 100644
--- a/cubids/tests/test_utils.py
+++ b/cubids/tests/test_utils.py
@@ -1,6 +1,5 @@
 """Tests for the utils module."""
 
-import numpy as np
 import pandas as pd
 
 from cubids.cubids import format_params
diff --git a/cubids/utils.py b/cubids/utils.py
index 358aa772e..6cccbf98c 100644
--- a/cubids/utils.py
+++ b/cubids/utils.py
@@ -37,7 +37,20 @@ def _get_container_type(image_name):
 
 
 def _compress_lists(df):
-    """Compress lists in a DataFrame to strings."""
+    """Compress lists in a DataFrame to strings.
+
+    Used to prepare a DataFrame with cells containing lists for writing to a TSV file.
+
+    Parameters
+    ----------
+    df : :obj:`pandas.DataFrame`
+        The DataFrame to compress.
+
+    Returns
+    -------
+    :obj:`pandas.DataFrame`
+        The compressed DataFrame.
+    """
     for col in df.columns:
         if isinstance(df[col].values[0], list):
             df[col] = df[col].apply(lambda x: "|&|".join(x))
@@ -45,7 +58,21 @@ def _compress_lists(df):
 
 
 def _expand_lists(df):
-    """Expand strings in a DataFrame to lists."""
+    """Expand strings in a DataFrame to lists.
+
+    Used to prepare a DataFrame with cells containing strings for querying after loading from a
+    TSV file.
+
+    Parameters
+    ----------
+    df : :obj:`pandas.DataFrame`
+        The DataFrame to expand.
+
+    Returns
+    -------
+    :obj:`pandas.DataFrame`
+        The expanded DataFrame.
+    """
     for col in df.columns:
         if isinstance(df[col].values[0], str):
             df[col] = df[col].apply(lambda x: x.split("|&|"))

From 6c3f3e1e976400d1975e69e4a0f13657084a5b60 Mon Sep 17 00:00:00 2001
From: Taylor Salo <salot@pennmedicine.upenn.edu>
Date: Fri, 17 Jan 2025 15:08:06 -0500
Subject: [PATCH 6/6] Update test_utils.py

---
 cubids/tests/test_utils.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cubids/tests/test_utils.py b/cubids/tests/test_utils.py
index 184e1d250..a162053fc 100644
--- a/cubids/tests/test_utils.py
+++ b/cubids/tests/test_utils.py
@@ -142,8 +142,7 @@ def test_format_params():
 
 
 def compare_group_assignments(list1, list2):
-    """
-    Compare two lists for equality based on group assignments.
+    """Compare two lists for equality based on group assignments.
 
     This function checks if two lists can be considered equal based on their group assignments.
     The actual values in the lists do not matter, only the group assignments do. Each unique value
@@ -169,6 +168,11 @@ def compare_group_assignments(list1, list2):
     >>> compare_group_assignments(list1, list2)
     True
 
+    >>> list1 = [1, 2, 1, 3, 2]
+    >>> list2 = ['b', 'd', 'b', 'q', 'd']
+    >>> compare_group_assignments(list1, list2)
+    True
+
     >>> list1 = [1, 2, 1, 3, 2]
     >>> list2 = ['a', 'b', 'a', 'c', 'd']
     >>> compare_group_assignments(list1, list2)