PennLINC · tsalo · Jan 17, 2025 · Jan 17, 2025 · Jan 17, 2025 · Jan 17, 2025
diff --git a/cubids/cubids.py b/cubids/cubids.py
@@ -2019,23 +2019,64 @@ def format_params(param_group_df, config, modality):
             continue
 
         if "tolerance" in column_fmt and len(param_group_df) > 1:
-            array = param_group_df[column_name].to_numpy().reshape(-1, 1)
-
-            for i in range(len(array)):
-                if np.isnan(array[i, 0]):
-                    array[i, 0] = -999
+            column_data = param_group_df[column_name].to_numpy()
+
+            if any(isinstance(x, list) for x in column_data):
+                # For array/list data, we should first define "clusters" based on the number of
+                # elements, then apply the clustering within each set of lengths.
+                # For example, if there are four runs with five elements and 10 runs with three
+                # elements, we should cluster the five-element runs separately from the
+                # three-element runs, and account for that in the clustering labels.
+                lengths = ["x".join(str(i) for i in np.array(x).shape) for x in column_data]
+                unique_lengths = np.unique(lengths)
+                cluster_idx = 0
+                for unique_length in unique_lengths:
+                    sel_rows = [i for i, x in enumerate(lengths) if x == unique_length]
+                    array = np.array([np.array(x) for x in column_data[sel_rows]])
+
+                    if array.shape[0] > 1:
+                        # clustering requires at least two samples
+                        array[np.isnan(array)] = -999
+
+                        tolerance = to_format[column_name]["tolerance"]
+                        clustering = AgglomerativeClustering(
+                            n_clusters=None, distance_threshold=tolerance, linkage="complete"
+                        ).fit(array)
+
+                        param_group_df.loc[sel_rows, f"Cluster_{column_name}"] = (
+                            clustering.labels_ + cluster_idx
+                        )
+                        cluster_idx += max(clustering.labels_) + 1
+                    else:
+                        # single-file cluster
+                        param_group_df.loc[sel_rows, f"Cluster_{column_name}"] = cluster_idx
+                        cluster_idx += 1
+            else:
+                array = column_data.reshape(-1, 1)
+                array[np.isnan(array)] = -999
 
-            tolerance = to_format[column_name]["tolerance"]
-            clustering = AgglomerativeClustering(
-                n_clusters=None, distance_threshold=tolerance, linkage="complete"
-            ).fit(array)
+                tolerance = to_format[column_name]["tolerance"]
+                clustering = AgglomerativeClustering(
+                    n_clusters=None, distance_threshold=tolerance, linkage="complete"
+                ).fit(array)
 
-            for i in range(len(array)):
-                if array[i, 0] == -999:
-                    array[i, 0] = np.nan
+                # now add clustering_labels as a column
+                param_group_df[f"Cluster_{column_name}"] = clustering.labels_
 
-            # now add clustering_labels as a column
-            param_group_df[f"Cluster_{column_name}"] = clustering.labels_
+        else:
+            # We can rely on string matching for string-type fields,
+            # but arrays of strings need to be handled differently.
+            column_data = param_group_df[column_name].tolist()
+
+            if any(isinstance(x, list) for x in column_data):
+                cluster_idx = 0
+
+                column_data = ["|&|".join(str(val) for val in cell) for cell in column_data]
+                unique_vals = np.unique(column_data)
+                for val in unique_vals:
+                    sel_rows = [i for i, x in enumerate(column_data) if x == val]
+                    param_group_df.loc[sel_rows, f"Cluster_{column_name}"] = cluster_idx
+                    cluster_idx += 1
 
     return param_group_df
 

diff --git a/cubids/tests/test_utils.py b/cubids/tests/test_utils.py
@@ -0,0 +1,194 @@
+"""Tests for the utils module."""
+
+import pandas as pd
+
+from cubids.cubids import format_params
+
+
+def test_format_params():
+    """Test the format_params function.
+
+    We want to test that the function correctly clusters parameters based on the
+    configuration dictionary.
+    """
+    config = {
+        "sidecar_params": {
+            "func": {
+                "RepetitionTime": {"tolerance": 0.01, "suggest_variant_rename": True},
+                "TaskName": {"suggest_variant_rename": True},
+                "SliceTiming": {"tolerance": 0.01, "suggest_variant_rename": True},
+                "ImageType": {"suggest_variant_rename": True},
+            },
+        },
+        "derived_params": {
+            "func": {},
+        },
+    }
+
+    # Mock up the input. The variants are explicitly prepared.
+    params = [
+        {
+            "RepetitionTime": 2.0,
+            "TaskName": "rest eyes closed",
+            "SliceTiming": [0.0, 1.0, 2.0],
+            "ImageType": ["ORIGINAL", "NONE", "M"],
+        },
+        {
+            "RepetitionTime": 2.0,
+            "TaskName": "rest eyes closed",
+            "SliceTiming": [0.0, 1.0, 2.0],
+            "ImageType": ["ORIGINAL", "NONE", "M"],
+        },
+        {
+            "RepetitionTime": 2.0,
+            # TaskName variant
+            "TaskName": "rest eyes open",
+            "SliceTiming": [0.0, 1.0, 2.0],
+            "ImageType": ["ORIGINAL", "NONE", "M"],
+        },
+        {
+            # RepetitionTime variant
+            "RepetitionTime": 1.9,
+            "TaskName": "rest eyes closed",
+            "SliceTiming": [0.0, 1.0, 2.0],
+            "ImageType": ["ORIGINAL", "NONE", "M"],
+        },
+        {
+            "RepetitionTime": 2.0,
+            "TaskName": "rest eyes closed",
+            # SliceTiming variant (length)
+            "SliceTiming": [0.0, 0.5, 1.0, 1.5, 2.0],
+            "ImageType": ["ORIGINAL", "NONE", "M"],
+        },
+        {
+            "RepetitionTime": 2.0,
+            "TaskName": "rest eyes closed",
+            # SliceTiming variant (values)
+            "SliceTiming": [0.0, 1.0, 1.9],
+            "ImageType": ["ORIGINAL", "NONE", "M"],
+        },
+        {
+            "RepetitionTime": 2.0,
+            "TaskName": "rest eyes closed",
+            "SliceTiming": [0.0, 1.0, 2.0],
+            # ImageType variant (length)
+            "ImageType": ["ORIGINAL", "NONE", "M", "NORM"],
+        },
+        {
+            "RepetitionTime": 2.0,
+            "TaskName": "rest eyes closed",
+            "SliceTiming": [0.0, 1.0, 2.0],
+            # ImageType variant (values)
+            "ImageType": ["ORIGINAL", "NONE", "P"],
+        },
+    ]
+    param_group_df = pd.DataFrame(params)
+    modality = "func"
+
+    # Run the function
+    out_df = format_params(
+        param_group_df=param_group_df,
+        config=config,
+        modality=modality,
+    )
+    assert isinstance(out_df, pd.DataFrame)
+    assert "Cluster_RepetitionTime" in out_df.columns
+    assert "Cluster_SliceTiming" in out_df.columns
+    assert "Cluster_ImageType" in out_df.columns
+    # Non-list columns without tolerance don't get clustered
+    assert "Cluster_TaskName" not in out_df.columns
+
+    assert compare_group_assignments(
+        out_df["Cluster_RepetitionTime"].values.astype(int),
+        [0, 0, 0, 1, 0, 0, 0, 0],
+    )
+    assert compare_group_assignments(
+        out_df["Cluster_SliceTiming"].values.astype(int),
+        [0, 0, 0, 0, 2, 1, 0, 0],
+    )
+    assert compare_group_assignments(
+        out_df["Cluster_ImageType"].values.astype(int),
+        [0, 0, 0, 0, 0, 0, 1, 2],
+    )
+
+    # Change the tolerance for SliceTiming
+    config["sidecar_params"]["func"]["SliceTiming"]["tolerance"] = 0.5
+    out_df = format_params(
+        param_group_df=param_group_df,
+        config=config,
+        modality=modality,
+    )
+    assert isinstance(out_df, pd.DataFrame)
+    assert "Cluster_RepetitionTime" in out_df.columns
+    assert "Cluster_SliceTiming" in out_df.columns
+    assert "Cluster_ImageType" in out_df.columns
+    # Non-list columns without tolerance don't get clustered
+    assert "Cluster_TaskName" not in out_df.columns
+
+    assert compare_group_assignments(
+        out_df["Cluster_RepetitionTime"].values.astype(int),
+        [0, 0, 0, 1, 0, 0, 0, 0],
+    )
+    # Different lengths still produce different clusters,
+    # but the value-based variants are now the same
+    assert compare_group_assignments(
+        out_df["Cluster_SliceTiming"].values.astype(int),
+        [0, 0, 0, 0, 1, 0, 0, 0],
+    )
+    assert compare_group_assignments(
+        out_df["Cluster_ImageType"].values.astype(int),
+        [0, 0, 0, 0, 0, 0, 1, 2],
+    )
+
+
+def compare_group_assignments(list1, list2):
+    """Compare two lists for equality based on group assignments.
+
+    This function checks if two lists can be considered equal based on their group assignments.
+    The actual values in the lists do not matter, only the group assignments do. Each unique value
+    in the first list is mapped to a unique value in the second list, and the function checks if
+    this mapping is consistent throughout the lists.
+
+    Parameters
+    ----------
+    list1 : list
+        The first list to compare.
+    list2 : list
+        The second list to compare.
+
+    Returns
+    -------
+    bool
+        True if the lists are equal based on group assignments, False otherwise.
+
+    Examples
+    --------
+    >>> list1 = [1, 2, 1, 3, 2]
+    >>> list2 = ['a', 'b', 'a', 'c', 'b']
+    >>> compare_group_assignments(list1, list2)
+    True
+
+    >>> list1 = [1, 2, 1, 3, 2]
+    >>> list2 = ['b', 'd', 'b', 'q', 'd']
+    >>> compare_group_assignments(list1, list2)
+    True
+
+    >>> list1 = [1, 2, 1, 3, 2]
+    >>> list2 = ['a', 'b', 'a', 'c', 'd']
+    >>> compare_group_assignments(list1, list2)
+    False
+    """
+    if len(list1) != len(list2):
+        return False
+
+    mapping = {}
+    for a, b in zip(list1, list2):
+        if a in mapping:
+            if mapping[a] != b:
+                return False
+        else:
+            if b in mapping.values():
+                return False
+            mapping[a] = b
+
+    return True
diff --git a/cubids/utils.py b/cubids/utils.py
@@ -0,0 +1,79 @@
+"""Miscellaneous utility functions for CuBIDS.
+
+This module provides various utility functions used throughout the CuBIDS package.
+"""
+
+import re
+from pathlib import Path
+
+
+def _get_container_type(image_name):
+    """Get and return the container type.
+
+    Parameters
+    ----------
+    image_name : :obj:`str`
+        The name of the container image.
+
+    Returns
+    -------
+    :obj:`str`
+        The container type, either "docker" or "singularity".
+
+    Raises
+    ------
+    :obj:`Exception`
+        If the container type cannot be determined.
+    """
+    # If it's a file on disk, it must be a singularity image
+    if Path(image_name).exists():
+        return "singularity"
+
+    # It needs to match a docker tag pattern to be docker
+    if re.match(r"(?:.+\/)?([^:]+)(?::.+)?", image_name):
+        return "docker"
+
+    raise Exception("Unable to determine the container type of " + image_name)
+
+
+def _compress_lists(df):
+    """Compress lists in a DataFrame to strings.
+
+    Used to prepare a DataFrame with cells containing lists for writing to a TSV file.
+
+    Parameters
+    ----------
+    df : :obj:`pandas.DataFrame`
+        The DataFrame to compress.
+
+    Returns
+    -------
+    :obj:`pandas.DataFrame`
+        The compressed DataFrame.
+    """
+    for col in df.columns:
+        if isinstance(df[col].values[0], list):
+            df[col] = df[col].apply(lambda x: "|&|".join(x))
+    return df
+
+
+def _expand_lists(df):
+    """Expand strings in a DataFrame to lists.
+
+    Used to prepare a DataFrame with cells containing strings for querying after loading from a
+    TSV file.
+
+    Parameters
+    ----------
+    df : :obj:`pandas.DataFrame`
+        The DataFrame to expand.
+
+    Returns
+    -------
+    :obj:`pandas.DataFrame`
+        The expanded DataFrame.
+    """
+    for col in df.columns:
+        if isinstance(df[col].values[0], str):
+            df[col] = df[col].apply(lambda x: x.split("|&|"))
+    return df