Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support array-type metadata fields in cubids group #407

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
69 changes: 55 additions & 14 deletions cubids/cubids.py
Original file line number Diff line number Diff line change
Expand Up @@ -2019,23 +2019,64 @@ def format_params(param_group_df, config, modality):
continue

if "tolerance" in column_fmt and len(param_group_df) > 1:
array = param_group_df[column_name].to_numpy().reshape(-1, 1)

for i in range(len(array)):
if np.isnan(array[i, 0]):
array[i, 0] = -999
column_data = param_group_df[column_name].to_numpy()

if any(isinstance(x, list) for x in column_data):
# For array/list data, we should first define "clusters" based on the number of
# elements, then apply the clustering within each set of lengths.
# For example, if there are four runs with five elements and 10 runs with three
# elements, we should cluster the five-element runs separately from the
# three-element runs, and account for that in the clustering labels.
lengths = ["x".join(str(i) for i in np.array(x).shape) for x in column_data]
unique_lengths = np.unique(lengths)
cluster_idx = 0
for unique_length in unique_lengths:
sel_rows = [i for i, x in enumerate(lengths) if x == unique_length]
array = np.array([np.array(x) for x in column_data[sel_rows]])

if array.shape[0] > 1:
# clustering requires at least two samples
array[np.isnan(array)] = -999

tolerance = to_format[column_name]["tolerance"]
clustering = AgglomerativeClustering(
n_clusters=None, distance_threshold=tolerance, linkage="complete"
).fit(array)

param_group_df.loc[sel_rows, f"Cluster_{column_name}"] = (
clustering.labels_ + cluster_idx
)
cluster_idx += max(clustering.labels_) + 1
else:
# single-file cluster
param_group_df.loc[sel_rows, f"Cluster_{column_name}"] = cluster_idx
cluster_idx += 1
else:
array = column_data.reshape(-1, 1)
array[np.isnan(array)] = -999

tolerance = to_format[column_name]["tolerance"]
clustering = AgglomerativeClustering(
n_clusters=None, distance_threshold=tolerance, linkage="complete"
).fit(array)
tolerance = to_format[column_name]["tolerance"]
clustering = AgglomerativeClustering(
n_clusters=None, distance_threshold=tolerance, linkage="complete"
).fit(array)

for i in range(len(array)):
if array[i, 0] == -999:
array[i, 0] = np.nan
# now add clustering_labels as a column
param_group_df[f"Cluster_{column_name}"] = clustering.labels_

# now add clustering_labels as a column
param_group_df[f"Cluster_{column_name}"] = clustering.labels_
else:
# We can rely on string matching for string-type fields,
# but arrays of strings need to be handled differently.
column_data = param_group_df[column_name].tolist()

if any(isinstance(x, list) for x in column_data):
cluster_idx = 0

column_data = ["|&|".join(str(val) for val in cell) for cell in column_data]
unique_vals = np.unique(column_data)
for val in unique_vals:
sel_rows = [i for i, x in enumerate(column_data) if x == val]
param_group_df.loc[sel_rows, f"Cluster_{column_name}"] = cluster_idx
cluster_idx += 1

return param_group_df

Expand Down
194 changes: 194 additions & 0 deletions cubids/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
"""Tests for the utils module."""

import pandas as pd

from cubids.cubids import format_params


def test_format_params():
"""Test the format_params function.

We want to test that the function correctly clusters parameters based on the
configuration dictionary.
"""
config = {
"sidecar_params": {
"func": {
"RepetitionTime": {"tolerance": 0.01, "suggest_variant_rename": True},
"TaskName": {"suggest_variant_rename": True},
"SliceTiming": {"tolerance": 0.01, "suggest_variant_rename": True},
"ImageType": {"suggest_variant_rename": True},
},
},
"derived_params": {
"func": {},
},
}

# Mock up the input. The variants are explicitly prepared.
params = [
{
"RepetitionTime": 2.0,
"TaskName": "rest eyes closed",
"SliceTiming": [0.0, 1.0, 2.0],
"ImageType": ["ORIGINAL", "NONE", "M"],
},
{
"RepetitionTime": 2.0,
"TaskName": "rest eyes closed",
"SliceTiming": [0.0, 1.0, 2.0],
"ImageType": ["ORIGINAL", "NONE", "M"],
},
{
"RepetitionTime": 2.0,
# TaskName variant
"TaskName": "rest eyes open",
"SliceTiming": [0.0, 1.0, 2.0],
"ImageType": ["ORIGINAL", "NONE", "M"],
},
{
# RepetitionTime variant
"RepetitionTime": 1.9,
"TaskName": "rest eyes closed",
"SliceTiming": [0.0, 1.0, 2.0],
"ImageType": ["ORIGINAL", "NONE", "M"],
},
{
"RepetitionTime": 2.0,
"TaskName": "rest eyes closed",
# SliceTiming variant (length)
"SliceTiming": [0.0, 0.5, 1.0, 1.5, 2.0],
"ImageType": ["ORIGINAL", "NONE", "M"],
},
{
"RepetitionTime": 2.0,
"TaskName": "rest eyes closed",
# SliceTiming variant (values)
"SliceTiming": [0.0, 1.0, 1.9],
"ImageType": ["ORIGINAL", "NONE", "M"],
},
{
"RepetitionTime": 2.0,
"TaskName": "rest eyes closed",
"SliceTiming": [0.0, 1.0, 2.0],
# ImageType variant (length)
"ImageType": ["ORIGINAL", "NONE", "M", "NORM"],
},
{
"RepetitionTime": 2.0,
"TaskName": "rest eyes closed",
"SliceTiming": [0.0, 1.0, 2.0],
# ImageType variant (values)
"ImageType": ["ORIGINAL", "NONE", "P"],
},
]
param_group_df = pd.DataFrame(params)
modality = "func"

# Run the function
out_df = format_params(
param_group_df=param_group_df,
config=config,
modality=modality,
)
assert isinstance(out_df, pd.DataFrame)
assert "Cluster_RepetitionTime" in out_df.columns
assert "Cluster_SliceTiming" in out_df.columns
assert "Cluster_ImageType" in out_df.columns
# Non-list columns without tolerance don't get clustered
assert "Cluster_TaskName" not in out_df.columns

assert compare_group_assignments(
out_df["Cluster_RepetitionTime"].values.astype(int),
[0, 0, 0, 1, 0, 0, 0, 0],
)
assert compare_group_assignments(
out_df["Cluster_SliceTiming"].values.astype(int),
[0, 0, 0, 0, 2, 1, 0, 0],
)
assert compare_group_assignments(
out_df["Cluster_ImageType"].values.astype(int),
[0, 0, 0, 0, 0, 0, 1, 2],
)

# Change the tolerance for SliceTiming
config["sidecar_params"]["func"]["SliceTiming"]["tolerance"] = 0.5
out_df = format_params(
param_group_df=param_group_df,
config=config,
modality=modality,
)
assert isinstance(out_df, pd.DataFrame)
assert "Cluster_RepetitionTime" in out_df.columns
assert "Cluster_SliceTiming" in out_df.columns
assert "Cluster_ImageType" in out_df.columns
# Non-list columns without tolerance don't get clustered
assert "Cluster_TaskName" not in out_df.columns

assert compare_group_assignments(
out_df["Cluster_RepetitionTime"].values.astype(int),
[0, 0, 0, 1, 0, 0, 0, 0],
)
# Different lengths still produce different clusters,
# but the value-based variants are now the same
assert compare_group_assignments(
out_df["Cluster_SliceTiming"].values.astype(int),
[0, 0, 0, 0, 1, 0, 0, 0],
)
assert compare_group_assignments(
out_df["Cluster_ImageType"].values.astype(int),
[0, 0, 0, 0, 0, 0, 1, 2],
)


def compare_group_assignments(list1, list2):
"""Compare two lists for equality based on group assignments.

This function checks if two lists can be considered equal based on their group assignments.
The actual values in the lists do not matter, only the group assignments do. Each unique value
in the first list is mapped to a unique value in the second list, and the function checks if
this mapping is consistent throughout the lists.

Parameters
----------
list1 : list
The first list to compare.
list2 : list
The second list to compare.

Returns
-------
bool
True if the lists are equal based on group assignments, False otherwise.

Examples
--------
>>> list1 = [1, 2, 1, 3, 2]
>>> list2 = ['a', 'b', 'a', 'c', 'b']
>>> compare_group_assignments(list1, list2)
True

>>> list1 = [1, 2, 1, 3, 2]
>>> list2 = ['b', 'd', 'b', 'q', 'd']
>>> compare_group_assignments(list1, list2)
True

>>> list1 = [1, 2, 1, 3, 2]
>>> list2 = ['a', 'b', 'a', 'c', 'd']
>>> compare_group_assignments(list1, list2)
False
"""
if len(list1) != len(list2):
return False

mapping = {}
for a, b in zip(list1, list2):
if a in mapping:
if mapping[a] != b:
return False
else:
if b in mapping.values():
return False
mapping[a] = b

return True
79 changes: 79 additions & 0 deletions cubids/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
"""Miscellaneous utility functions for CuBIDS.

This module provides various utility functions used throughout the CuBIDS package.
"""

import re
from pathlib import Path


def _get_container_type(image_name):
"""Get and return the container type.

Parameters
----------
image_name : :obj:`str`
The name of the container image.

Returns
-------
:obj:`str`
The container type, either "docker" or "singularity".

Raises
------
:obj:`Exception`
If the container type cannot be determined.
"""
# If it's a file on disk, it must be a singularity image
if Path(image_name).exists():
return "singularity"

# It needs to match a docker tag pattern to be docker
if re.match(r"(?:.+\/)?([^:]+)(?::.+)?", image_name):
return "docker"

raise Exception("Unable to determine the container type of " + image_name)


def _compress_lists(df):
"""Compress lists in a DataFrame to strings.

Used to prepare a DataFrame with cells containing lists for writing to a TSV file.

Parameters
----------
df : :obj:`pandas.DataFrame`
The DataFrame to compress.

Returns
-------
:obj:`pandas.DataFrame`
The compressed DataFrame.
"""
for col in df.columns:
if isinstance(df[col].values[0], list):
df[col] = df[col].apply(lambda x: "|&|".join(x))
return df


def _expand_lists(df):
"""Expand strings in a DataFrame to lists.

Used to prepare a DataFrame with cells containing strings for querying after loading from a
TSV file.

Parameters
----------
df : :obj:`pandas.DataFrame`
The DataFrame to expand.

Returns
-------
:obj:`pandas.DataFrame`
The expanded DataFrame.
"""
for col in df.columns:
if isinstance(df[col].values[0], str):
df[col] = df[col].apply(lambda x: x.split("|&|"))
return df