Skip to content

Commit

Permalink
feat: Add module for Build_Upset.py script
Browse files Browse the repository at this point in the history
  • Loading branch information
jvfe committed Jul 24, 2024
1 parent 5d1250e commit 1964e89
Show file tree
Hide file tree
Showing 3 changed files with 176 additions and 0 deletions.
137 changes: 137 additions & 0 deletions bin/Build_Upset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#!/usr/bin/env python

import argparse
import pandas as pd
from matplotlib import cm
from matplotlib import pyplot as plt
from upsetplot import UpSet, from_memberships
import warnings

# Suppress the UserWarning for tight_layout
warnings.simplefilter(action='ignore', category=FutureWarning)

def load_data(samplesheet_file, feature_profile_file):
"""
Load samplesheet and feature profile data from TSV files.
Parameters:
samplesheet_file (str): Path to the samplesheet file.
feature_profile_file (str): Path to the feature profile file.
Returns:
pd.DataFrame: Samplesheet dataframe.
pd.DataFrame: Feature profile dataframe.
"""
try:
samplesheet_df = pd.read_csv(samplesheet_file)
feature_df = pd.read_csv(feature_profile_file, sep='\t')
except FileNotFoundError as e:
print(f"Error: {e}")
raise

# Convert all columns in samplesheet_df to strings
samplesheet_df = samplesheet_df.astype(str)

return samplesheet_df, feature_df

def process_data(samplesheet_df, feature_df, column_name):
"""
Process the data to merge samplesheet and feature dataframes and create presence/absence matrix.
Parameters:
samplesheet_df (pd.DataFrame): Samplesheet dataframe.
feature_df (pd.DataFrame): Feature profile dataframe.
column_name (str): Column name for the feature.
Returns:
pd.DataFrame: Transposed presence/absence dataframe with groups column.
"""
merged_df = feature_df.merge(samplesheet_df, on='genome', how='left')

if column_name not in merged_df.columns:
print(f"Warning: Column '{column_name}' not found in samplesheet. Skipping.")
return None

if merged_df[column_name].nunique() <= 1:
print(f"Warning: Column '{column_name}' contains only a single unique value. Skipping.")
return None

selected_columns = ['genome', column_name] + list(feature_df.columns[1:])
feature_df = merged_df[selected_columns]
presence = feature_df.groupby(column_name).any().astype(int)
presence.drop('genome', axis=1, inplace=True, errors='ignore')
presence_transposed = presence.transpose()
presence_transposed.reset_index(inplace=True)
presence_transposed.rename(columns={'index': 'feature'}, inplace=True)
presence_transposed['category'] = presence_transposed['feature'].str.split('_', expand=True)[0]

presence_transposed['groups'] = presence_transposed.apply(
lambda row: ','.join([col for col in presence_transposed.columns[2:] if row[col] == 1]), axis=1)

presence_transposed = presence_transposed.loc[:, ['feature', 'category', 'groups']]

# print("Presence Transposed DataFrame with 'groups' column:\n", presence_transposed.head()) # Debug print to check the dataframe state

return presence_transposed

def create_upset_plot(samplesheet_file, feature_profile_file, column_names, output_file_prefix):
"""
Create an Upset plot from samplesheet and feature profile files.
Parameters:
samplesheet_file (str): Path to the samplesheet file.
feature_profile_file (str): Path to the feature profile file.
column_names (list of str): List of column names for features.
output_file_prefix (str): Prefix for the output Upset plot files.
"""
samplesheet_df, feature_df = load_data(samplesheet_file, feature_profile_file)

for column_name in column_names:
presence_transposed = process_data(samplesheet_df, feature_df, column_name)

if presence_transposed is None:
continue

if 'groups' not in presence_transposed.columns:
print(f"Error: 'groups' column was not created for {column_name}. Skipping.")
continue

# Ensure 'groups' column is processed correctly
memberships = presence_transposed['groups'].str.split(",").apply(lambda x: frozenset(x))
rows_by_group = from_memberships(memberships, data=presence_transposed.set_index('feature'))

# print("Rows by group:\n", rows_by_group.head()) # Debug print to check rows_by_group structure

# Create UpSet plot
fig, ax = plt.subplots(figsize=(24, 12)) # Increased width for better layout
upset = UpSet(rows_by_group, show_counts=True, intersection_plot_elements=0, min_subset_size=1)
upset.add_stacked_bars(by="category", colors=cm.Pastel1, title="category", elements=10)
upset.plot(fig=fig)

# Remove the black frame by turning off the top and right spines
ax.spines[['top','right','left','bottom']].set_visible(False)
ax.tick_params(left=False)
ax.tick_params(bottom=False)
ax.tick_params(labelleft=False)
ax.tick_params(labelbottom=False)

# Adjust layout to move legend outside the figure
plt.tight_layout(rect=[0.1, 0.1, 0.75, 1]) # Adjust rect to leave space on the right for legend
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

output_file = f"{output_file_prefix}_{column_name}.png"
# Overwrite the existing file if it exists
plt.savefig(output_file)
plt.close(fig)

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Create Upset plot from samplesheet and feature profile files.')
parser.add_argument('-s', '--samplesheet_file', type=str, required=True, help='Path to the samplesheet file in TSV format')
parser.add_argument('-f', '--feature_profile_file', type=str, required=True, help='Path to the feature profile file in TSV format')
parser.add_argument('-c', '--columns', type=str, required=True, help='Comma-separated list of column names for features')
parser.add_argument('-o', '--output_file_prefix', type=str, required=True, help='Prefix for the output Upset plot files (e.g., upset_plot)')
args = parser.parse_args()

column_names = args.columns.split(',')

create_upset_plot(args.samplesheet_file, args.feature_profile_file, column_names, args.output_file_prefix)
7 changes: 7 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,13 @@ process {
]
}

withName: BUILD_UPSET {
publishDir = [
path: { "${params.outdir}/annotation/upsets" },
mode: params.publish_dir_mode,
]
}

// Recombination
withName: VERTICALL_PAIRWISE {
ext.prefix = { "cluster_${cluster}" }
Expand Down
32 changes: 32 additions & 0 deletions modules/local/build_upset/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
process BUILD_UPSET {
label 'process_single'

conda "conda-forge::upsetplot=0.9.0"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'docker://docker.io/jvfe/arete_upset:v0.1.0':
'docker.io/jvfe/arete_upset:v0.1.0' }"

input:
path feature_profile
path samplesheet
val samplesheet_columns

output:
path "BuildUpset*png", emit: pngs

when:
task.ext.when == null || task.ext.when

script:
"""
Build_Upset.py \\
--output_file_prefix BuildUpset \\
--feature_profile_file $feature_profile \\
--samplesheet_file $samplesheet \\
--columns $samplesheet_columns
"""
stub:
"""
touch BuildUpset_dummy.png
"""
}

0 comments on commit 1964e89

Please sign in to comment.