diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 570f6c0dc..e5337fdc7 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -5,10 +5,12 @@ report_comment: > report_section_order: "nf-core-circrna-methods-description": order: -1000 - software_versions: + benchmarking: order: -1001 - "nf-core-circrna-summary": + software_versions: order: -1002 + "nf-core-circrna-summary": + order: -1003 export_plots: true diff --git a/assets/schema_input.json b/assets/schema_input.json index 7e9257a06..d79fa8076 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -27,6 +27,12 @@ "pattern": "^\\S+\\.f(ast)?q\\.gz$", "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" }, + "benchmarking": { + "type": "boolean", + "default": false, + "errorMessage": "Benchmarking must be a boolean value", + "meta": ["benchmarking"] + }, "strandedness": { "type": "string", "enum": ["unstranded", "forward", "reverse", "auto"], diff --git a/conf/modules.config b/conf/modules.config index b23ea157f..765827fcd 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -813,6 +813,23 @@ process { ] } + withName: '.*:CIRCRNA_DISCOVERY_BENCHMARKING:ANNOTATION:REMOVE_SCORE_STRAND' { + ext.suffix = "_benchmarking.tidy.bed" + } + + withName: '.*:BENCHMARKING:SORT' { + ext.args = "-k 1,1 -k2,2n -k3,3n -u" + ext.suffix = "combined.bed" + } + + withName: '.*:BENCHMARKING:BEDTOOLS_MERGE' { + ext.args = "-s -c 6 -o distinct" + } + + withName: '.*:BENCHMARKING:BEDTOOLS_GENOMECOV' { + ext.args = "-dz" + } + withName: ADD_BACKSPLICE { ext.args = "-c fastx '{ if (\$name ~ /^circ_/) { \$seq = \$seq substr(\$seq, 1, 25) } print \">\" \$name; print \$seq }'" ext.suffix = "backspliced.fa" diff --git a/modules.json b/modules.json index c1bed4145..79e98dcc5 100644 --- a/modules.json +++ b/modules.json @@ -5,6 +5,11 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "bedtools/genomecov": { + "branch": "master", + "git_sha": "81b90194ce9911dbd55bba2c65c6919f6677abc4", + "installed_by": ["modules"] + }, "bedtools/getfasta": { "branch": "master", "git_sha": "cdcdd5e3d806f0ff3983c40c69e0b07bb44ec299", @@ -20,6 +25,16 @@ "git_sha": "575e1bc54b083fb15e7dd8b5fcc40bea60e8ce83", "installed_by": ["modules"] }, + "bedtools/jaccard": { + "branch": "master", + "git_sha": "3b248b84694d1939ac4bb33df84bf6233a34d668", + "installed_by": ["modules"] + }, + "bedtools/merge": { + "branch": "master", + "git_sha": "a5377837fe9013bde89de8689829e83e84086536", + "installed_by": ["modules"] + }, "bedtools/sort": { "branch": "master", "git_sha": "571a5feac4c9ce0a8df0bc15b94230e7f3e8db47", @@ -201,6 +216,11 @@ "git_sha": "b1b959609bda44341120aed1766329909f54b8d0", "installed_by": ["modules"] }, + "subread/featurecounts": { + "branch": "master", + "git_sha": "49f4e50534fe4b64101e62ea41d5dc43b1324358", + "installed_by": ["modules"] + }, "trimgalore": { "branch": "master", "git_sha": "a98418419ae6c9df3cf6cf108d1e1aba71037d5a", diff --git a/modules/local/benchmarking/average_tsv/main.nf b/modules/local/benchmarking/average_tsv/main.nf new file mode 100644 index 000000000..79a0b5c13 --- /dev/null +++ b/modules/local/benchmarking/average_tsv/main.nf @@ -0,0 +1,19 @@ +process AVERAGE_TSV { + label "process_single" + + conda "bioconda::pandas=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : + 'biocontainers/pandas:1.5.2' }" + + + input: + path(tsv) + + output: + path(tsv), emit: tsv + path("versions.yml"), emit: versions + + script: + template "average.py" +} diff --git a/modules/local/benchmarking/average_tsv/templates/average.py b/modules/local/benchmarking/average_tsv/templates/average.py new file mode 100644 index 000000000..54f27a37c --- /dev/null +++ b/modules/local/benchmarking/average_tsv/templates/average.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 + +import pandas as pd +import platform + +tsv = "$tsv" +data = pd.read_csv(tsv, sep='\\t') + +data['tool'] = data['tool'].str.replace('tool:', '') +average_values = data.groupby('tool')['pearson_corr'].mean().reset_index() + +output_file_path = tsv +average_values.to_csv(output_file_path, sep='\\t', index=False) + +#version capture +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +versions = { + "${task.process}" : { + "python": platform.python_version(), + "pandas": pd.__version__ + } +} +with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) + diff --git a/modules/local/benchmarking/decompress_reads/main.nf b/modules/local/benchmarking/decompress_reads/main.nf new file mode 100644 index 000000000..6f88e3bf1 --- /dev/null +++ b/modules/local/benchmarking/decompress_reads/main.nf @@ -0,0 +1,20 @@ + + +process DECOMPRESS_READS { + input: + tuple val(meta), path(fastq_gz_1), path(fastq_gz_2) + + output: + tuple val(meta), path("*.fq") + + script: + """ + # Define output names by removing .gz and adding _decompressed.fq + output_name1=\$(basename ${fastq_gz_1} .gz)_decompressed.fq + output_name2=\$(basename ${fastq_gz_2} .gz)_decompressed.fq + + # Decompress the files + gunzip -c ${fastq_gz_1} > \$output_name1 + gunzip -c ${fastq_gz_2} > \$output_name2 + """ +} diff --git a/modules/local/benchmarking/location_plots/main.nf b/modules/local/benchmarking/location_plots/main.nf new file mode 100644 index 000000000..4b4747bf5 --- /dev/null +++ b/modules/local/benchmarking/location_plots/main.nf @@ -0,0 +1,17 @@ +process LOCATION_PLOT { + tag "$meta.id" + label "process_single" + + conda "bioconda::seaborn=0.11.2" + container 'community.wave.seqera.io/library/seaborn:0.13.2--ef0811a05c6fcc75' + + input: + tuple val(meta), path(bedfile1), path(bedfile2) + + output: + path("*_mqc.png"), emit: plots + path("versions.yml"), emit: versions + + script: + template "create_plots.py" +} diff --git a/modules/local/benchmarking/location_plots/templates/create_plots.py b/modules/local/benchmarking/location_plots/templates/create_plots.py new file mode 100755 index 000000000..1479bc844 --- /dev/null +++ b/modules/local/benchmarking/location_plots/templates/create_plots.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +import csv +import matplotlib.pyplot as plt +import seaborn as sns +import pandas as pd +from matplotlib.lines import Line2D +import platform + +input_bed_file_1 = '$bedfile1' +input_bed_file_2 = '$bedfile2' + +# Read input files +def read_bed_file(file_path, label): + data = {'chromosome': [], 'start': [], 'strand': [], 'file_label': []} + with open(file_path, 'r') as file: + reader = csv.reader(file, delimiter='\\t') + for row in reader: + data['chromosome'].append(row[0]) + data['start'].append(int(row[1])) + data['strand'].append(row[3]) + data['file_label'].append(label.lower()) + return data + +def combine_data(data1, data2): + for key in data1: + data1[key].extend(data2[key]) + return data1 + +data1 = read_bed_file(input_bed_file_1, 'total') +data2 = read_bed_file(input_bed_file_2, 'polya') + +# Combine the two datasets +combined_data = combine_data(data1, data2) + +# Create a DataFrame +df = pd.DataFrame({ + 'Chromosome': combined_data['chromosome'], + 'Start Location': combined_data['start'], + 'Strand': combined_data['strand'], + 'File Label': combined_data['file_label'] +}) + +# Sort DataFrame to ensure consistent plotting order +df.sort_values(by=['File Label', 'Strand'], inplace=True) + +# Plotting +fig, ax = plt.subplots(figsize=(12, 6)) + +palette = { + "total +": "red", + "total -": "lightcoral", + "polya +": "blue", + "polya -": "lightblue" +} + +# Draw violins +for file_label in df['File Label'].unique(): + sns.violinplot( + x="Chromosome", + y="Start Location", + hue="Strand", + data=df[df['File Label'] == file_label], + palette={"+" : palette[f"{file_label} +"], "-" : palette[f"{file_label} -"]}, + split=True, + ax=ax, + scale="count", + scale_hue=False, + saturation=0.75, + inner=None + ) + +# Set transparency for all violins +for violin in ax.collections: + violin.set_alpha(0.25) + +# Legend +custom_lines = [ + Line2D([0], [0], color=palette[f"{file} {strand}"], lw=4, alpha=0.25) + for file in df['File Label'].unique() + for strand in ["+", "-"] +] +ax.legend( + custom_lines, + [f"{file} : {strand}" for file in df['File Label'].unique() for strand in ["+", "-"]], + title="File : Strand" +) + +plt.title('Start Locations of circRNA by Chromosome and Strand') + +plot_file_name = f"{input_bed_file_1.replace('.bed','')}_{input_bed_file_2.replace('.bed','')}_mqc.png" + +# Save the plot +plt.savefig(plot_file_name, bbox_inches='tight') + +#version capture +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +versions = { + "${task.process}" : { + "python": platform.python_version(), + "pandas": pd.__version__, + "seaborn": sns.__version__ + } +} +with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) diff --git a/modules/local/benchmarking/multiqc/main.nf b/modules/local/benchmarking/multiqc/main.nf new file mode 100644 index 000000000..a39675d30 --- /dev/null +++ b/modules/local/benchmarking/multiqc/main.nf @@ -0,0 +1,18 @@ +process BENCHMARKING_MULTIQC { + label "process_single" + + conda "bioconda::pandas=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : + 'biocontainers/pandas:1.5.2' }" + + input: + path(jaccard) + + output: + path("*_mqc.json") , emit: report + path("versions.yml"), emit: versions + + script: + template "benchmarking.py" +} diff --git a/modules/local/benchmarking/multiqc/templates/benchmarking.py b/modules/local/benchmarking/multiqc/templates/benchmarking.py new file mode 100644 index 000000000..1515e8198 --- /dev/null +++ b/modules/local/benchmarking/multiqc/templates/benchmarking.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 + +import pandas as pd +import json +import platform + +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +df = pd.read_csv("$jaccard", sep='\\t', index_col=0) + +for metric in df.columns: + data = { + "id": f"benchmarking_{metric}", + "parent_id": "benchmarking", + "parent_name": "Benchmarking", + "parent_description": "Benchmarking of the tools", + "section_name": metric.capitalize(), + "description": f"{metric.capitalize()} values of the tools", + "plot_type": "bargraph", + "data": df[[metric]].T.to_dict() + } + + with open(f"benchmarking_{metric}_mqc.json", "w") as f: + json.dump(data, f, indent=4) + +versions = { + "${task.process}" : { + "python": platform.python_version(), + "pandas": pd.__version__ + } +} +with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) diff --git a/modules/local/benchmarking/overlap_plot/main.nf b/modules/local/benchmarking/overlap_plot/main.nf new file mode 100644 index 000000000..161b62036 --- /dev/null +++ b/modules/local/benchmarking/overlap_plot/main.nf @@ -0,0 +1,17 @@ +process OVERLAP_PLOT { + tag "$meta.id" + label "process_single" + + conda "bioconda::seaborn=0.11.2" + container 'community.wave.seqera.io/library/seaborn:0.13.2--ef0811a05c6fcc75' + + input: + tuple val(meta), path(bed) + + output: + path("*_mqc.png") , emit: plots + path("versions.yml"), emit: versions + + script: + template "create_plots.py" +} diff --git a/modules/local/benchmarking/overlap_plot/templates/create_plots.py b/modules/local/benchmarking/overlap_plot/templates/create_plots.py new file mode 100644 index 000000000..fcbae5eaf --- /dev/null +++ b/modules/local/benchmarking/overlap_plot/templates/create_plots.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +import csv +import matplotlib.pyplot as plt +import seaborn as sns +import pandas as pd +import platform + +input_bed_file = '$bed' + +# Read input file +def read_bed_file(file_path): + data = {'chromosome': [], 'start': [], 'strand': []} + with open(file_path, 'r') as file: + reader = csv.reader(file, delimiter='\t') + for row in reader: + data['chromosome'].append(row[0]) + data['start'].append(int(row[1])) + data['strand'].append(row[3]) + return data + +# Read the data +data = read_bed_file(input_bed_file) + +# Create a DataFrame +df = pd.DataFrame({ + 'Chromosome': data['chromosome'], + 'Start Location': data['start'], + 'Strand': data['strand'] +}) + +# Plotting +plt.figure(figsize=(12, 6)) +sns.violinplot(x='Chromosome', y='Start Location', hue='Strand', data=df, split=True, inner="quartile") +plt.title('Start Locations of circRNA by Chromosome and Strand') +plt.legend(title="Strand") +plot_file_name = f"{input_bed_file.replace('.bed','')}_overlaps_mqc.png" + +plt.savefig(plot_file_name, bbox_inches='tight') + + +#version capture +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +versions = { + "${task.process}" : { + "python": platform.python_version(), + "pandas": pd.__version__, + "seaborn": sns.__version__ + } +} +with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) diff --git a/modules/local/benchmarking/plot_polyatails/main.nf b/modules/local/benchmarking/plot_polyatails/main.nf new file mode 100644 index 000000000..26f173f62 --- /dev/null +++ b/modules/local/benchmarking/plot_polyatails/main.nf @@ -0,0 +1,14 @@ +process PLOT_POLYATAILS { + label "process_single" + + conda "bioconda::seaborn=0.11.2" + container 'community.wave.seqera.io/library/seaborn:0.13.2--ef0811a05c6fcc75' + + input: + path(real) + path(benchmarking) + output: + path("*.tsv") , emit: average_tails + script: + template "create_plots.py" +} \ No newline at end of file diff --git a/modules/local/benchmarking/plot_polyatails/templates/create_plots.py b/modules/local/benchmarking/plot_polyatails/templates/create_plots.py new file mode 100644 index 000000000..dcf2afc63 --- /dev/null +++ b/modules/local/benchmarking/plot_polyatails/templates/create_plots.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +import csv +import matplotlib.pyplot as plt +import seaborn as sns +import pandas as pd +import platform + +# Function to calculate the average nA value for a given file +def calculate_average_na(file_path): + # Read the TSV file into a pandas DataFrame + df = pd.read_csv(file_path, sep='\t') # Use tab as the separator + + # Calculate the mean of the nA column + avg_na = df['nA'].mean() + + # Extract the sample name from the 'sample' column (since all rows have the same sample) + sample_name = df['sample'].iloc[0] + + return avg_na, sample_name + +def main(file_list, output_file): + # Initialize lists to store the averages and corresponding samples + averages = [] + samples = [] + + # Loop through each file in the list + for file_path in file_list: + avg_na, sample_name = calculate_average_na(file_path) + averages.append(avg_na) + samples.append(sample_name) + + # Write the results to a TSV file + with open(output_file, 'w', newline='') as tsvfile: + writer = csv.writer(tsvfile, delimiter='\t') # Tab-separated output + writer.writerow(['sample', 'avg']) # Write header + for sample, avg in zip(samples, averages): + writer.writerow([sample, avg]) + +if __name__ == "__main__": + file_list_r = '$real'.split(' ') + output_file_r = 'real.tsv' + main(file_list_r, output_file_r) + file_list_b = '$benchmarking'.split(' ') + output_file_b = 'benchmarking.tsv' + main(file_list_b, output_file_b) + + \ No newline at end of file diff --git a/modules/local/benchmarking/png_json/main.nf b/modules/local/benchmarking/png_json/main.nf new file mode 100644 index 000000000..e6205bfc1 --- /dev/null +++ b/modules/local/benchmarking/png_json/main.nf @@ -0,0 +1,20 @@ +process PNG_JSON { + label "process_single" + + conda "bioconda::pandas=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : + 'biocontainers/pandas:1.5.2' }" + + input: + path(png) + val(title) + val(description) + + output: + path("*_mqc.json") , emit: report + path("versions.yml"), emit: versions + + script: + template "json.py" +} diff --git a/modules/local/benchmarking/png_json/templates/json.py b/modules/local/benchmarking/png_json/templates/json.py new file mode 100644 index 000000000..639cfab2f --- /dev/null +++ b/modules/local/benchmarking/png_json/templates/json.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +import json +import os +import base64 +import platform + +png = "$png" +title = "$title" +description = "$description" + +absolute_path = os.path.abspath(png) +filename = os.path.basename(absolute_path) +name = filename.rstrip("_mcq.png") +id = title.lower().replace(" ", "_") + +# Convert the PNG file to a base64 string +with open(absolute_path, "rb") as image_file: + base64_string = base64.b64encode(image_file.read()).decode('utf-8') + +# Create the data structure for JSON +data = { + "id": f"benchmarking_{name}", + "parent_id": id, + "parent_name": title, + "parent_description": description, + "section_name": name, + "plot_type": "image", + "data": f'
' +} + +# Write the JSON data to a file +with open(f"{name}_mqc.json", "w") as f: + json.dump(data, f, indent=4) + +print(f"JSON file created for {absolute_path}") + +#version capture +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +versions = { + "${task.process}" : { + "python": platform.python_version() + } +} +with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) diff --git a/modules/local/benchmarking/polyatailor/main.nf b/modules/local/benchmarking/polyatailor/main.nf new file mode 100644 index 000000000..3427728d8 --- /dev/null +++ b/modules/local/benchmarking/polyatailor/main.nf @@ -0,0 +1,20 @@ +process POLYATAILOR { + tag "$meta.id" + label "process_high" + + container 'docker://faboehm/polyatailor-env' + + input: + tuple val(meta), + path(fastq) + path(bam) + + output: + path("*.tsv"), emit: tails + path("versions.yml"), emit: versions + + script: + template'polyatailor.R' + +} + diff --git a/modules/local/benchmarking/polyatailor/templates/polyatailor.R b/modules/local/benchmarking/polyatailor/templates/polyatailor.R new file mode 100644 index 000000000..fc8e3d6fb --- /dev/null +++ b/modules/local/benchmarking/polyatailor/templates/polyatailor.R @@ -0,0 +1,73 @@ +#!/usr/bin/env Rscript +library(PolyAtailor) +library(Biostrings) +library(yaml) # To write the YAML file +library(Rsamtools) # Ensure Rsamtools is loaded for scanBam +library(GenomicAlignments) +library(dplyr) +conflict_prefer("filter", "dplyr") +conflict_prefer("rename", "dplyr") +conflict_prefer("strsplit", "base") + +fastq_string <- '$fastq' +fastq_files <- unlist(strsplit(fastq_string, " ")) +sample_names <- gsub(".fq.gz\$", "", fastq_files) + +result_1 <- tailScan( + fastq = fastq_files[1], + mcans = 3, + findUmi = FALSE, + lumi = 0, + adapterSeq = "", + anchorSeq = "", + resultpath = "./", + samplename = sample_names[1], + tailAnchorLen = 8, + minTailLen = 5, + realTailLen = 15, + maxNtail = 2, + mapping = FALSE, + mapinfo = NULL, + findTailType = 'A' +) + +write.table(result_1, file = paste0(sample_names[1], ".tsv"), sep = "\t", row.names = FALSE, col.names = TRUE) + +result_2 <- tailScan( + fastq = fastq_files[2], + mcans = 3, + findUmi = FALSE, + lumi = 0, + adapterSeq = "", + anchorSeq = "", + resultpath = "./", + samplename = sample_names[2], + tailAnchorLen = 8, + minTailLen = 5, + realTailLen = 15, + maxNtail = 2, + mapping = FALSE, + mapinfo = NULL, + findTailType = 'A' +) +write.table(result_2, file = paste0(sample_names[2], ".tsv"), sep = "\t", row.names = FALSE, col.names = TRUE) + +result_tailMap <- tailMap( + bamfile = '$bam', + mcans = 3, + minTailLen = 5, + findUmi = FALSE, + maxNtail = 2, + mapping = TRUE, + longRead = FALSE +) +write.csv(result_tailMap, "map.csv", row.names = FALSE) + + + +version_info <- list( + PolyAtailor_version = as.character(packageVersion("PolyAtailor")), + Biostrings_version = as.character(packageVersion("Biostrings")), + run_time = Sys.time() +) +write_yaml(version_info, "versions.yml") diff --git a/modules/local/benchmarking/rRNA_correlation/main.nf b/modules/local/benchmarking/rRNA_correlation/main.nf new file mode 100644 index 000000000..221d779cb --- /dev/null +++ b/modules/local/benchmarking/rRNA_correlation/main.nf @@ -0,0 +1,17 @@ +process RRNA_CORRELATION { + label "process_single" + + conda "bioconda::pandas=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : + 'biocontainers/pandas:1.5.2' }" + + input: + val bed_real_list // Collected list of tuples for real BEDs + val bed_bench_list // Collected list of tuples for benchmarking BEDs + val rRNA_real_list // Collected list of tuples for real rRNA summaries + val rRNA_bench_list // Collected list of tuples for benchmarking rRNA summaries + + script: + template "rRNA_corr.py" +} diff --git a/modules/local/benchmarking/rRNA_correlation/templates/rRNA_corr.py b/modules/local/benchmarking/rRNA_correlation/templates/rRNA_corr.py new file mode 100644 index 000000000..fc0c46bd2 --- /dev/null +++ b/modules/local/benchmarking/rRNA_correlation/templates/rRNA_corr.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +import numpy as np +from collections import defaultdict + +def process_meta_and_paths(input_string): + # 1. Strip the first and last character + stripped_input = input_string.replace("[[", "[").replace("]]", "]") + + # 2. Split by "[", resulting in a list + split_data = stripped_input.split("[") + + # 3. For every list member, strip all occurrences of "]" + cleaned_data = [item.replace("]", "").strip() for item in split_data if item.strip()] + + # 4. For every list member, split by "," + parsed_data = [item.split(",") for item in cleaned_data] + + parsed_data = [sublist[:-1] if sublist[-1] == '' else sublist for sublist in parsed_data] + + + return parsed_data + + +def calculate_favcount_summary_ratio(file_path): + with open(file_path.strip(), 'r') as file: + lines = file.readlines() + assigned = int(lines[1].split()[1]) + unassigned_multimapping = int(lines[9].split()[1]) + others_sum = sum(int(lines[i].split()[1]) for i in list(range(2, 9)) + list(range(10, 15))) + ratio = (assigned + unassigned_multimapping) + #others_sum if others_sum != 0 else 1 + return ratio + + + +def getpairs (beds, favcounts): + pairs = [] + for bed in beds: + fav = None + for f in favcounts: + if f[0] == bed[0]: + fav = f + break + pairs.append([bed[0],bed[4],bed[-1],fav[-1]]) + return pairs + +def get_totals(pairs): + for i in range(len(pairs)): + bed = pairs[i][2].strip(" ") + fav = pairs[i][3] + with open(bed, 'r') as file: + bed_value = sum(1 for line in file) + pairs[i][2] = bed_value + pairs[i][3] = calculate_favcount_summary_ratio(fav) + return pairs + + +def sort_tools(value_pairs): + grouped_data = defaultdict(list) + for entry in value_pairs: + grouped_data[entry[1]].append(entry) + return dict(grouped_data) + + +def compute_correlation(value_pairs): + results = defaultdict(float) + for tool in value_pairs.keys(): + bed_list, fav_list = zip(*[sublist[2:] for sublist in value_pairs[tool]]) + correlation = np.corrcoef(bed_list, fav_list)[0,1] + results[tool] = correlation + return dict(results) + + + + +# Input data from Nextflow variables +real_bed = "$bed_real_list" +bench_bed = "$bed_bench_list" +real_rRNA = "$rRNA_real_list" +bench_rRNA = "$rRNA_bench_list" + +real_bed = process_meta_and_paths(real_bed) +bench_bed = process_meta_and_paths(bench_bed) +real_rRNA = process_meta_and_paths(real_rRNA) +bench_rRNA = process_meta_and_paths(bench_rRNA) + +real_pairs = getpairs(real_bed,real_rRNA) +bench_pairs = getpairs(bench_bed,bench_rRNA) + + +real_value_pairs = get_totals(real_pairs) +bench_value_pairs = get_totals(bench_pairs) + +real_value_pairs = sort_tools(real_value_pairs) +bench_value_pairs = sort_tools(bench_value_pairs) + +real_corr = compute_correlation(real_value_pairs) +bench_corr = compute_correlation(bench_value_pairs) + +with open("real_corr.txt", "w") as file: + for key, value in real_corr.items(): + # Remove leading space in the key if required + line = f"{key.strip()}:\\t{value}\\n" + file.write(line) + +with open("bench_corr.txt", "w") as file: + for key, value in bench_corr.items(): + # Remove leading space in the key if required + line = f"{key.strip()}:\\t{value}\\n" + file.write(line) + +#TODO: both relative or both total. both? +#TODO: both correllations the same can't be!!! + diff --git a/modules/local/benchmarking/seq_depth_plot/main.nf b/modules/local/benchmarking/seq_depth_plot/main.nf new file mode 100644 index 000000000..cdb5c1ed6 --- /dev/null +++ b/modules/local/benchmarking/seq_depth_plot/main.nf @@ -0,0 +1,17 @@ +process SEQ_DEPTH_CORRELLATION { + tag "$meta.id" + label "process_single" + + conda "bioconda::matplotlib=3.5.1 bioconda::seaborn=0.11.2" + container 'https://depot.galaxyproject.org/singularity/bionumpy:0.2.12--pyha8f3691_0' + + input: + tuple val(meta), + path(bed) + path(depth) + output: + path("*.tsv") , emit: report + path("versions.yml"), emit: versions + script: + template "correlation.py" +} diff --git a/modules/local/benchmarking/seq_depth_plot/templates/correlation.py b/modules/local/benchmarking/seq_depth_plot/templates/correlation.py new file mode 100644 index 000000000..78daf86d2 --- /dev/null +++ b/modules/local/benchmarking/seq_depth_plot/templates/correlation.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +import os +import re +import platform +import numpy as np + +meta = "$meta" +depth = "$depth" +bed = "$bed" + +def calculate_correlation(bed_file_path, depth_file_path): + # Read the BED file and store circRNA regions + circRNA_regions = [] + with open(bed_file_path, 'r') as bed_file: + for line in bed_file: + if line.startswith('#'): continue + parts = line.strip().split() + chr = parts[0] + start = int(parts[1]) + end = int(parts[2]) + circRNA_regions.append((chr, start, end)) + + # Read the sequence depth file + circRNA_presence = [] + sequencing_depth = [] + + with open(depth_file_path, 'r') as depth_file: + bed_idx = 0 + bed_len = len(circRNA_regions) + + for line in depth_file: + parts = line.strip().split() + chr = parts[0] + pos = int(parts[1]) + depth = int(parts[2]) + + # Move bed_idx to the correct region + while bed_idx < bed_len and (circRNA_regions[bed_idx][0] < chr or (circRNA_regions[bed_idx][0] == chr and circRNA_regions[bed_idx][2] < pos)): + bed_idx += 1 + + # Check if the current position is within a circRNA region + if bed_idx < bed_len and circRNA_regions[bed_idx][0] == chr and circRNA_regions[bed_idx][1] <= pos <= circRNA_regions[bed_idx][2]: + circRNA_presence.append(1) + else: + circRNA_presence.append(0) + + sequencing_depth.append(depth) + + # Calculate the correlation coefficient + correlation = np.corrcoef(sequencing_depth, circRNA_presence)[0, 1] + + return correlation + +id = re.search(r'id:(\\w+)', meta).group(1) + +with open(depth, 'r') as paths: + for path in paths: + path = path.strip() + basename = os.path.splitext(os.path.basename(path))[0] + + if basename == id: + # Calculate correlation + corr = calculate_correlation(bed, path) + # Write the correlation to stats.txt + with open('corr_mqc.tsv', 'w') as outfile: + header = "tool\\tpearson_corr\\n" + outfile.write(header + meta.split(",")[4][:-1] + '\\t' + str(corr)) + +#version capture +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +versions = { + "${task.process}" : { + "python": platform.python_version(), + "numpy": np.__version__ + } +} +with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) diff --git a/modules/nf-core/bedtools/genomecov/environment.yml b/modules/nf-core/bedtools/genomecov/environment.yml new file mode 100644 index 000000000..8fbe20c31 --- /dev/null +++ b/modules/nf-core/bedtools/genomecov/environment.yml @@ -0,0 +1,7 @@ +name: bedtools_genomecov +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bedtools=2.31.1 diff --git a/modules/nf-core/bedtools/genomecov/main.nf b/modules/nf-core/bedtools/genomecov/main.nf new file mode 100644 index 000000000..954e8913d --- /dev/null +++ b/modules/nf-core/bedtools/genomecov/main.nf @@ -0,0 +1,74 @@ +process BEDTOOLS_GENOMECOV { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' : + 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }" + + input: + tuple val(meta), path(intervals), val(scale) + path sizes + val extension + val sort + + output: + tuple val(meta), path("*.${extension}"), emit: genomecov + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args_list = args.tokenize() + args += (scale > 0 && scale != 1) ? " -scale $scale" : "" + if (!args_list.contains('-bg') && (scale > 0 && scale != 1)) { + args += " -bg" + } + def sort_cmd = sort ? '| bedtools sort' : '' + + def prefix = task.ext.prefix ?: "${meta.id}" + if (intervals.name =~ /\.bam/) { + """ + bedtools \\ + genomecov \\ + -ibam $intervals \\ + $args \\ + $sort_cmd \\ + > ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ + } else { + """ + bedtools \\ + genomecov \\ + -i $intervals \\ + -g $sizes \\ + $args \\ + $sort_cmd \\ + > ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ + } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bedtools/genomecov/meta.yml b/modules/nf-core/bedtools/genomecov/meta.yml new file mode 100644 index 000000000..2b2385e3e --- /dev/null +++ b/modules/nf-core/bedtools/genomecov/meta.yml @@ -0,0 +1,59 @@ +name: bedtools_genomecov +description: Computes histograms (default), per-base reports (-d) and BEDGRAPH (-bg) summaries of feature coverage (e.g., aligned sequences) for a given genome. +keywords: + - bed + - bam + - genomecov + - bedtools + - histogram +tools: + - bedtools: + description: | + A set of tools for genomic analysis tasks, specifically enabling genome arithmetic (merge, count, complement) on various file types. + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/genomecov.html + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intervals: + type: file + description: BAM/BED/GFF/VCF + pattern: "*.{bam|bed|gff|vcf}" + - scale: + type: integer + description: Number containing the scale factor for the output. Set to 1 to disable. Setting to a value other than 1 will also get the -bg bedgraph output format as this is required for this command switch + - sizes: + type: file + description: Tab-delimited table of chromosome names in the first column and chromosome sizes in the second column + - extension: + type: string + description: Extension of the output file (e. g., ".bg", ".bedgraph", ".txt", ".tab", etc.) It is set arbitrarily by the user and corresponds to the file format which depends on arguments. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - genomecov: + type: file + description: Computed genome coverage file + pattern: "*.${extension}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@edmundmiller" + - "@sruthipsuresh" + - "@drpatelh" + - "@sidorov-si" + - "@chris-cheshire" +maintainers: + - "@edmundmiller" + - "@sruthipsuresh" + - "@drpatelh" + - "@sidorov-si" + - "@chris-cheshire" diff --git a/modules/nf-core/bedtools/genomecov/tests/main.nf.test b/modules/nf-core/bedtools/genomecov/tests/main.nf.test new file mode 100644 index 000000000..8213cff95 --- /dev/null +++ b/modules/nf-core/bedtools/genomecov/tests/main.nf.test @@ -0,0 +1,122 @@ +nextflow_process { + name "Test Process BEDTOOLS_GENOMECOV" + script "../main.nf" + process "BEDTOOLS_GENOMECOV" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "bedtools" + tag "bedtools/genomecov" + + test("sarscov2 - no scale") { + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.paired_end.bam", checkIfExists: true), + 1 + ] + // sizes + input[1] = [] + // extension + input[2] = "txt" + input[3] = true + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match("no_scale") } + ) + } + + } + + test("sarscov2 - dummy sizes") { + when { + process { + """ + input[0] = [ + [ id:'test'], + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.paired_end.bam", checkIfExists: true), + 0.5 + ] + // sizes + input[1] = file('dummy_chromosome_sizes') + // extension + input[2] = 'txt' + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match("dummy_sizes") } + ) + } + + } + + test("sarscov2 - scale") { + when { + process { + """ + input[0] = [ + [ id:'test'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/bed/baits.bed", checkIfExists: true), + 0.5 + ] + // sizes + input[1] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.sizes", checkIfExists: true) + // extension + input[2] = 'txt' + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match("scale") } + ) + } + + } + + test("stub") { + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.paired_end.bam", checkIfExists: true), + 1 + ] + // sizes + input[1] = [] + // extension + input[2] = 'txt' + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.genomecov[0][1]).name).match("stub") } + ) + } + + } + +} diff --git a/modules/nf-core/bedtools/genomecov/tests/main.nf.test.snap b/modules/nf-core/bedtools/genomecov/tests/main.nf.test.snap new file mode 100644 index 000000000..8f9191e4c --- /dev/null +++ b/modules/nf-core/bedtools/genomecov/tests/main.nf.test.snap @@ -0,0 +1,95 @@ +{ + "dummy_sizes": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.coverage.txt:md5,01291b6e1beab72e046653e709eb0e10" + ] + ], + "1": [ + "versions.yml:md5,5fd44452613992a6f71f2c73d2e117f2" + ], + "genomecov": [ + [ + { + "id": "test" + }, + "test.coverage.txt:md5,01291b6e1beab72e046653e709eb0e10" + ] + ], + "versions": [ + "versions.yml:md5,5fd44452613992a6f71f2c73d2e117f2" + ] + } + ], + "timestamp": "2023-12-05T17:35:58.35232" + }, + "no_scale": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.coverage.txt:md5,66083198daca6c001d328ba9616e9b53" + ] + ], + "1": [ + "versions.yml:md5,5fd44452613992a6f71f2c73d2e117f2" + ], + "genomecov": [ + [ + { + "id": "test" + }, + "test.coverage.txt:md5,66083198daca6c001d328ba9616e9b53" + ] + ], + "versions": [ + "versions.yml:md5,5fd44452613992a6f71f2c73d2e117f2" + ] + } + ], + "timestamp": "2023-12-05T17:35:51.142496" + }, + "stub": { + "content": [ + "test.coverage.txt" + ], + "timestamp": "2023-12-05T17:36:13.084709" + }, + "scale": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.coverage.txt:md5,de3c59c0ea123bcdbbad27bc0a0a601e" + ] + ], + "1": [ + "versions.yml:md5,5fd44452613992a6f71f2c73d2e117f2" + ], + "genomecov": [ + [ + { + "id": "test" + }, + "test.coverage.txt:md5,de3c59c0ea123bcdbbad27bc0a0a601e" + ] + ], + "versions": [ + "versions.yml:md5,5fd44452613992a6f71f2c73d2e117f2" + ] + } + ], + "timestamp": "2023-12-05T17:36:05.962006" + } +} \ No newline at end of file diff --git a/modules/nf-core/bedtools/genomecov/tests/nextflow.config b/modules/nf-core/bedtools/genomecov/tests/nextflow.config new file mode 100644 index 000000000..bdb74ae5a --- /dev/null +++ b/modules/nf-core/bedtools/genomecov/tests/nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: BEDTOOLS_GENOMECOV { + ext.prefix = { "${meta.id}.coverage" } + } + +} diff --git a/modules/nf-core/bedtools/genomecov/tests/tags.yml b/modules/nf-core/bedtools/genomecov/tests/tags.yml new file mode 100644 index 000000000..55fce4780 --- /dev/null +++ b/modules/nf-core/bedtools/genomecov/tests/tags.yml @@ -0,0 +1,2 @@ +bedtools/genomecov: + - "modules/nf-core/bedtools/genomecov/**" diff --git a/modules/nf-core/bedtools/jaccard/environment.yml b/modules/nf-core/bedtools/jaccard/environment.yml new file mode 100644 index 000000000..f0f2a38fc --- /dev/null +++ b/modules/nf-core/bedtools/jaccard/environment.yml @@ -0,0 +1,7 @@ +name: bedtools_jaccard +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bedtools=2.31.1 diff --git a/modules/nf-core/bedtools/jaccard/main.nf b/modules/nf-core/bedtools/jaccard/main.nf new file mode 100644 index 000000000..32a41210d --- /dev/null +++ b/modules/nf-core/bedtools/jaccard/main.nf @@ -0,0 +1,49 @@ +process BEDTOOLS_JACCARD { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' : + 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }" + + input: + tuple val(meta), path(input_a), path(input_b) + tuple val(meta2), path(genome_file) + + output: + tuple val(meta), path("*.tsv"), emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def genome = genome_file ? "-g ${genome_file}" : "" + """ + bedtools jaccard \\ + -a ${input_a} \\ + -b ${input_b} \\ + ${genome} \\ + ${args} \\ + > ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bedtools/jaccard/meta.yml b/modules/nf-core/bedtools/jaccard/meta.yml new file mode 100644 index 000000000..2b5422841 --- /dev/null +++ b/modules/nf-core/bedtools/jaccard/meta.yml @@ -0,0 +1,57 @@ +name: "bedtools_jaccard" +description: Calculate Jaccard statistic b/w two feature files. +keywords: + - vcf + - gff + - bed + - jaccard + - intersection + - union + - statistics +tools: + - "bedtools": + description: | + A set of tools for genomic analysis tasks, specifically enabling genome arithmetic (merge, count, complement) on various file types. + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/intersect.html + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input_a: + type: file + description: VCF,GFF or BED file to use with the `-a` option + pattern: "*.{vcf,vcf.gz,bed,bed.gz,gff}" + - input_b: + type: file + description: VCF,GFF or BED file to use with the `-b` option + pattern: "*.{vcf,vcf.gz,bed,bed.gz,gff}" + - meta2: + type: map + description: | + Groovy Map containing genome file information + e.g. [ id:'test', single_end:false ] + - genome_file: + type: file + description: A file containing all the contigs of the genome used to create the input files + pattern: "*.{txt,sizes,fai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - tsv: + type: file + description: TSV file containing the results + pattern: "*.tsv" +authors: + - "@nvnieuwk" +maintainers: + - "@nvnieuwk" diff --git a/modules/nf-core/bedtools/merge/environment.yml b/modules/nf-core/bedtools/merge/environment.yml new file mode 100644 index 000000000..99707878b --- /dev/null +++ b/modules/nf-core/bedtools/merge/environment.yml @@ -0,0 +1,7 @@ +name: bedtools_merge +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bedtools=2.31.1 diff --git a/modules/nf-core/bedtools/merge/main.nf b/modules/nf-core/bedtools/merge/main.nf new file mode 100644 index 000000000..5310647de --- /dev/null +++ b/modules/nf-core/bedtools/merge/main.nf @@ -0,0 +1,47 @@ +process BEDTOOLS_MERGE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' : + 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }" + + input: + tuple val(meta), path(bed) + + output: + tuple val(meta), path('*.bed'), emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if ("$bed" == "${prefix}.bed") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + bedtools \\ + merge \\ + -i $bed \\ + $args \\ + > ${prefix}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bedtools/merge/meta.yml b/modules/nf-core/bedtools/merge/meta.yml new file mode 100644 index 000000000..d7463e3db --- /dev/null +++ b/modules/nf-core/bedtools/merge/meta.yml @@ -0,0 +1,45 @@ +name: bedtools_merge +description: combines overlapping or “book-ended” features in an interval file into a single feature which spans all of the combined features. +keywords: + - bed + - merge + - bedtools + - overlapped bed +tools: + - bedtools: + description: | + A set of tools for genomic analysis tasks, specifically enabling genome arithmetic (merge, count, complement) on various file types. + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/merge.html + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bed: + type: file + description: Input BED file + pattern: "*.{bed}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bed: + type: file + description: Overlapped bed file with combined features + pattern: "*.{bed}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@edmundmiller" + - "@sruthipsuresh" + - "@drpatelh" +maintainers: + - "@edmundmiller" + - "@sruthipsuresh" + - "@drpatelh" diff --git a/modules/nf-core/bedtools/merge/tests/main.nf.test b/modules/nf-core/bedtools/merge/tests/main.nf.test new file mode 100644 index 000000000..95dba8e54 --- /dev/null +++ b/modules/nf-core/bedtools/merge/tests/main.nf.test @@ -0,0 +1,34 @@ +nextflow_process { + + name "Test Process BEDTOOLS_MERGE" + script "../main.nf" + config "./nextflow.config" + process "BEDTOOLS_MERGE" + + tag "modules" + tag "modules_nfcore" + tag "bedtools" + tag "bedtools/merge" + + test("test_bedtools_merge") { + + when { + process { + """ + input[0] = [ [ id:'test'], + file(params.test_data['sarscov2']['genome']['test_bed'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/bedtools/merge/tests/main.nf.test.snap b/modules/nf-core/bedtools/merge/tests/main.nf.test.snap new file mode 100644 index 000000000..ee6c4e634 --- /dev/null +++ b/modules/nf-core/bedtools/merge/tests/main.nf.test.snap @@ -0,0 +1,35 @@ +{ + "test_bedtools_merge": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_out.bed:md5,0cf6ed2b6f470cd44a247da74ca4fe4e" + ] + ], + "1": [ + "versions.yml:md5,2d134badb4cd1e4e903696c7967f28d6" + ], + "bed": [ + [ + { + "id": "test" + }, + "test_out.bed:md5,0cf6ed2b6f470cd44a247da74ca4fe4e" + ] + ], + "versions": [ + "versions.yml:md5,2d134badb4cd1e4e903696c7967f28d6" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-18T17:07:09.721153" + } +} \ No newline at end of file diff --git a/modules/nf-core/bedtools/merge/tests/nextflow.config b/modules/nf-core/bedtools/merge/tests/nextflow.config new file mode 100644 index 000000000..16444e982 --- /dev/null +++ b/modules/nf-core/bedtools/merge/tests/nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: BEDTOOLS_MERGE { + ext.prefix = { "${meta.id}_out" } + } + +} diff --git a/modules/nf-core/bedtools/merge/tests/tags.yml b/modules/nf-core/bedtools/merge/tests/tags.yml new file mode 100644 index 000000000..60c8cad1c --- /dev/null +++ b/modules/nf-core/bedtools/merge/tests/tags.yml @@ -0,0 +1,2 @@ +bedtools/merge: + - "modules/nf-core/bedtools/merge/**" diff --git a/modules/nf-core/subread/featurecounts/environment.yml b/modules/nf-core/subread/featurecounts/environment.yml new file mode 100644 index 000000000..fdb575020 --- /dev/null +++ b/modules/nf-core/subread/featurecounts/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::subread=2.0.6 diff --git a/modules/nf-core/subread/featurecounts/main.nf b/modules/nf-core/subread/featurecounts/main.nf new file mode 100644 index 000000000..c0c9de4d9 --- /dev/null +++ b/modules/nf-core/subread/featurecounts/main.nf @@ -0,0 +1,57 @@ +process SUBREAD_FEATURECOUNTS { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container 'oras://community.wave.seqera.io/library/subread:2.0.6--2dd2dd526de026fd' + + input: + tuple val(meta), path(bams), path(annotation) + + output: + tuple val(meta), path("*featureCounts.txt") , emit: counts + tuple val(meta), path("*featureCounts.txt.summary"), emit: summary + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired_end = meta.single_end ? '' : '-p' + + def strandedness = 0 + if (meta.strandedness == 'forward') { + strandedness = 1 + } else if (meta.strandedness == 'reverse') { + strandedness = 2 + } + """ + featureCounts \\ + $args \\ + $paired_end \\ + -T $task.cpus \\ + -a $annotation \\ + -s $strandedness \\ + -o ${prefix}.featureCounts.txt \\ + ${bams.join(' ')} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + subread: \$( echo \$(featureCounts -v 2>&1) | sed -e "s/featureCounts v//g") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.featureCounts.txt + touch ${prefix}.featureCounts.txt.summary + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + subread: \$( echo \$(featureCounts -v 2>&1) | sed -e "s/featureCounts v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/subread/featurecounts/meta.yml b/modules/nf-core/subread/featurecounts/meta.yml new file mode 100644 index 000000000..a9b045b7d --- /dev/null +++ b/modules/nf-core/subread/featurecounts/meta.yml @@ -0,0 +1,62 @@ +name: subread_featurecounts +description: Count reads that map to genomic features +keywords: + - counts + - fasta + - genome + - reference +tools: + - featurecounts: + description: featureCounts is a highly efficient general-purpose read summarization + program that counts mapped reads for genomic features such as genes, exons, + promoter, gene bodies, genomic bins and chromosomal locations. It can be used + to count both RNA-seq and genomic DNA-seq reads. + homepage: http://bioinf.wehi.edu.au/featureCounts/ + documentation: http://bioinf.wehi.edu.au/subread-package/SubreadUsersGuide.pdf + doi: "10.1093/bioinformatics/btt656" + licence: ["GPL v3"] + identifier: biotools:subread +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bams: + type: file + description: BAM files containing mapped reads + pattern: "*.bam" + - annotation: + type: file + description: Genomic features annotation in GTF or SAF + pattern: "*.{gtf,saf}" +output: + - counts: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*featureCounts.txt": + type: file + description: Counts of reads mapping to features + pattern: "*featureCounts.txt" + - summary: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*featureCounts.txt.summary": + type: file + description: Summary log file + pattern: "*.featureCounts.txt.summary" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@ntoda03" +maintainers: + - "@ntoda03" diff --git a/modules/nf-core/subread/featurecounts/tests/main.nf.test b/modules/nf-core/subread/featurecounts/tests/main.nf.test new file mode 100644 index 000000000..3b95da33f --- /dev/null +++ b/modules/nf-core/subread/featurecounts/tests/main.nf.test @@ -0,0 +1,155 @@ +nextflow_process { + + name "Test Process SUBREAD_FEATURECOUNTS" + script "../main.nf" + process "SUBREAD_FEATURECOUNTS" + config "./nextflow.config" + tag "modules" + tag "modules_nfcore" + tag "subread" + tag "subread/featurecounts" + + test("sarscov2 [bam] - forward") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true, strandedness:'forward' ], // meta map + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.single_end.bam", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gtf", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.counts).match("forward_counts") }, + { assert snapshot(process.out.summary).match("forward_summary") }, + { assert snapshot(process.out.versions).match("forward_versions") } + ) + } + } + + test("sarscov2 [bam] - forward - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true, strandedness:'forward' ], // meta map + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.single_end.bam", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gtf", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sarscov2 [bam] - reverse") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true, strandedness:'reverse' ], // meta map + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.single_end.bam", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gtf", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.counts).match("reverse_counts") }, + { assert snapshot(process.out.summary).match("reverse_summary") }, + { assert snapshot(process.out.versions).match("reverse_versions") } + ) + } + } + + test("sarscov2 [bam] - reverse - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true, strandedness:'reverse' ], // meta map + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.single_end.bam", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gtf", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sarscov2 [bam] - unstranded") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true, strandedness:'unstranded' ], // meta map + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.single_end.bam", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gtf", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.counts).match("unstranded_counts") }, + { assert snapshot(process.out.summary).match("unstranded_summary") }, + { assert snapshot(process.out.versions).match("unstranded_versions") } + ) + } + } + + test("sarscov2 [bam] - unstranded - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true, strandedness:'unstranded' ], // meta map + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.single_end.bam", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gtf", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/subread/featurecounts/tests/main.nf.test.snap b/modules/nf-core/subread/featurecounts/tests/main.nf.test.snap new file mode 100644 index 000000000..5a743f0ce --- /dev/null +++ b/modules/nf-core/subread/featurecounts/tests/main.nf.test.snap @@ -0,0 +1,323 @@ +{ + "forward_counts": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "forward" + }, + "test.featureCounts.txt:md5,4cf89f0e702ba9abef3fa571e68fe8f0" + ] + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-18T10:19:47.695388295" + }, + "unstranded_counts": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "unstranded" + }, + "test.featureCounts.txt:md5,c4ef2c2a80547fbb3074331bc0a1bda3" + ] + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-18T10:21:03.208334705" + }, + "reverse_summary": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "reverse" + }, + "test.featureCounts.txt.summary:md5,7cfa30ad678b9bc1bc63afbb0281547b" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2023-11-23T15:50:25.168206514" + }, + "sarscov2 [bam] - forward - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "forward" + }, + "test.featureCounts.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "forward" + }, + "test.featureCounts.txt.summary:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,956ebb9da6a1747fc47d393fb5931fc2" + ], + "counts": [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "forward" + }, + "test.featureCounts.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "summary": [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "forward" + }, + "test.featureCounts.txt.summary:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,956ebb9da6a1747fc47d393fb5931fc2" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-18T10:20:10.040191252" + }, + "sarscov2 [bam] - reverse - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "reverse" + }, + "test.featureCounts.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "reverse" + }, + "test.featureCounts.txt.summary:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,956ebb9da6a1747fc47d393fb5931fc2" + ], + "counts": [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "reverse" + }, + "test.featureCounts.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "summary": [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "reverse" + }, + "test.featureCounts.txt.summary:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,956ebb9da6a1747fc47d393fb5931fc2" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-18T10:20:42.708101743" + }, + "reverse_counts": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "reverse" + }, + "test.featureCounts.txt:md5,a7d8843ebc12d855c2e68d3e2e137582" + ] + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-18T10:20:24.490755916" + }, + "sarscov2 [bam] - unstranded - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "unstranded" + }, + "test.featureCounts.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "unstranded" + }, + "test.featureCounts.txt.summary:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,956ebb9da6a1747fc47d393fb5931fc2" + ], + "counts": [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "unstranded" + }, + "test.featureCounts.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "summary": [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "unstranded" + }, + "test.featureCounts.txt.summary:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,956ebb9da6a1747fc47d393fb5931fc2" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-18T10:21:19.244837771" + }, + "forward_summary": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "forward" + }, + "test.featureCounts.txt.summary:md5,8f602ff9a8ef467af43294e80b367cdf" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2023-11-23T15:50:10.699024934" + }, + "forward_versions": { + "content": [ + [ + "versions.yml:md5,956ebb9da6a1747fc47d393fb5931fc2" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-18T10:19:48.001158764" + }, + "unstranded_summary": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "unstranded" + }, + "test.featureCounts.txt.summary:md5,23164b79f9f23f11c82820db61a35560" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2023-11-23T15:50:38.68776235" + }, + "reverse_versions": { + "content": [ + [ + "versions.yml:md5,956ebb9da6a1747fc47d393fb5931fc2" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-18T10:20:24.551053149" + }, + "unstranded_versions": { + "content": [ + [ + "versions.yml:md5,956ebb9da6a1747fc47d393fb5931fc2" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-18T10:21:03.25895568" + } +} \ No newline at end of file diff --git a/modules/nf-core/subread/featurecounts/tests/nextflow.config b/modules/nf-core/subread/featurecounts/tests/nextflow.config new file mode 100644 index 000000000..d9fd4fd57 --- /dev/null +++ b/modules/nf-core/subread/featurecounts/tests/nextflow.config @@ -0,0 +1,9 @@ +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + + withName: SUBREAD_FEATURECOUNTS { + ext.args = '-t CDS' + } + +} diff --git a/modules/nf-core/subread/featurecounts/tests/tags.yml b/modules/nf-core/subread/featurecounts/tests/tags.yml new file mode 100644 index 000000000..6d2534bf4 --- /dev/null +++ b/modules/nf-core/subread/featurecounts/tests/tags.yml @@ -0,0 +1,2 @@ +subread/featurecounts: + - modules/nf-core/subread/featurecounts/** diff --git a/nextflow.config b/nextflow.config index ff6b0bbcb..1f3b85a27 100644 --- a/nextflow.config +++ b/nextflow.config @@ -22,6 +22,7 @@ params { tool_filter = 1 exon_boundary = 0 save_intermediates = false + benchmarking = false // References genome = null diff --git a/nextflow_schema.json b/nextflow_schema.json index ee8ea89c8..268428f74 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -139,6 +139,13 @@ "help_text": "During annotation, if one of the start or end position of a circular candidate imperfectly overlaps an exon boundary, the script will consider positions within 'exon_boundary' (default 0bp) to be an exonic circRNA. If they fall outside of this range, the candidate is assumed to be an exonic-intronic circRNA, and the entire underlying sequence is taken for miRNA analysis, as opposed to just the exonic sequences for canonical exonic circRNAs. ", "default": 0 }, + "benchmarking": { + "type": "boolean", + "description": "Run the pipeline in benchmarking mode.", + "default": false, + "fa_icon": "fas fa-chart-line", + "help_text": "This enables parallel analysis of polyA-enriched data with the same circRNA quantification tools to compare results." + }, "mirna_tools": { "type": "string", "fa_icon": "fas fa-wrench", diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf index dea819a65..e14290c55 100644 --- a/subworkflows/local/annotation.nf +++ b/subworkflows/local/annotation.nf @@ -1,6 +1,5 @@ include { BEDTOOLS_INTERSECT as INTERSECT_GTF } from '../../modules/nf-core/bedtools/intersect' include { GAWK as INGEST_DATABASE_NAMES } from '../../modules/nf-core/gawk' -include { GNU_SORT as COMBINE_DATABASES } from '../../modules/nf-core/gnu/sort' include { BEDTOOLS_INTERSECT as INTERSECT_DATABASE } from '../../modules/nf-core/bedtools/intersect' include { ANNOTATION as ANNOTATE } from '../../modules/local/annotation' diff --git a/subworkflows/local/benchmarking.nf b/subworkflows/local/benchmarking.nf new file mode 100644 index 000000000..7df2181fa --- /dev/null +++ b/subworkflows/local/benchmarking.nf @@ -0,0 +1,202 @@ +include { GNU_SORT as SORT } from '../../modules/nf-core/gnu/sort' +include { BEDTOOLS_MERGE } from '../../modules/nf-core/bedtools/merge' +include { BEDTOOLS_INTERSECT } from '../../modules/nf-core/bedtools/intersect' +include { BEDTOOLS_JACCARD } from '../../modules/nf-core/bedtools/jaccard' +include { BEDTOOLS_GENOMECOV } from '../../modules/nf-core/bedtools/genomecov' +include { BENCHMARKING_MULTIQC as JACCARD_MULTIQC } from '../../modules/local/benchmarking/multiqc' +include { BENCHMARKING_MULTIQC as CORRELATION_MULTIQC } from '../../modules/local/benchmarking/multiqc' +include { PNG_JSON as LOCATION_JSON } from '../../modules/local/benchmarking/png_json' +include { PNG_JSON as OVERLAP_JSON } from '../../modules/local/benchmarking/png_json' +include { LOCATION_PLOT } from '../../modules/local/benchmarking/location_plots' +include { OVERLAP_PLOT } from '../../modules/local/benchmarking/overlap_plot' +include { SEQ_DEPTH_CORRELLATION } from '../../modules/local/benchmarking/seq_depth_plot' +include { AVERAGE_TSV } from '../../modules/local/benchmarking/average_tsv' +include { POLYATAILOR as POLYATAILOR_REAL } from '../../modules/local/benchmarking/polyatailor' +include { POLYATAILOR as POLYATAILOR_BENCHMARK} from '../../modules/local/benchmarking/polyatailor' +include { PLOT_POLYATAILS } from '../../modules/local/benchmarking/plot_polyatails' +include { SUBREAD_FEATURECOUNTS as FEATURECOUNTS_TOTAL } from '../../modules/nf-core/subread/featurecounts' +include { SUBREAD_FEATURECOUNTS as FEATURECOUNTS_BENCHMARKING} from '../../modules/nf-core/subread/featurecounts' +include { STAR2PASS as STAR2PASS_REAL } from './detection_tools/star2pass' +include { STAR2PASS as STAR2PASS_BENCHMARKING } from './detection_tools/star2pass' +include { STAR_GENOMEGENERATE } from '../../modules/nf-core/star/genomegenerate' +include { DECOMPRESS_READS as DECOMPRESS_REAL } from '../../modules/local/benchmarking/decompress_reads' +include { DECOMPRESS_READS as DECOMPRESS_BENCHMARKING} from '../../modules/local/benchmarking/decompress_reads' +include { RRNA_CORRELATION } from '../../modules/local/benchmarking/rRNA_correlation' + + + +workflow BENCHMARKING { + + take: + ch_reads_real + ch_reads_benchmarking + ch_real_bed + ch_benchmarking_bed + ch_real_bam + ch_benchmarking_bam + ch_trim_report + ch_fasta + bsj_reads + + main: + + //quality score estimation + /*ch_reads_benchmarking.view {"rb: $it"} + ch_reads_real.view {"rr: $it"} + ch_polya_real = POLYATAILOR_REAL(ch_reads_real, ch_real_bam.map { it[1] }) + ch_polya_benchmarking = POLYATAILOR_BENCHMARK(ch_reads_benchmarking, ch_benchmarking_bam.map { it[1] }) + + ch_polya_real.tails.collect().view {"pr: $it"} + ch_polya_benchmarking.tails.collect().view {"pb: $it"} + + ch_polya_plots = PLOT_POLYATAILS(ch_polya_real.tails.collect(), ch_polya_benchmarking.tails.collect()) + ch_polya_plots.average_tails.view {"$it"}*/ + + + // Use only the paths in each tuple of ch_reads_real and ch_reads_benchmarking, assuming the metadata is already present + // Define parameters + ch_benchmark_gtf = "/nfs/data3/CIRCEST/runs/test_benchmarking/gencode.v47.primary_assembly.annotation.gtf" + ch_benchmarking_fasta = "/nfs/data3/CIRCEST/runs/test_benchmarking/gencode.v47.rRNARNA_transcripts.fa" + ch_filter_gtf = "/nfs/data3/CIRCEST/runs/test_benchmarking/ensembl_rRNA.gtf" + + STAR_GENOMEGENERATE(ch_fasta, tuple([id: "benchmarking_gtf"], file(ch_benchmark_gtf))) + star_index = params.star ? Channel.value([[id: "star"], file(params.star, checkIfExists: true)]) : STAR_GENOMEGENERATE.out.index.collect() + star_ignore_sjdbgtf = true + seq_center = params.seq_center ?: '' + seq_platform = '' + + + ch_reads_real_restructured = ch_reads_real.map { meta, fastq_gz_list -> + tuple(meta, fastq_gz_list[0], fastq_gz_list[1]) + } + ch_reads_benchmarking_restructured = ch_reads_benchmarking.map { meta, fastq_gz_list -> + tuple(meta, fastq_gz_list[0], fastq_gz_list[1]) + } + ch_uncompressed_reads_real = DECOMPRESS_REAL(ch_reads_real_restructured) + ch_uncompressed_reads_benchmarking = DECOMPRESS_BENCHMARKING(ch_reads_benchmarking_restructured) + + ch_rRNA_real_bam = STAR2PASS_REAL(ch_uncompressed_reads_real, star_index, tuple([id: "Benchmarking_gtf"], file(ch_benchmark_gtf)), bsj_reads, star_ignore_sjdbgtf, seq_center, seq_platform).bam + + ch_rRNA_benchmarking_bam = STAR2PASS_BENCHMARKING(ch_uncompressed_reads_benchmarking, star_index, tuple([id: "Benchmarking_gtf"], file(ch_benchmark_gtf)), bsj_reads, star_ignore_sjdbgtf, seq_center, seq_platform).bam + + + + ch_rRNA_real_input = ch_real_bam.map { meta, path -> + tuple(meta, path, file(ch_filter_gtf)) + } + ch_rRNA_benchmarking_input = ch_benchmarking_bam.map { meta, path -> + tuple(meta, path, file(ch_filter_gtf)) + } + + ch_rRNA_real = FEATURECOUNTS_TOTAL(ch_rRNA_real_input).summary + ch_rRNA_benchmarking = FEATURECOUNTS_BENCHMARKING(ch_rRNA_benchmarking_input).summary + + + ch_collected_real_bed = ch_real_bed.collect() + ch_collected_benchmarking_bed = ch_benchmarking_bed.collect() + ch_collected_rRNA_real = ch_rRNA_real.collect() + ch_collected_rRNA_benchmarking = ch_rRNA_benchmarking.collect() + + RRNA_CORRELATION( + ch_collected_real_bed, + ch_collected_benchmarking_bed, + ch_collected_rRNA_real, + ch_collected_rRNA_benchmarking + ) + + + //ch_rRNA_benchmarking.view { "bench: $it" } + + + //data preparation + ch_versions = Channel.empty() + + ch_all = ch_real_bed.mix(ch_benchmarking_bed) + .map{ meta, bed -> [[id: meta.tool + "_" + (meta.benchmarking ? "benchmarking" : "real"), + tool: meta.tool, + benchmarking: meta.benchmarking], bed]} + .groupTuple() + + SORT(ch_all) + ch_versions = ch_versions.mix(SORT.out.versions) + + BEDTOOLS_MERGE(SORT.out.sorted).bed.branch{ meta, bed -> + real: !meta.benchmarking + benchmarking: meta.benchmarking + }.set { ch_merged } + ch_versions = ch_versions.mix(BEDTOOLS_MERGE.out.versions) + + ch_joined = ch_merged.real.map{ meta, bed -> [[id: meta.tool], bed]} + .join(ch_merged.benchmarking.map{ meta, bed -> [[id: meta.tool], bed]}) + + //Overlap plot + ch_intersect = BEDTOOLS_INTERSECT(ch_joined,[[], []]) + OVERLAP_PLOT(ch_intersect.intersect) + OVERLAP_JSON(OVERLAP_PLOT.out.plots, "Overlap plots", "Plot the overlap circRNAs found in total and polyA data for the tools") + ch_versions = ch_versions.mix(BEDTOOLS_INTERSECT.out.versions) + ch_versions = ch_versions.mix(OVERLAP_PLOT.out.versions) + ch_versions = ch_versions.mix(OVERLAP_JSON.out.versions) + + //Location plot workflow + LOCATION_PLOT(ch_joined) + LOCATION_JSON(LOCATION_PLOT.out.plots, "Location plots", "Plots the location of the circRNAs found" ) + ch_versions = ch_versions.mix(LOCATION_PLOT.out.versions) + ch_versions = ch_versions.mix(LOCATION_JSON.out.versions) + + //Pearson correllation workflow + ch_meta = ch_real_bam.map { it[0] } + ch_path = ch_real_bam.map { it[1] } + ch_scale = Channel.value(1) + ch_genomecov_inputs = ch_meta.combine(ch_path).combine(ch_scale) + .map { meta, path, scale -> + tuple(meta, path, scale) + } + + ch_genomecov = BEDTOOLS_GENOMECOV(ch_genomecov_inputs, [], "bg",false) + ch_versions = ch_versions.mix(BEDTOOLS_GENOMECOV.out.versions) + + ch_seqdepths = ch_genomecov.genomecov + .map { genomecov_result -> genomecov_result[1].toString() } + .collectFile(name: 'genomecov_paths.txt', + newLine: true) + + ch_corr = SEQ_DEPTH_CORRELLATION(ch_real_bed, ch_seqdepths.collect()).report + ch_versions = ch_versions.mix(SEQ_DEPTH_CORRELLATION.out.versions) + + ch_pearson = ch_corr.splitCsv(header: true, sep: "\t") + .map{ values -> [values.tool, values.pearson_corr]} + .collectFile( newLine: true, + storeDir: params.outdir, + seed: "tool\tpearson_corr") { + row -> ["pearson.tsv", row.join("\t")] + } + + AVERAGE_TSV(ch_pearson) + CORRELATION_MULTIQC(AVERAGE_TSV.out.tsv) + ch_versions = ch_versions.mix(AVERAGE_TSV.out.versions) + ch_versions = ch_versions.mix(CORRELATION_MULTIQC.out.versions) + + //Jaccard Workflow + ch_jaccard = BEDTOOLS_JACCARD(ch_joined, [[], []]).tsv + ch_versions = ch_versions.mix(BEDTOOLS_JACCARD.out.versions) + + ch_stats = ch_jaccard.splitCsv(header: true, sep: "\t") + .map{ meta, values -> [meta.id, values.intersection, values.union, values.jaccard, values.n_intersections]} + .collectFile( newLine: true, + storeDir: params.outdir, + seed: "tool\tintersection\tunion\tjaccard\tn_intersections") { + row -> ["jaccard.tsv", row.join("\t")] + } + + JACCARD_MULTIQC(ch_stats) + ch_versions = ch_versions.mix(JACCARD_MULTIQC.out.versions) + + //combine results + ch_reports = JACCARD_MULTIQC.out.report.mix(LOCATION_JSON.out.report) + ch_reports = ch_reports.mix(OVERLAP_JSON.out.report) + ch_reports = ch_reports.mix(CORRELATION_MULTIQC.out.report) + + emit: + reports = ch_reports + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/bsj_detection.nf b/subworkflows/local/bsj_detection.nf index 89ddec4ed..d85bf3304 100644 --- a/subworkflows/local/bsj_detection.nf +++ b/subworkflows/local/bsj_detection.nf @@ -229,11 +229,13 @@ workflow BSJ_DETECTION { ) emit: - bed = ch_bsj_bed_combined - bed12 = ch_bsj_bed12_combined - gtf = ch_bsj_gtf_combined - fasta = ch_bsj_fasta_combined - - multiqc_files = ch_multiqc_files - versions = ch_versions + bed = ch_bsj_bed_combined + bed12 = ch_bsj_bed12_combined + gtf = ch_bsj_gtf_combined + fasta = ch_bsj_fasta_combined + star_bam = STAR2PASS.out.bam + bed_per_sample_tool = ch_bsj_bed_per_sample_tool + + multiqc_files = ch_multiqc_files + versions = ch_versions } diff --git a/subworkflows/local/detection_tools/star2pass.nf b/subworkflows/local/detection_tools/star2pass.nf index 4e216545b..4545c909b 100644 --- a/subworkflows/local/detection_tools/star2pass.nf +++ b/subworkflows/local/detection_tools/star2pass.nf @@ -29,6 +29,7 @@ workflow STAR2PASS { junction = PASS_2.out.junction sam = PASS_2.out.sam tab = PASS_2.out.tab + bam = PASS_2.out.bam versions = ch_versions } diff --git a/subworkflows/local/mirna_prediction.nf b/subworkflows/local/mirna_prediction.nf index b3fc46cc0..1b6b046bb 100644 --- a/subworkflows/local/mirna_prediction.nf +++ b/subworkflows/local/mirna_prediction.nf @@ -1,8 +1,7 @@ // MODULES -include { BIOAWK as ADD_BACKSPLICE } from '../../modules/nf-core/bioawk' -include { DESEQ2_NORMALIZATION } from '../../modules/local/deseq2/normalization' -include { MIRNA_FILTERING } from '../../modules/local/mirna_filtering' -include { COMPUTE_CORRELATIONS } from '../../modules/local/compute_correlations' +include { DESEQ2_NORMALIZATION } from '../../modules/local/deseq2/normalization' +include { MIRNA_FILTERING } from '../../modules/local/mirna_filtering' +include { COMPUTE_CORRELATIONS } from '../../modules/local/compute_correlations' // SUBWORKFLOWS include { MIRNA_BINDINGSITES } from './mirna/mirna_bindingsites' diff --git a/variables_output.txt b/variables_output.txt new file mode 100644 index 000000000..e69de29bb diff --git a/workflows/circrna/main.nf b/workflows/circrna/main.nf index 048b4da34..3713e38e9 100644 --- a/workflows/circrna/main.nf +++ b/workflows/circrna/main.nf @@ -22,17 +22,19 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil */ // SUBWORKFLOWS: -include { paramsSummaryMap } from 'plugin/nf-validation' -include { paramsSummaryMultiqc } from '../../subworkflows/nf-core/utils_nfcore_pipeline' -include { validateInputSamplesheet } from '../../subworkflows/local/utils_nfcore_circrna_pipeline' - -include { softwareVersionsToYAML } from '../../subworkflows/nf-core/utils_nfcore_pipeline' -include { PREPARE_GENOME } from '../../subworkflows/local/prepare_genome' -include { BSJ_DETECTION } from '../../subworkflows/local/bsj_detection' -include { ANNOTATION } from '../../subworkflows/local/annotation' -include { QUANTIFICATION } from '../../subworkflows/local/quantification' -include { MIRNA_PREDICTION } from '../../subworkflows/local/mirna_prediction' -include { STATISTICAL_TESTS } from '../../subworkflows/local/statistical_tests' +include { paramsSummaryMap } from 'plugin/nf-validation' +include { paramsSummaryMultiqc } from '../../subworkflows/nf-core/utils_nfcore_pipeline' +include { validateInputSamplesheet } from '../../subworkflows/local/utils_nfcore_circrna_pipeline' + +include { softwareVersionsToYAML } from '../../subworkflows/nf-core/utils_nfcore_pipeline' +include { PREPARE_GENOME } from '../../subworkflows/local/prepare_genome' +include { BSJ_DETECTION } from '../../subworkflows/local/bsj_detection' +include { BSJ_DETECTION as BSJ_DETECTION_BENCHMARKING } from '../../subworkflows/local/bsj_detection' +include { BENCHMARKING } from '../../subworkflows/local/benchmarking' +include { ANNOTATION } from '../../subworkflows/local/annotation' +include { QUANTIFICATION } from '../../subworkflows/local/quantification' +include { MIRNA_PREDICTION } from '../../subworkflows/local/mirna_prediction' +include { STATISTICAL_TESTS } from '../../subworkflows/local/statistical_tests' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -136,8 +138,13 @@ workflow CIRCRNA { // 2. BSJ Discovery // + FASTQC_TRIMGALORE.out.reads.branch{ + real: !it[0].benchmarking + benchmarking: it[0].benchmarking + }.set{ ch_reads } + BSJ_DETECTION( - FASTQC_TRIMGALORE.out.reads, + ch_reads.real, ch_fasta, ch_gtf, ch_annotation, @@ -155,13 +162,49 @@ workflow CIRCRNA { ch_versions = ch_versions.mix(BSJ_DETECTION.out.versions) // - // 3. circRNA quantification + // 3. Benchmarking + // + + if (params.benchmarking) { + BSJ_DETECTION_BENCHMARKING( + ch_reads.benchmarking, + ch_fasta, + ch_gtf, + ch_annotation, + bowtie_index, + bowtie2_index, + bwa_index, + chromosomes, + hisat2_index, + star_index, + params.bsj_reads, + params.exon_boundary + ) + + BENCHMARKING( + ch_reads.real, + ch_reads.benchmarking, + BSJ_DETECTION.out.bed_per_sample_tool, + BSJ_DETECTION_BENCHMARKING.out.bed_per_sample_tool, + BSJ_DETECTION.out.star_bam, + BSJ_DETECTION_BENCHMARKING.out.star_bam, + FASTQC_TRIMGALORE.out.trim_log, + ch_fasta, + params.bsj_reads + ) + + ch_multiqc_files = ch_multiqc_files.mix(BENCHMARKING.out.reports) + ch_versions = ch_versions.mix(BSJ_DETECTION_BENCHMARKING.out.versions) + } + + // + // 4. Quantification // QUANTIFICATION( ch_gtf, ch_fasta, - FASTQC_TRIMGALORE.out.reads, + ch_reads.real, BSJ_DETECTION.out.bed12, BSJ_DETECTION.out.gtf, params.bootstrap_samples, @@ -172,7 +215,7 @@ workflow CIRCRNA { ch_versions = ch_versions.mix(QUANTIFICATION.out.versions) // - // 4. miRNA prediction + // 5. miRNA prediction // if (params.mature) {