diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
index 570f6c0dc..e5337fdc7 100644
--- a/assets/multiqc_config.yml
+++ b/assets/multiqc_config.yml
@@ -5,10 +5,12 @@ report_comment: >
report_section_order:
"nf-core-circrna-methods-description":
order: -1000
- software_versions:
+ benchmarking:
order: -1001
- "nf-core-circrna-summary":
+ software_versions:
order: -1002
+ "nf-core-circrna-summary":
+ order: -1003
export_plots: true
diff --git a/assets/schema_input.json b/assets/schema_input.json
index 7e9257a06..d79fa8076 100644
--- a/assets/schema_input.json
+++ b/assets/schema_input.json
@@ -27,6 +27,12 @@
"pattern": "^\\S+\\.f(ast)?q\\.gz$",
"errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
},
+ "benchmarking": {
+ "type": "boolean",
+ "default": false,
+ "errorMessage": "Benchmarking must be a boolean value",
+ "meta": ["benchmarking"]
+ },
"strandedness": {
"type": "string",
"enum": ["unstranded", "forward", "reverse", "auto"],
diff --git a/conf/modules.config b/conf/modules.config
index b23ea157f..765827fcd 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -813,6 +813,23 @@ process {
]
}
+ withName: '.*:CIRCRNA_DISCOVERY_BENCHMARKING:ANNOTATION:REMOVE_SCORE_STRAND' {
+ ext.suffix = "_benchmarking.tidy.bed"
+ }
+
+ withName: '.*:BENCHMARKING:SORT' {
+ ext.args = "-k 1,1 -k2,2n -k3,3n -u"
+ ext.suffix = "combined.bed"
+ }
+
+ withName: '.*:BENCHMARKING:BEDTOOLS_MERGE' {
+ ext.args = "-s -c 6 -o distinct"
+ }
+
+ withName: '.*:BENCHMARKING:BEDTOOLS_GENOMECOV' {
+ ext.args = "-dz"
+ }
+
withName: ADD_BACKSPLICE {
ext.args = "-c fastx '{ if (\$name ~ /^circ_/) { \$seq = \$seq substr(\$seq, 1, 25) } print \">\" \$name; print \$seq }'"
ext.suffix = "backspliced.fa"
diff --git a/modules.json b/modules.json
index c1bed4145..79e98dcc5 100644
--- a/modules.json
+++ b/modules.json
@@ -5,6 +5,11 @@
"https://github.com/nf-core/modules.git": {
"modules": {
"nf-core": {
+ "bedtools/genomecov": {
+ "branch": "master",
+ "git_sha": "81b90194ce9911dbd55bba2c65c6919f6677abc4",
+ "installed_by": ["modules"]
+ },
"bedtools/getfasta": {
"branch": "master",
"git_sha": "cdcdd5e3d806f0ff3983c40c69e0b07bb44ec299",
@@ -20,6 +25,16 @@
"git_sha": "575e1bc54b083fb15e7dd8b5fcc40bea60e8ce83",
"installed_by": ["modules"]
},
+ "bedtools/jaccard": {
+ "branch": "master",
+ "git_sha": "3b248b84694d1939ac4bb33df84bf6233a34d668",
+ "installed_by": ["modules"]
+ },
+ "bedtools/merge": {
+ "branch": "master",
+ "git_sha": "a5377837fe9013bde89de8689829e83e84086536",
+ "installed_by": ["modules"]
+ },
"bedtools/sort": {
"branch": "master",
"git_sha": "571a5feac4c9ce0a8df0bc15b94230e7f3e8db47",
@@ -201,6 +216,11 @@
"git_sha": "b1b959609bda44341120aed1766329909f54b8d0",
"installed_by": ["modules"]
},
+ "subread/featurecounts": {
+ "branch": "master",
+ "git_sha": "49f4e50534fe4b64101e62ea41d5dc43b1324358",
+ "installed_by": ["modules"]
+ },
"trimgalore": {
"branch": "master",
"git_sha": "a98418419ae6c9df3cf6cf108d1e1aba71037d5a",
diff --git a/modules/local/benchmarking/average_tsv/main.nf b/modules/local/benchmarking/average_tsv/main.nf
new file mode 100644
index 000000000..79a0b5c13
--- /dev/null
+++ b/modules/local/benchmarking/average_tsv/main.nf
@@ -0,0 +1,19 @@
+process AVERAGE_TSV {
+ label "process_single"
+
+ conda "bioconda::pandas=1.5.2"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' :
+ 'biocontainers/pandas:1.5.2' }"
+
+
+ input:
+ path(tsv)
+
+ output:
+ path(tsv), emit: tsv
+ path("versions.yml"), emit: versions
+
+ script:
+ template "average.py"
+}
diff --git a/modules/local/benchmarking/average_tsv/templates/average.py b/modules/local/benchmarking/average_tsv/templates/average.py
new file mode 100644
index 000000000..54f27a37c
--- /dev/null
+++ b/modules/local/benchmarking/average_tsv/templates/average.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+
+import pandas as pd
+import platform
+
+tsv = "$tsv"
+data = pd.read_csv(tsv, sep='\\t')
+
+data['tool'] = data['tool'].str.replace('tool:', '')
+average_values = data.groupby('tool')['pearson_corr'].mean().reset_index()
+
+output_file_path = tsv
+average_values.to_csv(output_file_path, sep='\\t', index=False)
+
+#version capture
+def format_yaml_like(data: dict, indent: int = 0) -> str:
+ """Formats a dictionary to a YAML-like string.
+
+ Args:
+ data (dict): The dictionary to format.
+ indent (int): The current indentation level.
+
+ Returns:
+ str: A string formatted as YAML.
+ """
+ yaml_str = ""
+ for key, value in data.items():
+ spaces = " " * indent
+ if isinstance(value, dict):
+ yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
+ else:
+ yaml_str += f"{spaces}{key}: {value}\\n"
+ return yaml_str
+
+versions = {
+ "${task.process}" : {
+ "python": platform.python_version(),
+ "pandas": pd.__version__
+ }
+}
+with open("versions.yml", "w") as f:
+ f.write(format_yaml_like(versions))
+
diff --git a/modules/local/benchmarking/decompress_reads/main.nf b/modules/local/benchmarking/decompress_reads/main.nf
new file mode 100644
index 000000000..6f88e3bf1
--- /dev/null
+++ b/modules/local/benchmarking/decompress_reads/main.nf
@@ -0,0 +1,20 @@
+
+
+process DECOMPRESS_READS {
+ input:
+ tuple val(meta), path(fastq_gz_1), path(fastq_gz_2)
+
+ output:
+ tuple val(meta), path("*.fq")
+
+ script:
+ """
+ # Define output names by removing .gz and adding _decompressed.fq
+ output_name1=\$(basename ${fastq_gz_1} .gz)_decompressed.fq
+ output_name2=\$(basename ${fastq_gz_2} .gz)_decompressed.fq
+
+ # Decompress the files
+ gunzip -c ${fastq_gz_1} > \$output_name1
+ gunzip -c ${fastq_gz_2} > \$output_name2
+ """
+}
diff --git a/modules/local/benchmarking/location_plots/main.nf b/modules/local/benchmarking/location_plots/main.nf
new file mode 100644
index 000000000..4b4747bf5
--- /dev/null
+++ b/modules/local/benchmarking/location_plots/main.nf
@@ -0,0 +1,17 @@
+process LOCATION_PLOT {
+ tag "$meta.id"
+ label "process_single"
+
+ conda "bioconda::seaborn=0.11.2"
+ container 'community.wave.seqera.io/library/seaborn:0.13.2--ef0811a05c6fcc75'
+
+ input:
+ tuple val(meta), path(bedfile1), path(bedfile2)
+
+ output:
+ path("*_mqc.png"), emit: plots
+ path("versions.yml"), emit: versions
+
+ script:
+ template "create_plots.py"
+}
diff --git a/modules/local/benchmarking/location_plots/templates/create_plots.py b/modules/local/benchmarking/location_plots/templates/create_plots.py
new file mode 100755
index 000000000..1479bc844
--- /dev/null
+++ b/modules/local/benchmarking/location_plots/templates/create_plots.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+import csv
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+from matplotlib.lines import Line2D
+import platform
+
+input_bed_file_1 = '$bedfile1'
+input_bed_file_2 = '$bedfile2'
+
+# Read input files
+def read_bed_file(file_path, label):
+ data = {'chromosome': [], 'start': [], 'strand': [], 'file_label': []}
+ with open(file_path, 'r') as file:
+ reader = csv.reader(file, delimiter='\\t')
+ for row in reader:
+ data['chromosome'].append(row[0])
+ data['start'].append(int(row[1]))
+ data['strand'].append(row[3])
+ data['file_label'].append(label.lower())
+ return data
+
+def combine_data(data1, data2):
+ for key in data1:
+ data1[key].extend(data2[key])
+ return data1
+
+data1 = read_bed_file(input_bed_file_1, 'total')
+data2 = read_bed_file(input_bed_file_2, 'polya')
+
+# Combine the two datasets
+combined_data = combine_data(data1, data2)
+
+# Create a DataFrame
+df = pd.DataFrame({
+ 'Chromosome': combined_data['chromosome'],
+ 'Start Location': combined_data['start'],
+ 'Strand': combined_data['strand'],
+ 'File Label': combined_data['file_label']
+})
+
+# Sort DataFrame to ensure consistent plotting order
+df.sort_values(by=['File Label', 'Strand'], inplace=True)
+
+# Plotting
+fig, ax = plt.subplots(figsize=(12, 6))
+
+palette = {
+ "total +": "red",
+ "total -": "lightcoral",
+ "polya +": "blue",
+ "polya -": "lightblue"
+}
+
+# Draw violins
+for file_label in df['File Label'].unique():
+ sns.violinplot(
+ x="Chromosome",
+ y="Start Location",
+ hue="Strand",
+ data=df[df['File Label'] == file_label],
+ palette={"+" : palette[f"{file_label} +"], "-" : palette[f"{file_label} -"]},
+ split=True,
+ ax=ax,
+ scale="count",
+ scale_hue=False,
+ saturation=0.75,
+ inner=None
+ )
+
+# Set transparency for all violins
+for violin in ax.collections:
+ violin.set_alpha(0.25)
+
+# Legend
+custom_lines = [
+ Line2D([0], [0], color=palette[f"{file} {strand}"], lw=4, alpha=0.25)
+ for file in df['File Label'].unique()
+ for strand in ["+", "-"]
+]
+ax.legend(
+ custom_lines,
+ [f"{file} : {strand}" for file in df['File Label'].unique() for strand in ["+", "-"]],
+ title="File : Strand"
+)
+
+plt.title('Start Locations of circRNA by Chromosome and Strand')
+
+plot_file_name = f"{input_bed_file_1.replace('.bed','')}_{input_bed_file_2.replace('.bed','')}_mqc.png"
+
+# Save the plot
+plt.savefig(plot_file_name, bbox_inches='tight')
+
+#version capture
+def format_yaml_like(data: dict, indent: int = 0) -> str:
+ """Formats a dictionary to a YAML-like string.
+
+ Args:
+ data (dict): The dictionary to format.
+ indent (int): The current indentation level.
+
+ Returns:
+ str: A string formatted as YAML.
+ """
+ yaml_str = ""
+ for key, value in data.items():
+ spaces = " " * indent
+ if isinstance(value, dict):
+ yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
+ else:
+ yaml_str += f"{spaces}{key}: {value}\\n"
+ return yaml_str
+
+versions = {
+ "${task.process}" : {
+ "python": platform.python_version(),
+ "pandas": pd.__version__,
+ "seaborn": sns.__version__
+ }
+}
+with open("versions.yml", "w") as f:
+ f.write(format_yaml_like(versions))
diff --git a/modules/local/benchmarking/multiqc/main.nf b/modules/local/benchmarking/multiqc/main.nf
new file mode 100644
index 000000000..a39675d30
--- /dev/null
+++ b/modules/local/benchmarking/multiqc/main.nf
@@ -0,0 +1,18 @@
+process BENCHMARKING_MULTIQC {
+ label "process_single"
+
+ conda "bioconda::pandas=1.5.2"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' :
+ 'biocontainers/pandas:1.5.2' }"
+
+ input:
+ path(jaccard)
+
+ output:
+ path("*_mqc.json") , emit: report
+ path("versions.yml"), emit: versions
+
+ script:
+ template "benchmarking.py"
+}
diff --git a/modules/local/benchmarking/multiqc/templates/benchmarking.py b/modules/local/benchmarking/multiqc/templates/benchmarking.py
new file mode 100644
index 000000000..1515e8198
--- /dev/null
+++ b/modules/local/benchmarking/multiqc/templates/benchmarking.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+
+import pandas as pd
+import json
+import platform
+
+def format_yaml_like(data: dict, indent: int = 0) -> str:
+ """Formats a dictionary to a YAML-like string.
+
+ Args:
+ data (dict): The dictionary to format.
+ indent (int): The current indentation level.
+
+ Returns:
+ str: A string formatted as YAML.
+ """
+ yaml_str = ""
+ for key, value in data.items():
+ spaces = " " * indent
+ if isinstance(value, dict):
+ yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
+ else:
+ yaml_str += f"{spaces}{key}: {value}\\n"
+ return yaml_str
+
+df = pd.read_csv("$jaccard", sep='\\t', index_col=0)
+
+for metric in df.columns:
+ data = {
+ "id": f"benchmarking_{metric}",
+ "parent_id": "benchmarking",
+ "parent_name": "Benchmarking",
+ "parent_description": "Benchmarking of the tools",
+ "section_name": metric.capitalize(),
+ "description": f"{metric.capitalize()} values of the tools",
+ "plot_type": "bargraph",
+ "data": df[[metric]].T.to_dict()
+ }
+
+ with open(f"benchmarking_{metric}_mqc.json", "w") as f:
+ json.dump(data, f, indent=4)
+
+versions = {
+ "${task.process}" : {
+ "python": platform.python_version(),
+ "pandas": pd.__version__
+ }
+}
+with open("versions.yml", "w") as f:
+ f.write(format_yaml_like(versions))
diff --git a/modules/local/benchmarking/overlap_plot/main.nf b/modules/local/benchmarking/overlap_plot/main.nf
new file mode 100644
index 000000000..161b62036
--- /dev/null
+++ b/modules/local/benchmarking/overlap_plot/main.nf
@@ -0,0 +1,17 @@
+process OVERLAP_PLOT {
+ tag "$meta.id"
+ label "process_single"
+
+ conda "bioconda::seaborn=0.11.2"
+ container 'community.wave.seqera.io/library/seaborn:0.13.2--ef0811a05c6fcc75'
+
+ input:
+ tuple val(meta), path(bed)
+
+ output:
+ path("*_mqc.png") , emit: plots
+ path("versions.yml"), emit: versions
+
+ script:
+ template "create_plots.py"
+}
diff --git a/modules/local/benchmarking/overlap_plot/templates/create_plots.py b/modules/local/benchmarking/overlap_plot/templates/create_plots.py
new file mode 100644
index 000000000..fcbae5eaf
--- /dev/null
+++ b/modules/local/benchmarking/overlap_plot/templates/create_plots.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+import csv
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+import platform
+
+input_bed_file = '$bed'
+
+# Read input file
+def read_bed_file(file_path):
+ data = {'chromosome': [], 'start': [], 'strand': []}
+ with open(file_path, 'r') as file:
+ reader = csv.reader(file, delimiter='\t')
+ for row in reader:
+ data['chromosome'].append(row[0])
+ data['start'].append(int(row[1]))
+ data['strand'].append(row[3])
+ return data
+
+# Read the data
+data = read_bed_file(input_bed_file)
+
+# Create a DataFrame
+df = pd.DataFrame({
+ 'Chromosome': data['chromosome'],
+ 'Start Location': data['start'],
+ 'Strand': data['strand']
+})
+
+# Plotting
+plt.figure(figsize=(12, 6))
+sns.violinplot(x='Chromosome', y='Start Location', hue='Strand', data=df, split=True, inner="quartile")
+plt.title('Start Locations of circRNA by Chromosome and Strand')
+plt.legend(title="Strand")
+plot_file_name = f"{input_bed_file.replace('.bed','')}_overlaps_mqc.png"
+
+plt.savefig(plot_file_name, bbox_inches='tight')
+
+
+#version capture
+def format_yaml_like(data: dict, indent: int = 0) -> str:
+ """Formats a dictionary to a YAML-like string.
+
+ Args:
+ data (dict): The dictionary to format.
+ indent (int): The current indentation level.
+
+ Returns:
+ str: A string formatted as YAML.
+ """
+ yaml_str = ""
+ for key, value in data.items():
+ spaces = " " * indent
+ if isinstance(value, dict):
+ yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
+ else:
+ yaml_str += f"{spaces}{key}: {value}\\n"
+ return yaml_str
+
+versions = {
+ "${task.process}" : {
+ "python": platform.python_version(),
+ "pandas": pd.__version__,
+ "seaborn": sns.__version__
+ }
+}
+with open("versions.yml", "w") as f:
+ f.write(format_yaml_like(versions))
diff --git a/modules/local/benchmarking/plot_polyatails/main.nf b/modules/local/benchmarking/plot_polyatails/main.nf
new file mode 100644
index 000000000..26f173f62
--- /dev/null
+++ b/modules/local/benchmarking/plot_polyatails/main.nf
@@ -0,0 +1,14 @@
+process PLOT_POLYATAILS {
+ label "process_single"
+
+ conda "bioconda::seaborn=0.11.2"
+ container 'community.wave.seqera.io/library/seaborn:0.13.2--ef0811a05c6fcc75'
+
+ input:
+ path(real)
+ path(benchmarking)
+ output:
+ path("*.tsv") , emit: average_tails
+ script:
+ template "create_plots.py"
+}
\ No newline at end of file
diff --git a/modules/local/benchmarking/plot_polyatails/templates/create_plots.py b/modules/local/benchmarking/plot_polyatails/templates/create_plots.py
new file mode 100644
index 000000000..dcf2afc63
--- /dev/null
+++ b/modules/local/benchmarking/plot_polyatails/templates/create_plots.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+import csv
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+import platform
+
+# Function to calculate the average nA value for a given file
+def calculate_average_na(file_path):
+ # Read the TSV file into a pandas DataFrame
+ df = pd.read_csv(file_path, sep='\t') # Use tab as the separator
+
+ # Calculate the mean of the nA column
+ avg_na = df['nA'].mean()
+
+ # Extract the sample name from the 'sample' column (since all rows have the same sample)
+ sample_name = df['sample'].iloc[0]
+
+ return avg_na, sample_name
+
+def main(file_list, output_file):
+ # Initialize lists to store the averages and corresponding samples
+ averages = []
+ samples = []
+
+ # Loop through each file in the list
+ for file_path in file_list:
+ avg_na, sample_name = calculate_average_na(file_path)
+ averages.append(avg_na)
+ samples.append(sample_name)
+
+ # Write the results to a TSV file
+ with open(output_file, 'w', newline='') as tsvfile:
+ writer = csv.writer(tsvfile, delimiter='\t') # Tab-separated output
+ writer.writerow(['sample', 'avg']) # Write header
+ for sample, avg in zip(samples, averages):
+ writer.writerow([sample, avg])
+
+if __name__ == "__main__":
+ file_list_r = '$real'.split(' ')
+ output_file_r = 'real.tsv'
+ main(file_list_r, output_file_r)
+ file_list_b = '$benchmarking'.split(' ')
+ output_file_b = 'benchmarking.tsv'
+ main(file_list_b, output_file_b)
+
+
\ No newline at end of file
diff --git a/modules/local/benchmarking/png_json/main.nf b/modules/local/benchmarking/png_json/main.nf
new file mode 100644
index 000000000..e6205bfc1
--- /dev/null
+++ b/modules/local/benchmarking/png_json/main.nf
@@ -0,0 +1,20 @@
+process PNG_JSON {
+ label "process_single"
+
+ conda "bioconda::pandas=1.5.2"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' :
+ 'biocontainers/pandas:1.5.2' }"
+
+ input:
+ path(png)
+ val(title)
+ val(description)
+
+ output:
+ path("*_mqc.json") , emit: report
+ path("versions.yml"), emit: versions
+
+ script:
+ template "json.py"
+}
diff --git a/modules/local/benchmarking/png_json/templates/json.py b/modules/local/benchmarking/png_json/templates/json.py
new file mode 100644
index 000000000..639cfab2f
--- /dev/null
+++ b/modules/local/benchmarking/png_json/templates/json.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+import json
+import os
+import base64
+import platform
+
+png = "$png"
+title = "$title"
+description = "$description"
+
+absolute_path = os.path.abspath(png)
+filename = os.path.basename(absolute_path)
+name = filename.rstrip("_mcq.png")
+id = title.lower().replace(" ", "_")
+
+# Convert the PNG file to a base64 string
+with open(absolute_path, "rb") as image_file:
+ base64_string = base64.b64encode(image_file.read()).decode('utf-8')
+
+# Create the data structure for JSON
+data = {
+ "id": f"benchmarking_{name}",
+ "parent_id": id,
+ "parent_name": title,
+ "parent_description": description,
+ "section_name": name,
+ "plot_type": "image",
+ "data": f'
![](data:image/png;base64,{base64_string})
'
+}
+
+# Write the JSON data to a file
+with open(f"{name}_mqc.json", "w") as f:
+ json.dump(data, f, indent=4)
+
+print(f"JSON file created for {absolute_path}")
+
+#version capture
+def format_yaml_like(data: dict, indent: int = 0) -> str:
+ """Formats a dictionary to a YAML-like string.
+
+ Args:
+ data (dict): The dictionary to format.
+ indent (int): The current indentation level.
+
+ Returns:
+ str: A string formatted as YAML.
+ """
+ yaml_str = ""
+ for key, value in data.items():
+ spaces = " " * indent
+ if isinstance(value, dict):
+ yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
+ else:
+ yaml_str += f"{spaces}{key}: {value}\\n"
+ return yaml_str
+
+versions = {
+ "${task.process}" : {
+ "python": platform.python_version()
+ }
+}
+with open("versions.yml", "w") as f:
+ f.write(format_yaml_like(versions))
diff --git a/modules/local/benchmarking/polyatailor/main.nf b/modules/local/benchmarking/polyatailor/main.nf
new file mode 100644
index 000000000..3427728d8
--- /dev/null
+++ b/modules/local/benchmarking/polyatailor/main.nf
@@ -0,0 +1,20 @@
+process POLYATAILOR {
+ tag "$meta.id"
+ label "process_high"
+
+ container 'docker://faboehm/polyatailor-env'
+
+ input:
+ tuple val(meta),
+ path(fastq)
+ path(bam)
+
+ output:
+ path("*.tsv"), emit: tails
+ path("versions.yml"), emit: versions
+
+ script:
+ template'polyatailor.R'
+
+}
+
diff --git a/modules/local/benchmarking/polyatailor/templates/polyatailor.R b/modules/local/benchmarking/polyatailor/templates/polyatailor.R
new file mode 100644
index 000000000..fc8e3d6fb
--- /dev/null
+++ b/modules/local/benchmarking/polyatailor/templates/polyatailor.R
@@ -0,0 +1,73 @@
+#!/usr/bin/env Rscript
+library(PolyAtailor)
+library(Biostrings)
+library(yaml) # To write the YAML file
+library(Rsamtools) # Ensure Rsamtools is loaded for scanBam
+library(GenomicAlignments)
+library(dplyr)
+conflict_prefer("filter", "dplyr")
+conflict_prefer("rename", "dplyr")
+conflict_prefer("strsplit", "base")
+
+fastq_string <- '$fastq'
+fastq_files <- unlist(strsplit(fastq_string, " "))
+sample_names <- gsub(".fq.gz\$", "", fastq_files)
+
+result_1 <- tailScan(
+ fastq = fastq_files[1],
+ mcans = 3,
+ findUmi = FALSE,
+ lumi = 0,
+ adapterSeq = "",
+ anchorSeq = "",
+ resultpath = "./",
+ samplename = sample_names[1],
+ tailAnchorLen = 8,
+ minTailLen = 5,
+ realTailLen = 15,
+ maxNtail = 2,
+ mapping = FALSE,
+ mapinfo = NULL,
+ findTailType = 'A'
+)
+
+write.table(result_1, file = paste0(sample_names[1], ".tsv"), sep = "\t", row.names = FALSE, col.names = TRUE)
+
+result_2 <- tailScan(
+ fastq = fastq_files[2],
+ mcans = 3,
+ findUmi = FALSE,
+ lumi = 0,
+ adapterSeq = "",
+ anchorSeq = "",
+ resultpath = "./",
+ samplename = sample_names[2],
+ tailAnchorLen = 8,
+ minTailLen = 5,
+ realTailLen = 15,
+ maxNtail = 2,
+ mapping = FALSE,
+ mapinfo = NULL,
+ findTailType = 'A'
+)
+write.table(result_2, file = paste0(sample_names[2], ".tsv"), sep = "\t", row.names = FALSE, col.names = TRUE)
+
+result_tailMap <- tailMap(
+ bamfile = '$bam',
+ mcans = 3,
+ minTailLen = 5,
+ findUmi = FALSE,
+ maxNtail = 2,
+ mapping = TRUE,
+ longRead = FALSE
+)
+write.csv(result_tailMap, "map.csv", row.names = FALSE)
+
+
+
+version_info <- list(
+ PolyAtailor_version = as.character(packageVersion("PolyAtailor")),
+ Biostrings_version = as.character(packageVersion("Biostrings")),
+ run_time = Sys.time()
+)
+write_yaml(version_info, "versions.yml")
diff --git a/modules/local/benchmarking/rRNA_correlation/main.nf b/modules/local/benchmarking/rRNA_correlation/main.nf
new file mode 100644
index 000000000..221d779cb
--- /dev/null
+++ b/modules/local/benchmarking/rRNA_correlation/main.nf
@@ -0,0 +1,17 @@
+process RRNA_CORRELATION {
+ label "process_single"
+
+ conda "bioconda::pandas=1.5.2"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' :
+ 'biocontainers/pandas:1.5.2' }"
+
+ input:
+ val bed_real_list // Collected list of tuples for real BEDs
+ val bed_bench_list // Collected list of tuples for benchmarking BEDs
+ val rRNA_real_list // Collected list of tuples for real rRNA summaries
+ val rRNA_bench_list // Collected list of tuples for benchmarking rRNA summaries
+
+ script:
+ template "rRNA_corr.py"
+}
diff --git a/modules/local/benchmarking/rRNA_correlation/templates/rRNA_corr.py b/modules/local/benchmarking/rRNA_correlation/templates/rRNA_corr.py
new file mode 100644
index 000000000..fc0c46bd2
--- /dev/null
+++ b/modules/local/benchmarking/rRNA_correlation/templates/rRNA_corr.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+import numpy as np
+from collections import defaultdict
+
+def process_meta_and_paths(input_string):
+ # 1. Strip the first and last character
+ stripped_input = input_string.replace("[[", "[").replace("]]", "]")
+
+ # 2. Split by "[", resulting in a list
+ split_data = stripped_input.split("[")
+
+ # 3. For every list member, strip all occurrences of "]"
+ cleaned_data = [item.replace("]", "").strip() for item in split_data if item.strip()]
+
+ # 4. For every list member, split by ","
+ parsed_data = [item.split(",") for item in cleaned_data]
+
+ parsed_data = [sublist[:-1] if sublist[-1] == '' else sublist for sublist in parsed_data]
+
+
+ return parsed_data
+
+
+def calculate_favcount_summary_ratio(file_path):
+ with open(file_path.strip(), 'r') as file:
+ lines = file.readlines()
+ assigned = int(lines[1].split()[1])
+ unassigned_multimapping = int(lines[9].split()[1])
+ others_sum = sum(int(lines[i].split()[1]) for i in list(range(2, 9)) + list(range(10, 15)))
+ ratio = (assigned + unassigned_multimapping)
+ #others_sum if others_sum != 0 else 1
+ return ratio
+
+
+
+def getpairs (beds, favcounts):
+ pairs = []
+ for bed in beds:
+ fav = None
+ for f in favcounts:
+ if f[0] == bed[0]:
+ fav = f
+ break
+ pairs.append([bed[0],bed[4],bed[-1],fav[-1]])
+ return pairs
+
+def get_totals(pairs):
+ for i in range(len(pairs)):
+ bed = pairs[i][2].strip(" ")
+ fav = pairs[i][3]
+ with open(bed, 'r') as file:
+ bed_value = sum(1 for line in file)
+ pairs[i][2] = bed_value
+ pairs[i][3] = calculate_favcount_summary_ratio(fav)
+ return pairs
+
+
+def sort_tools(value_pairs):
+ grouped_data = defaultdict(list)
+ for entry in value_pairs:
+ grouped_data[entry[1]].append(entry)
+ return dict(grouped_data)
+
+
+def compute_correlation(value_pairs):
+ results = defaultdict(float)
+ for tool in value_pairs.keys():
+ bed_list, fav_list = zip(*[sublist[2:] for sublist in value_pairs[tool]])
+ correlation = np.corrcoef(bed_list, fav_list)[0,1]
+ results[tool] = correlation
+ return dict(results)
+
+
+
+
+# Input data from Nextflow variables
+real_bed = "$bed_real_list"
+bench_bed = "$bed_bench_list"
+real_rRNA = "$rRNA_real_list"
+bench_rRNA = "$rRNA_bench_list"
+
+real_bed = process_meta_and_paths(real_bed)
+bench_bed = process_meta_and_paths(bench_bed)
+real_rRNA = process_meta_and_paths(real_rRNA)
+bench_rRNA = process_meta_and_paths(bench_rRNA)
+
+real_pairs = getpairs(real_bed,real_rRNA)
+bench_pairs = getpairs(bench_bed,bench_rRNA)
+
+
+real_value_pairs = get_totals(real_pairs)
+bench_value_pairs = get_totals(bench_pairs)
+
+real_value_pairs = sort_tools(real_value_pairs)
+bench_value_pairs = sort_tools(bench_value_pairs)
+
+real_corr = compute_correlation(real_value_pairs)
+bench_corr = compute_correlation(bench_value_pairs)
+
+with open("real_corr.txt", "w") as file:
+ for key, value in real_corr.items():
+ # Remove leading space in the key if required
+ line = f"{key.strip()}:\\t{value}\\n"
+ file.write(line)
+
+with open("bench_corr.txt", "w") as file:
+ for key, value in bench_corr.items():
+ # Remove leading space in the key if required
+ line = f"{key.strip()}:\\t{value}\\n"
+ file.write(line)
+
+#TODO: both relative or both total. both?
+#TODO: both correllations the same can't be!!!
+
diff --git a/modules/local/benchmarking/seq_depth_plot/main.nf b/modules/local/benchmarking/seq_depth_plot/main.nf
new file mode 100644
index 000000000..cdb5c1ed6
--- /dev/null
+++ b/modules/local/benchmarking/seq_depth_plot/main.nf
@@ -0,0 +1,17 @@
+process SEQ_DEPTH_CORRELLATION {
+ tag "$meta.id"
+ label "process_single"
+
+ conda "bioconda::matplotlib=3.5.1 bioconda::seaborn=0.11.2"
+ container 'https://depot.galaxyproject.org/singularity/bionumpy:0.2.12--pyha8f3691_0'
+
+ input:
+ tuple val(meta),
+ path(bed)
+ path(depth)
+ output:
+ path("*.tsv") , emit: report
+ path("versions.yml"), emit: versions
+ script:
+ template "correlation.py"
+}
diff --git a/modules/local/benchmarking/seq_depth_plot/templates/correlation.py b/modules/local/benchmarking/seq_depth_plot/templates/correlation.py
new file mode 100644
index 000000000..78daf86d2
--- /dev/null
+++ b/modules/local/benchmarking/seq_depth_plot/templates/correlation.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+import os
+import re
+import platform
+import numpy as np
+
+meta = "$meta"
+depth = "$depth"
+bed = "$bed"
+
+def calculate_correlation(bed_file_path, depth_file_path):
+ # Read the BED file and store circRNA regions
+ circRNA_regions = []
+ with open(bed_file_path, 'r') as bed_file:
+ for line in bed_file:
+ if line.startswith('#'): continue
+ parts = line.strip().split()
+ chr = parts[0]
+ start = int(parts[1])
+ end = int(parts[2])
+ circRNA_regions.append((chr, start, end))
+
+ # Read the sequence depth file
+ circRNA_presence = []
+ sequencing_depth = []
+
+ with open(depth_file_path, 'r') as depth_file:
+ bed_idx = 0
+ bed_len = len(circRNA_regions)
+
+ for line in depth_file:
+ parts = line.strip().split()
+ chr = parts[0]
+ pos = int(parts[1])
+ depth = int(parts[2])
+
+ # Move bed_idx to the correct region
+ while bed_idx < bed_len and (circRNA_regions[bed_idx][0] < chr or (circRNA_regions[bed_idx][0] == chr and circRNA_regions[bed_idx][2] < pos)):
+ bed_idx += 1
+
+ # Check if the current position is within a circRNA region
+ if bed_idx < bed_len and circRNA_regions[bed_idx][0] == chr and circRNA_regions[bed_idx][1] <= pos <= circRNA_regions[bed_idx][2]:
+ circRNA_presence.append(1)
+ else:
+ circRNA_presence.append(0)
+
+ sequencing_depth.append(depth)
+
+ # Calculate the correlation coefficient
+ correlation = np.corrcoef(sequencing_depth, circRNA_presence)[0, 1]
+
+ return correlation
+
+id = re.search(r'id:(\\w+)', meta).group(1)
+
+with open(depth, 'r') as paths:
+ for path in paths:
+ path = path.strip()
+ basename = os.path.splitext(os.path.basename(path))[0]
+
+ if basename == id:
+ # Calculate correlation
+ corr = calculate_correlation(bed, path)
+ # Write the correlation to stats.txt
+ with open('corr_mqc.tsv', 'w') as outfile:
+ header = "tool\\tpearson_corr\\n"
+ outfile.write(header + meta.split(",")[4][:-1] + '\\t' + str(corr))
+
+#version capture
+def format_yaml_like(data: dict, indent: int = 0) -> str:
+ """Formats a dictionary to a YAML-like string.
+
+ Args:
+ data (dict): The dictionary to format.
+ indent (int): The current indentation level.
+
+ Returns:
+ str: A string formatted as YAML.
+ """
+ yaml_str = ""
+ for key, value in data.items():
+ spaces = " " * indent
+ if isinstance(value, dict):
+ yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
+ else:
+ yaml_str += f"{spaces}{key}: {value}\\n"
+ return yaml_str
+
+versions = {
+ "${task.process}" : {
+ "python": platform.python_version(),
+ "numpy": np.__version__
+ }
+}
+with open("versions.yml", "w") as f:
+ f.write(format_yaml_like(versions))
diff --git a/modules/nf-core/bedtools/genomecov/environment.yml b/modules/nf-core/bedtools/genomecov/environment.yml
new file mode 100644
index 000000000..8fbe20c31
--- /dev/null
+++ b/modules/nf-core/bedtools/genomecov/environment.yml
@@ -0,0 +1,7 @@
+name: bedtools_genomecov
+channels:
+ - conda-forge
+ - bioconda
+ - defaults
+dependencies:
+ - bioconda::bedtools=2.31.1
diff --git a/modules/nf-core/bedtools/genomecov/main.nf b/modules/nf-core/bedtools/genomecov/main.nf
new file mode 100644
index 000000000..954e8913d
--- /dev/null
+++ b/modules/nf-core/bedtools/genomecov/main.nf
@@ -0,0 +1,74 @@
+process BEDTOOLS_GENOMECOV {
+ tag "$meta.id"
+ label 'process_single'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' :
+ 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }"
+
+ input:
+ tuple val(meta), path(intervals), val(scale)
+ path sizes
+ val extension
+ val sort
+
+ output:
+ tuple val(meta), path("*.${extension}"), emit: genomecov
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ def args_list = args.tokenize()
+ args += (scale > 0 && scale != 1) ? " -scale $scale" : ""
+ if (!args_list.contains('-bg') && (scale > 0 && scale != 1)) {
+ args += " -bg"
+ }
+ def sort_cmd = sort ? '| bedtools sort' : ''
+
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ if (intervals.name =~ /\.bam/) {
+ """
+ bedtools \\
+ genomecov \\
+ -ibam $intervals \\
+ $args \\
+ $sort_cmd \\
+ > ${prefix}.${extension}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ bedtools: \$(bedtools --version | sed -e "s/bedtools v//g")
+ END_VERSIONS
+ """
+ } else {
+ """
+ bedtools \\
+ genomecov \\
+ -i $intervals \\
+ -g $sizes \\
+ $args \\
+ $sort_cmd \\
+ > ${prefix}.${extension}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ bedtools: \$(bedtools --version | sed -e "s/bedtools v//g")
+ END_VERSIONS
+ """
+ }
+
+ stub:
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ touch ${prefix}.${extension}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ bedtools: \$(bedtools --version | sed -e "s/bedtools v//g")
+ END_VERSIONS
+ """
+}
diff --git a/modules/nf-core/bedtools/genomecov/meta.yml b/modules/nf-core/bedtools/genomecov/meta.yml
new file mode 100644
index 000000000..2b2385e3e
--- /dev/null
+++ b/modules/nf-core/bedtools/genomecov/meta.yml
@@ -0,0 +1,59 @@
+name: bedtools_genomecov
+description: Computes histograms (default), per-base reports (-d) and BEDGRAPH (-bg) summaries of feature coverage (e.g., aligned sequences) for a given genome.
+keywords:
+ - bed
+ - bam
+ - genomecov
+ - bedtools
+ - histogram
+tools:
+ - bedtools:
+ description: |
+ A set of tools for genomic analysis tasks, specifically enabling genome arithmetic (merge, count, complement) on various file types.
+ documentation: https://bedtools.readthedocs.io/en/latest/content/tools/genomecov.html
+ licence: ["MIT"]
+input:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - intervals:
+ type: file
+ description: BAM/BED/GFF/VCF
+ pattern: "*.{bam|bed|gff|vcf}"
+ - scale:
+ type: integer
+ description: Number containing the scale factor for the output. Set to 1 to disable. Setting to a value other than 1 will also get the -bg bedgraph output format as this is required for this command switch
+ - sizes:
+ type: file
+ description: Tab-delimited table of chromosome names in the first column and chromosome sizes in the second column
+ - extension:
+ type: string
+ description: Extension of the output file (e. g., ".bg", ".bedgraph", ".txt", ".tab", etc.) It is set arbitrarily by the user and corresponds to the file format which depends on arguments.
+output:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - genomecov:
+ type: file
+ description: Computed genome coverage file
+ pattern: "*.${extension}"
+ - versions:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+authors:
+ - "@edmundmiller"
+ - "@sruthipsuresh"
+ - "@drpatelh"
+ - "@sidorov-si"
+ - "@chris-cheshire"
+maintainers:
+ - "@edmundmiller"
+ - "@sruthipsuresh"
+ - "@drpatelh"
+ - "@sidorov-si"
+ - "@chris-cheshire"
diff --git a/modules/nf-core/bedtools/genomecov/tests/main.nf.test b/modules/nf-core/bedtools/genomecov/tests/main.nf.test
new file mode 100644
index 000000000..8213cff95
--- /dev/null
+++ b/modules/nf-core/bedtools/genomecov/tests/main.nf.test
@@ -0,0 +1,122 @@
+nextflow_process {
+ name "Test Process BEDTOOLS_GENOMECOV"
+ script "../main.nf"
+ process "BEDTOOLS_GENOMECOV"
+ config "./nextflow.config"
+
+ tag "modules"
+ tag "modules_nfcore"
+ tag "bedtools"
+ tag "bedtools/genomecov"
+
+ test("sarscov2 - no scale") {
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test' ], // meta map
+ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.paired_end.bam", checkIfExists: true),
+ 1
+ ]
+ // sizes
+ input[1] = []
+ // extension
+ input[2] = "txt"
+ input[3] = true
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match("no_scale") }
+ )
+ }
+
+ }
+
+ test("sarscov2 - dummy sizes") {
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test'],
+ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.paired_end.bam", checkIfExists: true),
+ 0.5
+ ]
+ // sizes
+ input[1] = file('dummy_chromosome_sizes')
+ // extension
+ input[2] = 'txt'
+ input[3] = false
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match("dummy_sizes") }
+ )
+ }
+
+ }
+
+ test("sarscov2 - scale") {
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test'],
+ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/bed/baits.bed", checkIfExists: true),
+ 0.5
+ ]
+ // sizes
+ input[1] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.sizes", checkIfExists: true)
+ // extension
+ input[2] = 'txt'
+ input[3] = false
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match("scale") }
+ )
+ }
+
+ }
+
+ test("stub") {
+ options "-stub"
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test' ], // meta map
+ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.paired_end.bam", checkIfExists: true),
+ 1
+ ]
+ // sizes
+ input[1] = []
+ // extension
+ input[2] = 'txt'
+ input[3] = false
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(file(process.out.genomecov[0][1]).name).match("stub") }
+ )
+ }
+
+ }
+
+}
diff --git a/modules/nf-core/bedtools/genomecov/tests/main.nf.test.snap b/modules/nf-core/bedtools/genomecov/tests/main.nf.test.snap
new file mode 100644
index 000000000..8f9191e4c
--- /dev/null
+++ b/modules/nf-core/bedtools/genomecov/tests/main.nf.test.snap
@@ -0,0 +1,95 @@
+{
+ "dummy_sizes": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test"
+ },
+ "test.coverage.txt:md5,01291b6e1beab72e046653e709eb0e10"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,5fd44452613992a6f71f2c73d2e117f2"
+ ],
+ "genomecov": [
+ [
+ {
+ "id": "test"
+ },
+ "test.coverage.txt:md5,01291b6e1beab72e046653e709eb0e10"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,5fd44452613992a6f71f2c73d2e117f2"
+ ]
+ }
+ ],
+ "timestamp": "2023-12-05T17:35:58.35232"
+ },
+ "no_scale": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test"
+ },
+ "test.coverage.txt:md5,66083198daca6c001d328ba9616e9b53"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,5fd44452613992a6f71f2c73d2e117f2"
+ ],
+ "genomecov": [
+ [
+ {
+ "id": "test"
+ },
+ "test.coverage.txt:md5,66083198daca6c001d328ba9616e9b53"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,5fd44452613992a6f71f2c73d2e117f2"
+ ]
+ }
+ ],
+ "timestamp": "2023-12-05T17:35:51.142496"
+ },
+ "stub": {
+ "content": [
+ "test.coverage.txt"
+ ],
+ "timestamp": "2023-12-05T17:36:13.084709"
+ },
+ "scale": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test"
+ },
+ "test.coverage.txt:md5,de3c59c0ea123bcdbbad27bc0a0a601e"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,5fd44452613992a6f71f2c73d2e117f2"
+ ],
+ "genomecov": [
+ [
+ {
+ "id": "test"
+ },
+ "test.coverage.txt:md5,de3c59c0ea123bcdbbad27bc0a0a601e"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,5fd44452613992a6f71f2c73d2e117f2"
+ ]
+ }
+ ],
+ "timestamp": "2023-12-05T17:36:05.962006"
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/bedtools/genomecov/tests/nextflow.config b/modules/nf-core/bedtools/genomecov/tests/nextflow.config
new file mode 100644
index 000000000..bdb74ae5a
--- /dev/null
+++ b/modules/nf-core/bedtools/genomecov/tests/nextflow.config
@@ -0,0 +1,7 @@
+process {
+
+ withName: BEDTOOLS_GENOMECOV {
+ ext.prefix = { "${meta.id}.coverage" }
+ }
+
+}
diff --git a/modules/nf-core/bedtools/genomecov/tests/tags.yml b/modules/nf-core/bedtools/genomecov/tests/tags.yml
new file mode 100644
index 000000000..55fce4780
--- /dev/null
+++ b/modules/nf-core/bedtools/genomecov/tests/tags.yml
@@ -0,0 +1,2 @@
+bedtools/genomecov:
+ - "modules/nf-core/bedtools/genomecov/**"
diff --git a/modules/nf-core/bedtools/jaccard/environment.yml b/modules/nf-core/bedtools/jaccard/environment.yml
new file mode 100644
index 000000000..f0f2a38fc
--- /dev/null
+++ b/modules/nf-core/bedtools/jaccard/environment.yml
@@ -0,0 +1,7 @@
+name: bedtools_jaccard
+channels:
+ - conda-forge
+ - bioconda
+ - defaults
+dependencies:
+ - bioconda::bedtools=2.31.1
diff --git a/modules/nf-core/bedtools/jaccard/main.nf b/modules/nf-core/bedtools/jaccard/main.nf
new file mode 100644
index 000000000..32a41210d
--- /dev/null
+++ b/modules/nf-core/bedtools/jaccard/main.nf
@@ -0,0 +1,49 @@
+process BEDTOOLS_JACCARD {
+ tag "$meta.id"
+ label 'process_single'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' :
+ 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }"
+
+ input:
+ tuple val(meta), path(input_a), path(input_b)
+ tuple val(meta2), path(genome_file)
+
+ output:
+ tuple val(meta), path("*.tsv"), emit: tsv
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ def genome = genome_file ? "-g ${genome_file}" : ""
+ """
+ bedtools jaccard \\
+ -a ${input_a} \\
+ -b ${input_b} \\
+ ${genome} \\
+ ${args} \\
+ > ${prefix}.tsv
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ bedtools: \$(bedtools --version | sed -e "s/bedtools v//g")
+ END_VERSIONS
+ """
+
+ stub:
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ touch ${prefix}.tsv
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ bedtools: \$(bedtools --version | sed -e "s/bedtools v//g")
+ END_VERSIONS
+ """
+}
diff --git a/modules/nf-core/bedtools/jaccard/meta.yml b/modules/nf-core/bedtools/jaccard/meta.yml
new file mode 100644
index 000000000..2b5422841
--- /dev/null
+++ b/modules/nf-core/bedtools/jaccard/meta.yml
@@ -0,0 +1,57 @@
+name: "bedtools_jaccard"
+description: Calculate Jaccard statistic b/w two feature files.
+keywords:
+ - vcf
+ - gff
+ - bed
+ - jaccard
+ - intersection
+ - union
+ - statistics
+tools:
+ - "bedtools":
+ description: |
+ A set of tools for genomic analysis tasks, specifically enabling genome arithmetic (merge, count, complement) on various file types.
+ documentation: https://bedtools.readthedocs.io/en/latest/content/tools/intersect.html
+ licence: ["MIT"]
+input:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - input_a:
+ type: file
+ description: VCF,GFF or BED file to use with the `-a` option
+ pattern: "*.{vcf,vcf.gz,bed,bed.gz,gff}"
+ - input_b:
+ type: file
+ description: VCF,GFF or BED file to use with the `-b` option
+ pattern: "*.{vcf,vcf.gz,bed,bed.gz,gff}"
+ - meta2:
+ type: map
+ description: |
+ Groovy Map containing genome file information
+ e.g. [ id:'test', single_end:false ]
+ - genome_file:
+ type: file
+ description: A file containing all the contigs of the genome used to create the input files
+ pattern: "*.{txt,sizes,fai}"
+output:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - versions:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+ - tsv:
+ type: file
+ description: TSV file containing the results
+ pattern: "*.tsv"
+authors:
+ - "@nvnieuwk"
+maintainers:
+ - "@nvnieuwk"
diff --git a/modules/nf-core/bedtools/merge/environment.yml b/modules/nf-core/bedtools/merge/environment.yml
new file mode 100644
index 000000000..99707878b
--- /dev/null
+++ b/modules/nf-core/bedtools/merge/environment.yml
@@ -0,0 +1,7 @@
+name: bedtools_merge
+channels:
+ - conda-forge
+ - bioconda
+ - defaults
+dependencies:
+ - bioconda::bedtools=2.31.1
diff --git a/modules/nf-core/bedtools/merge/main.nf b/modules/nf-core/bedtools/merge/main.nf
new file mode 100644
index 000000000..5310647de
--- /dev/null
+++ b/modules/nf-core/bedtools/merge/main.nf
@@ -0,0 +1,47 @@
+process BEDTOOLS_MERGE {
+ tag "$meta.id"
+ label 'process_single'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' :
+ 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }"
+
+ input:
+ tuple val(meta), path(bed)
+
+ output:
+ tuple val(meta), path('*.bed'), emit: bed
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ if ("$bed" == "${prefix}.bed") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
+ """
+ bedtools \\
+ merge \\
+ -i $bed \\
+ $args \\
+ > ${prefix}.bed
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ bedtools: \$(bedtools --version | sed -e "s/bedtools v//g")
+ END_VERSIONS
+ """
+
+ stub:
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ touch ${prefix}.bed
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ bedtools: \$(bedtools --version | sed -e "s/bedtools v//g")
+ END_VERSIONS
+ """
+}
diff --git a/modules/nf-core/bedtools/merge/meta.yml b/modules/nf-core/bedtools/merge/meta.yml
new file mode 100644
index 000000000..d7463e3db
--- /dev/null
+++ b/modules/nf-core/bedtools/merge/meta.yml
@@ -0,0 +1,45 @@
+name: bedtools_merge
+description: combines overlapping or “book-ended” features in an interval file into a single feature which spans all of the combined features.
+keywords:
+ - bed
+ - merge
+ - bedtools
+ - overlapped bed
+tools:
+ - bedtools:
+ description: |
+ A set of tools for genomic analysis tasks, specifically enabling genome arithmetic (merge, count, complement) on various file types.
+ documentation: https://bedtools.readthedocs.io/en/latest/content/tools/merge.html
+ licence: ["MIT"]
+input:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - bed:
+ type: file
+ description: Input BED file
+ pattern: "*.{bed}"
+output:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - bed:
+ type: file
+ description: Overlapped bed file with combined features
+ pattern: "*.{bed}"
+ - versions:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+authors:
+ - "@edmundmiller"
+ - "@sruthipsuresh"
+ - "@drpatelh"
+maintainers:
+ - "@edmundmiller"
+ - "@sruthipsuresh"
+ - "@drpatelh"
diff --git a/modules/nf-core/bedtools/merge/tests/main.nf.test b/modules/nf-core/bedtools/merge/tests/main.nf.test
new file mode 100644
index 000000000..95dba8e54
--- /dev/null
+++ b/modules/nf-core/bedtools/merge/tests/main.nf.test
@@ -0,0 +1,34 @@
+nextflow_process {
+
+ name "Test Process BEDTOOLS_MERGE"
+ script "../main.nf"
+ config "./nextflow.config"
+ process "BEDTOOLS_MERGE"
+
+ tag "modules"
+ tag "modules_nfcore"
+ tag "bedtools"
+ tag "bedtools/merge"
+
+ test("test_bedtools_merge") {
+
+ when {
+ process {
+ """
+ input[0] = [ [ id:'test'],
+ file(params.test_data['sarscov2']['genome']['test_bed'], checkIfExists: true)
+ ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+
+ }
+
+}
\ No newline at end of file
diff --git a/modules/nf-core/bedtools/merge/tests/main.nf.test.snap b/modules/nf-core/bedtools/merge/tests/main.nf.test.snap
new file mode 100644
index 000000000..ee6c4e634
--- /dev/null
+++ b/modules/nf-core/bedtools/merge/tests/main.nf.test.snap
@@ -0,0 +1,35 @@
+{
+ "test_bedtools_merge": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test"
+ },
+ "test_out.bed:md5,0cf6ed2b6f470cd44a247da74ca4fe4e"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,2d134badb4cd1e4e903696c7967f28d6"
+ ],
+ "bed": [
+ [
+ {
+ "id": "test"
+ },
+ "test_out.bed:md5,0cf6ed2b6f470cd44a247da74ca4fe4e"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,2d134badb4cd1e4e903696c7967f28d6"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "23.10.1"
+ },
+ "timestamp": "2024-03-18T17:07:09.721153"
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/bedtools/merge/tests/nextflow.config b/modules/nf-core/bedtools/merge/tests/nextflow.config
new file mode 100644
index 000000000..16444e982
--- /dev/null
+++ b/modules/nf-core/bedtools/merge/tests/nextflow.config
@@ -0,0 +1,7 @@
+process {
+
+ withName: BEDTOOLS_MERGE {
+ ext.prefix = { "${meta.id}_out" }
+ }
+
+}
diff --git a/modules/nf-core/bedtools/merge/tests/tags.yml b/modules/nf-core/bedtools/merge/tests/tags.yml
new file mode 100644
index 000000000..60c8cad1c
--- /dev/null
+++ b/modules/nf-core/bedtools/merge/tests/tags.yml
@@ -0,0 +1,2 @@
+bedtools/merge:
+ - "modules/nf-core/bedtools/merge/**"
diff --git a/modules/nf-core/subread/featurecounts/environment.yml b/modules/nf-core/subread/featurecounts/environment.yml
new file mode 100644
index 000000000..fdb575020
--- /dev/null
+++ b/modules/nf-core/subread/featurecounts/environment.yml
@@ -0,0 +1,5 @@
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - bioconda::subread=2.0.6
diff --git a/modules/nf-core/subread/featurecounts/main.nf b/modules/nf-core/subread/featurecounts/main.nf
new file mode 100644
index 000000000..c0c9de4d9
--- /dev/null
+++ b/modules/nf-core/subread/featurecounts/main.nf
@@ -0,0 +1,57 @@
+process SUBREAD_FEATURECOUNTS {
+ tag "$meta.id"
+ label 'process_medium'
+
+ conda "${moduleDir}/environment.yml"
+ container 'oras://community.wave.seqera.io/library/subread:2.0.6--2dd2dd526de026fd'
+
+ input:
+ tuple val(meta), path(bams), path(annotation)
+
+ output:
+ tuple val(meta), path("*featureCounts.txt") , emit: counts
+ tuple val(meta), path("*featureCounts.txt.summary"), emit: summary
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ def paired_end = meta.single_end ? '' : '-p'
+
+ def strandedness = 0
+ if (meta.strandedness == 'forward') {
+ strandedness = 1
+ } else if (meta.strandedness == 'reverse') {
+ strandedness = 2
+ }
+ """
+ featureCounts \\
+ $args \\
+ $paired_end \\
+ -T $task.cpus \\
+ -a $annotation \\
+ -s $strandedness \\
+ -o ${prefix}.featureCounts.txt \\
+ ${bams.join(' ')}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ subread: \$( echo \$(featureCounts -v 2>&1) | sed -e "s/featureCounts v//g")
+ END_VERSIONS
+ """
+
+ stub:
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ touch ${prefix}.featureCounts.txt
+ touch ${prefix}.featureCounts.txt.summary
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ subread: \$( echo \$(featureCounts -v 2>&1) | sed -e "s/featureCounts v//g")
+ END_VERSIONS
+ """
+}
diff --git a/modules/nf-core/subread/featurecounts/meta.yml b/modules/nf-core/subread/featurecounts/meta.yml
new file mode 100644
index 000000000..a9b045b7d
--- /dev/null
+++ b/modules/nf-core/subread/featurecounts/meta.yml
@@ -0,0 +1,62 @@
+name: subread_featurecounts
+description: Count reads that map to genomic features
+keywords:
+ - counts
+ - fasta
+ - genome
+ - reference
+tools:
+ - featurecounts:
+ description: featureCounts is a highly efficient general-purpose read summarization
+ program that counts mapped reads for genomic features such as genes, exons,
+ promoter, gene bodies, genomic bins and chromosomal locations. It can be used
+ to count both RNA-seq and genomic DNA-seq reads.
+ homepage: http://bioinf.wehi.edu.au/featureCounts/
+ documentation: http://bioinf.wehi.edu.au/subread-package/SubreadUsersGuide.pdf
+ doi: "10.1093/bioinformatics/btt656"
+ licence: ["GPL v3"]
+ identifier: biotools:subread
+input:
+ - - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - bams:
+ type: file
+ description: BAM files containing mapped reads
+ pattern: "*.bam"
+ - annotation:
+ type: file
+ description: Genomic features annotation in GTF or SAF
+ pattern: "*.{gtf,saf}"
+output:
+ - counts:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - "*featureCounts.txt":
+ type: file
+ description: Counts of reads mapping to features
+ pattern: "*featureCounts.txt"
+ - summary:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - "*featureCounts.txt.summary":
+ type: file
+ description: Summary log file
+ pattern: "*.featureCounts.txt.summary"
+ - versions:
+ - versions.yml:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+authors:
+ - "@ntoda03"
+maintainers:
+ - "@ntoda03"
diff --git a/modules/nf-core/subread/featurecounts/tests/main.nf.test b/modules/nf-core/subread/featurecounts/tests/main.nf.test
new file mode 100644
index 000000000..3b95da33f
--- /dev/null
+++ b/modules/nf-core/subread/featurecounts/tests/main.nf.test
@@ -0,0 +1,155 @@
+nextflow_process {
+
+ name "Test Process SUBREAD_FEATURECOUNTS"
+ script "../main.nf"
+ process "SUBREAD_FEATURECOUNTS"
+ config "./nextflow.config"
+ tag "modules"
+ tag "modules_nfcore"
+ tag "subread"
+ tag "subread/featurecounts"
+
+ test("sarscov2 [bam] - forward") {
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test', single_end:true, strandedness:'forward' ], // meta map
+ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.single_end.bam", checkIfExists: true),
+ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gtf", checkIfExists: true)
+ ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out.counts).match("forward_counts") },
+ { assert snapshot(process.out.summary).match("forward_summary") },
+ { assert snapshot(process.out.versions).match("forward_versions") }
+ )
+ }
+ }
+
+ test("sarscov2 [bam] - forward - stub") {
+
+ options "-stub"
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test', single_end:true, strandedness:'forward' ], // meta map
+ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.single_end.bam", checkIfExists: true),
+ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gtf", checkIfExists: true)
+ ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+ }
+
+ test("sarscov2 [bam] - reverse") {
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test', single_end:true, strandedness:'reverse' ], // meta map
+ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.single_end.bam", checkIfExists: true),
+ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gtf", checkIfExists: true)
+ ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out.counts).match("reverse_counts") },
+ { assert snapshot(process.out.summary).match("reverse_summary") },
+ { assert snapshot(process.out.versions).match("reverse_versions") }
+ )
+ }
+ }
+
+ test("sarscov2 [bam] - reverse - stub") {
+
+ options "-stub"
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test', single_end:true, strandedness:'reverse' ], // meta map
+ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.single_end.bam", checkIfExists: true),
+ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gtf", checkIfExists: true)
+ ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+ }
+
+ test("sarscov2 [bam] - unstranded") {
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test', single_end:true, strandedness:'unstranded' ], // meta map
+ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.single_end.bam", checkIfExists: true),
+ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gtf", checkIfExists: true)
+ ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out.counts).match("unstranded_counts") },
+ { assert snapshot(process.out.summary).match("unstranded_summary") },
+ { assert snapshot(process.out.versions).match("unstranded_versions") }
+ )
+ }
+ }
+
+ test("sarscov2 [bam] - unstranded - stub") {
+
+ options "-stub"
+
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test', single_end:true, strandedness:'unstranded' ], // meta map
+ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.single_end.bam", checkIfExists: true),
+ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gtf", checkIfExists: true)
+ ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+ }
+}
diff --git a/modules/nf-core/subread/featurecounts/tests/main.nf.test.snap b/modules/nf-core/subread/featurecounts/tests/main.nf.test.snap
new file mode 100644
index 000000000..5a743f0ce
--- /dev/null
+++ b/modules/nf-core/subread/featurecounts/tests/main.nf.test.snap
@@ -0,0 +1,323 @@
+{
+ "forward_counts": {
+ "content": [
+ [
+ [
+ {
+ "id": "test",
+ "single_end": true,
+ "strandedness": "forward"
+ },
+ "test.featureCounts.txt:md5,4cf89f0e702ba9abef3fa571e68fe8f0"
+ ]
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.9.0",
+ "nextflow": "24.04.4"
+ },
+ "timestamp": "2024-10-18T10:19:47.695388295"
+ },
+ "unstranded_counts": {
+ "content": [
+ [
+ [
+ {
+ "id": "test",
+ "single_end": true,
+ "strandedness": "unstranded"
+ },
+ "test.featureCounts.txt:md5,c4ef2c2a80547fbb3074331bc0a1bda3"
+ ]
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.9.0",
+ "nextflow": "24.04.4"
+ },
+ "timestamp": "2024-10-18T10:21:03.208334705"
+ },
+ "reverse_summary": {
+ "content": [
+ [
+ [
+ {
+ "id": "test",
+ "single_end": true,
+ "strandedness": "reverse"
+ },
+ "test.featureCounts.txt.summary:md5,7cfa30ad678b9bc1bc63afbb0281547b"
+ ]
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "24.04.2"
+ },
+ "timestamp": "2023-11-23T15:50:25.168206514"
+ },
+ "sarscov2 [bam] - forward - stub": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test",
+ "single_end": true,
+ "strandedness": "forward"
+ },
+ "test.featureCounts.txt:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "1": [
+ [
+ {
+ "id": "test",
+ "single_end": true,
+ "strandedness": "forward"
+ },
+ "test.featureCounts.txt.summary:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "2": [
+ "versions.yml:md5,956ebb9da6a1747fc47d393fb5931fc2"
+ ],
+ "counts": [
+ [
+ {
+ "id": "test",
+ "single_end": true,
+ "strandedness": "forward"
+ },
+ "test.featureCounts.txt:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "summary": [
+ [
+ {
+ "id": "test",
+ "single_end": true,
+ "strandedness": "forward"
+ },
+ "test.featureCounts.txt.summary:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,956ebb9da6a1747fc47d393fb5931fc2"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.0",
+ "nextflow": "24.04.4"
+ },
+ "timestamp": "2024-10-18T10:20:10.040191252"
+ },
+ "sarscov2 [bam] - reverse - stub": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test",
+ "single_end": true,
+ "strandedness": "reverse"
+ },
+ "test.featureCounts.txt:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "1": [
+ [
+ {
+ "id": "test",
+ "single_end": true,
+ "strandedness": "reverse"
+ },
+ "test.featureCounts.txt.summary:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "2": [
+ "versions.yml:md5,956ebb9da6a1747fc47d393fb5931fc2"
+ ],
+ "counts": [
+ [
+ {
+ "id": "test",
+ "single_end": true,
+ "strandedness": "reverse"
+ },
+ "test.featureCounts.txt:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "summary": [
+ [
+ {
+ "id": "test",
+ "single_end": true,
+ "strandedness": "reverse"
+ },
+ "test.featureCounts.txt.summary:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,956ebb9da6a1747fc47d393fb5931fc2"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.0",
+ "nextflow": "24.04.4"
+ },
+ "timestamp": "2024-10-18T10:20:42.708101743"
+ },
+ "reverse_counts": {
+ "content": [
+ [
+ [
+ {
+ "id": "test",
+ "single_end": true,
+ "strandedness": "reverse"
+ },
+ "test.featureCounts.txt:md5,a7d8843ebc12d855c2e68d3e2e137582"
+ ]
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.9.0",
+ "nextflow": "24.04.4"
+ },
+ "timestamp": "2024-10-18T10:20:24.490755916"
+ },
+ "sarscov2 [bam] - unstranded - stub": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test",
+ "single_end": true,
+ "strandedness": "unstranded"
+ },
+ "test.featureCounts.txt:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "1": [
+ [
+ {
+ "id": "test",
+ "single_end": true,
+ "strandedness": "unstranded"
+ },
+ "test.featureCounts.txt.summary:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "2": [
+ "versions.yml:md5,956ebb9da6a1747fc47d393fb5931fc2"
+ ],
+ "counts": [
+ [
+ {
+ "id": "test",
+ "single_end": true,
+ "strandedness": "unstranded"
+ },
+ "test.featureCounts.txt:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "summary": [
+ [
+ {
+ "id": "test",
+ "single_end": true,
+ "strandedness": "unstranded"
+ },
+ "test.featureCounts.txt.summary:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,956ebb9da6a1747fc47d393fb5931fc2"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.0",
+ "nextflow": "24.04.4"
+ },
+ "timestamp": "2024-10-18T10:21:19.244837771"
+ },
+ "forward_summary": {
+ "content": [
+ [
+ [
+ {
+ "id": "test",
+ "single_end": true,
+ "strandedness": "forward"
+ },
+ "test.featureCounts.txt.summary:md5,8f602ff9a8ef467af43294e80b367cdf"
+ ]
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "24.04.2"
+ },
+ "timestamp": "2023-11-23T15:50:10.699024934"
+ },
+ "forward_versions": {
+ "content": [
+ [
+ "versions.yml:md5,956ebb9da6a1747fc47d393fb5931fc2"
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.9.0",
+ "nextflow": "24.04.4"
+ },
+ "timestamp": "2024-10-18T10:19:48.001158764"
+ },
+ "unstranded_summary": {
+ "content": [
+ [
+ [
+ {
+ "id": "test",
+ "single_end": true,
+ "strandedness": "unstranded"
+ },
+ "test.featureCounts.txt.summary:md5,23164b79f9f23f11c82820db61a35560"
+ ]
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "24.04.2"
+ },
+ "timestamp": "2023-11-23T15:50:38.68776235"
+ },
+ "reverse_versions": {
+ "content": [
+ [
+ "versions.yml:md5,956ebb9da6a1747fc47d393fb5931fc2"
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.9.0",
+ "nextflow": "24.04.4"
+ },
+ "timestamp": "2024-10-18T10:20:24.551053149"
+ },
+ "unstranded_versions": {
+ "content": [
+ [
+ "versions.yml:md5,956ebb9da6a1747fc47d393fb5931fc2"
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.9.0",
+ "nextflow": "24.04.4"
+ },
+ "timestamp": "2024-10-18T10:21:03.25895568"
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/subread/featurecounts/tests/nextflow.config b/modules/nf-core/subread/featurecounts/tests/nextflow.config
new file mode 100644
index 000000000..d9fd4fd57
--- /dev/null
+++ b/modules/nf-core/subread/featurecounts/tests/nextflow.config
@@ -0,0 +1,9 @@
+process {
+
+ publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
+
+ withName: SUBREAD_FEATURECOUNTS {
+ ext.args = '-t CDS'
+ }
+
+}
diff --git a/modules/nf-core/subread/featurecounts/tests/tags.yml b/modules/nf-core/subread/featurecounts/tests/tags.yml
new file mode 100644
index 000000000..6d2534bf4
--- /dev/null
+++ b/modules/nf-core/subread/featurecounts/tests/tags.yml
@@ -0,0 +1,2 @@
+subread/featurecounts:
+ - modules/nf-core/subread/featurecounts/**
diff --git a/nextflow.config b/nextflow.config
index ff6b0bbcb..1f3b85a27 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -22,6 +22,7 @@ params {
tool_filter = 1
exon_boundary = 0
save_intermediates = false
+ benchmarking = false
// References
genome = null
diff --git a/nextflow_schema.json b/nextflow_schema.json
index ee8ea89c8..268428f74 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -139,6 +139,13 @@
"help_text": "During annotation, if one of the start or end position of a circular candidate imperfectly overlaps an exon boundary, the script will consider positions within 'exon_boundary' (default 0bp) to be an exonic circRNA. If they fall outside of this range, the candidate is assumed to be an exonic-intronic circRNA, and the entire underlying sequence is taken for miRNA analysis, as opposed to just the exonic sequences for canonical exonic circRNAs. ",
"default": 0
},
+ "benchmarking": {
+ "type": "boolean",
+ "description": "Run the pipeline in benchmarking mode.",
+ "default": false,
+ "fa_icon": "fas fa-chart-line",
+ "help_text": "This enables parallel analysis of polyA-enriched data with the same circRNA quantification tools to compare results."
+ },
"mirna_tools": {
"type": "string",
"fa_icon": "fas fa-wrench",
diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf
index dea819a65..e14290c55 100644
--- a/subworkflows/local/annotation.nf
+++ b/subworkflows/local/annotation.nf
@@ -1,6 +1,5 @@
include { BEDTOOLS_INTERSECT as INTERSECT_GTF } from '../../modules/nf-core/bedtools/intersect'
include { GAWK as INGEST_DATABASE_NAMES } from '../../modules/nf-core/gawk'
-include { GNU_SORT as COMBINE_DATABASES } from '../../modules/nf-core/gnu/sort'
include { BEDTOOLS_INTERSECT as INTERSECT_DATABASE } from '../../modules/nf-core/bedtools/intersect'
include { ANNOTATION as ANNOTATE } from '../../modules/local/annotation'
diff --git a/subworkflows/local/benchmarking.nf b/subworkflows/local/benchmarking.nf
new file mode 100644
index 000000000..7df2181fa
--- /dev/null
+++ b/subworkflows/local/benchmarking.nf
@@ -0,0 +1,202 @@
+include { GNU_SORT as SORT } from '../../modules/nf-core/gnu/sort'
+include { BEDTOOLS_MERGE } from '../../modules/nf-core/bedtools/merge'
+include { BEDTOOLS_INTERSECT } from '../../modules/nf-core/bedtools/intersect'
+include { BEDTOOLS_JACCARD } from '../../modules/nf-core/bedtools/jaccard'
+include { BEDTOOLS_GENOMECOV } from '../../modules/nf-core/bedtools/genomecov'
+include { BENCHMARKING_MULTIQC as JACCARD_MULTIQC } from '../../modules/local/benchmarking/multiqc'
+include { BENCHMARKING_MULTIQC as CORRELATION_MULTIQC } from '../../modules/local/benchmarking/multiqc'
+include { PNG_JSON as LOCATION_JSON } from '../../modules/local/benchmarking/png_json'
+include { PNG_JSON as OVERLAP_JSON } from '../../modules/local/benchmarking/png_json'
+include { LOCATION_PLOT } from '../../modules/local/benchmarking/location_plots'
+include { OVERLAP_PLOT } from '../../modules/local/benchmarking/overlap_plot'
+include { SEQ_DEPTH_CORRELLATION } from '../../modules/local/benchmarking/seq_depth_plot'
+include { AVERAGE_TSV } from '../../modules/local/benchmarking/average_tsv'
+include { POLYATAILOR as POLYATAILOR_REAL } from '../../modules/local/benchmarking/polyatailor'
+include { POLYATAILOR as POLYATAILOR_BENCHMARK} from '../../modules/local/benchmarking/polyatailor'
+include { PLOT_POLYATAILS } from '../../modules/local/benchmarking/plot_polyatails'
+include { SUBREAD_FEATURECOUNTS as FEATURECOUNTS_TOTAL } from '../../modules/nf-core/subread/featurecounts'
+include { SUBREAD_FEATURECOUNTS as FEATURECOUNTS_BENCHMARKING} from '../../modules/nf-core/subread/featurecounts'
+include { STAR2PASS as STAR2PASS_REAL } from './detection_tools/star2pass'
+include { STAR2PASS as STAR2PASS_BENCHMARKING } from './detection_tools/star2pass'
+include { STAR_GENOMEGENERATE } from '../../modules/nf-core/star/genomegenerate'
+include { DECOMPRESS_READS as DECOMPRESS_REAL } from '../../modules/local/benchmarking/decompress_reads'
+include { DECOMPRESS_READS as DECOMPRESS_BENCHMARKING} from '../../modules/local/benchmarking/decompress_reads'
+include { RRNA_CORRELATION } from '../../modules/local/benchmarking/rRNA_correlation'
+
+
+
+workflow BENCHMARKING {
+
+ take:
+ ch_reads_real
+ ch_reads_benchmarking
+ ch_real_bed
+ ch_benchmarking_bed
+ ch_real_bam
+ ch_benchmarking_bam
+ ch_trim_report
+ ch_fasta
+ bsj_reads
+
+ main:
+
+ //quality score estimation
+ /*ch_reads_benchmarking.view {"rb: $it"}
+ ch_reads_real.view {"rr: $it"}
+ ch_polya_real = POLYATAILOR_REAL(ch_reads_real, ch_real_bam.map { it[1] })
+ ch_polya_benchmarking = POLYATAILOR_BENCHMARK(ch_reads_benchmarking, ch_benchmarking_bam.map { it[1] })
+
+ ch_polya_real.tails.collect().view {"pr: $it"}
+ ch_polya_benchmarking.tails.collect().view {"pb: $it"}
+
+ ch_polya_plots = PLOT_POLYATAILS(ch_polya_real.tails.collect(), ch_polya_benchmarking.tails.collect())
+ ch_polya_plots.average_tails.view {"$it"}*/
+
+
+ // Use only the paths in each tuple of ch_reads_real and ch_reads_benchmarking, assuming the metadata is already present
+ // Define parameters
+ ch_benchmark_gtf = "/nfs/data3/CIRCEST/runs/test_benchmarking/gencode.v47.primary_assembly.annotation.gtf"
+ ch_benchmarking_fasta = "/nfs/data3/CIRCEST/runs/test_benchmarking/gencode.v47.rRNARNA_transcripts.fa"
+ ch_filter_gtf = "/nfs/data3/CIRCEST/runs/test_benchmarking/ensembl_rRNA.gtf"
+
+ STAR_GENOMEGENERATE(ch_fasta, tuple([id: "benchmarking_gtf"], file(ch_benchmark_gtf)))
+ star_index = params.star ? Channel.value([[id: "star"], file(params.star, checkIfExists: true)]) : STAR_GENOMEGENERATE.out.index.collect()
+ star_ignore_sjdbgtf = true
+ seq_center = params.seq_center ?: ''
+ seq_platform = ''
+
+
+ ch_reads_real_restructured = ch_reads_real.map { meta, fastq_gz_list ->
+ tuple(meta, fastq_gz_list[0], fastq_gz_list[1])
+ }
+ ch_reads_benchmarking_restructured = ch_reads_benchmarking.map { meta, fastq_gz_list ->
+ tuple(meta, fastq_gz_list[0], fastq_gz_list[1])
+ }
+ ch_uncompressed_reads_real = DECOMPRESS_REAL(ch_reads_real_restructured)
+ ch_uncompressed_reads_benchmarking = DECOMPRESS_BENCHMARKING(ch_reads_benchmarking_restructured)
+
+ ch_rRNA_real_bam = STAR2PASS_REAL(ch_uncompressed_reads_real, star_index, tuple([id: "Benchmarking_gtf"], file(ch_benchmark_gtf)), bsj_reads, star_ignore_sjdbgtf, seq_center, seq_platform).bam
+
+ ch_rRNA_benchmarking_bam = STAR2PASS_BENCHMARKING(ch_uncompressed_reads_benchmarking, star_index, tuple([id: "Benchmarking_gtf"], file(ch_benchmark_gtf)), bsj_reads, star_ignore_sjdbgtf, seq_center, seq_platform).bam
+
+
+
+ ch_rRNA_real_input = ch_real_bam.map { meta, path ->
+ tuple(meta, path, file(ch_filter_gtf))
+ }
+ ch_rRNA_benchmarking_input = ch_benchmarking_bam.map { meta, path ->
+ tuple(meta, path, file(ch_filter_gtf))
+ }
+
+ ch_rRNA_real = FEATURECOUNTS_TOTAL(ch_rRNA_real_input).summary
+ ch_rRNA_benchmarking = FEATURECOUNTS_BENCHMARKING(ch_rRNA_benchmarking_input).summary
+
+
+ ch_collected_real_bed = ch_real_bed.collect()
+ ch_collected_benchmarking_bed = ch_benchmarking_bed.collect()
+ ch_collected_rRNA_real = ch_rRNA_real.collect()
+ ch_collected_rRNA_benchmarking = ch_rRNA_benchmarking.collect()
+
+ RRNA_CORRELATION(
+ ch_collected_real_bed,
+ ch_collected_benchmarking_bed,
+ ch_collected_rRNA_real,
+ ch_collected_rRNA_benchmarking
+ )
+
+
+ //ch_rRNA_benchmarking.view { "bench: $it" }
+
+
+ //data preparation
+ ch_versions = Channel.empty()
+
+ ch_all = ch_real_bed.mix(ch_benchmarking_bed)
+ .map{ meta, bed -> [[id: meta.tool + "_" + (meta.benchmarking ? "benchmarking" : "real"),
+ tool: meta.tool,
+ benchmarking: meta.benchmarking], bed]}
+ .groupTuple()
+
+ SORT(ch_all)
+ ch_versions = ch_versions.mix(SORT.out.versions)
+
+ BEDTOOLS_MERGE(SORT.out.sorted).bed.branch{ meta, bed ->
+ real: !meta.benchmarking
+ benchmarking: meta.benchmarking
+ }.set { ch_merged }
+ ch_versions = ch_versions.mix(BEDTOOLS_MERGE.out.versions)
+
+ ch_joined = ch_merged.real.map{ meta, bed -> [[id: meta.tool], bed]}
+ .join(ch_merged.benchmarking.map{ meta, bed -> [[id: meta.tool], bed]})
+
+ //Overlap plot
+ ch_intersect = BEDTOOLS_INTERSECT(ch_joined,[[], []])
+ OVERLAP_PLOT(ch_intersect.intersect)
+ OVERLAP_JSON(OVERLAP_PLOT.out.plots, "Overlap plots", "Plot the overlap circRNAs found in total and polyA data for the tools")
+ ch_versions = ch_versions.mix(BEDTOOLS_INTERSECT.out.versions)
+ ch_versions = ch_versions.mix(OVERLAP_PLOT.out.versions)
+ ch_versions = ch_versions.mix(OVERLAP_JSON.out.versions)
+
+ //Location plot workflow
+ LOCATION_PLOT(ch_joined)
+ LOCATION_JSON(LOCATION_PLOT.out.plots, "Location plots", "Plots the location of the circRNAs found" )
+ ch_versions = ch_versions.mix(LOCATION_PLOT.out.versions)
+ ch_versions = ch_versions.mix(LOCATION_JSON.out.versions)
+
+ //Pearson correllation workflow
+ ch_meta = ch_real_bam.map { it[0] }
+ ch_path = ch_real_bam.map { it[1] }
+ ch_scale = Channel.value(1)
+ ch_genomecov_inputs = ch_meta.combine(ch_path).combine(ch_scale)
+ .map { meta, path, scale ->
+ tuple(meta, path, scale)
+ }
+
+ ch_genomecov = BEDTOOLS_GENOMECOV(ch_genomecov_inputs, [], "bg",false)
+ ch_versions = ch_versions.mix(BEDTOOLS_GENOMECOV.out.versions)
+
+ ch_seqdepths = ch_genomecov.genomecov
+ .map { genomecov_result -> genomecov_result[1].toString() }
+ .collectFile(name: 'genomecov_paths.txt',
+ newLine: true)
+
+ ch_corr = SEQ_DEPTH_CORRELLATION(ch_real_bed, ch_seqdepths.collect()).report
+ ch_versions = ch_versions.mix(SEQ_DEPTH_CORRELLATION.out.versions)
+
+ ch_pearson = ch_corr.splitCsv(header: true, sep: "\t")
+ .map{ values -> [values.tool, values.pearson_corr]}
+ .collectFile( newLine: true,
+ storeDir: params.outdir,
+ seed: "tool\tpearson_corr") {
+ row -> ["pearson.tsv", row.join("\t")]
+ }
+
+ AVERAGE_TSV(ch_pearson)
+ CORRELATION_MULTIQC(AVERAGE_TSV.out.tsv)
+ ch_versions = ch_versions.mix(AVERAGE_TSV.out.versions)
+ ch_versions = ch_versions.mix(CORRELATION_MULTIQC.out.versions)
+
+ //Jaccard Workflow
+ ch_jaccard = BEDTOOLS_JACCARD(ch_joined, [[], []]).tsv
+ ch_versions = ch_versions.mix(BEDTOOLS_JACCARD.out.versions)
+
+ ch_stats = ch_jaccard.splitCsv(header: true, sep: "\t")
+ .map{ meta, values -> [meta.id, values.intersection, values.union, values.jaccard, values.n_intersections]}
+ .collectFile( newLine: true,
+ storeDir: params.outdir,
+ seed: "tool\tintersection\tunion\tjaccard\tn_intersections") {
+ row -> ["jaccard.tsv", row.join("\t")]
+ }
+
+ JACCARD_MULTIQC(ch_stats)
+ ch_versions = ch_versions.mix(JACCARD_MULTIQC.out.versions)
+
+ //combine results
+ ch_reports = JACCARD_MULTIQC.out.report.mix(LOCATION_JSON.out.report)
+ ch_reports = ch_reports.mix(OVERLAP_JSON.out.report)
+ ch_reports = ch_reports.mix(CORRELATION_MULTIQC.out.report)
+
+ emit:
+ reports = ch_reports
+ versions = ch_versions // channel: [ versions.yml ]
+}
diff --git a/subworkflows/local/bsj_detection.nf b/subworkflows/local/bsj_detection.nf
index 89ddec4ed..d85bf3304 100644
--- a/subworkflows/local/bsj_detection.nf
+++ b/subworkflows/local/bsj_detection.nf
@@ -229,11 +229,13 @@ workflow BSJ_DETECTION {
)
emit:
- bed = ch_bsj_bed_combined
- bed12 = ch_bsj_bed12_combined
- gtf = ch_bsj_gtf_combined
- fasta = ch_bsj_fasta_combined
-
- multiqc_files = ch_multiqc_files
- versions = ch_versions
+ bed = ch_bsj_bed_combined
+ bed12 = ch_bsj_bed12_combined
+ gtf = ch_bsj_gtf_combined
+ fasta = ch_bsj_fasta_combined
+ star_bam = STAR2PASS.out.bam
+ bed_per_sample_tool = ch_bsj_bed_per_sample_tool
+
+ multiqc_files = ch_multiqc_files
+ versions = ch_versions
}
diff --git a/subworkflows/local/detection_tools/star2pass.nf b/subworkflows/local/detection_tools/star2pass.nf
index 4e216545b..4545c909b 100644
--- a/subworkflows/local/detection_tools/star2pass.nf
+++ b/subworkflows/local/detection_tools/star2pass.nf
@@ -29,6 +29,7 @@ workflow STAR2PASS {
junction = PASS_2.out.junction
sam = PASS_2.out.sam
tab = PASS_2.out.tab
+ bam = PASS_2.out.bam
versions = ch_versions
}
diff --git a/subworkflows/local/mirna_prediction.nf b/subworkflows/local/mirna_prediction.nf
index b3fc46cc0..1b6b046bb 100644
--- a/subworkflows/local/mirna_prediction.nf
+++ b/subworkflows/local/mirna_prediction.nf
@@ -1,8 +1,7 @@
// MODULES
-include { BIOAWK as ADD_BACKSPLICE } from '../../modules/nf-core/bioawk'
-include { DESEQ2_NORMALIZATION } from '../../modules/local/deseq2/normalization'
-include { MIRNA_FILTERING } from '../../modules/local/mirna_filtering'
-include { COMPUTE_CORRELATIONS } from '../../modules/local/compute_correlations'
+include { DESEQ2_NORMALIZATION } from '../../modules/local/deseq2/normalization'
+include { MIRNA_FILTERING } from '../../modules/local/mirna_filtering'
+include { COMPUTE_CORRELATIONS } from '../../modules/local/compute_correlations'
// SUBWORKFLOWS
include { MIRNA_BINDINGSITES } from './mirna/mirna_bindingsites'
diff --git a/variables_output.txt b/variables_output.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/workflows/circrna/main.nf b/workflows/circrna/main.nf
index 048b4da34..3713e38e9 100644
--- a/workflows/circrna/main.nf
+++ b/workflows/circrna/main.nf
@@ -22,17 +22,19 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil
*/
// SUBWORKFLOWS:
-include { paramsSummaryMap } from 'plugin/nf-validation'
-include { paramsSummaryMultiqc } from '../../subworkflows/nf-core/utils_nfcore_pipeline'
-include { validateInputSamplesheet } from '../../subworkflows/local/utils_nfcore_circrna_pipeline'
-
-include { softwareVersionsToYAML } from '../../subworkflows/nf-core/utils_nfcore_pipeline'
-include { PREPARE_GENOME } from '../../subworkflows/local/prepare_genome'
-include { BSJ_DETECTION } from '../../subworkflows/local/bsj_detection'
-include { ANNOTATION } from '../../subworkflows/local/annotation'
-include { QUANTIFICATION } from '../../subworkflows/local/quantification'
-include { MIRNA_PREDICTION } from '../../subworkflows/local/mirna_prediction'
-include { STATISTICAL_TESTS } from '../../subworkflows/local/statistical_tests'
+include { paramsSummaryMap } from 'plugin/nf-validation'
+include { paramsSummaryMultiqc } from '../../subworkflows/nf-core/utils_nfcore_pipeline'
+include { validateInputSamplesheet } from '../../subworkflows/local/utils_nfcore_circrna_pipeline'
+
+include { softwareVersionsToYAML } from '../../subworkflows/nf-core/utils_nfcore_pipeline'
+include { PREPARE_GENOME } from '../../subworkflows/local/prepare_genome'
+include { BSJ_DETECTION } from '../../subworkflows/local/bsj_detection'
+include { BSJ_DETECTION as BSJ_DETECTION_BENCHMARKING } from '../../subworkflows/local/bsj_detection'
+include { BENCHMARKING } from '../../subworkflows/local/benchmarking'
+include { ANNOTATION } from '../../subworkflows/local/annotation'
+include { QUANTIFICATION } from '../../subworkflows/local/quantification'
+include { MIRNA_PREDICTION } from '../../subworkflows/local/mirna_prediction'
+include { STATISTICAL_TESTS } from '../../subworkflows/local/statistical_tests'
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -136,8 +138,13 @@ workflow CIRCRNA {
// 2. BSJ Discovery
//
+ FASTQC_TRIMGALORE.out.reads.branch{
+ real: !it[0].benchmarking
+ benchmarking: it[0].benchmarking
+ }.set{ ch_reads }
+
BSJ_DETECTION(
- FASTQC_TRIMGALORE.out.reads,
+ ch_reads.real,
ch_fasta,
ch_gtf,
ch_annotation,
@@ -155,13 +162,49 @@ workflow CIRCRNA {
ch_versions = ch_versions.mix(BSJ_DETECTION.out.versions)
//
- // 3. circRNA quantification
+ // 3. Benchmarking
+ //
+
+ if (params.benchmarking) {
+ BSJ_DETECTION_BENCHMARKING(
+ ch_reads.benchmarking,
+ ch_fasta,
+ ch_gtf,
+ ch_annotation,
+ bowtie_index,
+ bowtie2_index,
+ bwa_index,
+ chromosomes,
+ hisat2_index,
+ star_index,
+ params.bsj_reads,
+ params.exon_boundary
+ )
+
+ BENCHMARKING(
+ ch_reads.real,
+ ch_reads.benchmarking,
+ BSJ_DETECTION.out.bed_per_sample_tool,
+ BSJ_DETECTION_BENCHMARKING.out.bed_per_sample_tool,
+ BSJ_DETECTION.out.star_bam,
+ BSJ_DETECTION_BENCHMARKING.out.star_bam,
+ FASTQC_TRIMGALORE.out.trim_log,
+ ch_fasta,
+ params.bsj_reads
+ )
+
+ ch_multiqc_files = ch_multiqc_files.mix(BENCHMARKING.out.reports)
+ ch_versions = ch_versions.mix(BSJ_DETECTION_BENCHMARKING.out.versions)
+ }
+
+ //
+ // 4. Quantification
//
QUANTIFICATION(
ch_gtf,
ch_fasta,
- FASTQC_TRIMGALORE.out.reads,
+ ch_reads.real,
BSJ_DETECTION.out.bed12,
BSJ_DETECTION.out.gtf,
params.bootstrap_samples,
@@ -172,7 +215,7 @@ workflow CIRCRNA {
ch_versions = ch_versions.mix(QUANTIFICATION.out.versions)
//
- // 4. miRNA prediction
+ // 5. miRNA prediction
//
if (params.mature) {