diff --git a/bin/extract_sample.py b/bin/extract_sample.py new file mode 100644 index 00000000..0fb2c2bd --- /dev/null +++ b/bin/extract_sample.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python + +import argparse +import errno +import os +import sys +from pathlib import Path +import pandas as pd + + +def parse_args(args=None): + Description = "Extract sample information from an experiment design file" + Epilog = "Example usage: python extract_sample.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("EXP", help="Expdesign file to be extracted") + return parser.parse_args(args) + + +def extract_sample(expdesign): + data = pd.read_csv(expdesign, sep="\t", header=0, dtype=str) + fTable = data.dropna() + + # two table format + with open(expdesign, "r") as f: + lines = f.readlines() + empty_row = lines.index("\n") + s_table = [i.replace("\n", "").split("\t") for i in lines[empty_row + 1:]][1:] + s_header = lines[empty_row + 1].replace("\n", "").split("\t") + s_DataFrame = pd.DataFrame(s_table, columns=s_header) + + sample_dt = pd.DataFrame() + if "MSstats_Mixture" not in s_DataFrame.columns: + fTable = fTable[["Spectra_Filepath", "Sample"]] + fTable.to_csv(f"{Path(expdesign).stem}_sample.csv", sep="\t", index=False) + else: + fTable.drop_duplicates(subset=["Spectra_Filepath"], inplace=True) + for _, row in fTable.iterrows(): + mixture_id = s_DataFrame[s_DataFrame["Sample"] == row["Sample"]]["MSstats_Mixture"] + sample_dt = sample_dt.append({"Spectra_Filepath": row["Spectra_Filepath"], "Sample": mixture_id}, + ignore_index=True) + sample_dt.to_csv(f"{Path(expdesign).stem}_sample.csv", sep="\t", index=False) + + +def main(args=None): + args = parse_args(args) + extract_sample(args.EXP) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/ms2rescore_cli.py b/bin/ms2rescore_cli.py new file mode 100644 index 00000000..7c607bc4 --- /dev/null +++ b/bin/ms2rescore_cli.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python +# Written by Jonas Scheid under the MIT license + +import sys +import click +import importlib.resources +import json +import logging +from typing import List + +import pandas as pd + +from ms2rescore import rescore, package_data +from psm_utils.io.idxml import IdXMLReader, IdXMLWriter +from psm_utils import PSMList +import pyopenms as oms + +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") + + +def parse_cli_arguments_to_config(**kwargs): + """Update default MS²Rescore config with CLI arguments""" + config = json.load(importlib.resources.open_text(package_data, "config_default.json")) + + for key, value in kwargs.items(): + # Skip these arguments since they need to set in a nested dict of feature_generators + if key in ["ms2pip_model", "ms2_tolerance", "rng", "calibration_set_size"]: + continue + + elif key == "feature_generators": + feature_generators = value.split(",") + # Reset feature generator dict since there might be default generators we don't want + config["ms2rescore"]["feature_generators"] = {} + if "basic" in feature_generators: + config["ms2rescore"]["feature_generators"]["basic"] = {} + if "ms2pip" in feature_generators: + config["ms2rescore"]["feature_generators"]["ms2pip"] = { + "model": kwargs["ms2pip_model"], + "ms2_tolerance": kwargs["ms2_tolerance"], + } + if "deeplc" in feature_generators: + config["ms2rescore"]["feature_generators"]["deeplc"] = { + "deeplc_retrain": False, + "calibration_set_size": kwargs["calibration_set_size"], + } + if "maxquant" in feature_generators: + config["ms2rescore"]["feature_generators"]["maxquant"] = {} + if "ionmob" in feature_generators: + config["ms2rescore"]["feature_generators"]["ionmob"] = {} + + elif key == "rescoring_engine": + # Reset rescoring engine dict we want to allow only computing features + config["ms2rescore"]["rescoring_engine"] = {} + if value == "mokapot": + config["ms2rescore"]["rescoring_engine"]["mokapot"] = { + "write_weights": True, + "write_txt": False, + "write_flashlfq": False, + "rng": kwargs["rng"], + "test_fdr": kwargs["test_fdr"], + "max_workers": kwargs["processes"], + } + if value == "percolator": + logging.info( + "Percolator rescoring engine has been specified. Use the idXML containing rescoring features and run Percolator in a separate step." + ) + continue + + else: + config["ms2rescore"][key] = value + + return config + + +def rescore_idxml(input_file, output_file, config) -> None: + """Rescore PSMs in an idXML file and keep other information unchanged.""" + # Read PSMs + reader = IdXMLReader(input_file) + psm_list = reader.read_file() + + # Rescore + rescore(config, psm_list) + + # Filter out PeptideHits within PeptideIdentification(s) that could not be processed by all feature generators + peptide_ids_filtered = filter_out_artifact_psms(psm_list, reader.peptide_ids) + + # Write + writer = IdXMLWriter(output_file, reader.protein_ids, peptide_ids_filtered) + writer.write_file(psm_list) + + +def filter_out_artifact_psms( + psm_list: PSMList, peptide_ids: List[oms.PeptideIdentification] +) -> List[oms.PeptideIdentification]: + """Filter out PeptideHits that could not be processed by all feature generators""" + num_mandatory_features = max([len(psm.rescoring_features) for psm in psm_list]) + new_psm_list = PSMList(psm_list=[psm for psm in psm_list if len(psm.rescoring_features) == num_mandatory_features]) + + # get differing peptidoforms of both psm lists + psm_list_peptides = set([next(iter(psm.provenance_data.items()))[1] for psm in psm_list]) + new_psm_list_peptides = set([next(iter(psm.provenance_data.items()))[1] for psm in new_psm_list]) + not_supported_peptides = psm_list_peptides - new_psm_list_peptides + + # no need to filter if all peptides are supported + if len(not_supported_peptides) == 0: + return peptide_ids + # Create new peptide ids and filter out not supported peptides + new_peptide_ids = [] + for peptide_id in peptide_ids: + new_hits = [] + for hit in peptide_id.getHits(): + if hit.getSequence().toString() in not_supported_peptides: + continue + new_hits.append(hit) + if len(new_hits) == 0: + continue + peptide_id.setHits(new_hits) + new_peptide_ids.append(peptide_id) + logging.info( + f"Removed {len(psm_list_peptides) - len(new_psm_list_peptides)} PSMs. Peptides not supported: {not_supported_peptides}" + ) + return new_peptide_ids + + +@click.command() +@click.option( + "-p", "--psm_file", help="Path to PSM file (PIN, mzIdentML, MaxQuant msms, X!Tandem XML, idXML)", required=True +) +@click.option( + "-s", + "--spectrum_path", + help="Path to MGF/mzML spectrum file or directory with spectrum files (default: derived from identification file)", + required=True, +) +@click.option( + "-o", "--output_path", help="Path and stem for output file names (default: derive from identification file)" +) +@click.option("-l", "--log_level", help="Logging level (default: `info`)", default="info") +@click.option("-n", "--processes", help="Number of parallel processes available to MS²Rescore", type=int, default=16) +@click.option("-f", "--fasta_file", help="Path to FASTA file") +@click.option("-t", "--test_fdr", help="The false-discovery rate threshold at which to evaluate the learned models. (default: 0.05)", default=0.05) +@click.option( + "-fg", + "--feature_generators", + help="Comma-separated list of feature generators to use (default: `ms2pip,deeplc`). See ms2rescore doc for further information", + default="", +) +@click.option("-pipm", "--ms2pip_model", help="MS²PIP model (default: `Immuno-HCD`)", type=str, default="Immuno-HCD") +@click.option( + "-ms2tol", "--ms2_tolerance", help="Fragment mass tolerance [Da](default: `0.02`)", type=float, default=0.02 +) +@click.option( + "-cs", + "--calibration_set_size", + help="Percentage of number of calibration set for DeepLC (default: `0.15`)", + default=0.15, +) +@click.option("-re", "--rescoring_engine", help="Either mokapot or percolator (default: `mokapot`)", default="mokapot") +@click.option( + "-rng", "--rng", help="Seed for mokapot's random number generator (default: `4711`)", type=int, default=4711 +) +@click.option("-d", "--id_decoy_pattern", help="Regex decoy pattern (default: `DECOY_`)", default="^DECOY_") +@click.option( + "-lsb", + "--lower_score_is_better", + help="Interpretation of primary search engine score (default: True)", + default=True, +) +def main(**kwargs): + config = parse_cli_arguments_to_config(**kwargs) + logging.info("MS²Rescore config:") + logging.info(config) + rescore_idxml(kwargs["psm_file"], kwargs["output_path"], config) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/psm_conversion.py b/bin/psm_conversion.py index c122a24f..767859a0 100644 --- a/bin/psm_conversion.py +++ b/bin/psm_conversion.py @@ -1,4 +1,5 @@ #!/usr/bin/env python + import numpy as np import pyopenms as oms import pandas as pd @@ -63,8 +64,8 @@ def convert_psm(idxml, spectra_file, export_decoy_psm): if isinstance(spectra_df, pd.DataFrame): spectra = spectra_df[spectra_df["scan"] == scan_number] - mz_array = spectra["mz"].values[0] - intensity_array = spectra["intensity"].values[0] + mz_array = spectra["mz"].values + intensity_array = spectra["intensity"].values num_peaks = len(mz_array) for hit in peptide_id.getHits(): @@ -84,6 +85,9 @@ def convert_psm(idxml, spectra_file, export_decoy_psm): elif search_engines == "Sage": id_scores = ["Sage:hyperscore: " + str(hit.getScore())] + if hit.metaValueExists("MS:1001491"): + global_qvalue = hit.getMetaValue("MS:1001491") + charge = hit.getCharge() peptidoform = hit.getSequence().toString() modifications = mods_position(peptidoform) diff --git a/conf/modules.config b/conf/modules.config index 0b84114e..89d3bc4b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -68,7 +68,8 @@ if (params.add_decoys) { if (params.posterior_probabilities == "percolator") { process { // EXTRACTPSMFEATURE - withName: '.*:ID:PSMRESCORING:EXTRACTPSMFEATURES' { + withName: '.*:EXTRACTPSMFEATURES' { + ext.args = "-debug $params.extractpsmfeature_debug" publishDir = [ path: { "${params.outdir}/extractpsmfeature" }, mode: params.publish_dir_mode, @@ -78,14 +79,17 @@ if (params.posterior_probabilities == "percolator") { } //PERCOLATOR - withName: '.*:ID:PSMRESCORING:PERCOLATOR' { - ext.args = "-debug $params.percolator_debug" + withName: '.*:PERCOLATOR' { + ext.args = [ + "-debug $params.percolator_debug", + (params.fdr_level != 'psm_level_fdrs') ? "-" + params.fdr_level : "" + ].join(' ').trim() } } } else { process { // IDPEP - withName: '.*:ID:PSMRESCORING:IDPEP' { + withName: '.*:IDPEP' { ext.args = "-debug $params.idpep_debug" } } @@ -270,4 +274,47 @@ process { ] } + // ID ONLY + withName: 'MS2RESCORE' { + ext.args = [ + "--ms2pip_model ${params.ms2pip_model}", + "--rescoring_engine ${params.posterior_probabilities}", + "--calibration_set_size ${params.calibration_set_size}", + "--test_fdr ${params.test_FDR}", + params.feature_generators.trim() ? "--feature_generators ${params.feature_generators}" : '' + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:DDA_ID:IDSCORESWITCHER' { + ext.args = [ + "-new_score_orientation lower_better", + "-new_score_type \"Posterior Error Probability\"", + "-debug $params.idscoreswitcher_debug" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/idscoreswitcherforluciphor" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:DDA_ID:PSMFDRCONTROL:IDSCORESWITCHER' { + ext.args = [ + "-new_score_orientation lower_better", + "-old_score \"Posterior Error Probability\"", + "-new_score_type q-value", + "-debug $params.idscoreswitcher_debug" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/idscoreswitcher" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } diff --git a/conf/test_full_lfq.config b/conf/test_full_lfq.config index fe17eb44..4e4684f1 100644 --- a/conf/test_full_lfq.config +++ b/conf/test_full_lfq.config @@ -29,5 +29,5 @@ params { add_decoys = true add_triqler_output = true protein_level_fdr_cutoff = 0.01 - psm_pep_fdr_cutoff = 0.01 + psm_level_fdr_cutoff = 0.01 } diff --git a/conf/test_full_tmt.config b/conf/test_full_tmt.config index d4b8469f..11eea647 100644 --- a/conf/test_full_tmt.config +++ b/conf/test_full_tmt.config @@ -27,7 +27,7 @@ params { posterior_probabilities = "percolator" search_engines = "comet,msgf" protein_level_fdr_cutoff = 0.01 - psm_pep_fdr_cutoff = 0.01 + psm_level_fdr_cutoff = 0.01 add_decoys = true protocol = 'TMT' } diff --git a/conf/test_localize.config b/conf/test_localize.config index 3ed7a152..ef32bddd 100644 --- a/conf/test_localize.config +++ b/conf/test_localize.config @@ -30,4 +30,5 @@ params { psm_level_fdr_cutoff = 0.50 skip_post_msstats = true quantify_decoys = true + fdr_level = "psm_level_fdrs" } diff --git a/modules.json b/modules.json index beb0405b..8bd8a9c6 100644 --- a/modules.json +++ b/modules.json @@ -26,7 +26,7 @@ }, "utils_nfcore_pipeline": { "branch": "master", - "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", + "git_sha": "92de218a329bfc9a9033116eb5f65fd270e72ba3", "installed_by": ["subworkflows"] }, "utils_nfvalidation_plugin": { diff --git a/modules/local/extract_sample/main.nf b/modules/local/extract_sample/main.nf new file mode 100644 index 00000000..6a7210f4 --- /dev/null +++ b/modules/local/extract_sample/main.nf @@ -0,0 +1,31 @@ +process GETSAMPLE { + tag "$design.Name" + label 'process_low' + + conda "bioconda::sdrf-pipelines=0.0.25" + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.0.25--pyhdfd78af_0" + } else { + container "biocontainers/sdrf-pipelines:0.0.25--pyhdfd78af_0" + } + + input: + path design + + output: + path "*_sample.csv", emit: ch_expdesign_sample + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + extract_sample.py "${design}" 2>&1 | tee extract_sample.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sdrf-pipelines: \$(parse_sdrf --version 2>&1 | awk -F ' ' '{print \$2}') + END_VERSIONS + """ +} diff --git a/modules/local/extract_sample/meta.yml b/modules/local/extract_sample/meta.yml new file mode 100644 index 00000000..548484ae --- /dev/null +++ b/modules/local/extract_sample/meta.yml @@ -0,0 +1,27 @@ +name: GETSAMPLE +description: A module to extract sample information from experimental design file +keywords: + - sample + - conversion +tools: + - custom: + description: | + A custom module for sample extraction. + homepage: https://github.com/bigbio/quantms + documentation: https://github.com/bigbio/quantms/tree/readthedocs +input: + - design: + type: file + description: experimental design file + pattern: "*.csv" +output: + - ch_expdesign_sample: + type: file + description: sample csv file + pattern: "*_sample.csv" + - version: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@daichengxin" diff --git a/modules/local/ms2rescore/main.nf b/modules/local/ms2rescore/main.nf new file mode 100644 index 00000000..6667337a --- /dev/null +++ b/modules/local/ms2rescore/main.nf @@ -0,0 +1,67 @@ +process MS2RESCORE { + tag "$meta.mzml_id" + label 'process_high' + + conda "bioconda::ms2rescore=3.0.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ms2rescore:3.0.2--pyhdfd78af_0': + 'biocontainers/ms2rescore:3.0.2--pyhdfd78af_0' }" + + // userEmulation settings when docker is specified + containerOptions = (workflow.containerEngine == 'docker') ? '-u $(id -u) -e "HOME=${HOME}" -v /etc/passwd:/etc/passwd:ro -v /etc/shadow:/etc/shadow:ro -v /etc/group:/etc/group:ro -v $HOME:$HOME' : '' + + input: + tuple val(meta), path(idxml), path(mzml) + + output: + tuple val(meta), path("*ms2rescore.idXML") , emit: idxml + tuple val(meta), path("*feature_names.tsv"), emit: feature_names + tuple val(meta), path("*.html" ) , optional:true, emit: html + path "versions.yml" , emit: versions + path "*.log" , emit: log + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.mzml_id}_ms2rescore" + + + // ms2rescore only supports Da unit. https://ms2rescore.readthedocs.io/en/v3.0.2/userguide/configuration/ + if (meta['fragmentmasstoleranceunit'].toLowerCase().endsWith('da')) { + ms2_tolerence = meta['fragmentmasstolerance'] + } else { + log.info "Warning: MS2Rescore only supports Da unit. Set default ms2 tolerance as 0.02!" + ms2_tolerence = 0.02 + } + + """ + ms2rescore_cli.py \\ + --psm_file $idxml \\ + --spectrum_path . \\ + --ms2_tolerance $ms2_tolerence \\ + --output_path ${idxml.baseName}_ms2rescore.idXML \\ + --processes $task.cpus \\ + $args \\ + 2>&1 | tee ${meta.mzml_id}_ms2rescore.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + MS²Rescore: \$(echo \$(ms2rescore --version 2>&1) | grep -oP 'MS²Rescore \\(v\\K[^\\)]+' )) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.mzml_id}_ms2rescore" + + """ + touch ${prefix}.idXML + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + MS²Rescore: \$(echo \$(ms2rescore --version 2>&1) | grep -oP 'MS²Rescore \\(v\\K[^\\)]+' )) + END_VERSIONS + """ +} diff --git a/modules/local/ms2rescore/meta.yml b/modules/local/ms2rescore/meta.yml new file mode 100644 index 00000000..887c3a04 --- /dev/null +++ b/modules/local/ms2rescore/meta.yml @@ -0,0 +1,38 @@ +name: MS2RESCORE +description: A module to perform MS2 rescoring step +keywords: + - MS2 + - rescoring +tools: + - custom: + description: | + A custom module for MS2 rescoring. + homepage: https://github.com/bigbio/quantms + documentation: https://github.com/bigbio/quantms/tree/readthedocs +input: + - idxml_file: + type: file + description: idXML identification file + pattern: "*.idXML" + - mzml: + type: file + description: spectrum data file + pattern: "*.mzML" + - meta: + type: map + description: Groovy Map containing sample information +output: + - idxml: + type: file + description: idXML identification file after MS2 rescoring + pattern: "*.idXML" + - feature_names: + type: file + description: File containing feature names + pattern: "*feature_names.tsv" + - version: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@daichengxin" diff --git a/modules/local/openms/consensusid/main.nf b/modules/local/openms/consensusid/main.nf index 5264701d..ad8a20f3 100644 --- a/modules/local/openms/consensusid/main.nf +++ b/modules/local/openms/consensusid/main.nf @@ -31,7 +31,7 @@ process CONSENSUSID { -filter:considered_hits $params.consensusid_considered_top_hits \\ -debug $params.consensusid_debug \\ $args \\ - 2>&1 | tee ${meta.id}_consensusID.log + 2>&1 | tee ${meta.mzml_id}_consensusID.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/extractpsmfeatures/main.nf b/modules/local/openms/extractpsmfeatures/main.nf index a9ff73b1..503aff55 100644 --- a/modules/local/openms/extractpsmfeatures/main.nf +++ b/modules/local/openms/extractpsmfeatures/main.nf @@ -10,7 +10,7 @@ process EXTRACTPSMFEATURES { 'biocontainers/openms-thirdparty:3.1.0--h9ee0642_1' }" input: - tuple val(meta), path(id_file) + tuple val(meta), path(id_file), path(extra_feat) output: tuple val(meta), path("${id_file.baseName}_feat.idXML"), emit: id_files_feat @@ -21,11 +21,17 @@ process EXTRACTPSMFEATURES { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.mzml_id}" + feature = "" + if (params.ms2rescore && params.id_only) { + feature = "-extra \$(awk 'NR > 1 && \$1 !~ /psm_file/ {printf \"%s \", \$2}' ${extra_feat})" + } + """ PSMFeatureExtractor \\ -in ${id_file} \\ -out ${id_file.baseName}_feat.idXML \\ -threads $task.cpus \\ + ${feature} \\ $args \\ 2>&1 | tee ${id_file.baseName}_extract_psm_feature.log diff --git a/modules/local/openms/idmerger/main.nf b/modules/local/openms/idmerger/main.nf new file mode 100644 index 00000000..9fde67e3 --- /dev/null +++ b/modules/local/openms/idmerger/main.nf @@ -0,0 +1,55 @@ +process IDMERGER { + tag "$meta.mzml_id" + label 'process_medium' + label 'openms' + + conda "bioconda::openms-thirdparty=3.1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/openms-thirdparty:3.1.0--h9ee0642_1' : + 'biocontainers/openms-thirdparty:3.1.0--h9ee0642_1' }" + + input: + tuple val(meta), path(id_files) + + output: + tuple val(meta), path("*_merged.idXML"), emit: id_merged + path "versions.yml", emit: version + path "*.log", emit: log + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.mzml_id}" + + if (params.rescore_range == "by_project") { + if (id_files[0].baseName.contains('sage')){ + prefix = "${meta.experiment_id}_sage" + } else if (id_files[0].baseName.contains('comet')){ + prefix = "${meta.experiment_id}_comet" + } else { + prefix = "${meta.experiment_id}_msgf" + } + } else if (params.rescore_range == "by_sample") { + if (id_files[0].baseName.contains('sage')){ + prefix = "${meta.mzml_id}_sage" + } else if (id_files[0].baseName.contains('comet')){ + prefix = "${meta.mzml_id}_comet" + } else { + prefix = "${meta.mzml_id}_msgf" + } + } + + """ + IDMerger \\ + -in ${id_files.join(' ')} \\ + -threads $task.cpus \\ + -out ${prefix}_merged.idXML \\ + -merge_proteins_add_PSMs \\ + $args \\ + 2>&1 | tee ${prefix}_merged.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + IDMerger: \$(IDMerger 2>&1 | grep -E '^Version(.*)' | sed 's/Version: //g' | cut -d ' ' -f 1) + END_VERSIONS + """ +} diff --git a/modules/local/openms/idmerger/meta.yml b/modules/local/openms/idmerger/meta.yml new file mode 100644 index 00000000..94b5caf5 --- /dev/null +++ b/modules/local/openms/idmerger/meta.yml @@ -0,0 +1,33 @@ +name: idmerger +description: Merges several idXML files into one file. +keywords: + - merge + - idXML + - OpenMS +tools: + - IDMerger: + description: | + Merges several idXML files into one file. + homepage: https://www.openms.org/documentation/html/TOPP_IDMerger.html + documentation: https://www.openms.org/documentation/html/TOPP_IDMerger.html +input: + - id_files: + type: file + description: | + Input files separated by blank. + pattern: "*.{idXML}" +output: + - id_merge: + type: file + description: Output Merged file + pattern: "*.{idXML}" + - log: + type: file + description: log file + pattern: "*.log" + - version: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@daichengxin" diff --git a/modules/local/openms/idripper/main.nf b/modules/local/openms/idripper/main.nf new file mode 100644 index 00000000..61eee8d9 --- /dev/null +++ b/modules/local/openms/idripper/main.nf @@ -0,0 +1,53 @@ +process IDRIPPER { + tag "$meta.mzml_id" + label 'process_medium' + label 'openms' + + conda "bioconda::openms-thirdparty=3.1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/openms-thirdparty:3.1.0--h9ee0642_1' : + 'biocontainers/openms-thirdparty:3.1.0--h9ee0642_1' }" + + input: + tuple val(meta), path(id_file), val(qval_score) + + output: + val(meta), emit: meta + path("*.idXML"), emit: id_rippers + val("MS:1001491"), emit: qval_score + path "versions.yml", emit: version + path "*.log", emit: log + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.mzml_id}" + + if (id_file.baseName.contains('sage')){ + pattern = "_sage_perc.idXML" + } else if (id_file.baseName.contains('comet')){ + pattern = "_comet_perc.idXML" + } else { + pattern = "_msgf_perc.idXML" + } + + """ + IDRipper \\ + -in ${id_file} \\ + -threads $task.cpus \\ + -out ./ \\ + -split_ident_runs \\ + $args \\ + 2>&1 | tee ${prefix}_idripper.log + + for i in `ls | grep -v \"_perc.idXML\$\" | grep \".idXML\$\"` + do + mv \$i `ls \"\$i\" |awk -F \".\" \'{print \$1\"${pattern}\"}\'` + done + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + IDRipper: \$(IDRipper 2>&1 | grep -E '^Version(.*)' | sed 's/Version: //g' | cut -d ' ' -f 1) + END_VERSIONS + """ +} +// diff --git a/modules/local/openms/idripper/meta.yml b/modules/local/openms/idripper/meta.yml new file mode 100644 index 00000000..81cc525f --- /dev/null +++ b/modules/local/openms/idripper/meta.yml @@ -0,0 +1,33 @@ +name: idripper +description: Splits the protein/peptide identifications of an idXML file into several idXML files according their annotated file origin. +keywords: + - split + - idXML + - OpenMS +tools: + - IDMerger: + description: | + IDRipper splits the protein/peptide identifications of an idXML file into several idXML files according their annotated file origin. + homepage: https://www.openms.org/documentation/html/TOPP_IDRipper.html + documentation: https://www.openms.org/documentation/html/TOPP_IDRipper.html +input: + - id_file: + type: file + description: | + Input file, in which the protein/peptide identifications must be tagged with 'file_origin' + pattern: "*.{idXML}" +output: + - id_rippers: + type: file + description: Output split files + pattern: "*.{idXML}" + - log: + type: file + description: log file + pattern: "*.log" + - version: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@daichengxin" diff --git a/modules/local/openms/thirdparty/percolator/main.nf b/modules/local/openms/thirdparty/percolator/main.nf index 7a99dd11..af166e96 100644 --- a/modules/local/openms/thirdparty/percolator/main.nf +++ b/modules/local/openms/thirdparty/percolator/main.nf @@ -12,7 +12,7 @@ process PERCOLATOR { tuple val(meta), path(id_file) output: - tuple val(meta), path("${id_file.baseName}_perc.idXML"), val("MS:1001491"), emit: id_files_perc + tuple val(meta), path("*_perc.idXML"), val("MS:1001491"), emit: id_files_perc path "versions.yml", emit: version path "*.log", emit: log diff --git a/nextflow.config b/nextflow.config index 609352ac..2461e68e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -33,6 +33,7 @@ params { // Debug level decoydatabase_debug = 0 pp_debug = 0 + extractpsmfeature_debug = 0 idfilter_debug = 0 idscoreswitcher_debug = 0 iso_debug = 0 @@ -103,6 +104,13 @@ params { min_fr_mz = null max_fr_mz = null + // MSRESCORE flags + ms2rescore = false + rescore_range = 'independent_run' + ms2pip_model = 'HCD2021' + feature_generators = 'deeplc,ms2pip' + calibration_set_size = 0.15 + // PeptideIndexer flags IL_equivalent = true unmatched_action = "warn" @@ -116,7 +124,7 @@ params { // Percolator flags train_FDR = 0.05 test_FDR = 0.05 - FDR_level = 'peptide-level-fdrs' + fdr_level = 'peptide_level_fdrs' klammer = false description_correct_features = 0 subset_max_train = 300000 diff --git a/nextflow_schema.json b/nextflow_schema.json index 42554e71..746e312c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -441,12 +441,43 @@ "default": false, "fa_icon": "far fa-check-square" }, + "ms2rescore": { + "type": "boolean", + "description": "Whether performing peptide identification rescoring with LC-MS predictors such as MS²PIP and DeepLC.", + "default": false, + "fa_icon": "far fa-check-square" + }, + "rescore_range": { + "type": "string", + "description": "Rescoring for independent run, Sample or whole experiments", + "fa_icon": "fas fa-font", + "default": "independent_run", + "enum": ["independent_run", "by_sample", "by_project"] + }, + "ms2pip_model": { + "type": "string", + "description": "Which deep learning model to generate feature.", + "fa_icon": "fas fa-font", + "default": "HCD2021" + }, + "feature_generators": { + "type": "string", + "description": "Which feature generator to generate feature.", + "fa_icon": "fas fa-font", + "default": "deeplc,ms2pip" + }, + "calibration_set_size": { + "type": "number", + "description": "Percentage of number of calibration set for DeepLC", + "default": 0.15, + "fa_icon": "fas fa-filter" + }, "posterior_probabilities": { "type": "string", - "description": "How to calculate posterior probabilities for PSMs:\n\n* 'percolator' = Re-score based on PSM-feature-based SVM and transform distance\n to hyperplane for posteriors\n* 'fit_distributions' = Fit positive and negative distributions to scores\n (similar to PeptideProphet)", + "description": "How to calculate posterior probabilities for PSMs:\n\n* 'percolator' = Re-score based on PSM-feature-based SVM and transform distance\n to hyperplane for posteriors\n* 'fit_distributions' = Fit positive and negative distributions to scores\n (similar to PeptideProphet)\n\n* 'mokapot' = Re-score based on PSM-feature-based semi-supervised learning algorithm introduced by Percolator", "fa_icon": "fas fa-list-ol", "default": "percolator", - "enum": ["percolator", "fit_distributions"] + "enum": ["percolator", "fit_distributions", "mokapot"] }, "run_fdr_cutoff": { "type": "number", @@ -454,6 +485,13 @@ "default": 0.1, "fa_icon": "fas fa-filter" }, + "extractpsmfeature_debug": { + "type": "integer", + "description": "Debug level when running the PSMFeatureExtractor step. Increase for verbose logging", + "fa_icon": "fas fa-bug", + "hidden": true, + "default": 0 + }, "idfilter_debug": { "type": "integer", "description": "Debug level when running the IDFilter step. Increase for verbose logging", @@ -484,12 +522,12 @@ "description": "In the following you can find help for the Percolator specific options that are only used if [`--posterior_probabilities`](#posterior_probabilities) was set to 'percolator'.\nNote that there are currently some restrictions to the original options of Percolator:\n\n* no Percolator protein FDR possible (currently OpenMS' FDR is used on protein level)\n* no support for separate target and decoy databases (i.e. no min-max q-value calculation or target-decoy competition strategy)\n* no support for combined or experiment-wide peptide re-scoring. Currently search results per input file are submitted to Percolator independently.", "default": "", "properties": { - "FDR_level": { + "fdr_level": { "type": "string", - "description": "Calculate FDR on PSM ('psm-level-fdrs') or peptide level ('peptide-level-fdrs')?", - "default": "peptide-level-fdrs", + "description": "Calculate FDR on PSM ('psm_level_fdrs') or peptide level ('peptide_level_fdrs')?", + "default": "peptide_level_fdrs", "fa_icon": "fas fa-list-ol", - "enum": ["peptide-level-fdrs", "psm-level-fdrs"] + "enum": ["peptide_level_fdrs", "psm_level_fdrs"] }, "train_FDR": { "type": "number", diff --git a/subworkflows/local/dda_id.nf b/subworkflows/local/dda_id.nf index c98b1c55..e04f851a 100644 --- a/subworkflows/local/dda_id.nf +++ b/subworkflows/local/dda_id.nf @@ -5,9 +5,14 @@ include { DECOYDATABASE } from '../../modules/local/openms/decoydatabase/main' include { CONSENSUSID } from '../../modules/local/openms/consensusid/main' include { EXTRACTPSMFEATURES } from '../../modules/local/openms/extractpsmfeatures/main' include { PERCOLATOR } from '../../modules/local/openms/thirdparty/percolator/main' +include { IDMERGER } from '../../modules/local/openms/idmerger/main' +include { IDRIPPER } from '../../modules/local/openms/idripper/main' include { FALSEDISCOVERYRATE as FDRIDPEP } from '../../modules/local/openms/falsediscoveryrate/main' include { IDPEP } from '../../modules/local/openms/idpep/main' include { PSMCONVERSION } from '../../modules/local/extract_psm/main' +include { MS2RESCORE } from '../../modules/local/ms2rescore/main' +include { IDSCORESWITCHER } from '../../modules/local/openms/idscoreswitcher/main' +include { GETSAMPLE } from '../../modules/local/extract_sample/main' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules @@ -20,6 +25,7 @@ workflow DDA_ID { ch_file_preparation_results ch_database_wdecoy ch_spectrum_data + ch_expdesign main: @@ -39,7 +45,7 @@ workflow DDA_ID { sage: filename.name.contains('sage') return [meta, filename] nosage: true - return [meta, filename] + return [meta, filename, []] }.set{ch_id_files_branched} @@ -48,16 +54,103 @@ workflow DDA_ID { // if (params.skip_rescoring == false) { if (params.posterior_probabilities == 'percolator') { - EXTRACTPSMFEATURES(ch_id_files_branched.nosage) - ch_id_files_feats = ch_id_files_branched.sage.mix(EXTRACTPSMFEATURES.out.id_files_feat) - ch_software_versions = ch_software_versions.mix(EXTRACTPSMFEATURES.out.version) - PERCOLATOR(ch_id_files_feats) - ch_software_versions = ch_software_versions.mix(PERCOLATOR.out.version) - ch_consensus_input = PERCOLATOR.out.id_files_perc - } + if (params.ms2rescore == true) { + MS2RESCORE(ch_id_files.combine(ch_file_preparation_results, by: 0)) + ch_software_versions = ch_software_versions.mix(MS2RESCORE.out.versions) + + MS2RESCORE.out.idxml.join(MS2RESCORE.out.feature_names).branch{ meta, idxml, feature_name -> + sage: idxml.name.contains('sage') + return [meta, idxml] + nosage: true + return [meta, idxml, feature_name] + }.set{ch_ms2rescore_branched} + + EXTRACTPSMFEATURES(ch_ms2rescore_branched.nosage) + ch_id_files_feats = EXTRACTPSMFEATURES.out.id_files_feat.mix(ch_ms2rescore_branched.sage) + ch_software_versions = ch_software_versions.mix(EXTRACTPSMFEATURES.out.version) + } else { + EXTRACTPSMFEATURES(ch_id_files_branched.nosage) + ch_id_files_feats = ch_id_files_branched.sage.mix(EXTRACTPSMFEATURES.out.id_files_feat) + ch_software_versions = ch_software_versions.mix(EXTRACTPSMFEATURES.out.version) + } + + // Rescoring for independent run, Sample or whole experiments + if (params.rescore_range == "independent_run") { + PERCOLATOR(ch_id_files_feats) + ch_software_versions = ch_software_versions.mix(PERCOLATOR.out.version) + ch_consensus_input = PERCOLATOR.out.id_files_perc + } else if (params.rescore_range == "by_sample") { + // Sample map + GETSAMPLE(ch_expdesign) + ch_expdesign_sample = EXTRACT_SAMPLE.out.ch_expdesign_sample + ch_expdesign_sample.splitCsv(header: true, sep: '\t') + .map { get_sample_map(it) }.set{ sample_map_idv } + + sample_map = sample_map_idv.collect().map{ all_sample_map( it ) } + + // Group by search_engines and convert meta + ch_id_files_feats.combine( sample_map ).branch{ meta, filename, sample_map -> + sage: filename.name.contains('sage') + return [convert_exp_meta(meta, "sample_id", filename, sample_map), filename] + msgf: filename.name.contains('msgf') + return [convert_exp_meta(meta, "sample_id", filename, sample_map), filename] + comet: filename.name.contains('comet') + return [convert_exp_meta(meta, "sample_id", filename, sample_map), filename] + }.set{ch_id_files_feat_branched} + + // IDMERGER for samples group + IDMERGER(ch_id_files_feat_branched.comet.groupTuple(by: 0) + .mix(ch_id_files_feat_branched.msgf.groupTuple(by: 0)) + .mix(ch_id_files_feat_branched.sage.groupTuple(by: 0))) + ch_software_versions = ch_software_versions.mix(IDMERGER.out.version) + + PERCOLATOR(IDMERGER.out.id_merged) + ch_software_versions = ch_software_versions.mix(PERCOLATOR.out.version) + + // Currently only ID runs on exactly one mzML file are supported in CONSENSUSID. Split idXML by runs + IDRIPPER(PERCOLATOR.out.id_files_perc) + IDRIPPER.out.meta.first().combine(IDRIPPER.out.id_rippers.flatten()) + .map{ [convert_exp_meta(it[0], "mzml_id", it[1], ""), it[1], "MS:1001491"] } + .set{ ch_consensus_input } + ch_software_versions = ch_software_versions.mix(IDRIPPER.out.version) + } else if (params.rescore_range == "by_project"){ + // Split ch_id_files_feats by search_engines + ch_id_files_feats.branch{ meta, filename -> + sage: filename.name.contains('sage') + return [convert_exp_meta(meta, "experiment_id", filename, ""), filename] + msgf: filename.name.contains('msgf') + return [convert_exp_meta(meta, "experiment_id", filename, ""), filename] + comet: filename.name.contains('comet') + return [convert_exp_meta(meta, "experiment_id", filename, ""), filename] + }.set{ch_id_files_feat_branched} - if (params.posterior_probabilities != 'percolator') { + // IDMERGER for whole experiments + IDMERGER(ch_id_files_feat_branched.comet.groupTuple(by: 0) + .mix(ch_id_files_feat_branched.msgf.groupTuple(by: 0)) + .mix(ch_id_files_feat_branched.sage.groupTuple(by: 0))) + ch_software_versions = ch_software_versions.mix(IDMERGER.out.version) + + PERCOLATOR(IDMERGER.out.id_merged) + ch_software_versions = ch_software_versions.mix(PERCOLATOR.out.version) + + // Currently only ID runs on exactly one mzML file are supported in CONSENSUSID. Split idXML by runs + IDRIPPER(PERCOLATOR.out.id_files_perc) + IDRIPPER.out.meta.first().combine(IDRIPPER.out.id_rippers.flatten()) + .map{ [convert_exp_meta(it[0], "mzml_id", it[1], ""), it[1], "MS:1001491"] } + .set{ ch_consensus_input } + ch_software_versions = ch_software_versions.mix(IDRIPPER.out.version) + + } + + + } else if (params.posterior_probabilities == 'mokapot') { + MS2RESCORE(ch_id_files.combine(ch_file_preparation_results, by: 0)) + ch_software_versions = ch_software_versions.mix(MS2RESCORE.out.versions) + IDSCORESWITCHER(MS2RESCORE.out.idxml.combine(Channel.value("PEP"))) + ch_software_versions = ch_software_versions.mix(IDSCORESWITCHER.out.version) + ch_consensus_input = IDSCORESWITCHER.out.id_score_switcher.combine(Channel.value("MS:1001491")) + } else { ch_fdridpep = Channel.empty() if (params.search_engines.split(",").size() == 1) { FDRIDPEP(ch_id_files) @@ -87,11 +180,8 @@ workflow DDA_ID { PSMFDRCONTROL(ch_psmfdrcontrol) ch_software_versions = ch_software_versions.mix(PSMFDRCONTROL.out.version.ifEmpty(null)) - // + // Extract PSMs and export parquet format - // - ch_spectrum_data.view() - PSMFDRCONTROL.out.id_filtered.view() PSMCONVERSION(PSMFDRCONTROL.out.id_filtered.combine(ch_spectrum_data, by: 0)) } else { @@ -102,3 +192,80 @@ workflow DDA_ID { emit: version = ch_software_versions } + +// Function to group by mzML/sample/experiment +def convert_exp_meta(Map meta, value, file_name, sample_map) { + def exp_meta = [:] + + if (value == "experiment_id") { + exp_meta.mzml_id = meta.experiment_id + } else if (value == "mzml_id") { + position = file(file_name).name.lastIndexOf('_sage_perc.idXML') + if (position == -1) { + position = file(file_name).name.lastIndexOf('_comet_perc.idXML') + if (position == -1) { + position = file(file_name).name.lastIndexOf('_msgf_perc.idXML') + } + } + exp_meta.mzml_id = file(file_name).name.take(position) + } else if (value == "sample_id") { + tag = file(file_name).name.lastIndexOf('_perc.idXML') + if (tag == -1) { + position = file(file_name).name.lastIndexOf('_sage.idXML') + if (position == -1) { + position = file(file_name).name.lastIndexOf('_comet_feat.idXML') + if (position == -1) { + position = file(file_name).name.lastIndexOf('_msgf_feat.idXML') + } + } + } else { + position = file(file_name).name.lastIndexOf('_sage_perc.idXML') + if (position == -1) { + position = file(file_name).name.lastIndexOf('_comet_perc.idXML') + if (position == -1) { + position = file(file_name).name.lastIndexOf('_msgf_perc.idXML') + } + } + } + + file_name = file(file_name).name.take(position) + exp_meta.mzml_id = sample_map[file_name] + } + + + exp_meta.experiment_id = meta.experiment_id + exp_meta.labelling_type = meta.labelling_type + exp_meta.dissociationmethod = meta.dissociationmethod + exp_meta.fixedmodifications = meta.fixedmodifications + exp_meta.variablemodifications = meta.variablemodifications + exp_meta.precursormasstolerance = meta.precursormasstolerance + exp_meta.precursormasstoleranceunit = meta.precursormasstoleranceunit + exp_meta.fragmentmasstolerance = meta.fragmentmasstolerance + exp_meta.fragmentmasstoleranceunit = meta.fragmentmasstoleranceunit + exp_meta.enzyme = meta.enzyme + exp_meta.acquisition_method = meta.acquisition_method + + return exp_meta +} + +// Function to get sample map +def get_sample_map(LinkedHashMap row) { + def sample_map = [:] + + filestr = row.Spectra_Filepath + file_name = file(filestr).name.take(file(filestr).name.lastIndexOf('.')) + sample = row.Sample + sample_map[file_name] = sample + + return sample_map + +} + +def all_sample_map(sample_list) { + res = [:] + sample_list.each { + res = res + it + } + + return res +} diff --git a/subworkflows/local/psmrescoring.nf b/subworkflows/local/psmrescoring.nf index 7bc89d98..9f243c87 100644 --- a/subworkflows/local/psmrescoring.nf +++ b/subworkflows/local/psmrescoring.nf @@ -21,7 +21,7 @@ workflow PSMRESCORING { sage: filename.name.contains('sage') return [meta, filename] nosage: true - return [meta, filename] + return [meta, filename, []] }.set{ch_id_files_branched} EXTRACTPSMFEATURES(ch_id_files_branched.nosage) ch_id_files_feats = ch_id_files_branched.sage.mix(EXTRACTPSMFEATURES.out.id_files_feat) diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf index a8b55d6f..14558c39 100644 --- a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf +++ b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf @@ -65,9 +65,15 @@ def checkProfileProvided(nextflow_cli_args) { // Citation string for pipeline // def workflowCitation() { + def temp_doi_ref = "" + String[] manifest_doi = workflow.manifest.doi.tokenize(",") + // Using a loop to handle multiple DOIs + // Removing `https://doi.org/` to handle pipelines using DOIs vs DOI resolvers + // Removing ` ` since the manifest.doi is a string and not a proper list + for (String doi_ref: manifest_doi) temp_doi_ref += " https://doi.org/${doi_ref.replace('https://doi.org/', '').replace(' ', '')}\n" return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + "* The pipeline\n" + - " ${workflow.manifest.doi}\n\n" + + temp_doi_ref + "\n" + "* The nf-core framework\n" + " https://doi.org/10.1038/s41587-020-0439-x\n\n" + "* Software dependencies\n" + diff --git a/workflows/quantms.nf b/workflows/quantms.nf index e0772204..e45a6c64 100644 --- a/workflows/quantms.nf +++ b/workflows/quantms.nf @@ -104,8 +104,15 @@ workflow QUANTMS { ch_versions = ch_versions.mix(DECOYDATABASE.out.version.ifEmpty(null)) } - if (params.id_only) { - DDA_ID( FILE_PREPARATION.out.results, ch_searchengine_in_db, FILE_PREPARATION.out.spectrum_data) + // This rescoring engine currently only is supported in id_only subworkflows via ms2rescore. + if (params.id_only | params.posterior_probabilities == "mokapot") { + if (params.id_only == false) { + log.warn "The mokapot rescoring engine currently only is supported in id_only subworkflow via ms2rescore." + } + if (params.posterior_probabilities == "mokapot" && params.fdr_level == "peptide_level_fdrs") { + log.warn "The rescoring engine is set to mokapot. This rescoring engine currently only supports psm-level-fdr via ms2rescore." + } + DDA_ID( FILE_PREPARATION.out.results, ch_searchengine_in_db, FILE_PREPARATION.out.spectrum_data, CREATE_INPUT_CHANNEL.out.ch_expdesign) ch_versions = ch_versions.mix(DDA_ID.out.version.ifEmpty(null)) } else { TMT(ch_fileprep_result.iso, CREATE_INPUT_CHANNEL.out.ch_expdesign, ch_searchengine_in_db)