Skip to content

Commit

Permalink
Merge pull request #11 from axelwalter/openms-3.2-update
Browse files Browse the repository at this point in the history
OpenMS 3.2 updates - additional fixes
  • Loading branch information
eeko-kon authored Dec 2, 2024
2 parents c25fa0f + e7371b1 commit bb5e01a
Show file tree
Hide file tree
Showing 10 changed files with 5,553 additions and 110 deletions.
17 changes: 9 additions & 8 deletions .test/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,13 @@ projects:
#### RULE CONFIGURATION ####
# rules: set value to TRUE if you want to run the analysis or FALSE if you don't
rules:
fileconversion: FALSE #true only for *.raw files from Thermo
preprocessing: TRUE #from raw data to a table of features
requantification: FALSE #true for files with common features
GNPS_export: TRUE #all the files necessary for FBMN
SIRIUS: FALSE #annotate the feature matrix with predictions for chemical formula, structure (CSI:FingerID) and chemical classes (CANOPUS)
spectralmatcher: FALSE #spectral matching with in-house or any downloaded MSMS library & feature matrix annotation (MSI level 2 annotations)
fileconversion: FALSE # True only for *.raw files from Thermo
preprocessing: TRUE # From raw data to a table of features
requantification: FALSE # True for files with common features
GNPS_export: TRUE # All the files necessary for FBMN
SIRIUS: FALSE # Annotate the feature matrix with predictions for chemical formula, structure (CSI:FingerID) and chemical classes (CANOPUS)
# The following rules require GNPS_export: TRUE
spectralmatcher: FALSE # Spectral matching with in-house or any downloaded MSMS library & feature matrix annotation (MSI level 2 annotations)
MS2Query: FALSE # Machine learning tool for spectral matching and analogue annotation (spec2vec and MS2DeepScore)
fbmn_integration: FALSE # After FBMN is finished: integration of formula and structural predictions to the GraphML network file. Optionally, annotate with the MSMS library matches from GNPS also (MSI level 2)

Expand Down Expand Up @@ -60,8 +61,8 @@ align:

# 7) SIRIUS/CSI:FingerID/CANOPUS
SIRIUS:
predict_structure: TRUE
predict_compound_class: TRUE
export_only: FALSE # Only export input files for SIRIUS, but don't actually execute SIRIUS here
predict_structure_and_class: TRUE # CSI:FingerID and CANOPUS
# combine_annotations: TRUE --> combine annotations (e.g. SIRIUS_molecularFormula) from all files into a single column separated by " ## "
# FALSE --> keep a separate column for each file (e.g. sample1_SIRIUS_molecularFormula, sample2_SIRIUS_molecularFormula, ...)
combine_annotations: TRUE
Expand Down
3 changes: 1 addition & 2 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,7 @@ align:
# 7) SIRIUS/CSI:FingerID/CANOPUS
SIRIUS:
export_only: FALSE # Only export input files for SIRIUS, but don't actually execute SIRIUS here
predict_structure: TRUE
predict_compound_class: TRUE
predict_structure_and_class: TRUE # CSI:FingerID and CANOPUS
# combine_annotations: TRUE --> combine annotations (e.g. SIRIUS_molecularFormula) from all files into a single column separated by " ## "
# FALSE --> keep a separate column for each file (e.g. sample1_SIRIUS_molecularFormula, sample2_SIRIUS_molecularFormula, ...)
combine_annotations: TRUE
Expand Down
1,682 changes: 1,681 additions & 1 deletion images/SIRIUS_CSI_FingerID.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3,787 changes: 3,786 additions & 1 deletion images/UmetaFlow.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
38 changes: 26 additions & 12 deletions workflow/rules/GNPS_export.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,32 @@ from os.path import join

# 1) Filter out the features that do not have an MS2 pattern (no protein ID annotations)

rule FileFilter:
input:
join("results", "Interim", ("Requantification" if config["rules"]["requantification"] else "Preprocessing"), "consenus_features.consensusXML")
output:
join("results", "Interim", "GNPS", "filtered.consensusXML")
log: join("workflow", "report", "logs", "GNPS", "FileFilter.log")
conda:
join("..", "envs", "openms.yaml")
shell:
"""
FileFilter -id:remove_unannotated_features -in {input} -out {output} -no_progress -log {log} 2>> {log}
"""
if config["rules"]["requantification"]:
rule FileFilter:
input:
join("results", "Interim", "Requantification", "consenus_features.consensusXML")
output:
join("results", "Interim", "GNPS", "filtered.consensusXML")
log: join("workflow", "report", "logs", "GNPS", "FileFilter.log")
conda:
join("..", "envs", "openms.yaml")
shell:
"""
FileFilter -id:remove_unannotated_features -in {input} -out {output} -no_progress -log {log} 2>> {log}
"""
else:
rule FileFilter:
input:
join("results", "Interim", "Preprocessing", "consenus_features.consensusXML")
output:
join("results", "Interim", "GNPS", "filtered.consensusXML")
log: join("workflow", "report", "logs", "GNPS", "FileFilter.log")
conda:
join("..", "envs", "openms.yaml")
shell:
"""
FileFilter -id:remove_unannotated_features -in {input} -out {output} -no_progress -log {log} 2>> {log}
"""

# 2) GNPS_export creates an mgf file with only the MS2 information of all files (introduce mzml files with spaces between them)

Expand Down
72 changes: 41 additions & 31 deletions workflow/rules/SIRIUS.smk
Original file line number Diff line number Diff line change
Expand Up @@ -10,32 +10,41 @@ envvars:

# 1) SIRIUS Export

rule SiriusExport:
input:
mzML = join("results", "Interim", "mzML", "Aligned_{sample}.mzML"),
featureXML = join(
"results",
"Interim",
(
"Requantification"
if config["rules"]["requantification"]
else "Preprocessing"
),
"MFD_{sample}.featureXML",
),
output:
join("results", "SIRIUS", "sirius-input", "{sample}.ms"),
log:
join("workflow", "report", "logs", "SIRIUS", "SiriusExport_{sample}.log"),
params:
requant = "true" if config["rules"]["requantification"] else "false"
conda:
join("..", "envs", "openms.yaml")
threads: config["system"]["threads"]
shell:
"""
SiriusExport -in {input.mzML} -in_featureinfo {input.featureXML} -out {output} -filter_by_num_masstraces 2 -feature_only true -threads {threads} -no_progress -log {log} 2>> {log}
"""
if config["rules"]["requantification"]:
rule SiriusExport:
input:
mzML = join("results", "Interim", "mzML", "Aligned_{sample}.mzML"),
featureXML = join("results", "Interim", "Requantification", "MFD_{sample}.featureXML"),
output:
join("results", "SIRIUS", "sirius-input", "{sample}.ms"),
log:
join("workflow", "report", "logs", "SIRIUS", "SiriusExport_{sample}.log"),
conda:
join("..", "envs", "openms.yaml")
threads: config["system"]["threads"]
shell:
"""
SiriusExport -in {input.mzML} -in_featureinfo {input.featureXML} -out {output} -filter_by_num_masstraces 2 -feature_only true -threads {threads} -no_progress -log {log} 2>> {log}
"""
else:
rule SiriusExport:
input:
mzML = join("results", "Interim", "mzML", "Aligned_{sample}.mzML"),
featureXML = join("results", "Interim", "Preprocessing", "MFD_{sample}.featureXML"),
output:
join("results", "SIRIUS", "sirius-input", "{sample}.ms"),
log:
join("workflow", "report", "logs", "SIRIUS", "SiriusExport_{sample}.log"),
params:
requant = "true" if config["rules"]["requantification"] else "false"
conda:
join("..", "envs", "openms.yaml")
threads: config["system"]["threads"]
shell:
"""
SiriusExport -in {input.mzML} -in_featureinfo {input.featureXML} -out {output} -filter_by_num_masstraces 2 -feature_only true -threads {threads} -no_progress -log {log} 2>> {log}
"""

if not config["SIRIUS"]["export_only"]:

# 2) Run SIRIUS with login
Expand Down Expand Up @@ -65,12 +74,13 @@ if not config["SIRIUS"]["export_only"]:
]

fingerprint = []
if config["SIRIUS"]["predict_structure"]:
if config["SIRIUS"]["predict_structure_and_class"]:
fingerprint = [
"fingerprint",
"structure",
"--database",
config["SIRIUS"]["structure_database"],
"canopus"
]


Expand All @@ -94,11 +104,10 @@ if not config["SIRIUS"]["export_only"]:
max_mz=config["SIRIUS"]["max_mz"],
formula=" ".join(formula),
fingerprint=" ".join(fingerprint),
canopus="canopus" if config["SIRIUS"]["predict_compound_class"] else "",
shell:
"""
sirius login --user={params.user} --password={params.password} 2>> {log}
sirius --input {input} --project {output.projects} --no-compression --maxmz {params.max_mz} {params.formula} {params.fingerprint} {params.canopus} write-summaries 2>> {log}
sirius --input {input} --project {output.projects} --no-compression --maxmz {params.max_mz} {params.formula} {params.fingerprint} write-summaries 2>> {log}
date '+%Y-%m-%d %H:%M:%S' > {output.flag}
"""

Expand All @@ -125,10 +134,11 @@ if not config["SIRIUS"]["export_only"]:
combine_annotations=(
"true" if config["SIRIUS"]["combine_annotations"] else "false"
),
requant = "true" if config["rules"]["requantification"] else "false"
requant = "true" if config["rules"]["requantification"] else "false",
csi_canopus = "true" if config["SIRIUS"]["predict_structure_and_class"] else "false"
shell:
"""
python workflow/scripts/sirius_annotation.py {params.requant} {output} {params.combine_annotations} > /dev/null 2>> {log}
python workflow/scripts/sirius_annotation.py {params.requant} {output} {params.combine_annotations} {params.csi_canopus} > /dev/null 2>> {log}
"""


Expand Down
2 changes: 1 addition & 1 deletion workflow/scripts/cleanup_feature_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def cleanup(input_tsv, output_tsv):
exclude="number"
).fillna("")

# Remove columns which contain feature IDs for individual files
# Drop feature ID columns
df = df.drop(columns=[c for c in df.columns if c.endswith("_IDs")])

# Remove "SCANS", "id" and "quality" columns
Expand Down
44 changes: 0 additions & 44 deletions workflow/scripts/library_training.py

This file was deleted.

9 changes: 2 additions & 7 deletions workflow/scripts/ms2query_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,10 @@


def ms2query_annotations(requant, annotated, ms2query_csv):
df = pd.read_csv(Path("results", "Interim", ("Requantification" if requant == "true" else "Preprocessing"), "FeatureMatrix.tsv") , sep="\t")
df = pd.read_csv(Path("results", "Interim", ("Requantification" if requant == "true" else "Preprocessing"), "FeatureMatrix.tsv") , sep="\t", index_col=0)


df_ms2query = pd.read_csv(ms2query_csv)

df_ms2query["feature_id"] = df_ms2query["feature_id"].apply(lambda x: int(x[2:]))

df_ms2query = df_ms2query.set_index("feature_id")

ms2query_columns = ["inchikey", "analog_compound_name", "smiles", "cf_kingdom", "cf_superclass", "cf_class", "cf_subclass", "cf_direct_parent", "npc_class_results", "npc_superclass_results", "npc_pathway_results"]
Expand All @@ -19,9 +16,7 @@ def ms2query_annotations(requant, annotated, ms2query_csv):
if i in df_ms2query.index:
for col in ms2query_columns:
df.loc[i, f"MS2Query_{col}"] = df_ms2query.loc[i, col]

df.to_csv(annotated, sep="\t", index=False)

df.to_csv(annotated, sep="\t")

if __name__ == "__main__":
ms2query_annotations(sys.argv[1], sys.argv[2], sys.argv[3])
9 changes: 6 additions & 3 deletions workflow/scripts/sirius_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@
import sys


def sirius_annotations(requant, annotated, combine_annotations):
def sirius_annotations(requant, annotated, combine_annotations, csi_canopus):
df = pd.read_csv(Path("results", "Interim", ("Requantification" if requant == "true" else "Preprocessing"), "FeatureMatrix.tsv") , sep="\t")


sirius_projects_dirs = [p for p in Path(Path(annotated).parent.parent, "SIRIUS", "sirius-projects").iterdir() if p.is_dir()]

# Define data to annotate
Expand All @@ -29,6 +28,8 @@ def sirius_annotations(requant, annotated, combine_annotations):
# Annotate for each input file (aka each sirius project directory)
for p in sirius_projects_dirs:
for tool, annotation_file, cols in zip(tools, annotation_files, column_names):
if tool in ("CSI:FingerID", "CANOPUS") and csi_canopus == "false":
continue
file = Path(p, annotation_file)
if file.exists():
df_tmp = pd.read_csv(file, sep="\t")
Expand All @@ -45,6 +46,8 @@ def sirius_annotations(requant, annotated, combine_annotations):
if combine_annotations == "true":
# Create summary columns, where the file origin is omitted ("##" separated lists)
for tool, columns in zip(tools, column_names):
if tool in ("CSI:FingerID", "CANOPUS") and csi_canopus == "false":
continue
for col in columns:
if "#" in col:
col = col.split("#")[1]
Expand All @@ -63,4 +66,4 @@ def sirius_annotations(requant, annotated, combine_annotations):


if __name__ == "__main__":
sirius_annotations(sys.argv[1], sys.argv[2], sys.argv[3])
sirius_annotations(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])

0 comments on commit bb5e01a

Please sign in to comment.