Merge pull request #10 from axelwalter/openms-3.2-update

Openms 3.2 updates
biosustain · Nov 25, 2024 · c25fa0f · c25fa0f
2 parents 2839d5e + e776a10
commit c25fa0f
Show file tree

Hide file tree

Showing 55 changed files with 1,479 additions and 1,675 deletions.
diff --git a/.gitignore b/.gitignore
@@ -25,4 +25,5 @@ workflow/scripts/OpenMSWF.py
 *.sh
 *.txt
 workflow/report/logs/*
-Data_analysis_complementary.ipynb
+Data_analysis_complementary.ipynb
+launch.sh
diff --git a/.test/config/config.yaml b/.test/config/config.yaml
@@ -3,26 +3,25 @@
 # one row per sample. It can be parsed easily via pandas.
 pep_version: 2.1.0
 sample_table: "dataset.tsv"
-subsample_table: ["samples.tsv"]
+subsample_table: [ "samples.tsv" ]
 
 projects:
-  - name: Test
-    sample_table: dataset.tsv
-    subsample_table: ["samples.tsv"]
-    rules: "config/config.yaml"
+- name: Test
+  sample_table: dataset.tsv
+  subsample_table: [ "samples.tsv" ]
+  rules: "config/config.yaml"
 
 #### RULE CONFIGURATION ####
 # rules: set value to TRUE if you want to run the analysis or FALSE if you don't
 rules:
-  fileconversion: TRUE #true only for *.raw files from Thermo
-  preprocessing: TRUE # without filtering blanks/QCs or controls
-  GNPSexport: TRUE #if you want to perform FBMN
+  fileconversion: FALSE #true only for *.raw files from Thermo
+  preprocessing: TRUE #from raw data to a table of features
   requantification: FALSE #true for files with common features
-  sirius: FALSE #only formula, no structural predictions and annotate the feature matrix with formula predictions using feature_ids
-  sirius_csi: FALSE #both formula and structural predictions and annotate the feature matrix with formula and structural predictions using feature_ids (MSI level 3)
-  spectralmatcher: TRUE # MSMS matching with in-house or any downloaded MGF format MSMS library & feature matrix annotation (MSI level 2)
-  analogsearch: FALSE # Machine learning tool for spectral matching and analogue annotation (spec2vec and MS2DeepScore)
-  fbmn_integration: FALSE # After FBMN is finished: integration of sirius and csi predictions to the GraphML FBMN file. Optionally, annotate with the MSMS library matches from GNPS also (MSI level 2)
+  GNPS_export: TRUE #all the files necessary for FBMN
+  SIRIUS: FALSE #annotate the feature matrix with predictions for chemical formula, structure (CSI:FingerID) and chemical classes (CANOPUS)
+  spectralmatcher: FALSE #spectral matching with in-house or any downloaded MSMS library & feature matrix annotation (MSI level 2 annotations)
+  MS2Query: FALSE # Machine learning tool for spectral matching and analogue annotation (spec2vec and MS2DeepScore)
+  fbmn_integration: FALSE # After FBMN is finished: integration of formula and structural predictions to the GraphML network file. Optionally, annotate with the MSMS library matches from GNPS also (MSI level 2)
 
 #### PARAMETER CONFIGURATION ####
 # set values to the most important parameters for your run:
@@ -59,9 +58,20 @@ featurelink:
 align:
   mz_max: 10.0 # (in ppm) do not pair features with m/z distance larger than that number - Instrument specific
 
-# 7) SIRIUS/CSI:FingerID
-sirius:
+# 7) SIRIUS/CSI:FingerID/CANOPUS
+SIRIUS:
+  predict_structure: TRUE
+  predict_compound_class: TRUE
+  # combine_annotations: TRUE --> combine annotations (e.g. SIRIUS_molecularFormula) from all files into a single column separated by " ## "
+  # FALSE --> keep a separate column for each file (e.g. sample1_SIRIUS_molecularFormula, sample2_SIRIUS_molecularFormula, ...)
+  combine_annotations: TRUE
+  max_mz: 300
   instrument: 'orbitrap' # (valid: 'default', 'qtof', 'orbitrap', 'fticr')
   pos_ions_considered: "[M+H]+,[M-H2O+H]+,[M+Na]+,[M+NH4]+"
   neg_ions_considered: "[M-H]-,[M-H2O-H]-,[M-HCOOH]-"
-  database: none # Search formulas in the Union of the given databases db-name1,db-name2,db-name3. If no database is given all possible molecular formulas will be respected (no database is used). Example: possible DBs: ALL,BIO,PUBCHEM,MESH,HMDB,KNAPSACK,CHEBI,PUBMED,KEGG,HSDB,MACONDA,METACYC,GNPS,ZINCBIO,UNDP,YMDB,PLANTCYC,NORMAN,ADDITIONAL,PUBCHEMANNOTATIONBIO,PUBCHEMANNOTATIONDRUG,PUBCHEMANNOTATIONSAFETYANDTOXIC,PUBCHEMANNOTATIONFOOD,KEGGMINE,ECOCYCMINE,YMDBMINE
+  elements_considered: "SBrClBSe"
+  elements_enforced: "CHNOP"
+  ppm_max: 10
+  ppm_max_ms2: 10
+  formula_database: none # Search formulas in the Union of the given databases db-name1,db-name2,db-name3. If no database is given all possible molecular formulas will be respected (no database is used). Example: possible DBs: ALL,BIO,PUBCHEM,MESH,HMDB,KNAPSACK,CHEBI,PUBMED,KEGG,HSDB,MACONDA,METACYC,GNPS,ZINCBIO,UNDP,YMDB,PLANTCYC,NORMAN,ADDITIONAL,PUBCHEMANNOTATIONBIO,PUBCHEMANNOTATIONDRUG,PUBCHEMANNOTATIONSAFETYANDTOXIC,PUBCHEMANNOTATIONFOOD,KEGGMINE,ECOCYCMINE,YMDBMINE
+  structure_database: "BIO"
diff --git a/Create_dataset_tsv.ipynb b/Create_dataset_tsv.ipynb
@@ -131,7 +131,7 @@
     "metadata[\"ATTRIBUTE_media\"]= metadata['filename'].str.extract(r'(ISP2|DNPM|FPY12|MA|soyM\\d*)')\n",
     "metadata[\"ATTRIBUTE_comment\"]= metadata['ATTRIBUTE_genomeID'].astype(str) +\"_\" + metadata[\"ATTRIBUTE_media\"].astype(str)\n",
     "metadata=metadata.drop(columns=\"ATTRIBUTE_genomeIDMDNA\")\n",
-    "metadata.to_csv(os.path.join(\"results\", \"GNPSexport\", \"metadata.tsv\"), sep='\\t', index= None)\n",
+    "metadata.to_csv(os.path.join(\"results\", \"GNPS\", \"metadata.tsv\"), sep='\\t', index= None)\n",
     "metadata"
    ]
   },
@@ -142,7 +142,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "metadata= pd.read_csv(os.path.join(\"results\", \"GNPSexport\", \"metadata.tsv\"), sep='\\t')\n",
+    "metadata= pd.read_csv(os.path.join(\"results\", \"GNPS\", \"metadata.tsv\"), sep='\\t')\n",
     "metadata"
    ]
   },

diff --git a/README.md b/README.md
@@ -5,146 +5,109 @@
 
 This is the Snakemake implementation of the [pyOpenMS workflow](https://github.com/biosustain/pyOpenMS_UmetaFlow.git) tailored by [Eftychia Eva Kontou](https://github.com/eeko-kon) and [Axel Walter](https://github.com/axelwalter).
 
-## Workflow overview
+## Overview
+
+![dag](/images/UmetaFlow_graph.svg)
 
 The pipeline consists of seven interconnected steps:
 
-1) File conversion: Simply add your Thermo raw files under the directory data/raw/ and they will be converted to centroid mzML files. If you have Agilent, Bruker, or other vendor files, skip that step (write "FALSE" for rule fileconversion in the config.yaml file - see more under "Configure workflow"), convert them independently using [proteowizard](https://proteowizard.sourceforge.io/) and add them under the data/mzML/ directory.
+1) **File conversion**: Simply add your Thermo raw files under the directory `data/raw/` and they will be converted to centroid mzML files. If you have Agilent, Bruker, or other vendor files, skip that step (write "FALSE" for rule file conversion in the config.yaml file - see more under "Configure workflow"), convert them independently using [proteowizard](https://proteowizard.sourceforge.io/) and add them under the `data/mzML/` directory.
 
-2) Pre-processing: converting raw data to a feature table with a series of algorithms through feature detection, alignment and grouping. This step includes an optional removal of blank/QC samples if defined by the user. Optional "minfrac" step here allows for removal of consensus features with too many missing values.
+2) **Pre-processing**: converting raw data to a feature table with a series of algorithms through feature detection, alignment and grouping. This step includes an optional removal of blank/QC samples if defined by the user. Optional "minfrac" step here allows for removal of consensus features with too many missing values.
 
-3) Re-quantification (optional): Re-quantify all features with missing values across samples resulted from the pre-processing step for more reliable statistical analysis and data exploration. Optional "minfrac" step here allows for removal of consensus features with too many missing values.
+3) **Re-quantification (optional)**: Re-quantify all features with missing values across samples resulted from the pre-processing step for more reliable statistical analysis and data exploration. Optional "minfrac" step here allows for removal of consensus features with too many missing values.
 
-4) Structural and formula predictions (SIRIUS and CSI:FingeID) and annotation of the feature matrix with those predictions (MSI level 3).
+4) **Structural, formula and compound class predictions** (SIRIUS, CSI:FingerID and CANOPUS) and annotation of the feature matrix with those predictions (MSI level 3).
 
-5) GNPSexport: generate all the files necessary to create a [FBMN](https://ccms-ucsd.github.io/GNPSDocumentation/featurebasedmolecularnetworking-with-openms/) or [IIMN](https://ccms-ucsd.github.io/GNPSDocumentation/fbmn-iin/#iimn-networks-with-collapsed-ion-identity-edges) job at GNPS. 
+5) **GNPS**: generate all the files necessary to create a GNPS [FBMN](https://ccms-ucsd.github.io/GNPSDocumentation/featurebasedmolecularnetworking-with-openms/) or [IIMN](https://ccms-ucsd.github.io/GNPSDocumentation/fbmn-iin/#iimn-networks-with-collapsed-ion-identity-edges) job at GNPS. 
 
-6) Spectral matching with in-house or a publicly available library (MGF/MSP/mzML format) and annotation of the feature matrix with matches that have a score above 60 (MSI level 2).
+6) **Spectral matching** with in-house or a publicly available library (MGF/MSP/mzML format) and annotation of the feature matrix with matches that have a score above 60 (MSI level 2).
 
-7) After FBMN or IIMN: Integrate Sirius and CSI predictions to the network (GraphML) and MSMS spectral library annotations to the feature matrix- MSI level 2 (optional).
+7) **Graph view**: Integrate SIRIUS predictions to the network (GraphML) and GNPS library annotations to the feature matrix - MSI level 2 (optional).
 
-8) MS2Query: add another annotation step with a machine learning tool, MS2Query, that searches for exact spectral matches, as well as analogues, using Spec2Vec and MS2Deepscore.
+8) **MS2Query**: add another annotation step with a machine learning tool, MS2Query, that searches for exact spectral matches, as well as analogues, using Spec2Vec and MS2Deepscore.
 
 See [README](workflow/rules/README.md) file for details.
-### Overview
-![dag](/images/UmetaFlow_graph.svg)
 
-## Usage
-
-### Step 1: Clone the workflow
-
-[Clone](https://help.github.com/en/articles/cloning-a-repository) this repository to your local system, into the place where you want to perform the data analysis.
-
-    git clone https://github.com/biosustain/snakemake_UmetaFlow.git
-
-Make sure to have the right access / SSH Key. If **not**, follow the steps:
-
-Step (i): https://docs.github.com/en/github/authenticating-to-github/connecting-to-github-with-ssh/generating-a-new-ssh-key-and-adding-it-to-the-ssh-agent
-
-Step (ii): https://docs.github.com/en/github/authenticating-to-github/connecting-to-github-with-ssh/adding-a-new-ssh-key-to-your-github-account
-
-
-### Step 2: Install all dependencies
-
-> **Mamba** and **Snakemake** dependencies:
->>#### <span style="color: green"> **For both systems** </span>
->>Install [mambaforge](https://github.com/conda-forge/miniforge#mambaforge) for any system. This step is optional if the user already has conda installed, then replace mamba with conda for the following commands.
->>
->>Then install [Snakemake](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html) through mamba with:
->>
->>      conda create -c conda-forge -c bioconda -n snakemake snakemake python=3.10.8
->>      
->**SIRIUS**, **ThermoRawFileParser** executables, and **MS2Query** models:
->>Download the latest SIRIUS executable compatible with your operating system (linux or macOS), the ThermoRawFileParser (file converter executable for Thermo .RAW files) and MS2Query models. Use the following script to complete this step:
->>
->>      cd snakemake_UmetaFlow
->>      SCRIPT_VERSION="0.1.5"
->>      wget -O setup_scripts.zip https://github.com/NBChub/umetaflow_tutorial/archive/refs/tags/$SCRIPT_VERSION.zip
->>      unzip setup_scripts.zip && mv umetaflow_tutorial-$SCRIPT_VERSION/* setup_scripts/
->>
->>The important arguments here are the **ion mode** of your data ("positive" or "negative") which will fetch the respective modules for MS2Query and the **operating system** ("osx64" for macOS and "linux64" for linux) which will fetch the latest release of the sirius executable for your operating system (defaults: positive mode, osx64). Run the script with or without arguments.
->>
->>      bash setup_scripts/setup.sh -o "osx64" -m "positive"
->>
-> Install **OpenMS 3.0.0**:
->>#### <span style="color: green"> **For both systems** </span>
->>Grab OpenMS 3.0.0 [here](https://github.com/OpenMS/OpenMS/releases/tag/Release3.0.0).
->>
->>#### <span style="color: green"> **For Linux(!) only** </span>
->>Then add the binaries to your path (Linux):
->>
->>      export PATH=$PATH:/path/to/OpenMS-3.0.0/bin/
->>      source ~/.bashrc
->>#### <span style="color: green"> **For MacOS(!) only** </span>
->>Then add the binaries to your path (MacOS) by opening one of these files in a text editor:
->>
->>      /etc/profile
->>      ~/.bash_profile
->>      ~/.bash_login (if .bash_profile does not exist)
->>      ~/.profile (if .bash_login does not exist)
->>and adding the path to the binaries at the very end (path-dependent):
->>
->>      export PATH=$PATH:/path/to/OpenMS-3.0.0/bin/
-
-### Step 3: Configure workflow
+## Installation
+
+1. Install [**conda**](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) or [**mamba**](https://mamba.readthedocs.io/en/latest/installation/mamba-installation.html) following the linked guides (skip if already installed).
+
+2. Create and activate your **snakemake-umetaflow** environment (using conda or mamba).
+
+```
+conda create -c conda-forge -c bioconda -n umetaflow-snakemake snakemake python=3.12 -y
+conda activate umetaflow-snakemake
+```
+
+3. [Clone](https://help.github.com/en/articles/cloning-a-repository) this repository to your local system, into the place where you want to perform the data analysis.
+
+```
+git clone https://github.com/biosustain/snakemake_UmetaFlow.git
+```
+
+## Configuration
 Configure the workflow according to your metabolomics data and instrument method via editing the files in the `config/` folder. 
 
-1. Adjust the `config.yaml` to: 
-- Configure the workflow execution (write <span style="color: green">TRUE</span>/<span style="color: red">FALSE</span> if you want to run/skip a specific rules of the workflow)
-- Adjust the parameters in the configuration file for your dataset as explained in the commented section in the yaml file (e.g. positive/negative ionisation, etc.)
+### 1. Adjust configuration file
+
+The `config.yaml` file determines the workflow steps: 
+- Write <span style="color: green">TRUE</span>/<span style="color: red">FALSE</span> if you want to run/skip a specific rules of the workflow.
+- Set parameters according to your dataset as explained in the commented section in the yaml file (e.g. positive/negative ionisation etc.).
+
+### 2. Add MS data files
+
+Add all your files in the `data/raw/` or `data/mzML/` directory and generate the `dataset.tsv` table to specify the samples (filenames) that will be processed. 
+
+Use the Jupyter notebook [Create_dataset_tsv](./Create_dataset_tsv.ipynb) or simply run:
 
-2. Add all your files in the data/raw/ or data/mzML/ directory and generate the `dataset.tsv` table to specify the samples (filenames) that will be processed. 
 
-    **Suggestion**: Use the Jupyter notebook [Create_dataset_tsv](./Create_dataset_tsv.ipynb) or simply run:
-
     python data_files.py
 
-- `config/dataset.tsv` example:
+
+`config/dataset.tsv` example:
 
 |  sample_name |       comment                |
 |-------------:|-----------------------------:|
 | ISP2_blank   | blank media                  |
 | NBC_00162    | pyracrimicin                 |
 | MDNA_WGS_14  | epemicins_A_B                |
 
-#### If there are blanks/QC samples in the file list, then define them through the script.
+### 3. Define QC and Blank samples (optional)
+
+If there are blanks/QC samples in the file list, add them to the appropriate table.
 
-- `config/blanks.tsv` example:
+`config/blanks.tsv` example:
 
 |  sample_name |       comment                |
 |-------------:|-----------------------------:|
 | ISP2_blank   | blank media                  |
 
-- `config/samples.tsv` example:
+`config/samples.tsv` example:
 
 |  sample_name |       comment                |
 |-------------:|-----------------------------:|
 | NBC_00162    | pyracrimicin                 |
 | MDNA_WGS_14  | epemicins_A_B                |
 
 
-### Step 4: Execute workflow
+### 4. Test workflow configuration (optional)
 
-Activate the conda environment:
+Test your configuration by performing a dry-run via
 
-    mamba activate snakemake
-
+    snakemake --use-conda --dry-run
 
-#### Test the workflow with the example dataset
-
-Test your configuration by performing a dry-run via
+See the [Snakemake documentation](https://snakemake.readthedocs.io/en/stable/executable.html) for further details.
 
-    snakemake --use-conda -n
+## Execution
 
-Execute the workflow locally via
+Make sure the `umetaflow-snakemake` conda environment is activated and you are in the `snakemake_UmetaFlow` directory.
 
     snakemake --use-conda --cores all
 
-See the [Snakemake documentation](https://snakemake.readthedocs.io/en/stable/executable.html) for further details.
-
-### Step 5: Investigate results
+## Results
 
-All the results are in a .TSV format and can be opened simply with excel or using pandas dataframes. All the files under results/interim can be ignored and eventualy discarded.
+All the results are in a .TSV format and can be opened simply with excel or using Pandas dataframes. All the files under results/interim can be ignored and eventualy discarded.
 
 ## Developer Notes