Update pipeline CMPB #8 #1

draeger-lab · May 22, 2024 · 5d73813 · 5d73813
1 parent 9d2098e
commit 5d73813
Show file tree

Hide file tree

Showing 7 changed files with 217 additions and 200 deletions.
diff --git a/src/specimen/cmpb/workflow.py b/src/specimen/cmpb/workflow.py
@@ -15,74 +15,215 @@
 from datetime import date
 from pathlib import Path
 
+import warnings
+import yaml
+
 from refinegems.curation.pathways import kegg_pathway_analysis
 from refinegems.classes.reports import ModelInfoReport
+from refinegems.curation.biomass import test_biomass_presence
+from refinegems.analysis import growth
+from refinegems.utility.connections import run_memote, perform_mcc, adjust_BOF
+from refinegems.curation.curate import resolve_duplicates
+from refinegems.curation.pathways import kegg_pathways
+
+# from SBOannotator import *
+from SBOannotator import sbo_annotator
 
 from ..util.set_up import save_cmpb_user_input
 
 ################################################################################
 # functions
 ################################################################################
 
-def run():
+def run(configpath:str):
+
+    # setup phase
+    #############
+
+    # load config
+    # -----------
+    if not configpath:
+        config = save_cmpb_user_input(Path(dir, 'config_user.yaml')) 
+    else:
+        with open(configpath, "r") as cfg:
+            config = yaml.load(cfg, Loader=yaml.loader.FullLoader)
+
+    # create log
+    # ----------
+    today = date.today().strftime("%Y%m%d")
+    log_file = Path(config["out_path"],f'rg_{str(today)}.log')
 
+    # ....................................................
+    # @TODO / @IDEAS
     # global options
     # run memote after every step
     # calculate model stats after each step
     # use temp folder or report all model/in-between steps
+    # what to write in the log file
+    # ....................................................
 
     # CarveMe
-    # -------
+    #########
     # @TODO
-    # in a future update
+    # will come in a future update
+    if not config['input']['modelpath']:
+        # run CarveMe
+        raise ValueError('Currently, CarveMe has not been included in the pipeline. Please use it separatly.mThis wfunction will be provided in a future update.')
 
     # CarveMe correction
-    # ------------------
+    ####################
+    libmodel
+
+
     # check, if input is a CarveMe model
     # rg.polish
+    #      polish(model: libModel, email: str, id_db: str, refseq_gff: str, 
+    #      protein_fasta: str, lab_strain: bool, kegg_organism_id: str, path: str)
     # rg correct charges
+    # 
 
     # growth test
+    # -----------
+    model
+    media_path
+    namespace
+    # try to set objective to growth
+    growth_func_list = test_biomass_presence(model)
+    if growth_func_list:
+        # independently of how many growth functions are found, the first one will be used
+        model.objective = growth_func_list[0]
+        # simulate growth on different media
+        growth_report = growth.growth_analysis(model, media_path, 
+                                               namespace=namespace, retrieve='report')
+        growth_report.save(Path(dir,'growth')) # @TODO adjust Path, just a placeholder really
+
+    else:
+        warnings.warn('No growth/biomass function detected, growth simulation before gapfilling will be skipped.')
+
 
     # gapfilling
-    # ----------
+    ############
     # options: automatic/manual extension/manual input
 
     # ModelPolisher
-    # -------------
+    ###############
 
     # Annotations
-    # -----------
+    #############
+    model
+    media_path
+    namespace
+
     # KEGGPathwayGroups, optional
+    # -----------------
+    modelpath
+    new_libmodel, missing_list = kegg_pathways(modelpath)
+
     # SBOannotator
+    # ------------
+    # @TODO 
+    #  theoretically:msoething along the way:
+    libsbml_doc = readSBML(model)
+    libsbml_model = libsbml_doc.getModel()
+    sbo_annotator(libsbml_doc, libsbml_model, 'constraint-based', True, 'create_dbs', 
+                  Path(dir,'step3-annotation',libsbml_model.getId()+'_SBOannotated.xml'))
+
 
     # growth test
+    # -----------
+    # try to set objective to growth
+    growth_func_list = test_biomass_presence(model)
+    if growth_func_list:
+        # independently of how many growth functions are found, the first one will be used
+        model.objective = growth_func_list[0]
+        # simulate growth on different media
+        growth_report = growth.growth_analysis(model, media_path, 
+                                               namespace=namespace, retrieve='report')
+        growth_report.save(Path(dir,'growth')) # @TODO adjust Path, just a placeholder really
+
+    else:
+        warnings.warn('No growth/biomass function detected, growth simulation after annotation will be skipped.')
+
 
     # model cleanup
-    # -------------
+    ###############
+    model
+
     # duplicates
-    # BOFdat?
-    # mcc 
+    # ----------
+    # @TODO which params to set and which to set as optional input?
+    resolve_duplicates(model, check_reac:bool=True, 
+                       check_meta:Literal['default','exhaustive','skip']='default', 
+                       replace_dupl_meta:bool=True, remove_unused_meta:bool=False, 
+                       remove_dupl_reac:bool=True)
+
+
+    # BOF
+    # ---
+    # @TODO
+    # BOFdat - optional
+
+    # check and normalise
+
+    # MCC
+    # ---
+    # @TODO 
+    model = perform_mcc(model, Path(dir,'mcc'),apply=True) # @TODO Path is just a placeholder
 
     # analysis
-    # --------
+    ##########
+    # @TODO 
+    #   set / get params from config or upstream pipeline
     dir
     model
-
+    namespace
+    media_path
+
     # stats
+    # -----
     stats_report = ModelInfoReport(model)
-    stats_report.save(Path(dir,'stats')) # adjust Path, just a placeholder really
+    stats_report.save(Path(dir,'stats')) # @TODO adjust Path, just a placeholder really
 
     # kegg pathway
+    # ------------
     pathway_report = kegg_pathway_analysis(model)
-    pathway_report.save(Path(dir,'kegg_pathway')) # adjust Path, just a placeholder really
+    pathway_report.save(Path(dir,'kegg_pathway')) # @TODO adjust Path, just a placeholder really
 
-    # sbo terms
+    # sbo term
+    # --------
+    # @TODO
+    # plot_rea_sbo_single(model: libModel) -> fig?
+
     # memote
+    # ------
+    run_memote(model, 'html', save_res=Path(dir,'final_memote.html'))
+
     # growth
+    # ------
+    # try to set objective to growth
+    growth_func_list = test_biomass_presence(model)
+    if growth_func_list:
+        # independently of how many growth functions are found, the first one will be used
+        model.objective = growth_func_list[0]
+        # simulate growth on different media
+        growth_report = growth.growth_analysis(model, media_path, 
+                                               namespace=namespace, retrieve='report')
+        growth_report.save(Path(dir,'growth')) # @TODO adjust Path, just a placeholder really
+
+    else:
+        warnings.warn('No growth/biomass function detected, final growth simulation will be skipped.')
+
     # auxotrophies
+    # ------------
+    media_list = growth.read_media_config(media_path)
+    auxo_report = growth.test_auxotrophies(model, media_list[0], media_list[1], namespace)
+    auxo_report.save(Path(dir,'auxotrophies')) # @TODO adjust Path, just a placeholder really
+
+
+
+
+
 
-    pass
 
 ###########
 # old stuff

diff --git a/src/specimen/data/config/cmpb_config.yaml b/src/specimen/data/config/cmpb_config.yaml
@@ -1,40 +1,58 @@
-Description: > 
-  This file can be adapted to choose what refineGEMs should do.
-  Note: For windows use \ instead of / for the paths
-
-
-General Setting: >
-  Path to GEM to be investigated
-
-model: 'data/e_coli_core.xml' 
-# Set the out path for all analysis files
-out_path: ''
-
-Settings for scripts that investigate the model: >
-  These are only necessary if none of the scripts to manipulate the model are used.
-
-# Set to TRUE if you want pngs that aid in model investigation, will be saved to a folder called 'visualization'
-visualize: TRUE
-
-# Set the path to a medium config for growth simulation 
-mediapath: 'media_config.yaml'
-
-# Namespace to use for the model
-namespace: 'BiGG'
-
-# Settings if you want to compare multiple models
-multiple: FALSE
-multiple_paths: # enter as many paths as you need below
-  - 'data/e_coli_core.xml'
-  - ''
-  - ''
-single: TRUE # set to False if you only want to work with the multiple models
-
-# Determine whether the biomass function should be checked & normalised
-biomass: TRUE
-
-# determine whether the memote score should be calculated, default: FALSE
-memote: FALSE
+# Configuration file for the SPECIMEN CMPB pipeline
+# parameters with the value __USER__ are required to be specified by the user
+
+# meta info:
+#    model:     __USER__
+#    organism:  __USER__
+#    date:      __USER__
+#    author:    __USER__
+
+# input for the pipeline
+# ----------------------
+input:
+  modelpath: NULL            # optional, path to a model. 
+                             # If not given, runs CarveMe
+  annotated_genome: __USER__ # required, path to the annotated genome file
+  namespace: BiGG            # namespace to use for the model
+  mediapath: __USER__        # path to a media config to tests growth with
+
+# general options
+# ---------------
+general:
+  dir: SPECIMEN-CMPB         # Path/Name of a directory to save output to
+  memote_always_on: False    # run memote after every step
+  stats_always_on: False     # calculate the model statistics after every step
+
+# part-specific options
+# ---------------------
+
+# add KEGG pathways as groups
+kegg_pathway_groups: True 
+
+# resolve duplicates
+duplicates:
+  # three possible option for the resolvement of duplicates for the following model entities:
+  # - check:  check for duplicates and simply report them
+  # - remove: check for and remove duplicates from the model (if possible)
+  # - skip:   skip the resolvement 
+  reactions: remove
+  metabolites: remove
+  # additional remove unused metabolites (reduces possible knowledge base)
+  remove_unused_metabs: False
+
+# BOFdat / Biomass objective function
+BOF:
+  run_bofdat: 
+  # @TODO
+
+# gapfilling
+gapfilling:
+  # @TODO
+
+
+##################
+# old struff below
+##################
 
 # compare metabolites to the ModelSEED database
 modelseed: FALSE # set to False if not needed
@@ -48,8 +66,6 @@ entrez_email: '' # necessary to access NCBI API
 organismid: 'cstr'  # Needs to be specified for db_to_compare='KEGG' for the gap_analysis, Can be provided for polish
 gff_file: 'data/cstr.gff'  # Path to RefSeq GFF file: Required for db_to_compare='KEGG', Can be provided for polish
 
-### Addition of KEGG Pathways as Groups ###
-keggpathways: FALSE
 
 ### SBO-Term Annotation ###
 sboterms: FALSE

diff --git a/src/specimen/hqtb/core/generate_draft_model.py b/src/specimen/hqtb/core/generate_draft_model.py
@@ -20,7 +20,7 @@
 from refinegems.utility.io import load_model
 from refinegems.utility.entities import resolve_compartment_names
 from refinegems.curation.biomass import test_biomass_presence
-from refinegems.analysis.investigate import run_memote
+from refinegems.utility.connections import run_memote
 
 from refinegems.analysis.growth import MIN_GROWTH_THRESHOLD
 

diff --git a/src/specimen/hqtb/core/refinement/annotation.py b/src/specimen/hqtb/core/refinement/annotation.py
@@ -18,7 +18,7 @@
 
 # refinegems
 from refinegems.utility.io import load_model, kegg_reaction_parser
-from refinegems.analysis.investigate import run_memote
+from refinegems.utility.connections import run_memote
 
 # from SBOannotator import *
 from SBOannotator import sbo_annotator

diff --git a/src/specimen/hqtb/core/refinement/cleanup.py b/src/specimen/hqtb/core/refinement/cleanup.py
@@ -21,7 +21,7 @@
 from refinegems.utility.io import load_model
 from refinegems.classes.medium import medium_to_model, Medium
 from refinegems.analysis.growth import read_media_config
-from refinegems.analysis.investigate import run_memote
+from refinegems.utility.connections import run_memote
 from refinegems.curation.curate import resolve_duplicates, complete_BioMetaCyc
 
 ################################################################################

diff --git a/src/specimen/hqtb/core/refinement/extension.py b/src/specimen/hqtb/core/refinement/extension.py
@@ -32,7 +32,7 @@
 
 from refinegems.utility.io import kegg_reaction_parser, load_a_table_from_database
 from refinegems.utility.entities import create_random_id, get_reaction_annotation_dict, match_id_to_namespace
-from refinegems.analysis.investigate import run_memote
+from refinegems.utility.connections import run_memote
 
 # further required programs:
 #        - DIAMOND, tested with version 0.9.14 (works only for certain sensitivity mode)