diff --git a/src/specimen/cmpb/workflow.py b/src/specimen/cmpb/workflow.py index 52126a6..4293395 100644 --- a/src/specimen/cmpb/workflow.py +++ b/src/specimen/cmpb/workflow.py @@ -15,8 +15,19 @@ from datetime import date from pathlib import Path +import warnings +import yaml + from refinegems.curation.pathways import kegg_pathway_analysis from refinegems.classes.reports import ModelInfoReport +from refinegems.curation.biomass import test_biomass_presence +from refinegems.analysis import growth +from refinegems.utility.connections import run_memote, perform_mcc, adjust_BOF +from refinegems.curation.curate import resolve_duplicates +from refinegems.curation.pathways import kegg_pathways + +# from SBOannotator import * +from SBOannotator import sbo_annotator from ..util.set_up import save_cmpb_user_input @@ -24,65 +35,195 @@ # functions ################################################################################ -def run(): +def run(configpath:str): + + # setup phase + ############# + + # load config + # ----------- + if not configpath: + config = save_cmpb_user_input(Path(dir, 'config_user.yaml')) + else: + with open(configpath, "r") as cfg: + config = yaml.load(cfg, Loader=yaml.loader.FullLoader) + + # create log + # ---------- + today = date.today().strftime("%Y%m%d") + log_file = Path(config["out_path"],f'rg_{str(today)}.log') + # .................................................... + # @TODO / @IDEAS # global options # run memote after every step # calculate model stats after each step # use temp folder or report all model/in-between steps + # what to write in the log file + # .................................................... # CarveMe - # ------- + ######### # @TODO - # in a future update + # will come in a future update + if not config['input']['modelpath']: + # run CarveMe + raise ValueError('Currently, CarveMe has not been included in the pipeline. Please use it separatly.mThis wfunction will be provided in a future update.') # CarveMe correction - # ------------------ + #################### + libmodel + + # check, if input is a CarveMe model # rg.polish + # polish(model: libModel, email: str, id_db: str, refseq_gff: str, + # protein_fasta: str, lab_strain: bool, kegg_organism_id: str, path: str) # rg correct charges + # # growth test + # ----------- + model + media_path + namespace + # try to set objective to growth + growth_func_list = test_biomass_presence(model) + if growth_func_list: + # independently of how many growth functions are found, the first one will be used + model.objective = growth_func_list[0] + # simulate growth on different media + growth_report = growth.growth_analysis(model, media_path, + namespace=namespace, retrieve='report') + growth_report.save(Path(dir,'growth')) # @TODO adjust Path, just a placeholder really + + else: + warnings.warn('No growth/biomass function detected, growth simulation before gapfilling will be skipped.') + # gapfilling - # ---------- + ############ # options: automatic/manual extension/manual input # ModelPolisher - # ------------- + ############### # Annotations - # ----------- + ############# + model + media_path + namespace + # KEGGPathwayGroups, optional + # ----------------- + modelpath + new_libmodel, missing_list = kegg_pathways(modelpath) + # SBOannotator + # ------------ + # @TODO + # theoretically:msoething along the way: + libsbml_doc = readSBML(model) + libsbml_model = libsbml_doc.getModel() + sbo_annotator(libsbml_doc, libsbml_model, 'constraint-based', True, 'create_dbs', + Path(dir,'step3-annotation',libsbml_model.getId()+'_SBOannotated.xml')) + # growth test + # ----------- + # try to set objective to growth + growth_func_list = test_biomass_presence(model) + if growth_func_list: + # independently of how many growth functions are found, the first one will be used + model.objective = growth_func_list[0] + # simulate growth on different media + growth_report = growth.growth_analysis(model, media_path, + namespace=namespace, retrieve='report') + growth_report.save(Path(dir,'growth')) # @TODO adjust Path, just a placeholder really + + else: + warnings.warn('No growth/biomass function detected, growth simulation after annotation will be skipped.') + # model cleanup - # ------------- + ############### + model + # duplicates - # BOFdat? - # mcc + # ---------- + # @TODO which params to set and which to set as optional input? + resolve_duplicates(model, check_reac:bool=True, + check_meta:Literal['default','exhaustive','skip']='default', + replace_dupl_meta:bool=True, remove_unused_meta:bool=False, + remove_dupl_reac:bool=True) + + + # BOF + # --- + # @TODO + # BOFdat - optional + + # check and normalise + + # MCC + # --- + # @TODO + model = perform_mcc(model, Path(dir,'mcc'),apply=True) # @TODO Path is just a placeholder # analysis - # -------- + ########## + # @TODO + # set / get params from config or upstream pipeline dir model - + namespace + media_path + # stats + # ----- stats_report = ModelInfoReport(model) - stats_report.save(Path(dir,'stats')) # adjust Path, just a placeholder really + stats_report.save(Path(dir,'stats')) # @TODO adjust Path, just a placeholder really # kegg pathway + # ------------ pathway_report = kegg_pathway_analysis(model) - pathway_report.save(Path(dir,'kegg_pathway')) # adjust Path, just a placeholder really + pathway_report.save(Path(dir,'kegg_pathway')) # @TODO adjust Path, just a placeholder really - # sbo terms + # sbo term + # -------- + # @TODO + # plot_rea_sbo_single(model: libModel) -> fig? + # memote + # ------ + run_memote(model, 'html', save_res=Path(dir,'final_memote.html')) + # growth + # ------ + # try to set objective to growth + growth_func_list = test_biomass_presence(model) + if growth_func_list: + # independently of how many growth functions are found, the first one will be used + model.objective = growth_func_list[0] + # simulate growth on different media + growth_report = growth.growth_analysis(model, media_path, + namespace=namespace, retrieve='report') + growth_report.save(Path(dir,'growth')) # @TODO adjust Path, just a placeholder really + + else: + warnings.warn('No growth/biomass function detected, final growth simulation will be skipped.') + # auxotrophies + # ------------ + media_list = growth.read_media_config(media_path) + auxo_report = growth.test_auxotrophies(model, media_list[0], media_list[1], namespace) + auxo_report.save(Path(dir,'auxotrophies')) # @TODO adjust Path, just a placeholder really + + + + + - pass ########### # old stuff diff --git a/src/specimen/data/config/cmpb_config.yaml b/src/specimen/data/config/cmpb_config.yaml index cceb572..673ed24 100644 --- a/src/specimen/data/config/cmpb_config.yaml +++ b/src/specimen/data/config/cmpb_config.yaml @@ -1,40 +1,58 @@ -Description: > - This file can be adapted to choose what refineGEMs should do. - Note: For windows use \ instead of / for the paths - - -General Setting: > - Path to GEM to be investigated - -model: 'data/e_coli_core.xml' -# Set the out path for all analysis files -out_path: '' - -Settings for scripts that investigate the model: > - These are only necessary if none of the scripts to manipulate the model are used. - -# Set to TRUE if you want pngs that aid in model investigation, will be saved to a folder called 'visualization' -visualize: TRUE - -# Set the path to a medium config for growth simulation -mediapath: 'media_config.yaml' - -# Namespace to use for the model -namespace: 'BiGG' - -# Settings if you want to compare multiple models -multiple: FALSE -multiple_paths: # enter as many paths as you need below - - 'data/e_coli_core.xml' - - '' - - '' -single: TRUE # set to False if you only want to work with the multiple models - -# Determine whether the biomass function should be checked & normalised -biomass: TRUE - -# determine whether the memote score should be calculated, default: FALSE -memote: FALSE +# Configuration file for the SPECIMEN CMPB pipeline +# parameters with the value __USER__ are required to be specified by the user + +# meta info: +# model: __USER__ +# organism: __USER__ +# date: __USER__ +# author: __USER__ + +# input for the pipeline +# ---------------------- +input: + modelpath: NULL # optional, path to a model. + # If not given, runs CarveMe + annotated_genome: __USER__ # required, path to the annotated genome file + namespace: BiGG # namespace to use for the model + mediapath: __USER__ # path to a media config to tests growth with + +# general options +# --------------- +general: + dir: SPECIMEN-CMPB # Path/Name of a directory to save output to + memote_always_on: False # run memote after every step + stats_always_on: False # calculate the model statistics after every step + +# part-specific options +# --------------------- + +# add KEGG pathways as groups +kegg_pathway_groups: True + +# resolve duplicates +duplicates: + # three possible option for the resolvement of duplicates for the following model entities: + # - check: check for duplicates and simply report them + # - remove: check for and remove duplicates from the model (if possible) + # - skip: skip the resolvement + reactions: remove + metabolites: remove + # additional remove unused metabolites (reduces possible knowledge base) + remove_unused_metabs: False + +# BOFdat / Biomass objective function +BOF: + run_bofdat: + # @TODO + +# gapfilling +gapfilling: + # @TODO + + +################## +# old struff below +################## # compare metabolites to the ModelSEED database modelseed: FALSE # set to False if not needed @@ -48,8 +66,6 @@ entrez_email: '' # necessary to access NCBI API organismid: 'cstr' # Needs to be specified for db_to_compare='KEGG' for the gap_analysis, Can be provided for polish gff_file: 'data/cstr.gff' # Path to RefSeq GFF file: Required for db_to_compare='KEGG', Can be provided for polish -### Addition of KEGG Pathways as Groups ### -keggpathways: FALSE ### SBO-Term Annotation ### sboterms: FALSE diff --git a/src/specimen/hqtb/core/generate_draft_model.py b/src/specimen/hqtb/core/generate_draft_model.py index d82310a..66d0eaa 100644 --- a/src/specimen/hqtb/core/generate_draft_model.py +++ b/src/specimen/hqtb/core/generate_draft_model.py @@ -20,7 +20,7 @@ from refinegems.utility.io import load_model from refinegems.utility.entities import resolve_compartment_names from refinegems.curation.biomass import test_biomass_presence -from refinegems.analysis.investigate import run_memote +from refinegems.utility.connections import run_memote from refinegems.analysis.growth import MIN_GROWTH_THRESHOLD diff --git a/src/specimen/hqtb/core/refinement/annotation.py b/src/specimen/hqtb/core/refinement/annotation.py index e0e46af..5e3efd7 100644 --- a/src/specimen/hqtb/core/refinement/annotation.py +++ b/src/specimen/hqtb/core/refinement/annotation.py @@ -18,7 +18,7 @@ # refinegems from refinegems.utility.io import load_model, kegg_reaction_parser -from refinegems.analysis.investigate import run_memote +from refinegems.utility.connections import run_memote # from SBOannotator import * from SBOannotator import sbo_annotator diff --git a/src/specimen/hqtb/core/refinement/cleanup.py b/src/specimen/hqtb/core/refinement/cleanup.py index 3804de4..1200283 100644 --- a/src/specimen/hqtb/core/refinement/cleanup.py +++ b/src/specimen/hqtb/core/refinement/cleanup.py @@ -21,7 +21,7 @@ from refinegems.utility.io import load_model from refinegems.classes.medium import medium_to_model, Medium from refinegems.analysis.growth import read_media_config -from refinegems.analysis.investigate import run_memote +from refinegems.utility.connections import run_memote from refinegems.curation.curate import resolve_duplicates, complete_BioMetaCyc ################################################################################ diff --git a/src/specimen/hqtb/core/refinement/extension.py b/src/specimen/hqtb/core/refinement/extension.py index cbbee35..afd0711 100644 --- a/src/specimen/hqtb/core/refinement/extension.py +++ b/src/specimen/hqtb/core/refinement/extension.py @@ -32,7 +32,7 @@ from refinegems.utility.io import kegg_reaction_parser, load_a_table_from_database from refinegems.utility.entities import create_random_id, get_reaction_annotation_dict, match_id_to_namespace -from refinegems.analysis.investigate import run_memote +from refinegems.utility.connections import run_memote # further required programs: # - DIAMOND, tested with version 0.9.14 (works only for certain sensitivity mode) diff --git a/src/specimen/hqtb/core/refinement/smoothing.py b/src/specimen/hqtb/core/refinement/smoothing.py index d01dc07..8924bb6 100644 --- a/src/specimen/hqtb/core/refinement/smoothing.py +++ b/src/specimen/hqtb/core/refinement/smoothing.py @@ -13,26 +13,12 @@ from typing import Literal import warnings -from BOFdat import step1 -from BOFdat import step2 -from BOFdat.util import update -from BOFdat.util.update import determine_coefficients -from MCC import MassChargeCuration - import cobra from cobra import Reaction from refinegems.curation.biomass import test_biomass_presence, check_normalise_biomass -from refinegems.analysis.investigate import run_memote from refinegems.classes import egcs - -# further required programs: -# - BOFdat -# - MassChargeCuration - -# note: -# for BOFdat to run correctly, you need to change 'solution.f' to 'solution.objective_value' -# in the coenzymes_and_ions.py file of BOFdat +from refinegems.utility.connections import adjust_BOF, perform_mcc, run_memote ################################################################################ # variables @@ -42,132 +28,6 @@ # functions ################################################################################ -def perform_mcc(model: cobra.Model, dir: str, apply:bool = True) -> cobra.Model: - """Run the MassChargeCuration toll on the model and optionally directly apply - the solution. - - Args: - - model (cobra.Model): - The model to use the tool on. - - dir (str): - Path of the directory to save MCC output in. - - apply (bool, optional): - If True, model is directly updated with the results. - Defaults to True. - - Returns: - cobra.Model: - The model (updated or not) - """ - - # make temporary directory to save files for MCC in - with tempfile.TemporaryDirectory() as temp: - - # use MCC - if apply: - # update model - balancer = MassChargeCuration(model, update_ids = False, data_path=temp) - else: - # do not change original model - model_copy = model.copy() - balancer = MassChargeCuration(model_copy, update_ids = False, data_path=temp) - - # save reports - balancer.generate_reaction_report(Path(dir,model.id+'_mcc_reactions')) - balancer.generate_metabolite_report(Path(dir,model.id+'_mcc_metabolites')) - balancer.generate_visual_report(Path(dir,model.id+'_mcc_visual')) - - return model - - -def adjust_BOF(genome:str, model_file:str, model:cobra.Model, dna_weight_fraction:float, weight_frac:float) -> str: - """Adjust the model's BOF using BOFdat. Currently implemented are step 1 - DNA coefficients and step 2. - - Args: - - genome (str): - Path to the genome (e.g. .fna) FASTA file. - - model_file (str): - Path to the sbml (.xml) file of the model. - - model (cobra.Model): - The genome-scale metabolic model (from the string above), loaded with COBRApy. - - dna_weight_fraction (float): - DNA weight fraction for BOF step 1. - - weight_frac (float): - Weight fraction for the second step of BOFdat (enzymes and ions) - - Returns: - str: - The updated BOF reaction as a reaction string. - """ - - - # BOFdat step 1: - # -------------- - # dna coefficients - dna_coefficients = step1.generate_dna_coefficients(genome,model_file,DNA_WEIGHT_FRACTION=dna_weight_fraction) - bd_step1 = {} - for m in dna_coefficients: - bd_step1[m.id] = dna_coefficients[m] - - # ........................... - # @TODO - # if time permits or needed, options for more coefficients can be added - # ........................... - - # BOFdat step 2: - # -------------- - # find inorganic ions - selected_metabolites = step2.find_coenzymes_and_ions(model_file) - # determine coefficients - bd_step2 = determine_coefficients(selected_metabolites,model,weight_frac) - bd_step2.update(bd_step1) - - # update BOF - # ---------- - # retrieve previously used BOF - growth_func_list = test_biomass_presence(model) - if len(growth_func_list) == 1: - objective_list = model.reactions.get_by_id(growth_func_list[0]).reaction.split(' ') - elif len(growth_func_list) > 1: - mes = f'Multiple BOFs found. Using {growth_func_list[0]} for BOF adjustment.' - warnings.warn(mes,category=UserWarning) - objective_list = model.reactions.get_by_id(growth_func_list[0]).reaction.split(' ') - # else not needed, as if there is no BOF, a new one will be created - - # ............................................................... - objective_reactant = {} - objective_product = {} - product = False - # get reactants, product and factors from equation - for s in objective_list: - if s == '+': - continue - elif '>' in s: - product = True - elif '.' in s: - factor = s - else: - if product: - objective_product[s] = factor - else: - objective_reactant[s] = factor - - # update BOF information with data from BOFdat - for m in bd_step2: - if bd_step2[m] < 0: - objective_reactant[m] = bd_step2[m]*(-1) - else: - objective_product[m] = bd_step2[m] - - # create new objective function - new_objective = ' + '.join('{} {}'.format(value, key) for key, value in objective_reactant.items()) - new_objective += ' --> ' - new_objective += ' + '.join('{} {}'.format(value, key) for key, value in objective_product.items()) - - return new_objective - - def run(genome:str,model:str,dir:str,mcc='skip', egc_solver:None|Literal['greedy']=None, namespace:Literal['BiGG']='BiGG',