Fixing steps in hqtb #7

draeger-lab · Oct 1, 2024 · f3b15a1 · f3b15a1
1 parent f5fcaed
commit f3b15a1
Show file tree

Hide file tree

Showing 12 changed files with 182 additions and 163 deletions.
diff --git a/src/specimen/data/config/hqtb_advanced_config_expl.yaml b/src/specimen/data/config/hqtb_advanced_config_expl.yaml
@@ -73,15 +73,15 @@ parameters:
     generate_draft_model:
         # if the gene ids of the template model do not match the
         # identifiers of the annotated genome file, use this option to edit them
-        edit_names: no
+        edit_names: "no"
         # filter for genes based on percentage identity value
         pid: 80.0
         # Which medium to save the model with.
         medium: default
 
     refinement_extension:
         # Id to use for checking (change not advised)
-        id: locus_tag
+        id: null
         # change the sensitivity of DIAMOND
         sensitivity: more-sensitive
         coverage: 95.0
@@ -90,6 +90,7 @@ parameters:
         # include for knowledge-based model)
         exclude_dna: True
         exclude_rna: True
+        email: USER
 
     refinement_cleanup:
         # Options for duplicate checking/removal
@@ -100,6 +101,7 @@ parameters:
         remove_dupl_meta: True
         # Optional for gapfilling (input is a media config file, refineGEMs standart)
         media_gap: null
+        growth_threshold: 0.05
 
     refinement_annotation:
         # for KEGG pathway annotation

diff --git a/src/specimen/data/config/hqtb_basic_config_expl.yaml b/src/specimen/data/config/hqtb_basic_config_expl.yaml
@@ -65,14 +65,15 @@ parameters:
     generate_draft_model:
         # if the gene ids of the template model do not match the
         # identifiers of the annotated genome file, use this option to edit them
-        edit_names: no
+        edit_names: "no"
         pid: 80.0
 
     refinement_extension:
         # set the sensitivity for DIAMOND
         sensitivity: more-sensitive
         coverage: 95.0
         pid: 90.0
+        email: USER
 
     refinement_cleanup:
         # current default means no gapfilling

diff --git a/src/specimen/data/config/hqtb_config_default.yaml b/src/specimen/data/config/hqtb_config_default.yaml
@@ -69,13 +69,13 @@ parameters:
         sensitivity: more-sensitive
 
     generate_draft_model:
-        edit_names: no
+        edit_names: "no"
         pid: 80.0
         medium: default
 
     refinement_extension:
         # default (usually) fine
-        id: locus_tag
+        id: null
         # default fine
         sensitivity: more-sensitive
         # default alright but good to edit for trying different options
@@ -84,16 +84,18 @@ parameters:
         # default almost needed, except for special cases
         exclude_dna: True
         exclude_rna: True
+        email: USER
 
     refinement_cleanup:
-        # default as standart
+        # default as standard
         check_dupl_reac: True
         check_dupl_meta: default
         remove_unused_meta: False
         remove_dupl_reac: True
         remove_dupl_meta: True
         # current default means no gapfilling
         media_gap: null
+        growth_threshold: 0.05
 
     refinement_annotation:
         # for KEGG pathway annotation

diff --git a/src/specimen/hqtb/core/analysis.py b/src/specimen/hqtb/core/analysis.py
@@ -74,7 +74,7 @@ def run(model_path:str, dir:str,
         print('Given directory already has required structure.')
 
     # load model
-    model = load_model(model_path,'cobra')
+    model = load_model(str(model_path),'cobra')
 
     # ------------------
     # general statistics

diff --git a/src/specimen/hqtb/core/bidirectional_blast.py b/src/specimen/hqtb/core/bidirectional_blast.py
@@ -63,6 +63,7 @@ def extract_cds(file: str, name: str, dir: str, collect_info: list, identifier:
                 ):
 
                     for record in SeqIO.parse(f,"genbank"):
+                        x = 1
                         if record.features:
                             for feature in record.features:
 
@@ -132,14 +133,14 @@ def create_diamond_db(dir: str, name: str, path: str, threads: int):
     """
 
     # check if database already exists
-    if os.path.isfile(Path(dir,'DIAMONDdb',name+'.dmnd')):
+    if os.path.isfile(Path(dir,'db',name+'.dmnd')):
         print(F'database for {name} already exists')
     else:
         # generate new database using diamond makedb
         print(F'create DIAMOND database for {name} using:')
-        print(F'diamond makedb --in {path} -d {str(Path(dir,"db",name))} -p {int(threads)}')
+        print(F'diamond makedb --in {path} -d {str(Path(dir,"db",name+".dmnd"))} -p {int(threads)}')
         start = time.time()
-        subprocess.run([F'diamond makedb --in {path} -d {str(Path(dir,"db",name))} -p {int(threads)}'], shell=True)
+        subprocess.run(["diamond", "makedb", "--in", path, "-d", str(Path(dir,"db",name+".dmnd")), "-p", str(threads)], shell=True)
         end = time.time()
         print(F'\t time: {end - start}s')
 
@@ -172,7 +173,7 @@ def run_diamond_blastp(dir: str, db: str, query: str, fasta_path:str , sensitivi
         print(F'blast {query} against {db} using:')
         print(F'diamond blastp -d {str(Path(dir,"db/",db+".dmnd"))} -q {F"{fasta_path}"} --{sensitivity} -p {int(threads)} -o {outname} --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen ')
         start = time.time()
-        subprocess.run([F'diamond blastp -d {str(Path(dir,"db",db+".dmnd"))} -q {F"{fasta_path}"} --{sensitivity} -p {int(threads)} -o {outname} --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen '], shell=True)
+        subprocess.run(["diamond", "blastp", "-d", str(Path(dir,"db",db+".dmnd")), "-q", fasta_path, "--"+sensitivity, "-p", str(threads), "-o", outname, "--outfmt", "6", "qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qlen"], shell=True)
         end = time.time()
         print(F'\ttime: {end - start}s')
 
@@ -309,7 +310,7 @@ def run(template:str, input:str, dir:str,
             Defaults to None.
         - temp_header (str, optional): 
             Feature qualifier of the gbff (NCBI) / faa (PROKKA) of the template to use as header for the FASTA files.
-            If None is given, sets it based on file extension (currently only implememted for gbff and faa).
+            If None is given, sets it based on file extension (currently only implemented for gbff and faa).
             Defaults to 'protein_id'.
         - in_header (str, optional): 
             Feature qualifier of the gbff (NCBI) / faa (PROKKA) of the input to use as header for the FASTA files. 

diff --git a/src/specimen/hqtb/core/generate_draft_model.py b/src/specimen/hqtb/core/generate_draft_model.py
@@ -377,7 +377,6 @@ def run(template:str, bpbbh:str, dir:str,
     # -----------
     # check input
     # -----------
-
     if edit_names not in ['no','dot-to-underscore']:
         raise ValueError(F'Edit_names value {edit_names} not in list of allowed values: no, dot-to-underscore')
 
@@ -403,7 +402,7 @@ def run(template:str, bpbbh:str, dir:str,
     if len(growth_objfunc) == 1:
         template_model.objective = growth_objfunc[0]
     elif len(growth_objfunc) > 1:
-        mes = f'Multiple BOF detected. Chosing the following: {growth_objfunc[0]}'
+        mes = f'Multiple BOF detected. Choosing the following: {growth_objfunc[0]}'
         warnings.warn(mes, category=UserWarning)
         template_model.objective = growth_objfunc[0]
     else:
@@ -435,7 +434,7 @@ def run(template:str, bpbbh:str, dir:str,
     # -------------------
 
     if memote:
-        memote_path = Path(dir,'step1-extension',name+'.html')
+        memote_path = str(Path(dir,name+'.html'))
         run_memote(draft, 'html', return_res=False, save_res=memote_path, verbose=True)
 
     total_time_e = time.time()

diff --git a/src/specimen/hqtb/core/refinement/annotation.py b/src/specimen/hqtb/core/refinement/annotation.py
@@ -54,9 +54,15 @@ def kegg_reaction_to_kegg_pathway(model:cobra.Model, viaEC:bool=False, viaRC:boo
 
             # via reaction
             try:
-                reaction = kegg_reaction_parser(reac.annotation['kegg.reaction'])
-                if 'db' in reaction and 'kegg.pathway' in reaction['db']:
-                    pathways = reaction['db']['kegg.pathway']
+                if isinstance(reac.annotation['kegg.reaction'],list):
+                    for annotation in reac.annotation['kegg.reaction']:
+                        reaction = kegg_reaction_parser(annotation)
+                        if reaction is not None and 'db' in reaction and 'kegg.pathway' in reaction['db']:
+                           pathways.append(reaction['db']['kegg.pathway'])
+                else:
+                    reaction = kegg_reaction_parser(reac.annotation['kegg.reaction'])
+                    if reaction is not None and 'db' in reaction and 'kegg.pathway' in reaction['db']:
+                        pathways = reaction['db']['kegg.pathway']
             except urllib.error.HTTPError:
                 print(F'HTTPError: {reac.id}, {reac.annotation["kegg.reaction"]}')
             except ConnectionResetError:
@@ -175,13 +181,13 @@ def run(model:str, dir:str, kegg_viaEC:bool=False,
     libsbml_model = libsbml_doc.getModel()
 
     libsbml_model = run_SBOannotator(libsbml_model)
-    write_model_to_file(libsbml_model, Path(dir,'step3-annotation',libsbml_model.getId()+'_SBOannotated.xml'))
+    write_model_to_file(libsbml_model, str(Path(dir,'step3-annotation',libsbml_model.getId()+'_SBOannotated.xml')))
 
     end = time.time()
     print(F'\ttime: {end - start}s')
 
     # reload model
-    model = load_model(Path(dir,'step3-annotation',libsbml_model.getId()+'_SBOannotated.xml'), 'cobra')
+    model = load_model(str(Path(dir,'step3-annotation',libsbml_model.getId()+'_SBOannotated.xml')), 'cobra')
 
     # ................................................................
     # @EXTENDABLE
@@ -204,7 +210,7 @@ def run(model:str, dir:str, kegg_viaEC:bool=False,
     # ----------
     # save model
     # ----------
-    cobra.io.write_sbml_model(model, Path(dir,'step3-annotation',{model.id}+'_annotated.xml'))
+    cobra.io.write_sbml_model(model, Path(dir,'step3-annotation',model.id+'_annotated.xml'))
 
     # ---------------------------------
     # assess model quality using memote
@@ -213,7 +219,7 @@ def run(model:str, dir:str, kegg_viaEC:bool=False,
     if memote:
         start = time.time()
         name = model.id
-        memote_path = Path(dir,'step3-annotation',name+'_annotated.html')
+        memote_path = str(Path(dir,'step3-annotation',name+'_annotated.html'))
         run_memote(model, 'html', return_res=False, save_res=memote_path, verbose=True)
         end = time.time()
         print(F'\ttotal time: {end - start}s')
diff --git a/src/specimen/hqtb/core/refinement/cleanup.py b/src/specimen/hqtb/core/refinement/cleanup.py
@@ -499,5 +499,5 @@ def run(model:str, dir:str,
     # ---------------------------------
 
     if memote:
-        memote_path = Path(dir,'step2-clean-up',name+'.html')
+        memote_path = str(Path(dir,'step2-clean-up',name+'.html'))
         run_memote(model, 'html', return_res=False, save_res=memote_path, verbose=True)