From ba20128990d3986a4fa04eb2de528cf7b96dd947 Mon Sep 17 00:00:00 2001 From: dengzq1234 Date: Wed, 8 Nov 2023 18:25:15 +0100 Subject: [PATCH 01/12] update taxonomy documentation& sync gtdb module --- doc/tutorial/tutorial_taxonomy.rst | 78 ++++++++++++++++++++++++++++-- ete4/gtdb_taxonomy/gtdbquery.py | 11 ++--- 2 files changed, 79 insertions(+), 10 deletions(-) diff --git a/doc/tutorial/tutorial_taxonomy.rst b/doc/tutorial/tutorial_taxonomy.rst index eda99887e..6a96f0d9f 100644 --- a/doc/tutorial/tutorial_taxonomy.rst +++ b/doc/tutorial/tutorial_taxonomy.rst @@ -379,11 +379,31 @@ NCBI taxonomy example:: # Load the whole leaf name as species taxid. tree = PhyloTree('((9606, 9598), 10090);', sp_naming_function=lambda name: name) - tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa() + tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa(taxid_attr="species") # as default + + # Or annotate using only the name as taxid identifier. + tree = PhyloTree('((9606, 9598), 10090);') + tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa(taxid_attr="name") + print(tree.to_str(props=["name", "sci_name", "taxid"])) + # ╭╴9606,Bacteriovorax stolpii,960 + # ╭╴⊗,Bdellovibrionota,3018035╶┤ + # ╴⊗,Bacteria,2╶┤ ╰╴9598,Bdellovibrio bacteriovorus,959 + # │ + # ╰╴10090,Ancylobacter aquaticus,100 # Split names by '|' and return the first part as the species taxid. tree = PhyloTree('((9606|protA, 9598|protA), 10090|protB);', sp_naming_function=lambda name: name.split('|')[0]) - tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa() + tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa(taxid_attr="species") + + # using custom property as taxid identifier + tree = PhyloTree('((9606|protA, 9598|protA), 10090|protB);') + + # add custom property with namespace "spcode" to each node + tree['9606|protA'].add_prop("spcode", 9606) + tree['9598|protA'].add_prop("spcode", 9598) + tree['10090|protB'].add_prop("spcode", 10090) + + tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa(taxid_attr="spcode") print(tree.to_str(props=["name", "sci_name", "taxid"])) # ╭╴9606|protA,Homo sapiens,9606 @@ -398,9 +418,8 @@ GTDB taxonomy example:: # Load the whole leaf name as species taxid. newick = '((p__Huberarchaeota,f__Korarchaeaceae)d__Archaea,o__Peptococcales);' - tree = PhyloTree(newick) - tax2name, tax2track, tax2rank = gtdb.annotate_tree(tree, taxid_attr="name") + tax2name, tax2track, tax2rank = tree.annotate_gtdb_taxa(taxid_attr="name") print(tree.to_str(props=['sci_name', 'rank'])) # ╭╴p__Huberarchaeota,phylum @@ -408,3 +427,54 @@ GTDB taxonomy example:: # ╴root,no rank╶┤ ╰╴f__Korarchaeaceae,family # │ # ╰╴o__Peptococcales,order + + # Load the whole leaf name(representing genome) as species taxid. + newick = '((GB_GCA_020833055.1),(GB_GCA_003344655.1),(RS_GCF_000019605.1,RS_GCF_003948265.1));' + tree = PhyloTree(newick, sp_naming_function=lambda name: name) + tax2name, tax2track, tax2rank = tree.annotate_gtdb_taxa(taxid_attr="species") + + print(tree.to_str(props=['name', 'sci_name', 'rank'])) + # ╭╴⊗,GB_GCA_020833055.1,subspecies╶╌╴GB_GCA_020833055.1,s__Korarchaeum sp020833055,subspecies + # │ + # ╴⊗,g__Korarchaeum,genus╶┼╴⊗,GB_GCA_003344655.1,subspecies╶╌╴GB_GCA_003344655.1,s__Korarchaeum sp003344655,subspecies + # │ + # │ ╭╴RS_GCF_000019605.1,s__Korarchaeum cryptofilum,subspecies + # ╰╴⊗,s__Korarchaeum cryptofilum,species╶┤ + # ╰╴RS_GCF_003948265.1,s__Korarchaeum cryptofilum,subspecies + + + # Split names by '|' and return the first part as the species taxid. + newick = '((GB_GCA_020833055.1|protA:1):1,(GB_GCA_003344655.1|protB:1):1,(RS_GCF_000019605.1|protC:1,RS_GCF_003948265.1|protD:1):1):1;' + tree = PhyloTree(newick, sp_naming_function=lambda name: name.split('|')[0]) + tax2name, tax2track, tax2rank = tree.annotate_gtdb_taxa(taxid_attr="species") + print(tree.to_str(props=['name', 'sci_name', 'rank'])) + # ╭╴⊗,s__Korarchaeum cryptofilum,subspecies,⊗╶╌╴GB_GCA_020833055.1|protA,s__Korarchaeum cryptofilum,subspecies,GB_GCA_020833055.1 + # │ + # ╴⊗,s__Korarchaeum cryptofilum,subspecies,⊗╶┼╴⊗,s__Korarchaeum cryptofilum,subspecies,⊗╶╌╴GB_GCA_003344655.1|protB,s__Korarchaeum cryptofilum,subspecies,GB_GCA_003344655.1 + # │ + # │ ╭╴RS_GCF_000019605.1|protC,s__Korarchaeum cryptofilum,subspecies,RS_GCF_000019605.1 + # ╰╴⊗,s__Korarchaeum cryptofilum,subspecies,⊗╶┤ + # ╰╴RS_GCF_003948265.1|protD,s__Korarchaeum cryptofilum,subspecies,RS_GCF_003948265.1 + + # using custom property as taxid identifier + newick = '((protA:1),(protB:1):1,(protC:1,protD:1):1):1;' + tree = PhyloTree(newick) + annotate_dict = { + 'protA': 'GB_GCA_020833055.1', + 'protB': 'GB_GCA_003344655.1', + 'protC': 'RS_GCF_000019605.1', + 'protD': 'RS_GCF_003948265.1', + } + + for key, value in annotate_dict.items(): + tree[key].add_prop('gtdb_spcode', value) + + tax2name, tax2track, tax2rank = tree.annotate_gtdb_taxa(taxid_attr="gtdb_spcode") + print(tree.to_str(props=['name', 'sci_name', 'rank'])) + # ╭╴⊗,s__Korarchaeum cryptofilum,subspecies╶╌╴protA,s__Korarchaeum cryptofilum,subspecies + # │ + # ╴⊗,s__Korarchaeum cryptofilum,subspecies╶┼╴⊗,s__Korarchaeum cryptofilum,subspecies╶╌╴protB,s__Korarchaeum cryptofilum,subspecies + # │ + # │ ╭╴protC,s__Korarchaeum cryptofilum,subspecies + # ╰╴⊗,s__Korarchaeum cryptofilum,subspecies╶┤ + # ╰╴protD,s__Korarchaeum cryptofilum,subspecies diff --git a/ete4/gtdb_taxonomy/gtdbquery.py b/ete4/gtdb_taxonomy/gtdbquery.py index 4363cc755..6f1d658be 100644 --- a/ete4/gtdb_taxonomy/gtdbquery.py +++ b/ete4/gtdb_taxonomy/gtdbquery.py @@ -378,14 +378,14 @@ def get_topology(self, taxnames, intermediate_nodes=False, rank_limit=None, leaves = set([v for v, count in Counter(subtree).items() if count == 1]) tax2name = self.get_taxid_translator(list(subtree)) name2tax ={spname:taxid for taxid,spname in tax2name.items()} - nodes[root_taxid] = PhyloTree(name=root_taxid) + nodes[root_taxid] = PhyloTree({'name': str(root_taxid)}) current_parent = nodes[root_taxid] for tid in subtree: if tid in visited: current_parent = nodes[tid].up else: visited.add(tid) - nodes[tid] = PhyloTree(name=tax2name.get(tid, '')) + nodes[tid] = PhyloTree({'name': tax2name.get(tid, '')}) current_parent.add_child(nodes[tid]) if tid not in leaves: current_parent = nodes[tid] @@ -480,7 +480,7 @@ def annotate_tree(self, t, taxid_attr='name', for n in t.traverse(): try: # translate gtdb name -> id - taxaname = n.props.get(taxid_attr) + taxaname = getattr(n, taxid_attr, n.props.get(taxid_attr)) tid = self.get_name_translator([taxaname])[taxaname][0] taxids.add(tid) except (KeyError, ValueError, AttributeError): @@ -507,8 +507,7 @@ def annotate_tree(self, t, taxid_attr='name', n2leaves = t.get_cached_content() for node in t.traverse('postorder'): - node_taxid = node.props.get(taxid_attr) - + node_taxid = getattr(n, taxid_attr, n.props.get(taxid_attr)) node.add_prop('taxid', node_taxid) if node_taxid: @@ -531,7 +530,7 @@ def annotate_tree(self, t, taxid_attr='name', rank = tax2rank.get(tmp_taxid, 'Unknown'), named_lineage = [tax2name.get(tax, str(tax)) for tax in tax2track.get(tmp_taxid, [])]) elif node.is_leaf: - node.add_props(sci_name = node.props.get(taxid_attr, 'NA'), + node.add_props(sci_name = getattr(n, taxid_attr, n.props.get(taxid_attr, 'NA')), common_name = '', lineage = [], rank = 'Unknown', From 01234c62e52167deaca75829bc05c9a72ee5e9f6 Mon Sep 17 00:00:00 2001 From: dengzq1234 Date: Wed, 8 Nov 2023 19:55:56 +0100 Subject: [PATCH 02/12] cosmetic on ncbi documentation --- doc/tutorial/tutorial_taxonomy.rst | 36 ++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/doc/tutorial/tutorial_taxonomy.rst b/doc/tutorial/tutorial_taxonomy.rst index 6a96f0d9f..2d4526d72 100644 --- a/doc/tutorial/tutorial_taxonomy.rst +++ b/doc/tutorial/tutorial_taxonomy.rst @@ -371,15 +371,32 @@ used name, lineage and rank translators. Remember that species names in PhyloTree instances are automatically extracted from leaf names. The parsing method can be easily adapted to -any formatting: +any formatting -NCBI taxonomy example:: +Here are some examples using the NCBI taxonomic annotation. +1)Using the whole leaf name as taxonomic identifier:: + from ete4 import PhyloTree + tree = PhyloTree('((9606, 9598), 10090);') - # Load the whole leaf name as species taxid. + # pass name as taxid identifier to annotate_ncbi_taxa + tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa(taxid_attr="name") + print(tree.to_str(props=["name", "sci_name", "taxid"])) + # ╭╴9606,Bacteriovorax stolpii,960 + # ╭╴⊗,Bdellovibrionota,3018035╶┤ + # ╴⊗,Bacteria,2╶┤ ╰╴9598,Bdellovibrio bacteriovorus,959 + # │ + # ╰╴10090,Ancylobacter aquaticus,100 + +2)Using `sp_naming_function` to define `species` attribute for each node:: + + from ete4 import PhyloTree + # a) Load the whole leaf name as species attribute of each node. tree = PhyloTree('((9606, 9598), 10090);', sp_naming_function=lambda name: name) - tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa(taxid_attr="species") # as default + + # pass `species` as taxid identifier to annotate_ncbi_taxa + tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa(taxid_attr="species") # Or annotate using only the name as taxid identifier. tree = PhyloTree('((9606, 9598), 10090);') @@ -391,11 +408,16 @@ NCBI taxonomy example:: # │ # ╰╴10090,Ancylobacter aquaticus,100 + + # b) Only take part of the leaf name as species attribute of each node. # Split names by '|' and return the first part as the species taxid. tree = PhyloTree('((9606|protA, 9598|protA), 10090|protB);', sp_naming_function=lambda name: name.split('|')[0]) tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa(taxid_attr="species") - # using custom property as taxid identifier +3)Using custom property as taxid identifier:: + + from ete4 import PhyloTree + tree = PhyloTree('((9606|protA, 9598|protA), 10090|protB);') # add custom property with namespace "spcode" to each node @@ -403,9 +425,9 @@ NCBI taxonomy example:: tree['9598|protA'].add_prop("spcode", 9598) tree['10090|protB'].add_prop("spcode", 10090) + # passing the custom property name as taxid identifier to annotate_ncbi_taxa tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa(taxid_attr="spcode") - - print(tree.to_str(props=["name", "sci_name", "taxid"])) + print(tree.to_str(props=["name", "sci_name", "spcode"])) # ╭╴9606|protA,Homo sapiens,9606 # ╭╴(empty),Homininae,207598╶┤ # ╴(empty),Euarchontoglires,314146╶┤ ╰╴9598|protA,Pan troglodytes,9598 From b2672b6fbccdcf2a6213a74a88c5d0c045a76683 Mon Sep 17 00:00:00 2001 From: dengzq1234 Date: Wed, 8 Nov 2023 20:02:52 +0100 Subject: [PATCH 03/12] cosmetic --- doc/tutorial/tutorial_taxonomy.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/tutorial/tutorial_taxonomy.rst b/doc/tutorial/tutorial_taxonomy.rst index 2d4526d72..035c0b860 100644 --- a/doc/tutorial/tutorial_taxonomy.rst +++ b/doc/tutorial/tutorial_taxonomy.rst @@ -434,7 +434,7 @@ Here are some examples using the NCBI taxonomic annotation. # │ # ╰╴10090|protB,Mus musculus,10090 -GTDB taxonomy example:: +Examples using the NCBI taxonomic annotation:: from ete4 import PhyloTree From f32274b147fe1eaa4461bc95cba5c33d1dc96007 Mon Sep 17 00:00:00 2001 From: dengzq1234 Date: Thu, 9 Nov 2023 11:23:52 +0100 Subject: [PATCH 04/12] correct typo --- doc/tutorial/tutorial_smartview.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/tutorial/tutorial_smartview.rst b/doc/tutorial/tutorial_smartview.rst index 099b46dcb..cc2924a60 100644 --- a/doc/tutorial/tutorial_smartview.rst +++ b/doc/tutorial/tutorial_smartview.rst @@ -678,7 +678,7 @@ For node faces in collapsed clades, modify *collapsed_only* argument to True in :alt: alternative text :align: center -"mundo" TextFace shown in branch_right of node "n1" only when node is collapsed with argument **collapsed_only=False** +"mundo" TextFace shown in branch_right of node "n1" only when node is collapsed with argument **collapsed_only=True** .. image:: https://github.com/dengzq1234/ete4_gallery/blob/master/smartview/faceposition_collapsed_after.png?raw=true :alt: alternative text From 6312755a651058b2330a24aed6f34247a2237a38 Mon Sep 17 00:00:00 2001 From: dengzq1234 Date: Thu, 9 Nov 2023 15:42:03 +0100 Subject: [PATCH 05/12] add example and correct typo --- doc/tutorial/tutorial_taxonomy.rst | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/doc/tutorial/tutorial_taxonomy.rst b/doc/tutorial/tutorial_taxonomy.rst index 035c0b860..0c3094c18 100644 --- a/doc/tutorial/tutorial_taxonomy.rst +++ b/doc/tutorial/tutorial_taxonomy.rst @@ -383,11 +383,11 @@ Here are some examples using the NCBI taxonomic annotation. # pass name as taxid identifier to annotate_ncbi_taxa tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa(taxid_attr="name") print(tree.to_str(props=["name", "sci_name", "taxid"])) - # ╭╴9606,Bacteriovorax stolpii,960 + # ╭╴9606,Bacteriovorax stolpii,9606 # ╭╴⊗,Bdellovibrionota,3018035╶┤ - # ╴⊗,Bacteria,2╶┤ ╰╴9598,Bdellovibrio bacteriovorus,959 + # ╴⊗,Bacteria,2╶┤ ╰╴9598,Bdellovibrio bacteriovorus,9598 # │ - # ╰╴10090,Ancylobacter aquaticus,100 + # ╰╴10090,Ancylobacter aquaticus,10090 2)Using `sp_naming_function` to define `species` attribute for each node:: @@ -402,11 +402,11 @@ Here are some examples using the NCBI taxonomic annotation. tree = PhyloTree('((9606, 9598), 10090);') tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa(taxid_attr="name") print(tree.to_str(props=["name", "sci_name", "taxid"])) - # ╭╴9606,Bacteriovorax stolpii,960 + # ╭╴9606,Bacteriovorax stolpii,9606 # ╭╴⊗,Bdellovibrionota,3018035╶┤ - # ╴⊗,Bacteria,2╶┤ ╰╴9598,Bdellovibrio bacteriovorus,959 + # ╴⊗,Bacteria,2╶┤ ╰╴9598,Bdellovibrio bacteriovorus,9598 # │ - # ╰╴10090,Ancylobacter aquaticus,100 + # ╰╴10090,Ancylobacter aquaticus,10090 # b) Only take part of the leaf name as species attribute of each node. @@ -414,6 +414,13 @@ Here are some examples using the NCBI taxonomic annotation. tree = PhyloTree('((9606|protA, 9598|protA), 10090|protB);', sp_naming_function=lambda name: name.split('|')[0]) tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa(taxid_attr="species") + print(tree.to_str(props=["name", "sci_name", "taxid"])) + # ╭╴9606|protA,Homo sapiens,9606 + # ╭╴⊗,Homininae,207598╶┤ + # ╴⊗,Euarchontoglires,314146╶┤ ╰╴9598|protA,Pan troglodytes,9598 + # │ + # ╰╴10090|protB,Mus musculus,10090 + 3)Using custom property as taxid identifier:: from ete4 import PhyloTree From 021f33017f2cab2db2af2ae38fe34912b08ea762 Mon Sep 17 00:00:00 2001 From: dengzq1234 Date: Thu, 9 Nov 2023 20:59:03 +0100 Subject: [PATCH 06/12] correct typo --- doc/tutorial/tutorial_taxonomy.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/tutorial/tutorial_taxonomy.rst b/doc/tutorial/tutorial_taxonomy.rst index 0c3094c18..6183f10b3 100644 --- a/doc/tutorial/tutorial_taxonomy.rst +++ b/doc/tutorial/tutorial_taxonomy.rst @@ -441,7 +441,7 @@ Here are some examples using the NCBI taxonomic annotation. # │ # ╰╴10090|protB,Mus musculus,10090 -Examples using the NCBI taxonomic annotation:: +Examples using the GTDB taxonomic annotation:: from ete4 import PhyloTree From c21006eac22eff2e5734e5006d1e9212db66d3f1 Mon Sep 17 00:00:00 2001 From: dengzq1234 Date: Thu, 9 Nov 2023 21:29:36 +0100 Subject: [PATCH 07/12] correct typo of variable --- ete4/gtdb_taxonomy/gtdbquery.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ete4/gtdb_taxonomy/gtdbquery.py b/ete4/gtdb_taxonomy/gtdbquery.py index 6f1d658be..8c1305135 100644 --- a/ete4/gtdb_taxonomy/gtdbquery.py +++ b/ete4/gtdb_taxonomy/gtdbquery.py @@ -507,7 +507,7 @@ def annotate_tree(self, t, taxid_attr='name', n2leaves = t.get_cached_content() for node in t.traverse('postorder'): - node_taxid = getattr(n, taxid_attr, n.props.get(taxid_attr)) + node_taxid = getattr(node, taxid_attr, node.props.get(taxid_attr)) node.add_prop('taxid', node_taxid) if node_taxid: @@ -530,7 +530,7 @@ def annotate_tree(self, t, taxid_attr='name', rank = tax2rank.get(tmp_taxid, 'Unknown'), named_lineage = [tax2name.get(tax, str(tax)) for tax in tax2track.get(tmp_taxid, [])]) elif node.is_leaf: - node.add_props(sci_name = getattr(n, taxid_attr, n.props.get(taxid_attr, 'NA')), + node.add_props(sci_name = getattr(node, taxid_attr, node.props.get(taxid_attr, 'NA')), common_name = '', lineage = [], rank = 'Unknown', From 7ba094e1113e2ea87c56b7e1dc03b74f56e5b80a Mon Sep 17 00:00:00 2001 From: dengzq1234 Date: Thu, 9 Nov 2023 22:31:21 +0100 Subject: [PATCH 08/12] update unitest of annotate_tree() --- tests/test_gtdbquery.py | 85 +++++++++++++++++++++++++++++++++++++++++ tests/test_ncbiquery.py | 44 +++++++++++++++++++++ 2 files changed, 129 insertions(+) diff --git a/tests/test_gtdbquery.py b/tests/test_gtdbquery.py index aa6818173..57f9649db 100644 --- a/tests/test_gtdbquery.py +++ b/tests/test_gtdbquery.py @@ -48,6 +48,91 @@ def test_01tree_annotation(self): self.assertEqual(caballeronia.props.get('named_lineage'), ['root', 'd__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Burkholderiales', 'f__Burkholderiaceae', 'g__Caballeronia', 's__Caballeronia udeis']) + + def test_02tree_annotation(self): + # using name as species attribute + tree = PhyloTree('((GB_GCA_011358815.1,RS_GCF_003948265.1),(GB_GCA_003344655.1),(GB_GCA_011056255.1));', + sp_naming_function=lambda name: name) + tree.annotate_gtdb_taxa(dbfile=DATABASE_PATH, taxid_attr='species') + + self.assertEqual(tree.props.get('sci_name'), 'g__Korarchaeum') + + cryptofilum = tree['GB_GCA_011358815.1'].up + self.assertEqual(cryptofilum.props.get('taxid'), 's__Korarchaeum cryptofilum') + self.assertEqual(cryptofilum.props.get('sci_name'), 's__Korarchaeum cryptofilum') + self.assertEqual(cryptofilum.props.get('rank'), 'species') + self.assertEqual(cryptofilum.props.get('named_lineage'), + ['root', 'd__Archaea', 'p__Thermoproteota', 'c__Korarchaeia', + 'o__Korarchaeales', 'f__Korarchaeaceae', 'g__Korarchaeum', 's__Korarchaeum cryptofilum']) + + sp003344655 = tree['GB_GCA_003344655.1'] + self.assertEqual(sp003344655.props.get('taxid'), 'GB_GCA_003344655.1') + self.assertEqual(sp003344655.props.get('sci_name'), 's__Korarchaeum sp003344655') + self.assertEqual(sp003344655.props.get('rank'), 'subspecies') + self.assertEqual(sp003344655.props.get('named_lineage'), + ['root', 'd__Archaea', 'p__Thermoproteota', 'c__Korarchaeia', + 'o__Korarchaeales', 'f__Korarchaeaceae', 'g__Korarchaeum', + 's__Korarchaeum sp003344655', 'GB_GCA_003344655.1']) + + def test_03tree_annotation(self): + # assign species attribute via sp_naming_function + tree = PhyloTree('((GB_GCA_011358815.1|protA,RS_GCF_003948265.1|protB),(GB_GCA_003344655.1|protC),(GB_GCA_011056255.1|protD));', + sp_naming_function=lambda name: name.split('|')[0]) + tree.annotate_gtdb_taxa(taxid_attr='species') + + self.assertEqual(tree.props.get('sci_name'), 'g__Korarchaeum') + + cryptofilum = tree['GB_GCA_011358815.1|protA'].up + self.assertEqual(cryptofilum.props.get('taxid'), 's__Korarchaeum cryptofilum') + self.assertEqual(cryptofilum.props.get('sci_name'), 's__Korarchaeum cryptofilum') + self.assertEqual(cryptofilum.props.get('rank'), 'species') + self.assertEqual(cryptofilum.props.get('named_lineage'), + ['root', 'd__Archaea', 'p__Thermoproteota', 'c__Korarchaeia', + 'o__Korarchaeales', 'f__Korarchaeaceae', 'g__Korarchaeum', 's__Korarchaeum cryptofilum']) + + sp003344655 = tree['GB_GCA_003344655.1|protC'] + self.assertEqual(sp003344655.props.get('taxid'), 'GB_GCA_003344655.1') + self.assertEqual(sp003344655.props.get('sci_name'), 's__Korarchaeum sp003344655') + self.assertEqual(sp003344655.props.get('rank'), 'subspecies') + self.assertEqual(sp003344655.props.get('named_lineage'), + ['root', 'd__Archaea', 'p__Thermoproteota', 'c__Korarchaeia', + 'o__Korarchaeales', 'f__Korarchaeaceae', 'g__Korarchaeum', + 's__Korarchaeum sp003344655', 'GB_GCA_003344655.1']) + + def test_04tree_annotation(self): + # Using custom property as taxonomic identifier + tree = PhyloTree('((protA:1, protB:1):1,(protC:1),(protD:1):1):1;') + annotate_dict = { + 'protA': 'GB_GCA_011358815.1', + 'protB': 'RS_GCF_003948265.1', + 'protC': 'GB_GCA_003344655.1', + 'protD': 'GB_GCA_011056255.1', + } + for key, value in annotate_dict.items(): + tree[key].add_prop('gtdb_spcode', value) + + tree.annotate_gtdb_taxa(taxid_attr="gtdb_spcode") + + self.assertEqual(tree.props.get('sci_name'), 'g__Korarchaeum') + + cryptofilum = tree['protA'].up + self.assertEqual(cryptofilum.props.get('taxid'), 's__Korarchaeum cryptofilum') + self.assertEqual(cryptofilum.props.get('sci_name'), 's__Korarchaeum cryptofilum') + self.assertEqual(cryptofilum.props.get('rank'), 'species') + self.assertEqual(cryptofilum.props.get('named_lineage'), + ['root', 'd__Archaea', 'p__Thermoproteota', 'c__Korarchaeia', + 'o__Korarchaeales', 'f__Korarchaeaceae', 'g__Korarchaeum', 's__Korarchaeum cryptofilum']) + + sp003344655 = tree['protC'] + self.assertEqual(sp003344655.props.get('taxid'), 'GB_GCA_003344655.1') + self.assertEqual(sp003344655.props.get('sci_name'), 's__Korarchaeum sp003344655') + self.assertEqual(sp003344655.props.get('rank'), 'subspecies') + self.assertEqual(sp003344655.props.get('named_lineage'), + ['root', 'd__Archaea', 'p__Thermoproteota', 'c__Korarchaeia', + 'o__Korarchaeales', 'f__Korarchaeaceae', 'g__Korarchaeum', + 's__Korarchaeum sp003344655', 'GB_GCA_003344655.1']) + + def test_gtdbquery(self): gtdb = GTDBTaxa(dbfile=DATABASE_PATH) diff --git a/tests/test_ncbiquery.py b/tests/test_ncbiquery.py index 9a002eb21..99278a1e0 100644 --- a/tests/test_ncbiquery.py +++ b/tests/test_ncbiquery.py @@ -15,6 +15,7 @@ def test_00_update_database(self): def test_01tree_annotation(self): + # using name as species attribute t = PhyloTree( "((9598, 9606), 10090);", sp_naming_function=lambda name: name) t.annotate_ncbi_taxa(dbfile=DATABASE_PATH) self.assertEqual(t.props.get('sci_name'), 'Euarchontoglires') @@ -33,6 +34,48 @@ def test_01tree_annotation(self): self.assertEqual(human.props.get('named_lineage'), [u'root', u'cellular organisms', u'Eukaryota', u'Opisthokonta', u'Metazoa', u'Eumetazoa', u'Bilateria', u'Deuterostomia', u'Chordata', u'Craniata', u'Vertebrata', u'Gnathostomata', u'Teleostomi', u'Euteleostomi', u'Sarcopterygii', u'Dipnotetrapodomorpha', u'Tetrapoda', u'Amniota', u'Mammalia', u'Theria', u'Eutheria', u'Boreoeutheria', u'Euarchontoglires', u'Primates', u'Haplorrhini', u'Simiiformes', u'Catarrhini', u'Hominoidea', u'Hominidae', u'Homininae', u'Homo', u'Homo sapiens']) self.assertEqual(human.props.get('lineage'), [1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593, 7742, 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674, 32525, 9347, 1437010, 314146, 9443, 376913, 314293, 9526, 314295, 9604, 207598, 9605, 9606]) + def test_02tree_annotation(self): + # assign species attribute via sp_naming_function + t = PhyloTree( "((9598|protA, 9606|protB), 10090|propC);", sp_naming_function=lambda name: name.split('|')[0]) + t.annotate_ncbi_taxa(dbfile=DATABASE_PATH, taxid_attr='species') + + homi = t['9606|protB'].up + self.assertEqual(homi.props.get('sci_name'), 'Homininae') + self.assertEqual(homi.props.get('taxid'), 207598) + self.assertEqual(homi.props.get('rank'), 'subfamily') + self.assertEqual(homi.props.get('named_lineage'), [u'root', u'cellular organisms', u'Eukaryota', u'Opisthokonta', u'Metazoa', u'Eumetazoa', u'Bilateria', u'Deuterostomia', u'Chordata', u'Craniata', u'Vertebrata', u'Gnathostomata', u'Teleostomi', u'Euteleostomi', u'Sarcopterygii', u'Dipnotetrapodomorpha', u'Tetrapoda', u'Amniota', u'Mammalia', u'Theria', u'Eutheria', u'Boreoeutheria', u'Euarchontoglires', u'Primates', u'Haplorrhini', u'Simiiformes', u'Catarrhini', u'Hominoidea', u'Hominidae', u'Homininae']) + self.assertEqual(homi.props.get('lineage'), [1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593, 7742, 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674, 32525, 9347, 1437010, 314146, 9443, 376913, 314293, 9526, 314295, 9604, 207598] ) + + human = t['9606|protB'] + self.assertEqual(human.props.get('sci_name'), 'Homo sapiens') + self.assertEqual(human.props.get('taxid'), 9606) + self.assertEqual(human.props.get('rank'), 'species') + self.assertEqual(human.props.get('named_lineage'), [u'root', u'cellular organisms', u'Eukaryota', u'Opisthokonta', u'Metazoa', u'Eumetazoa', u'Bilateria', u'Deuterostomia', u'Chordata', u'Craniata', u'Vertebrata', u'Gnathostomata', u'Teleostomi', u'Euteleostomi', u'Sarcopterygii', u'Dipnotetrapodomorpha', u'Tetrapoda', u'Amniota', u'Mammalia', u'Theria', u'Eutheria', u'Boreoeutheria', u'Euarchontoglires', u'Primates', u'Haplorrhini', u'Simiiformes', u'Catarrhini', u'Hominoidea', u'Hominidae', u'Homininae', u'Homo', u'Homo sapiens']) + self.assertEqual(human.props.get('lineage'), [1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593, 7742, 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674, 32525, 9347, 1437010, 314146, 9443, 376913, 314293, 9526, 314295, 9604, 207598, 9605, 9606]) + + def test_03tree_annotation(self): + # Using custom property as taxonomic identifier + t = PhyloTree( "((protA, protB), propC);") + # add property called "spcode" + t['protA'].add_prop('spcode', 9598) + t['protB'].add_prop('spcode', 9606) + t['propC'].add_prop('spcode', 10090) + t.annotate_ncbi_taxa(dbfile=DATABASE_PATH, taxid_attr='spcode') + + homi = t['protB'].up + self.assertEqual(homi.props.get('sci_name'), 'Homininae') + self.assertEqual(homi.props.get('taxid'), 207598) + self.assertEqual(homi.props.get('rank'), 'subfamily') + self.assertEqual(homi.props.get('named_lineage'), [u'root', u'cellular organisms', u'Eukaryota', u'Opisthokonta', u'Metazoa', u'Eumetazoa', u'Bilateria', u'Deuterostomia', u'Chordata', u'Craniata', u'Vertebrata', u'Gnathostomata', u'Teleostomi', u'Euteleostomi', u'Sarcopterygii', u'Dipnotetrapodomorpha', u'Tetrapoda', u'Amniota', u'Mammalia', u'Theria', u'Eutheria', u'Boreoeutheria', u'Euarchontoglires', u'Primates', u'Haplorrhini', u'Simiiformes', u'Catarrhini', u'Hominoidea', u'Hominidae', u'Homininae']) + self.assertEqual(homi.props.get('lineage'), [1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593, 7742, 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674, 32525, 9347, 1437010, 314146, 9443, 376913, 314293, 9526, 314295, 9604, 207598] ) + + human = t['protB'] + self.assertEqual(human.props.get('sci_name'), 'Homo sapiens') + self.assertEqual(human.props.get('taxid'), 9606) + self.assertEqual(human.props.get('rank'), 'species') + self.assertEqual(human.props.get('named_lineage'), [u'root', u'cellular organisms', u'Eukaryota', u'Opisthokonta', u'Metazoa', u'Eumetazoa', u'Bilateria', u'Deuterostomia', u'Chordata', u'Craniata', u'Vertebrata', u'Gnathostomata', u'Teleostomi', u'Euteleostomi', u'Sarcopterygii', u'Dipnotetrapodomorpha', u'Tetrapoda', u'Amniota', u'Mammalia', u'Theria', u'Eutheria', u'Boreoeutheria', u'Euarchontoglires', u'Primates', u'Haplorrhini', u'Simiiformes', u'Catarrhini', u'Hominoidea', u'Hominidae', u'Homininae', u'Homo', u'Homo sapiens']) + self.assertEqual(human.props.get('lineage'), [1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593, 7742, 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674, 32525, 9347, 1437010, 314146, 9443, 376913, 314293, 9526, 314295, 9604, 207598, 9605, 9606]) + def test_ncbiquery(self): ncbi = NCBITaxa(dbfile=DATABASE_PATH) @@ -95,3 +138,4 @@ def test_merged_id(self): if __name__ == '__main__': unittest.main() + From b94de98d5f4eb5ec71ed77925f42bba3eb578ec9 Mon Sep 17 00:00:00 2001 From: dengzq1234 Date: Thu, 9 Nov 2023 22:58:02 +0100 Subject: [PATCH 09/12] udpate taxonomy documentation --- doc/tutorial/tutorial_taxonomy.rst | 42 ++++++++++++++---------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/doc/tutorial/tutorial_taxonomy.rst b/doc/tutorial/tutorial_taxonomy.rst index 6183f10b3..bddee5a47 100644 --- a/doc/tutorial/tutorial_taxonomy.rst +++ b/doc/tutorial/tutorial_taxonomy.rst @@ -62,13 +62,15 @@ a parsed version of it in `~/.local/share/ete/` by default. All future imports of NCBITaxa or GTDBTaxa will detect the local database and will skip this step. -Example:: +NCBI Example:: # Load NCBI module from ete4 import NCBITaxa ncbi = NCBITaxa() ncbi.update_taxonomy_database() +GTDB Example:: + # Load GTDB module from ete4 import GTDBTaxa gtdb = GTDBTaxa() @@ -96,13 +98,12 @@ NCBI taxonomy You can fetch species names, ranks and linage track information for your taxids using the following methods: -.. autosummary:: - NCBITaxa.get_rank - NCBITaxa.get_lineage - NCBITaxa.get_taxid_translator - NCBITaxa.get_name_translator - NCBITaxa.translate_to_names +- NCBITaxa.get_rank() +- NCBITaxa.get_lineage() +- NCBITaxa.get_taxid_translator() +- NCBITaxa.get_name_translator() +- NCBITaxa.translate_to_names() The so called get-translator functions will return a dictionary converting between taxids and species names. Either species or linage @@ -183,15 +184,12 @@ fetch and relate taxonomic information. Like NCBITaxa, GTDBTaxa contains similar methods: -.. autosummary:: - - GTDBTaxa.get_rank - GTDBTaxa.get_lineage - GTDBTaxa.get_taxid_translator - GTDBTaxa.get_name_translator - GTDBTaxa.translate_to_names - GTDBTaxa.get_name_lineage - +- GTDBTaxa.get_rank() +- GTDBTaxa.get_lineage() +- GTDBTaxa.get_taxid_translator() +- GTDBTaxa.get_name_translator() +- GTDBTaxa.translate_to_names() +- GTDBTaxa.get_name_lineage() Getting descendant taxa ----------------------- @@ -402,11 +400,11 @@ Here are some examples using the NCBI taxonomic annotation. tree = PhyloTree('((9606, 9598), 10090);') tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa(taxid_attr="name") print(tree.to_str(props=["name", "sci_name", "taxid"])) - # ╭╴9606,Bacteriovorax stolpii,9606 - # ╭╴⊗,Bdellovibrionota,3018035╶┤ - # ╴⊗,Bacteria,2╶┤ ╰╴9598,Bdellovibrio bacteriovorus,9598 - # │ - # ╰╴10090,Ancylobacter aquaticus,10090 + # ╭╴9606,Homo sapiens,9606 + # ╭╴⊗,Homininae,207598╶┤ + # ╴⊗,Euarchontoglires,314146╶┤ ╰╴9598,Pan troglodytes,9598 + # │ + # ╰╴10090,Mus musculus,10090 # b) Only take part of the leaf name as species attribute of each node. @@ -441,7 +439,7 @@ Here are some examples using the NCBI taxonomic annotation. # │ # ╰╴10090|protB,Mus musculus,10090 -Examples using the GTDB taxonomic annotation:: +Similar to above examples but using the GTDB taxonomic annotation:: from ete4 import PhyloTree From 68968fd9d96a6a0f8b63d1187417eed7d769364c Mon Sep 17 00:00:00 2001 From: dengzq1234 Date: Thu, 9 Nov 2023 23:01:27 +0100 Subject: [PATCH 10/12] add line to update taxonomy database --- doc/tutorial/tutorial_taxonomy.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/doc/tutorial/tutorial_taxonomy.rst b/doc/tutorial/tutorial_taxonomy.rst index bddee5a47..92b3252a6 100644 --- a/doc/tutorial/tutorial_taxonomy.rst +++ b/doc/tutorial/tutorial_taxonomy.rst @@ -80,8 +80,9 @@ GTDB Example:: from ete4 import GTDBTaxa gtdb = GTDBTaxa() - # latest release updated in https://github.com/dengzq1234/ete-data/tree/main/gtdb_taxonomy + # Default latest release updated in https://github.com/dengzq1234/ete-data/tree/main/gtdb_taxonomy gtdb.update_taxonomy_database() + # or gtdb.update_taxonomy_database("gtdbdump.tar.gz") @@ -442,7 +443,12 @@ Here are some examples using the NCBI taxonomic annotation. Similar to above examples but using the GTDB taxonomic annotation:: from ete4 import PhyloTree - + from ete4 import GTDBTaxa + + # update gtdb taxonomy database + gtdb = GTDBTaxa() + gtdb.update_taxonomy_database() + # Load the whole leaf name as species taxid. newick = '((p__Huberarchaeota,f__Korarchaeaceae)d__Archaea,o__Peptococcales);' tree = PhyloTree(newick) From f20985bd59d230d2e319f2ccf5728498e084729f Mon Sep 17 00:00:00 2001 From: dengzq1234 Date: Fri, 10 Nov 2023 10:31:00 +0100 Subject: [PATCH 11/12] back to autosummary --- doc/tutorial/tutorial_taxonomy.rst | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/doc/tutorial/tutorial_taxonomy.rst b/doc/tutorial/tutorial_taxonomy.rst index 92b3252a6..26d1635b6 100644 --- a/doc/tutorial/tutorial_taxonomy.rst +++ b/doc/tutorial/tutorial_taxonomy.rst @@ -99,12 +99,13 @@ NCBI taxonomy You can fetch species names, ranks and linage track information for your taxids using the following methods: +.. autosummary:: -- NCBITaxa.get_rank() -- NCBITaxa.get_lineage() -- NCBITaxa.get_taxid_translator() -- NCBITaxa.get_name_translator() -- NCBITaxa.translate_to_names() + NCBITaxa.get_rank + NCBITaxa.get_lineage + NCBITaxa.get_taxid_translator + NCBITaxa.get_name_translator + NCBITaxa.translate_to_names The so called get-translator functions will return a dictionary converting between taxids and species names. Either species or linage @@ -185,12 +186,14 @@ fetch and relate taxonomic information. Like NCBITaxa, GTDBTaxa contains similar methods: -- GTDBTaxa.get_rank() -- GTDBTaxa.get_lineage() -- GTDBTaxa.get_taxid_translator() -- GTDBTaxa.get_name_translator() -- GTDBTaxa.translate_to_names() -- GTDBTaxa.get_name_lineage() +.. autosummary:: + + GTDBTaxa.get_rank + GTDBTaxa.get_lineage + GTDBTaxa.get_taxid_translator + GTDBTaxa.get_name_translator + GTDBTaxa.translate_to_names + GTDBTaxa.get_name_lineage Getting descendant taxa ----------------------- From f2e4d31fb38553f2143a14b12d739016e0f9da97 Mon Sep 17 00:00:00 2001 From: dengzq1234 Date: Fri, 10 Nov 2023 10:36:31 +0100 Subject: [PATCH 12/12] extract space --- doc/tutorial/tutorial_taxonomy.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/tutorial/tutorial_taxonomy.rst b/doc/tutorial/tutorial_taxonomy.rst index 26d1635b6..7c5b2a252 100644 --- a/doc/tutorial/tutorial_taxonomy.rst +++ b/doc/tutorial/tutorial_taxonomy.rst @@ -188,12 +188,12 @@ Like NCBITaxa, GTDBTaxa contains similar methods: .. autosummary:: - GTDBTaxa.get_rank - GTDBTaxa.get_lineage - GTDBTaxa.get_taxid_translator - GTDBTaxa.get_name_translator - GTDBTaxa.translate_to_names - GTDBTaxa.get_name_lineage + GTDBTaxa.get_rank + GTDBTaxa.get_lineage + GTDBTaxa.get_taxid_translator + GTDBTaxa.get_name_translator + GTDBTaxa.translate_to_names + GTDBTaxa.get_name_lineage Getting descendant taxa -----------------------