Skip to content

Commit

Permalink
Merge pull request #731 from etetoolkit/ete4_tutorial_taxa
Browse files Browse the repository at this point in the history
update GTDB module and Taxa tutorial
  • Loading branch information
jordibc authored Nov 10, 2023
2 parents e9f96a3 + f2e4d31 commit 3dcd719
Show file tree
Hide file tree
Showing 5 changed files with 253 additions and 19 deletions.
2 changes: 1 addition & 1 deletion doc/tutorial/tutorial_smartview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,7 @@ For node faces in collapsed clades, modify *collapsed_only* argument to True in
:alt: alternative text
:align: center

"mundo" TextFace shown in branch_right of node "n1" only when node is collapsed with argument **collapsed_only=False**
"mundo" TextFace shown in branch_right of node "n1" only when node is collapsed with argument **collapsed_only=True**

.. image:: https://github.com/dengzq1234/ete4_gallery/blob/master/smartview/faceposition_collapsed_after.png?raw=true
:alt: alternative text
Expand Down
130 changes: 118 additions & 12 deletions doc/tutorial/tutorial_taxonomy.rst
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,15 @@ a parsed version of it in `~/.local/share/ete/` by default. All future
imports of NCBITaxa or GTDBTaxa will detect the local database and
will skip this step.

Example::
NCBI Example::

# Load NCBI module
from ete4 import NCBITaxa
ncbi = NCBITaxa()
ncbi.update_taxonomy_database()

GTDB Example::

# Load GTDB module
from ete4 import GTDBTaxa
gtdb = GTDBTaxa()
Expand All @@ -78,8 +80,9 @@ Example::
from ete4 import GTDBTaxa
gtdb = GTDBTaxa()

# latest release updated in https://github.com/dengzq1234/ete-data/tree/main/gtdb_taxonomy
# Default latest release updated in https://github.com/dengzq1234/ete-data/tree/main/gtdb_taxonomy
gtdb.update_taxonomy_database()

# or
gtdb.update_taxonomy_database("gtdbdump.tar.gz")

Expand Down Expand Up @@ -192,7 +195,6 @@ Like NCBITaxa, GTDBTaxa contains similar methods:
GTDBTaxa.translate_to_names
GTDBTaxa.get_name_lineage


Getting descendant taxa
-----------------------

Expand Down Expand Up @@ -371,40 +373,144 @@ used name, lineage and rank translators.

Remember that species names in PhyloTree instances are automatically
extracted from leaf names. The parsing method can be easily adapted to
any formatting:
any formatting

NCBI taxonomy example::
Here are some examples using the NCBI taxonomic annotation.

1)Using the whole leaf name as taxonomic identifier::
from ete4 import PhyloTree
tree = PhyloTree('((9606, 9598), 10090);')

# Load the whole leaf name as species taxid.
# pass name as taxid identifier to annotate_ncbi_taxa
tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa(taxid_attr="name")
print(tree.to_str(props=["name", "sci_name", "taxid"]))
# ╭╴9606,Bacteriovorax stolpii,9606
# ╭╴⊗,Bdellovibrionota,3018035╶┤
# ╴⊗,Bacteria,2╶┤ ╰╴9598,Bdellovibrio bacteriovorus,9598
# │
# ╰╴10090,Ancylobacter aquaticus,10090

2)Using `sp_naming_function` to define `species` attribute for each node::
from ete4 import PhyloTree
# a) Load the whole leaf name as species attribute of each node.
tree = PhyloTree('((9606, 9598), 10090);', sp_naming_function=lambda name: name)
tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa()

# pass `species` as taxid identifier to annotate_ncbi_taxa
tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa(taxid_attr="species")

# Or annotate using only the name as taxid identifier.
tree = PhyloTree('((9606, 9598), 10090);')
tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa(taxid_attr="name")
print(tree.to_str(props=["name", "sci_name", "taxid"]))
# ╭╴9606,Homo sapiens,9606
# ╭╴⊗,Homininae,207598╶┤
# ╴⊗,Euarchontoglires,314146╶┤ ╰╴9598,Pan troglodytes,9598
# │
# ╰╴10090,Mus musculus,10090


# b) Only take part of the leaf name as species attribute of each node.
# Split names by '|' and return the first part as the species taxid.
tree = PhyloTree('((9606|protA, 9598|protA), 10090|protB);', sp_naming_function=lambda name: name.split('|')[0])
tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa()
tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa(taxid_attr="species")

print(tree.to_str(props=["name", "sci_name", "taxid"]))
# ╭╴9606|protA,Homo sapiens,9606
# ╭╴⊗,Homininae,207598╶┤
# ╴⊗,Euarchontoglires,314146╶┤ ╰╴9598|protA,Pan troglodytes,9598
# │
# ╰╴10090|protB,Mus musculus,10090

3)Using custom property as taxid identifier::

from ete4 import PhyloTree

tree = PhyloTree('((9606|protA, 9598|protA), 10090|protB);')

# add custom property with namespace "spcode" to each node
tree['9606|protA'].add_prop("spcode", 9606)
tree['9598|protA'].add_prop("spcode", 9598)
tree['10090|protB'].add_prop("spcode", 10090)

# passing the custom property name as taxid identifier to annotate_ncbi_taxa
tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa(taxid_attr="spcode")
print(tree.to_str(props=["name", "sci_name", "spcode"]))
# ╭╴9606|protA,Homo sapiens,9606
# ╭╴(empty),Homininae,207598╶┤
# ╴(empty),Euarchontoglires,314146╶┤ ╰╴9598|protA,Pan troglodytes,9598
# │
# ╰╴10090|protB,Mus musculus,10090

GTDB taxonomy example::
Similar to above examples but using the GTDB taxonomic annotation::

from ete4 import PhyloTree

from ete4 import GTDBTaxa
# update gtdb taxonomy database
gtdb = GTDBTaxa()
gtdb.update_taxonomy_database()
# Load the whole leaf name as species taxid.
newick = '((p__Huberarchaeota,f__Korarchaeaceae)d__Archaea,o__Peptococcales);'

tree = PhyloTree(newick)
tax2name, tax2track, tax2rank = gtdb.annotate_tree(tree, taxid_attr="name")
tax2name, tax2track, tax2rank = tree.annotate_gtdb_taxa(taxid_attr="name")

print(tree.to_str(props=['sci_name', 'rank']))
# ╭╴p__Huberarchaeota,phylum
# ╭╴d__Archaea,superkingdom╶┤
# ╴root,no rank╶┤ ╰╴f__Korarchaeaceae,family
# │
# ╰╴o__Peptococcales,order

# Load the whole leaf name(representing genome) as species taxid.
newick = '((GB_GCA_020833055.1),(GB_GCA_003344655.1),(RS_GCF_000019605.1,RS_GCF_003948265.1));'
tree = PhyloTree(newick, sp_naming_function=lambda name: name)
tax2name, tax2track, tax2rank = tree.annotate_gtdb_taxa(taxid_attr="species")
print(tree.to_str(props=['name', 'sci_name', 'rank']))
# ╭╴⊗,GB_GCA_020833055.1,subspecies╶╌╴GB_GCA_020833055.1,s__Korarchaeum sp020833055,subspecies
# │
# ╴⊗,g__Korarchaeum,genus╶┼╴⊗,GB_GCA_003344655.1,subspecies╶╌╴GB_GCA_003344655.1,s__Korarchaeum sp003344655,subspecies
# │
# │ ╭╴RS_GCF_000019605.1,s__Korarchaeum cryptofilum,subspecies
# ╰╴⊗,s__Korarchaeum cryptofilum,species╶┤
# ╰╴RS_GCF_003948265.1,s__Korarchaeum cryptofilum,subspecies


# Split names by '|' and return the first part as the species taxid.
newick = '((GB_GCA_020833055.1|protA:1):1,(GB_GCA_003344655.1|protB:1):1,(RS_GCF_000019605.1|protC:1,RS_GCF_003948265.1|protD:1):1):1;'
tree = PhyloTree(newick, sp_naming_function=lambda name: name.split('|')[0])
tax2name, tax2track, tax2rank = tree.annotate_gtdb_taxa(taxid_attr="species")
print(tree.to_str(props=['name', 'sci_name', 'rank']))
# ╭╴⊗,s__Korarchaeum cryptofilum,subspecies,⊗╶╌╴GB_GCA_020833055.1|protA,s__Korarchaeum cryptofilum,subspecies,GB_GCA_020833055.1
# │
# ╴⊗,s__Korarchaeum cryptofilum,subspecies,⊗╶┼╴⊗,s__Korarchaeum cryptofilum,subspecies,⊗╶╌╴GB_GCA_003344655.1|protB,s__Korarchaeum cryptofilum,subspecies,GB_GCA_003344655.1
# │
# │ ╭╴RS_GCF_000019605.1|protC,s__Korarchaeum cryptofilum,subspecies,RS_GCF_000019605.1
# ╰╴⊗,s__Korarchaeum cryptofilum,subspecies,⊗╶┤
# ╰╴RS_GCF_003948265.1|protD,s__Korarchaeum cryptofilum,subspecies,RS_GCF_003948265.1
# using custom property as taxid identifier
newick = '((protA:1),(protB:1):1,(protC:1,protD:1):1):1;'
tree = PhyloTree(newick)
annotate_dict = {
'protA': 'GB_GCA_020833055.1',
'protB': 'GB_GCA_003344655.1',
'protC': 'RS_GCF_000019605.1',
'protD': 'RS_GCF_003948265.1',
}

for key, value in annotate_dict.items():
tree[key].add_prop('gtdb_spcode', value)

tax2name, tax2track, tax2rank = tree.annotate_gtdb_taxa(taxid_attr="gtdb_spcode")
print(tree.to_str(props=['name', 'sci_name', 'rank']))
# ╭╴⊗,s__Korarchaeum cryptofilum,subspecies╶╌╴protA,s__Korarchaeum cryptofilum,subspecies
# │
# ╴⊗,s__Korarchaeum cryptofilum,subspecies╶┼╴⊗,s__Korarchaeum cryptofilum,subspecies╶╌╴protB,s__Korarchaeum cryptofilum,subspecies
# │
# │ ╭╴protC,s__Korarchaeum cryptofilum,subspecies
# ╰╴⊗,s__Korarchaeum cryptofilum,subspecies╶┤
# ╰╴protD,s__Korarchaeum cryptofilum,subspecies
11 changes: 5 additions & 6 deletions ete4/gtdb_taxonomy/gtdbquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,14 +378,14 @@ def get_topology(self, taxnames, intermediate_nodes=False, rank_limit=None,
leaves = set([v for v, count in Counter(subtree).items() if count == 1])
tax2name = self.get_taxid_translator(list(subtree))
name2tax ={spname:taxid for taxid,spname in tax2name.items()}
nodes[root_taxid] = PhyloTree(name=root_taxid)
nodes[root_taxid] = PhyloTree({'name': str(root_taxid)})
current_parent = nodes[root_taxid]
for tid in subtree:
if tid in visited:
current_parent = nodes[tid].up
else:
visited.add(tid)
nodes[tid] = PhyloTree(name=tax2name.get(tid, ''))
nodes[tid] = PhyloTree({'name': tax2name.get(tid, '')})
current_parent.add_child(nodes[tid])
if tid not in leaves:
current_parent = nodes[tid]
Expand Down Expand Up @@ -480,7 +480,7 @@ def annotate_tree(self, t, taxid_attr='name',
for n in t.traverse():
try:
# translate gtdb name -> id
taxaname = n.props.get(taxid_attr)
taxaname = getattr(n, taxid_attr, n.props.get(taxid_attr))
tid = self.get_name_translator([taxaname])[taxaname][0]
taxids.add(tid)
except (KeyError, ValueError, AttributeError):
Expand All @@ -507,8 +507,7 @@ def annotate_tree(self, t, taxid_attr='name',
n2leaves = t.get_cached_content()

for node in t.traverse('postorder'):
node_taxid = node.props.get(taxid_attr)

node_taxid = getattr(node, taxid_attr, node.props.get(taxid_attr))
node.add_prop('taxid', node_taxid)

if node_taxid:
Expand All @@ -531,7 +530,7 @@ def annotate_tree(self, t, taxid_attr='name',
rank = tax2rank.get(tmp_taxid, 'Unknown'),
named_lineage = [tax2name.get(tax, str(tax)) for tax in tax2track.get(tmp_taxid, [])])
elif node.is_leaf:
node.add_props(sci_name = node.props.get(taxid_attr, 'NA'),
node.add_props(sci_name = getattr(node, taxid_attr, node.props.get(taxid_attr, 'NA')),
common_name = '',
lineage = [],
rank = 'Unknown',
Expand Down
85 changes: 85 additions & 0 deletions tests/test_gtdbquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,91 @@ def test_01tree_annotation(self):
self.assertEqual(caballeronia.props.get('named_lineage'),
['root', 'd__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria',
'o__Burkholderiales', 'f__Burkholderiaceae', 'g__Caballeronia', 's__Caballeronia udeis'])

def test_02tree_annotation(self):
# using name as species attribute
tree = PhyloTree('((GB_GCA_011358815.1,RS_GCF_003948265.1),(GB_GCA_003344655.1),(GB_GCA_011056255.1));',
sp_naming_function=lambda name: name)
tree.annotate_gtdb_taxa(dbfile=DATABASE_PATH, taxid_attr='species')

self.assertEqual(tree.props.get('sci_name'), 'g__Korarchaeum')

cryptofilum = tree['GB_GCA_011358815.1'].up
self.assertEqual(cryptofilum.props.get('taxid'), 's__Korarchaeum cryptofilum')
self.assertEqual(cryptofilum.props.get('sci_name'), 's__Korarchaeum cryptofilum')
self.assertEqual(cryptofilum.props.get('rank'), 'species')
self.assertEqual(cryptofilum.props.get('named_lineage'),
['root', 'd__Archaea', 'p__Thermoproteota', 'c__Korarchaeia',
'o__Korarchaeales', 'f__Korarchaeaceae', 'g__Korarchaeum', 's__Korarchaeum cryptofilum'])

sp003344655 = tree['GB_GCA_003344655.1']
self.assertEqual(sp003344655.props.get('taxid'), 'GB_GCA_003344655.1')
self.assertEqual(sp003344655.props.get('sci_name'), 's__Korarchaeum sp003344655')
self.assertEqual(sp003344655.props.get('rank'), 'subspecies')
self.assertEqual(sp003344655.props.get('named_lineage'),
['root', 'd__Archaea', 'p__Thermoproteota', 'c__Korarchaeia',
'o__Korarchaeales', 'f__Korarchaeaceae', 'g__Korarchaeum',
's__Korarchaeum sp003344655', 'GB_GCA_003344655.1'])

def test_03tree_annotation(self):
# assign species attribute via sp_naming_function
tree = PhyloTree('((GB_GCA_011358815.1|protA,RS_GCF_003948265.1|protB),(GB_GCA_003344655.1|protC),(GB_GCA_011056255.1|protD));',
sp_naming_function=lambda name: name.split('|')[0])
tree.annotate_gtdb_taxa(taxid_attr='species')

self.assertEqual(tree.props.get('sci_name'), 'g__Korarchaeum')

cryptofilum = tree['GB_GCA_011358815.1|protA'].up
self.assertEqual(cryptofilum.props.get('taxid'), 's__Korarchaeum cryptofilum')
self.assertEqual(cryptofilum.props.get('sci_name'), 's__Korarchaeum cryptofilum')
self.assertEqual(cryptofilum.props.get('rank'), 'species')
self.assertEqual(cryptofilum.props.get('named_lineage'),
['root', 'd__Archaea', 'p__Thermoproteota', 'c__Korarchaeia',
'o__Korarchaeales', 'f__Korarchaeaceae', 'g__Korarchaeum', 's__Korarchaeum cryptofilum'])

sp003344655 = tree['GB_GCA_003344655.1|protC']
self.assertEqual(sp003344655.props.get('taxid'), 'GB_GCA_003344655.1')
self.assertEqual(sp003344655.props.get('sci_name'), 's__Korarchaeum sp003344655')
self.assertEqual(sp003344655.props.get('rank'), 'subspecies')
self.assertEqual(sp003344655.props.get('named_lineage'),
['root', 'd__Archaea', 'p__Thermoproteota', 'c__Korarchaeia',
'o__Korarchaeales', 'f__Korarchaeaceae', 'g__Korarchaeum',
's__Korarchaeum sp003344655', 'GB_GCA_003344655.1'])

def test_04tree_annotation(self):
# Using custom property as taxonomic identifier
tree = PhyloTree('((protA:1, protB:1):1,(protC:1),(protD:1):1):1;')
annotate_dict = {
'protA': 'GB_GCA_011358815.1',
'protB': 'RS_GCF_003948265.1',
'protC': 'GB_GCA_003344655.1',
'protD': 'GB_GCA_011056255.1',
}
for key, value in annotate_dict.items():
tree[key].add_prop('gtdb_spcode', value)

tree.annotate_gtdb_taxa(taxid_attr="gtdb_spcode")

self.assertEqual(tree.props.get('sci_name'), 'g__Korarchaeum')

cryptofilum = tree['protA'].up
self.assertEqual(cryptofilum.props.get('taxid'), 's__Korarchaeum cryptofilum')
self.assertEqual(cryptofilum.props.get('sci_name'), 's__Korarchaeum cryptofilum')
self.assertEqual(cryptofilum.props.get('rank'), 'species')
self.assertEqual(cryptofilum.props.get('named_lineage'),
['root', 'd__Archaea', 'p__Thermoproteota', 'c__Korarchaeia',
'o__Korarchaeales', 'f__Korarchaeaceae', 'g__Korarchaeum', 's__Korarchaeum cryptofilum'])

sp003344655 = tree['protC']
self.assertEqual(sp003344655.props.get('taxid'), 'GB_GCA_003344655.1')
self.assertEqual(sp003344655.props.get('sci_name'), 's__Korarchaeum sp003344655')
self.assertEqual(sp003344655.props.get('rank'), 'subspecies')
self.assertEqual(sp003344655.props.get('named_lineage'),
['root', 'd__Archaea', 'p__Thermoproteota', 'c__Korarchaeia',
'o__Korarchaeales', 'f__Korarchaeaceae', 'g__Korarchaeum',
's__Korarchaeum sp003344655', 'GB_GCA_003344655.1'])



def test_gtdbquery(self):
gtdb = GTDBTaxa(dbfile=DATABASE_PATH)
Expand Down
Loading

0 comments on commit 3dcd719

Please sign in to comment.