+[docs]
+ defupdate_taxonomy_database(self,taxdump_file=None):
+"""Update the GTDB taxonomy database.
+
+ It updates it by downloading and parsing the latest
+ gtdbtaxdump.tar.gz file.
+
+ :param taxdump_file: Alternative location of gtdbtaxdump.tar.gz.
+ """
+ update_db(self.dbfile,targz_file=taxdump_file)
+
+
+ def_connect(self):
+ self.db=sqlite3.connect(self.dbfile)
+
+ def_translate_merged(self,all_taxids):
+ conv_all_taxids=set((list(map(int,all_taxids))))
+ cmd='select taxid_old, taxid_new FROM merged WHERE taxid_old IN (%s)'%','.join(map(str,all_taxids))
+
+ result=self.db.execute(cmd)
+ conversion={}
+ forold,newinresult.fetchall():
+ conv_all_taxids.discard(int(old))
+ conv_all_taxids.add(int(new))
+ conversion[int(old)]=int(new)
+ returnconv_all_taxids,conversion
+
+
+ # def get_fuzzy_name_translation(self, name, sim=0.9):
+ # '''
+ # Given an inexact species name, returns the best match in the NCBI database of taxa names.
+ # :argument 0.9 sim: Min word similarity to report a match (from 0 to 1).
+ # :return: taxid, species-name-match, match-score
+ # '''
+
+
+ # import sqlite3.dbapi2 as dbapi2
+ # _db = dbapi2.connect(self.dbfile)
+ # _db.enable_load_extension(True)
+ # module_path = os.path.split(os.path.realpath(__file__))[0]
+ # _db.execute("select load_extension('%s')" % os.path.join(module_path,
+ # "SQLite-Levenshtein/levenshtein.sqlext"))
+
+ # print("Trying fuzzy search for %s" % name)
+ # maxdiffs = math.ceil(len(name) * (1-sim))
+ # cmd = 'SELECT taxid, spname, LEVENSHTEIN(spname, "%s") AS sim FROM species WHERE sim<=%s ORDER BY sim LIMIT 1;' % (name, maxdiffs)
+ # taxid, spname, score = None, None, len(name)
+ # result = _db.execute(cmd)
+ # try:
+ # taxid, spname, score = result.fetchone()
+ # except TypeError:
+ # cmd = 'SELECT taxid, spname, LEVENSHTEIN(spname, "%s") AS sim FROM synonym WHERE sim<=%s ORDER BY sim LIMIT 1;' % (name, maxdiffs)
+ # result = _db.execute(cmd)
+ # try:
+ # taxid, spname, score = result.fetchone()
+ # except:
+ # pass
+ # else:
+ # taxid = int(taxid)
+ # else:
+ # taxid = int(taxid)
+
+ # norm_score = 1 - (float(score)/len(name))
+ # if taxid:
+ # print("FOUND! %s taxid:%s score:%s (%s)" %(spname, taxid, score, norm_score))
+
+ # return taxid, spname, norm_score
+
+
+[docs]
+ defget_rank(self,taxids):
+"""Return dictionary converting taxids to their GTDB taxonomy rank."""
+ ids=','.join('"%s"'%vforvinset(taxids)-{None,''})
+ result=self.db.execute('SELECT taxid, rank FROM species WHERE taxid IN (%s)'%ids)
+ return{tax:spnamefortax,spnameinresult.fetchall()}
+
+
+
+[docs]
+ defget_lineage_translator(self,taxids):
+"""Given a valid taxid number, return its corresponding lineage track as a
+ hierarchically sorted list of parent taxids.
+ """
+ all_ids=set(taxids)
+ all_ids.discard(None)
+ all_ids.discard("")
+ query=','.join(['"%s"'%vforvinall_ids])
+ result=self.db.execute('SELECT taxid, track FROM species WHERE taxid IN (%s);'%query)
+ id2lineages={}
+ fortax,trackinresult.fetchall():
+ id2lineages[tax]=list(map(int,reversed(track.split(","))))
+
+ returnid2lineages
+
+
+
+[docs]
+ defget_name_lineage(self,taxnames):
+"""Given a valid taxname, return its corresponding lineage track as a
+ hierarchically sorted list of parent taxnames.
+ """
+ name_lineages=[]
+ name2taxid=self.get_name_translator(taxnames)
+ forkey,valueinname2taxid.items():
+ lineage=self.get_lineage(value[0])
+ names=self.get_taxid_translator(lineage)
+ name_lineages.append({key:[names[taxid]fortaxidinlineage]})
+
+ returnname_lineages
+
+
+
+[docs]
+ defget_lineage(self,taxid):
+"""Given a valid taxid number, return its corresponding lineage track as a
+ hierarchically sorted list of parent taxids.
+ """
+ ifnottaxid:
+ returnNone
+ taxid=int(taxid)
+ result=self.db.execute('SELECT track FROM species WHERE taxid=%s'%taxid)
+ raw_track=result.fetchone()
+ ifnotraw_track:
+ #perhaps is an obsolete taxid
+ _,merged_conversion=self._translate_merged([taxid])
+ iftaxidinmerged_conversion:
+ result=self.db.execute('SELECT track FROM species WHERE taxid=%s'%merged_conversion[taxid])
+ raw_track=result.fetchone()
+ # if not raise error
+ ifnotraw_track:
+ #raw_track = ["1"]
+ raiseValueError("%s taxid not found"%taxid)
+ else:
+ warnings.warn("taxid %s was translated into %s"%(taxid,merged_conversion[taxid]))
+
+ track=list(map(int,raw_track[0].split(",")))
+ returnlist(reversed(track))
+
+
+
+[docs]
+ defget_common_names(self,taxids):
+ query=','.join(['"%s"'%vforvintaxids])
+ cmd="select taxid, common FROM species WHERE taxid IN (%s);"%query
+ result=self.db.execute(cmd)
+ id2name={}
+ fortax,common_nameinresult.fetchall():
+ ifcommon_name:
+ id2name[tax]=common_name
+ returnid2name
+
+
+
+[docs]
+ defget_taxid_translator(self,taxids,try_synonyms=True):
+"""Given a list of taxids, returns a dictionary with their corresponding
+ scientific names.
+ """
+
+ all_ids=set(map(int,taxids))
+ all_ids.discard(None)
+ all_ids.discard("")
+ query=','.join(['"%s"'%vforvinall_ids])
+ cmd="select taxid, spname FROM species WHERE taxid IN (%s);"%query
+ result=self.db.execute(cmd)
+ id2name={}
+ fortax,spnameinresult.fetchall():
+ id2name[tax]=spname
+
+ # any taxid without translation? lets tray in the merged table
+ # if len(all_ids) != len(id2name) and try_synonyms:
+ # not_found_taxids = all_ids - set(id2name.keys())
+ # taxids, old2new = self._translate_merged(not_found_taxids)
+ # new2old = {v: k for k,v in old2new.items()}
+
+ # if old2new:
+ # query = ','.join(['"%s"' %v for v in new2old])
+ # cmd = "select taxid, spname FROM species WHERE taxid IN (%s);" %query
+ # result = self.db.execute(cmd)
+ # for tax, spname in result.fetchall():
+ # id2name[new2old[tax]] = spname
+
+ returnid2name
+
+
+
+[docs]
+ defget_name_translator(self,names):
+"""
+ Given a list of taxid scientific names, returns a dictionary translating them into their corresponding taxids.
+ Exact name match is required for translation.
+ """
+
+ name2id={}
+ #name2realname = {}
+ name2origname={}
+ forninnames:
+ name2origname[n.lower()]=n
+
+ names=set(name2origname.keys())
+
+ query=','.join(['"%s"'%nforninname2origname.keys()])
+ cmd='select spname, taxid from species where spname IN (%s)'%query
+ result=self.db.execute('select spname, taxid from species where spname IN (%s)'%query)
+ forsp,taxidinresult.fetchall():
+ oname=name2origname[sp.lower()]
+ name2id.setdefault(oname,[]).append(taxid)
+ #name2realname[oname] = sp
+ missing=names-set([n.lower()forninname2id.keys()])
+ ifmissing:
+ query=','.join(['"%s"'%nforninmissing])
+ result=self.db.execute('select spname, taxid from synonym where spname IN (%s)'%query)
+ forsp,taxidinresult.fetchall():
+ oname=name2origname[sp.lower()]
+ name2id.setdefault(oname,[]).append(taxid)
+ #name2realname[oname] = sp
+ returnname2id
+
+
+
+[docs]
+ deftranslate_to_names(self,taxids):
+"""
+ Given a list of taxid numbers, returns another list with their corresponding scientific names.
+ """
+ id2name=self.get_taxid_translator(taxids)
+ names=[]
+ forspintaxids:
+ names.append(id2name.get(sp,sp))
+ returnnames
+
+
+
+
+[docs]
+ defget_descendant_taxa(self,parent,intermediate_nodes=False,rank_limit=None,collapse_subspecies=False,return_tree=False):
+"""
+ given a parent taxid or scientific species name, returns a list of all its descendants taxids.
+ If intermediate_nodes is set to True, internal nodes will also be dumped.
+ """
+ try:
+ taxid=int(parent)
+ exceptValueError:
+ try:
+ taxid=self.get_name_translator([parent])[parent][0]
+ exceptKeyError:
+ raiseValueError('%s not found!'%parent)
+
+ # checks if taxid is a deprecated one, and converts into the right one.
+ _,conversion=self._translate_merged([taxid])#try to find taxid in synonyms table
+ ifconversion:
+ taxid=conversion[taxid]
+
+ withopen(self.dbfile+".traverse.pkl","rb")asCACHED_TRAVERSE:
+ prepostorder=pickle.load(CACHED_TRAVERSE)
+ descendants={}
+ found=0
+ fortidinprepostorder:
+ iftid==taxid:
+ found+=1
+ eliffound==1:
+ descendants[tid]=descendants.get(tid,0)+1
+ eliffound==2:
+ break
+
+ ifnotfound:
+ raiseValueError("taxid not found:%s"%taxid)
+ eliffound==1:
+ return[taxid]
+ ifrank_limitorcollapse_subspeciesorreturn_tree:
+ descendants_spnames=self.get_taxid_translator(list(descendants.keys()))
+ #tree = self.get_topology(list(descendants.keys()), intermediate_nodes=intermediate_nodes, collapse_subspecies=collapse_subspecies, rank_limit=rank_limit)
+ tree=self.get_topology(list(descendants_spnames.values()),intermediate_nodes=intermediate_nodes,collapse_subspecies=collapse_subspecies,rank_limit=rank_limit)
+ ifreturn_tree:
+ returntree
+ elifintermediate_nodes:
+ return[n.namefornintree.get_descendants()]
+ else:
+ return[n.namefornintree]
+
+ elifintermediate_nodes:
+ returnself.translate_to_names([tidfortid,countindescendants.items()])
+ else:
+ self.translate_to_names([tidfortid,countindescendants.items()ifcount==1])
+ returnself.translate_to_names([tidfortid,countindescendants.items()ifcount==1])
+
+
+
+[docs]
+ defget_topology(self,taxnames,intermediate_nodes=False,rank_limit=None,
+ collapse_subspecies=False,annotate=True):
+"""Return minimal pruned GTDB taxonomy tree containing all given taxids.
+
+ :param intermediate_nodes: If True, single child nodes
+ representing the complete lineage of leaf nodes are kept.
+ Otherwise, the tree is pruned to contain the first common
+ ancestor of each group.
+ :param rank_limit: If valid NCBI rank name is provided, the
+ tree is pruned at that given level. For instance, use
+ rank="species" to get rid of sub-species or strain leaf
+ nodes.
+ :param collapse_subspecies: If True, any item under the
+ species rank will be collapsed into the species upper
+ node.
+ """
+ from..importPhyloTree
+ #taxids, merged_conversion = self._translate_merged(taxids)
+ tax2id=self.get_name_translator(taxnames)#{'f__Korarchaeaceae': [2174], 'o__Peptococcales': [205487], 'p__Huberarchaeota': [610]}
+ taxids=[i[0]foriintax2id.values()]
+
+ iflen(taxids)==1:
+ root_taxid=int(list(taxids)[0])
+ withopen(self.dbfile+".traverse.pkl","rb")asCACHED_TRAVERSE:
+ prepostorder=pickle.load(CACHED_TRAVERSE)
+ descendants={}
+ found=0
+ nodes={}
+ hit=0
+ visited=set()
+ start=prepostorder.index(root_taxid)
+ try:
+ end=prepostorder.index(root_taxid,start+1)
+ subtree=prepostorder[start:end+1]
+ exceptValueError:
+ # If root taxid is not found in postorder, must be a tip node
+ subtree=[root_taxid]
+ leaves=set([vforv,countinCounter(subtree).items()ifcount==1])
+ tax2name=self.get_taxid_translator(list(subtree))
+ name2tax={spname:taxidfortaxid,spnameintax2name.items()}
+ nodes[root_taxid]=PhyloTree(name=root_taxid)
+ current_parent=nodes[root_taxid]
+ fortidinsubtree:
+ iftidinvisited:
+ current_parent=nodes[tid].up
+ else:
+ visited.add(tid)
+ nodes[tid]=PhyloTree(name=tax2name.get(tid,''))
+ current_parent.add_child(nodes[tid])
+ iftidnotinleaves:
+ current_parent=nodes[tid]
+ root=nodes[root_taxid]
+ else:
+ taxids=set(map(int,taxids))
+ sp2track={}
+ elem2node={}
+ id2lineage=self.get_lineage_translator(taxids)
+ all_taxids=set()
+ forlineageinid2lineage.values():
+ all_taxids.update(lineage)
+ id2rank=self.get_rank(all_taxids)
+
+ tax2name=self.get_taxid_translator(taxids)
+ all_taxid_codes=set([_taxfor_lininlist(id2lineage.values())for_taxin_lin])
+ extra_tax2name=self.get_taxid_translator(list(all_taxid_codes-set(tax2name.keys())))
+ tax2name.update(extra_tax2name)
+ name2tax={spname:taxidfortaxid,spnameintax2name.items()}
+
+ forspintaxids:
+ track=[]
+ lineage=id2lineage[sp]
+
+ foreleminlineage:
+ spanme=tax2name[elem]
+ ifelemnotinelem2node:
+ node=elem2node.setdefault(elem,PhyloTree())
+ node.name=str(tax2name[elem])
+ node.taxid=str(tax2name[elem])
+ node.add_prop("rank",str(id2rank.get(int(elem),"no rank")))
+ else:
+ node=elem2node[elem]
+ track.append(node)
+ sp2track[sp]=track
+ # generate parent child relationships
+ forsp,trackinsp2track.items():
+ parent=None
+ forelemintrack:
+ ifparentandelemnotinparent.children:
+ parent.add_child(elem)
+ ifrank_limitandelem.props.get('rank')==rank_limit:
+ break
+ parent=elem
+ root=elem2node[1]
+ #remove onechild-nodes
+
+ ifnotintermediate_nodes:
+ forninroot.descendants():
+ iflen(n.children)==1andint(name2tax.get(n.name,n.name))notintaxids:
+ n.delete(prevent_nondicotomic=False)
+
+ iflen(root.children)==1:
+ tree=root.children[0].detach()
+ else:
+ tree=root
+
+ ifcollapse_subspecies:
+ to_detach=[]
+ fornodeintree.traverse():
+ ifnode.props.get('rank')=='species':
+ to_detach.extend(node.children)
+ forninto_detach:
+ n.detach()
+
+ ifannotate:
+ self.annotate_tree(tree)
+
+ returntree
+
+
+
+[docs]
+ defannotate_tree(self,t,taxid_attr='name',
+ tax2name=None,tax2track=None,tax2rank=None):
+"""Annotate a tree containing taxids as leaf names.
+
+ It annotates by adding the properties 'taxid', 'sci_name',
+ 'lineage', 'named_lineage' and 'rank'.
+
+ :param t: Tree to annotate.
+ :param taxid_attr: Node attribute (property) containing the
+ taxid number associated to each node (i.e. species in
+ PhyloTree instances).
+ :param tax2name, tax2track, tax2rank: Pre-calculated
+ dictionaries with translations from taxid number to names,
+ track lineages and ranks.
+ """
+ taxids=set()
+ iftaxid_attr=="taxid":
+ fornint.traverse():
+ iftaxid_attrinn.props:
+ taxids.add(n.props[taxid_attr])
+ else:
+ fornint.traverse():
+ try:
+ # translate gtdb name -> id
+ taxaname=n.props.get(taxid_attr)
+ tid=self.get_name_translator([taxaname])[taxaname][0]
+ taxids.add(tid)
+ except(KeyError,ValueError,AttributeError):
+ pass
+ merged_conversion={}
+
+ taxids,merged_conversion=self._translate_merged(taxids)
+
+ ifnottax2nameortaxids-set(map(int,list(tax2name.keys()))):
+ tax2name=self.get_taxid_translator(taxids)
+ ifnottax2trackortaxids-set(map(int,list(tax2track.keys()))):
+ tax2track=self.get_lineage_translator(taxids)
+
+ all_taxid_codes=set([_taxfor_lininlist(tax2track.values())for_taxin_lin])
+ extra_tax2name=self.get_taxid_translator(list(all_taxid_codes-set(tax2name.keys())))
+ tax2name.update(extra_tax2name)
+
+ tax2common_name=self.get_common_names(tax2name.keys())
+
+ ifnottax2rank:
+ tax2rank=self.get_rank(list(tax2name.keys()))
+
+ name2tax={spname:taxidfortaxid,spnameintax2name.items()}
+ n2leaves=t.get_cached_content()
+
+ fornodeint.traverse('postorder'):
+ node_taxid=node.props.get(taxid_attr)
+
+ node.add_prop('taxid',node_taxid)
+
+ ifnode_taxid:
+ tmp_taxid=self.get_name_translator([node_taxid]).get(node_taxid,[None])[0]
+
+ ifnode_taxidinmerged_conversion:
+ node_taxid=merged_conversion[node_taxid]
+
+ rank=tax2rank.get(tmp_taxid,'Unknown')
+ ifrank!='subspecies':
+ sci_name=tax2name.get(tmp_taxid,'')
+ else:
+ # For subspecies, gtdb taxid (like 'RS_GCF_0062.1') is not informative. Better use the species one.
+ track=tax2track[tmp_taxid]# like ['root', 'd__Bacteria', ..., 's__Moorella', 'RS_GCF_0062.1']
+ sci_name=tax2name.get(track[-2],'')
+
+ node.add_props(sci_name=sci_name,
+ common_name=tax2common_name.get(node_taxid,''),
+ lineage=tax2track.get(tmp_taxid,[]),
+ rank=tax2rank.get(tmp_taxid,'Unknown'),
+ named_lineage=[tax2name.get(tax,str(tax))fortaxintax2track.get(tmp_taxid,[])])
+ elifnode.is_leaf:
+ node.add_props(sci_name=node.props.get(taxid_attr,'NA'),
+ common_name='',
+ lineage=[],
+ rank='Unknown',
+ named_lineage=[])
+ else:
+ lineage=self._common_lineage([lf.props.get('lineage')forlfinn2leaves[node]])
+ iflineage[-1]:
+ ancestor=self.get_taxid_translator([lineage[-1]])[lineage[-1]]
+ else:
+ ancestor=None
+ node.add_props(sci_name=tax2name.get(ancestor,str(ancestor)),
+ common_name=tax2common_name.get(lineage[-1],''),
+ taxid=ancestor,
+ lineage=lineage,
+ rank=tax2rank.get(lineage[-1],'Unknown'),
+ named_lineage=[tax2name.get(tax,str(tax))fortaxinlineage])
+
+ returntax2name,tax2track,tax2rank
+
+
+ def_common_lineage(self,vectors):
+ occurrence=defaultdict(int)
+ pos=defaultdict(set)
+ forvinvectors:
+ fori,taxidinenumerate(v):
+ occurrence[taxid]+=1
+ pos[taxid].add(i)
+
+ common=[taxidfortaxid,ocuinoccurrence.items()ifocu==len(vectors)]
+ ifnotcommon:
+ return[""]
+ else:
+ sorted_lineage=sorted(common,key=lambdax:min(pos[x]))
+ returnsorted_lineage
+
+ # OLD APPROACH:
+
+ # visited = defaultdict(int)
+ # for index, name in [(ei, e) for v in vectors for ei, e in enumerate(v)]:
+ # visited[(name, index)] += 1
+
+ # def _sort(a, b):
+ # if a[1] > b[1]:
+ # return 1
+ # elif a[1] < b[1]:
+ # return -1
+ # else:
+ # if a[0][1] > b[0][1]:
+ # return 1
+ # elif a[0][1] < b[0][1]:
+ # return -1
+ # return 0
+
+ # matches = sorted(visited.items(), _sort)
+
+ # if matches:
+ # best_match = matches[-1]
+ # else:
+ # return "", set()
+
+ # if best_match[1] != len(vectors):
+ # return "", set()
+ # else:
+ # return best_match[0][0], [m[0][0] for m in matches if m[1] == len(vectors)]
+
+
+
+[docs]
+ defget_broken_branches(self,t,taxa_lineages,n2content=None):
+"""Returns a list of GTDB lineage names that are not monophyletic in the
+ provided tree, as well as the list of affected branches and their size.
+ CURRENTLY EXPERIMENTAL
+ """
+ ifnotn2content:
+ n2content=t.get_cached_content()
+
+ tax2node=defaultdict(set)
+
+ unknown=set()
+ forleafint.iter_leaves():
+ ifleaf.sci_name.lower()!="unknown":
+ lineage=taxa_lineages[leaf.taxid]
+ forindex,taxinenumerate(lineage):
+ tax2node[tax].add(leaf)
+ else:
+ unknown.add(leaf)
+
+ broken_branches=defaultdict(set)
+ broken_clades=set()
+ fortax,leavesintax2node.items():
+ iflen(leaves)>1:
+ common=t.get_common_ancestor(leaves)
+ else:
+ common=list(leaves)[0]
+ if(leaves^set(n2content[common]))-unknown:
+ broken_branches[common].add(tax)
+ broken_clades.add(tax)
+
+ broken_clade_sizes=[len(tax2node[tax])fortaxinbroken_clades]
+ returnbroken_branches,broken_clades,broken_clade_sizes
-[docs]defis_taxadb_up_to_date(dbfile=DEFAULT_TAXADB):"""Return True if a valid and up-to-date taxa.sqlite database exists.
@@ -71,19 +69,18 @@
Source code for ete4.ncbi_taxonomy.ncbiquery
db.close()
- returnversion==DB_VERSION
-
+ returnversion==DB_VERSION
-[docs]
+[docs]classNCBITaxa:""" A local transparent connector to the NCBI taxonomy database. """
-[docs]
+[docs]def__init__(self,dbfile=None,taxdump_file=None,memory=False,update=True):"""Open and keep a connection to the NCBI taxonomy database.
@@ -120,7 +117,7 @@
Source code for ete4.ncbi_taxonomy.ncbiquery
-[docs]
+[docs]defupdate_taxonomy_database(self,taxdump_file=None):"""Update the ncbi taxonomy database.
@@ -154,7 +151,7 @@
Source code for ete4.ncbi_taxonomy.ncbiquery
return conv_all_taxids,conversion
-[docs]
+[docs]defget_fuzzy_name_translation(self,name,sim=0.9):"""Return taxid, species name and match score from the NCBI database.
@@ -203,7 +200,7 @@
Source code for ete4.ncbi_taxonomy.ncbiquery
-[docs]
+[docs]defget_rank(self,taxids):"""Return dict with NCBI taxonomy ranks for each list of taxids."""all_ids=set(taxids)
@@ -222,7 +219,7 @@
Source code for ete4.ncbi_taxonomy.ncbiquery
-[docs]
+[docs]defget_lineage_translator(self,taxids):"""Return dict with lineage tracks corresponding to the given taxids.
@@ -244,7 +241,7 @@
Source code for ete4.ncbi_taxonomy.ncbiquery
-[docs]
+[docs]defget_lineage(self,taxid):"""Return lineage track corresponding to the given taxid.
@@ -276,7 +273,7 @@
Source code for ete4.ncbi_taxonomy.ncbiquery
-[docs]
+[docs]defget_common_names(self,taxids):query=','.join('"%s"'%vforvintaxids)cmd='SELECT taxid, common FROM species WHERE taxid IN (%s);'%query
@@ -291,7 +288,7 @@
Source code for ete4.ncbi_taxonomy.ncbiquery
-[docs]
+[docs]defget_taxid_translator(self,taxids,try_synonyms=True):"""Return dict with the scientific names corresponding to the taxids."""all_ids=set(map(int,taxids))
@@ -323,7 +320,7 @@
Source code for ete4.ncbi_taxonomy.ncbiquery
-[docs]
+[docs]defget_name_translator(self,names):"""Return dict with taxids corresponding to the given scientific names.
@@ -357,7 +354,7 @@
Source code for ete4.ncbi_taxonomy.ncbiquery
-[docs]
+[docs]deftranslate_to_names(self,taxids):"""Return list of scientific names corresponding to taxids."""id2name=self.get_taxid_translator(taxids)
@@ -368,7 +365,7 @@
-[docs]
+[docs]defget_topology(self,taxids,intermediate_nodes=False,rank_limit=None,collapse_subspecies=False,annotate=True):"""Return the minimal pruned NCBI taxonomy tree containing taxids.
@@ -532,7 +529,7 @@
Source code for ete4.ncbi_taxonomy.ncbiquery
-[docs]
+[docs]defannotate_tree(self,t,taxid_attr="name",tax2name=None,tax2track=None,tax2rank=None):"""Annotate a tree containing taxids as leaf names.
@@ -625,7 +622,7 @@
Source code for ete4.ncbi_taxonomy.ncbiquery
return sorted_lineage
-[docs]
+[docs]defget_broken_branches(self,t,taxa_lineages,n2content=None):"""Returns a list of NCBI lineage names that are not monophyletic in the provided tree, as well as the list of affected branches and their size.
diff --git a/_modules/index.html b/_modules/index.html
index 8734c95de..f7ad41bd3 100644
--- a/_modules/index.html
+++ b/_modules/index.html
@@ -35,6 +35,7 @@
diff --git a/_sources/reference/index.rst.txt b/_sources/reference/index.rst.txt
index 4d95c594b..8e94bd376 100644
--- a/_sources/reference/index.rst.txt
+++ b/_sources/reference/index.rst.txt
@@ -10,6 +10,6 @@ Reference Guide
reference_phylo
reference_clustering
reference_seqgroup
- reference_ncbi
+ reference_taxonomy
reference_smartview
reference_treeview
diff --git a/_sources/reference/reference_ncbi.rst.txt b/_sources/reference/reference_ncbi.rst.txt
deleted file mode 100644
index e79cc6d92..000000000
--- a/_sources/reference/reference_ncbi.rst.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-NCBITaxa
-========
-
-.. automodule:: ete4.ncbi_taxonomy.ncbiquery
- :members:
- :undoc-members:
- :special-members: __init__
diff --git a/_sources/reference/reference_phylo.rst.txt b/_sources/reference/reference_phylo.rst.txt
index cc285cef8..3f565b6c4 100644
--- a/_sources/reference/reference_phylo.rst.txt
+++ b/_sources/reference/reference_phylo.rst.txt
@@ -1,12 +1,19 @@
Phylogenetic trees
==================
+PhyloTree
+---------
+
.. autoclass:: ete4.PhyloTree
:members:
:undoc-members:
:show-inheritance:
:special-members: __init__
+
+EvolEvent
+---------
+
.. autoclass:: ete4.phylo.EvolEvent
:members:
:undoc-members:
diff --git a/_sources/reference/reference_taxonomy.rst.txt b/_sources/reference/reference_taxonomy.rst.txt
new file mode 100644
index 000000000..58b5d03e0
--- /dev/null
+++ b/_sources/reference/reference_taxonomy.rst.txt
@@ -0,0 +1,19 @@
+Taxonomy databases
+==================
+
+NCBITaxa
+--------
+
+.. autoclass:: ete4.NCBITaxa
+ :members:
+ :undoc-members:
+ :special-members: __init__
+
+
+GTDBTaxa
+--------
+
+.. autoclass:: ete4.GTDBTaxa
+ :members:
+ :undoc-members:
+ :special-members: __init__
diff --git a/_sources/tutorial/index.rst.txt b/_sources/tutorial/index.rst.txt
index a7a6ea1bf..b95647d7f 100644
--- a/_sources/tutorial/index.rst.txt
+++ b/_sources/tutorial/index.rst.txt
@@ -9,3 +9,4 @@ Contents:
tutorial_trees
tutorial_phylogeny
tutorial_drawing
+ tutorial_taxonomy
diff --git a/_sources/tutorial/tutorial_taxonomy.rst.txt b/_sources/tutorial/tutorial_taxonomy.rst.txt
index ddba7256a..eda99887e 100644
--- a/_sources/tutorial/tutorial_taxonomy.rst.txt
+++ b/_sources/tutorial/tutorial_taxonomy.rst.txt
@@ -1,60 +1,74 @@
.. currentmodule:: ete4
-Connecting with Taxonomy Databases
+Taxonomy databases
==================
.. contents::
Overview
--------
-ETE4 contains *ncbi_taxonomy* and *gtdb_taxonomy* modules which provide
-utilities to efficiently query a local copy of the NCBI or GTDB taxonomy
-databases. The class ``NCBITaxa`` and ``GTDBTaxa`` offer methods to convert
-from taxid to names (and vice versa), to fetch pruned topologies connecting
-a given set of species, or to download rank, names and lineage track information.
-It is also fully integrated with PhyloTree instances through the
-``PhyloNode.annotate_ncbi_taxa()`` and ``PhyloNode.annotate_gtdb_taxa()``method.
+ETE4 contains the *ncbi_taxonomy* and *gtdb_taxonomy* modules which
+provide utilities to efficiently query a local copy of the NCBI or
+GTDB taxonomy databases. The classes :class:`NCBITaxa` and
+:class:`GTDBTaxa` offer methods to convert from taxid to names (and
+vice versa), to fetch pruned topologies connecting a given set of
+species, or to download rank, names and lineage track information.
+
+It is also fully integrated with :class:`PhyloTree` instances through
+the :func:`~PhyloTree.annotate_ncbi_taxa` and
+:func:`~PhyloTree.annotate_gtdb_taxa` methods.
+
Differences between NCBI and GTDB taxonomies in ETE4
----------------------------------------------------
-The NCBI taxonomy database is a comprehensive resource for organism names and
-classifications.It is updated daily and offers multiple access points including a web
-portal, an FTP server. The database releases its data in a package called "taxdump.tar.gz" which
-contains several .dmp files.
+The NCBI taxonomy database is a comprehensive resource for organism
+names and classifications.It is updated daily and offers multiple
+access points including a web portal, an FTP server. The database
+releases its data in a package called "taxdump.tar.gz" which contains
+several .dmp files.
-Taxon in NCBI taxonomyis usually a numeric identifier, commonly representing
-taxa ("TaxID"), but it can also signify other entities like genetic codes or citations, such as
-9606 represents Homo Sapiens.
+Taxon in NCBI taxonomyis usually a numeric identifier, commonly
+representing taxa ("TaxID"), but it can also signify other entities
+like genetic codes or citations, such as 9606 represents Homo Sapiens.
-On the other hand, GTDB taxonomy is distributed as simple text files, uses a genome-based
-approach for classification, and the identifiers are usually specific to genomes rather
-than taxa.
+On the other hand, GTDB taxonomy is distributed as simple text files,
+uses a genome-based approach for classification, and the identifiers
+are usually specific to genomes rather than taxa.
-Since ETE Toolkit version 3, ete3 parses taxdump file to local sqlite database to fullfill the
-methods in ncbi_taxonomy module. We applied the same strategy to GTDBTaxa. While the original GTDB
-taxonomy data differs from NCBI taxonomy files, a conversion step is essential for integration.
+Since ETE Toolkit version 3, ETE parses taxdump file and stores it in
+a local sqlite database to fullfill the methods in ncbi_taxonomy
+module. We applied the same strategy to GTDBTaxa. While the original
+GTDB taxonomy data differs from NCBI taxonomy files, a conversion step
+is essential for integration.
+
+To integrate GTDB into the ETE Toolkit v4, a conversion process is
+necessary. A third-party script
+(https://github.com/nick-youngblut/gtdb_to_taxdump) is employed to
+convert the GTDB taxonomy to the NCBI-like taxdump format. We already
+preprared GTDB taxonomy dump file from different releases version and
+store in
+https://github.com/etetoolkit/ete-data/tree/main/gtdb_taxonomy.
-To integrate GTDB into the ETE Toolkit v4, a conversion process was necessary. A third-party script
-(https://github.com/nick-youngblut/gtdb_to_taxdump) was employed to convert the GTDB taxonomy to the
-NCBI-like taxdump format. We already preprared GTDB taxonomy dump file from different releases version
-and store in https://github.com/etetoolkit/ete-data/tree/main/gtdb_taxonomy.
Setting up local copies of the NCBI and GTDB taxonomy databases
--------------------------------------------------------------
-The first time you attempt to use NCBITaxa or GTDBTaxa, ETE will detect that your local
-database is empty and it will attempt to download the latest taxonomy database(NCBI ~600MB;GTDB ~72MB) and will
-store a parsed version of it in your home directory: ~/.local/share/ete/.
-All future imports of NCBITaxa or GTDBTaxa will detect the local database and will
-skip this step.
+---------------------------------------------------------------
+
+The first time you attempt to use NCBITaxa or GTDBTaxa, ETE will
+detect that your local database is empty and will attempt to download
+the latest taxonomy database (NCBI ~600MB, GTDB ~72MB) and will store
+a parsed version of it in `~/.local/share/ete/` by default. All future
+imports of NCBITaxa or GTDBTaxa will detect the local database and
+will skip this step.
Example::
+
# Load NCBI module
from ete4 import NCBITaxa
ncbi = NCBITaxa()
ncbi.update_taxonomy_database()
-
+
# Load GTDB module
from ete4 import GTDBTaxa
gtdb = GTDBTaxa()
@@ -66,279 +80,280 @@ Example::
# latest release updated in https://github.com/dengzq1234/ete-data/tree/main/gtdb_taxonomy
gtdb.update_taxonomy_database()
- # or
- gtdb.update_taxonomy_database("gtdbdump.tar.gz")
+ # or
+ gtdb.update_taxonomy_database("gtdbdump.tar.gz")
# update with custom release 202
gtdb.update_taxonomy_database('gtdb202dump.tar.gz')
+
Getting taxid information
-------------------------
NCBI taxonomy
~~~~~~~~~~~~~
-you can fetch species names, ranks and linage track information for your taxids using the following
-methods:
-- NCBITaxa.get_rank()
-- NCBITaxa.get_lineage()
-- NCBITaxa.get_taxid_translator()
-- NCBITaxa.get_name_translator()
-- NCBITaxa.translate_to_names()
-The so called get-translator-functions will return a dictionary converting between taxids and species names.
-Either species or linage names/taxids are accepted as input.
+You can fetch species names, ranks and linage track information for
+your taxids using the following methods:
+
+.. autosummary::
+
+ NCBITaxa.get_rank
+ NCBITaxa.get_lineage
+ NCBITaxa.get_taxid_translator
+ NCBITaxa.get_name_translator
+ NCBITaxa.translate_to_names
+
+The so called get-translator functions will return a dictionary
+converting between taxids and species names. Either species or linage
+names/taxids are accepted as input.
Example::
- from ete4 import NCBITaxa
- ncbi = NCBITaxa()
- taxid2name = ncbi.get_taxid_translator([9606, 9443])
- print(taxid2name)
- # {9443: 'Primates', 9606: 'Homo sapiens'}
- name2taxid = ncbi.get_name_translator(['Homo sapiens', 'primates'])
- print(name2taxid)
- # {'Homo sapiens': [9606], 'primates': [9443]}
+ from ete4 import NCBITaxa
+ ncbi = NCBITaxa()
+ taxid2name = ncbi.get_taxid_translator([9606, 9443])
+ print(taxid2name)
+ # {9443: 'Primates', 9606: 'Homo sapiens'}
- # when the same name points to several taxa, all taxids are returned
- name2taxid = ncbi.get_name_translator(['Bacteria'])
- print(name2taxid)
- # {'Bacteria': [2, 629395]}
+ name2taxid = ncbi.get_name_translator(['Homo sapiens', 'primates'])
+ print(name2taxid)
+ # {'Homo sapiens': [9606], 'primates': [9443]}
-Other functions allow to extract further information using taxid numbers as a query.
+ # when the same name points to several taxa, all taxids are returned
+ name2taxid = ncbi.get_name_translator(['Bacteria'])
+ print(name2taxid)
+ # {'Bacteria': [2, 629395]}
+
+Other functions allow to extract further information using taxid
+numbers as a query.
Example::
- from ete4 import NCBITaxa
- ncbi = NCBITaxa()
- print(ncbi.get_rank([9606, 9443]))
- # {9443: u'order', 9606: u'species'}
+ from ete4 import NCBITaxa
+ ncbi = NCBITaxa()
- print(ncbi.get_lineage(9606))
- # [1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593, 7742,
- # 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674, 32525, 9347,
- # 1437010, 314146, 9443, 376913, 314293, 9526, 314295, 9604, 207598, 9605,
- # 9606]
+ print(ncbi.get_rank([9606, 9443]))
+ # {9443: 'order', 9606: 'species'}
-Combine combine all at once:
+ print(ncbi.get_lineage(9606))
+ # [1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593, 7742,
+ # 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674, 32525, 9347,
+ # 1437010, 314146, 9443, 376913, 314293, 9526, 314295, 9604, 207598, 9605,
+ # 9606]
-Example::
- from ete4 import NCBITaxa
- ncbi = NCBITaxa()
+Example combining all at once::
+
+ from ete4 import NCBITaxa
+ ncbi = NCBITaxa()
- lineage = ncbi.get_lineage(9606)
- print(lineage)
+ lineage = ncbi.get_lineage(9606)
+ print(lineage)
- # [1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593, 7742,
- # 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674, 32525, 9347,
- # 1437010, 314146, 9443, 376913, 314293, 9526, 314295, 9604, 207598, 9605,
- # 9606]
+ # [1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593, 7742,
+ # 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674, 32525, 9347,
+ # 1437010, 314146, 9443, 376913, 314293, 9526, 314295, 9604, 207598, 9605,
+ # 9606]
- names = ncbi.get_taxid_translator(lineage)
- print([names[taxid] for taxid in lineage])
+ names = ncbi.get_taxid_translator(lineage)
+ print([names[taxid] for taxid in lineage])
- # [u'root', u'cellular organisms', u'Eukaryota', u'Opisthokonta', u'Metazoa',
- # u'Eumetazoa', u'Bilateria', u'Deuterostomia', u'Chordata', u'Craniata',
- # u'Vertebrata', u'Gnathostomata', u'Teleostomi', u'Euteleostomi',
- # u'Sarcopterygii', u'Dipnotetrapodomorpha', u'Tetrapoda', u'Amniota',
- # u'Mammalia', u'Theria', u'Eutheria', u'Boreoeutheria', u'Euarchontoglires',
- # u'Primates', u'Haplorrhini', u'Simiiformes', u'Catarrhini', u'Hominoidea',
- # u'Hominidae', u'Homininae', u'Homo', u'Homo sapiens']
+ # ['root', 'cellular organisms', 'Eukaryota', 'Opisthokonta', 'Metazoa',
+ # 'Eumetazoa', 'Bilateria', 'Deuterostomia', 'Chordata', 'Craniata',
+ # 'Vertebrata', 'Gnathostomata', 'Teleostomi', 'Euteleostomi',
+ # 'Sarcopterygii', 'Dipnotetrapodomorpha', 'Tetrapoda', 'Amniota',
+ # 'Mammalia', 'Theria', 'Eutheria', 'Boreoeutheria', 'Euarchontoglires',
+ # 'Primates', 'Haplorrhini', 'Simiiformes', 'Catarrhini', 'Hominoidea',
+ # 'Hominidae', 'Homininae', 'Homo', 'Homo sapiens']
GTDB taxonomy
~~~~~~~~~~~~~
-In the NCBI taxonomy database, each species is assigned a unique numeric taxid.
-For example, the taxid 9606 refers to Homo sapiens. These taxids serve as
+
+In the NCBI taxonomy database, each species is assigned a unique numeric taxid.
+For example, the taxid 9606 refers to Homo sapiens. These taxids serve as
essential keys for tracking lineages within the database.
-However, the GTDB database doesn't originally offer numeric taxids like NCBI
-does. In the GTDBTaxa module, we've introduced taxids for each species to
-facilitate lineage tracking. These taxids, while not officially recognized in
-the GTDB database, serve as convenient keys. They help in connecting the lineage
-and taxonomic ranks within the local database, making it easier for users to
+However, the GTDB database doesn't originally offer numeric taxids like NCBI
+does. In the GTDBTaxa module, we've introduced taxids for each species to
+facilitate lineage tracking. These taxids, while not officially recognized in
+the GTDB database, serve as convenient keys. They help in connecting the lineage
+and taxonomic ranks within the local database, making it easier for users to
fetch and relate taxonomic information.
Like NCBITaxa, GTDBTaxa contains similar methods:
-- GTDBTaxa.get_rank()
-- GTDBTaxa.get_lineage()
-- GTDBTaxa.get_taxid_translator()
-- GTDBTaxa.get_name_translator()
-- GTDBTaxa.translate_to_names()
-- GTDBTaxa.get_name_lineage()
+.. autosummary::
+
+ GTDBTaxa.get_rank
+ GTDBTaxa.get_lineage
+ GTDBTaxa.get_taxid_translator
+ GTDBTaxa.get_name_translator
+ GTDBTaxa.translate_to_names
+ GTDBTaxa.get_name_lineage
+
Getting descendant taxa
-----------------------
-Given a taxid or a taxa name from an internal node in the NCBI/GTDB taxonomy tree,
+
+Given a taxid or a taxa name from an internal node in the NCBI/GTDB taxonomy tree,
their descendants can be retrieved as follows:
-NCBI taxonomy
-Example::
- # example in NCBI taxonomy
- from ete4 import NCBITaxa
- ncbi = NCBITaxa()
+NCBI taxonomy example::
- descendants = ncbi.get_descendant_taxa('Homo')
- print(ncbi.translate_to_names(descendants))
+ from ete4 import NCBITaxa
+ ncbi = NCBITaxa()
- # [u'Homo heidelbergensis', u'Homo sapiens ssp. Denisova',
- # u'Homo sapiens neanderthalensis']
+ descendants = ncbi.get_descendant_taxa('Homo')
+ print(ncbi.translate_to_names(descendants))
- # you can easily ignore subspecies, so only taxa labeled as "species" will be reported:
- descendants = ncbi.get_descendant_taxa('Homo', collapse_subspecies=True)
- print(ncbi.translate_to_names(descendants))
+ # ['Homo heidelbergensis', 'Homo sapiens ssp. Denisova',
+ # 'Homo sapiens neanderthalensis']
- # [u'Homo sapiens', u'Homo heidelbergensis']
+ # You can easily ignore subspecies, so only taxa labeled as "species" will be reported:
+ descendants = ncbi.get_descendant_taxa('Homo', collapse_subspecies=True)
+ print(ncbi.translate_to_names(descendants))
- # or even returned as an annotated tree
- tree = ncbi.get_descendant_taxa('Homo', collapse_subspecies=True, return_tree=True)
+ # ['Homo sapiens', 'Homo heidelbergensis']
- print(tree.to_str(props=['sci_name','taxid']))
+ # or even returned as an annotated tree
+ tree = ncbi.get_descendant_taxa('Homo', collapse_subspecies=True, return_tree=True)
+
+ print(tree.to_str(props=['sci_name','taxid']))
+ # ╭╴environmental samples,2665952╶╌╴Homo sapiens environmental sample,2665953
+ # │
+ # ├╴Homo sapiens,9606
+ # ╴Homo,9605╶┤
+ # ├╴Homo heidelbergensis,1425170
+ # │
+ # ╰╴unclassified Homo,2813598╶╌╴Homo sp.,2813599
+
+GTDB taxonomy example::
+
+ from ete4 import GTDBTaxa
+ gtdb = GTDBTaxa()
+ descendants = gtdb.get_descendant_taxa('f__Thorarchaeaceae')
+ print(descendants)
+ # ['GB_GCA_003662765.1', 'GB_GCA_003662805.1', ..., 'GB_GCA_013138615.1']
+
+ # Ignore subspecies, so only taxa labeled as "species" will be reported.
+ descendants = gtdb.get_descendant_taxa('f__Thorarchaeaceae', collapse_subspecies=True)
+ print(descendants)
+ # ['s__MP8T-1 sp002825535', 's__MP8T-1 sp003345545', ..., 's__TEKIR-12S sp004524435']
+
+ # Returned as an annotated tree.
+ descendants = gtdb.get_descendant_taxa('f__Thorarchaeaceae', collapse_subspecies=True, return_tree=True)
+ print(descendants.to_str(props=['sci_name','rank']))
+ # ╭╴s__MP8T-1 sp002825535,species
+ # │
+ # ├╴s__MP8T-1 sp003345545,species
+ # │
+ # ╭╴g__MP8T-1,genus╶┼╴s__MP8T-1 sp002825465,species
+ # │ │
+ # │ ├╴s__MP8T-1 sp004524565,species
+ # │ │
+ # │ ╰╴s__MP8T-1 sp004524595,species
+ # │
+ # │ ╭╴s__SMTZ1-83 sp011364985,species
+ # │ │
+ # ├╴g__SMTZ1-83,genus╶┼╴s__SMTZ1-83 sp011365025,species
+ # │ │
+ # │ ╰╴s__SMTZ1-83 sp001563325,species
+ # │
+ # ├╴g__TEKIR-14,genus╶╌╴s__TEKIR-14 sp004524445,species
+ # │
+ # ├╴g__SHMX01,genus╶╌╴s__SHMX01 sp008080745,species
+ # │
+ # │ ╭╴s__OWC5 sp003345595,species
+ # ├╴g__OWC5,genus╶┤
+ # ╴f__Tho[...]╶┤ ╰╴s__OWC5 sp003345555,species
+ # │
+ # ├╴g__JACAEL01,genus╶╌╴s__JACAEL01 sp013388835,species
+ # │
+ # ├╴g__B65-G9,genus╶╌╴s__B65-G9 sp003662765,species
+ # │
+ # │ ╭╴s__SMTZ1-45 sp001563335,species
+ # │ │
+ # │ ├╴s__SMTZ1-45 sp011364905,species
+ # │ │
+ # ├╴g__SMTZ1-45,genus╶┼╴s__SMTZ1-45 sp001940705,species
+ # │ │
+ # │ ├╴s__SMTZ1-45 sp004376265,species
+ # │ │
+ # │ ╰╴s__SMTZ1-45 sp002825515,species
+ # │
+ # ├╴g__WTCK01,genus╶╌╴s__WTCK01 sp013138615,species
+ # │
+ # ╰╴g__TEKIR-12S,genus╶╌╴s__TEKIR-12S sp004524435,species
- """
- ╭╴environmental samples,2665952╶╌╴Homo sapiens environmental sample,2665953
- │
- ├╴Homo sapiens,9606
- ╴Homo,9605╶┤
- ├╴Homo heidelbergensis,1425170
- │
- ╰╴unclassified Homo,2813598╶╌╴Homo sp.,2813599
- """
-
-GTDB taxonomy
-Example::
- from ete4 import GTDBTaxa
- gtdb = GTDBTaxa()
- descendants = gtdb.get_descendant_taxa('f__Thorarchaeaceae')
- print(descendants)
- # ['GB_GCA_003662765.1', 'GB_GCA_003662805.1', 'GB_GCA_003345555.1', 'GB_GCA_003345595.1', 'GB_GCA_001940705.1', 'GB_GCA_001563335.1', 'GB_GCA_011364905.1', 'GB_GCA_004376265.1', 'GB_GCA_002825515.1', 'GB_GCA_001563325.1', 'GB_GCA_011364985.1', 'GB_GCA_011365025.1', 'GB_GCA_004524565.1', 'GB_GCA_004524595.1', 'GB_GCA_002825465.1', 'GB_GCA_002825535.1', 'GB_GCA_003345545.1', 'GB_GCA_004524445.1', 'GB_GCA_013388835.1', 'GB_GCA_008080745.1', 'GB_GCA_004524435.1', 'GB_GCA_013138615.1']
-
- #ignore subspecies, so only taxa labeled as "species" will be reported
- descendants = gtdb.get_descendant_taxa('f__Thorarchaeaceae', collapse_subspecies=True)
-
- print(descendants)
-
- #['s__MP8T-1 sp002825535', 's__MP8T-1 sp003345545', 's__MP8T-1 sp002825465', 's__MP8T-1 sp004524565', 's__MP8T-1 sp004524595', 's__SMTZ1-83 sp011364985', 's__SMTZ1-83 sp011365025', 's__SMTZ1-83 sp001563325', 's__TEKIR-14 sp004524445', 's__SHMX01 sp008080745', 's__OWC5 sp003345595', 's__OWC5 sp003345555', 's__JACAEL01 sp013388835', 's__B65-G9 sp003662765', 's__SMTZ1-45 sp001563335', 's__SMTZ1-45 sp011364905', 's__SMTZ1-45 sp001940705', 's__SMTZ1-45 sp004376265', 's__SMTZ1-45 sp002825515', 's__WTCK01 sp013138615', 's__TEKIR-12S sp004524435']
-
-
- #returned as an annotated tree
- descendants = gtdb.get_descendant_taxa('f__Thorarchaeaceae', collapse_subspecies=True, return_tree=True)
- print(descendants.to_str(props=['sci_name','rank']))
- """
- ╭╴s__MP8T-1 sp002825535,species
- │
- ├╴s__MP8T-1 sp003345545,species
- │
- ╭╴g__MP8T-1,genus╶┼╴s__MP8T-1 sp002825465,species
- │ │
- │ ├╴s__MP8T-1 sp004524565,species
- │ │
- │ ╰╴s__MP8T-1 sp004524595,species
- │
- │ ╭╴s__SMTZ1-83 sp011364985,species
- │ │
- ├╴g__SMTZ1-83,genus╶┼╴s__SMTZ1-83 sp011365025,species
- │ │
- │ ╰╴s__SMTZ1-83 sp001563325,species
- │
- ├╴g__TEKIR-14,genus╶╌╴s__TEKIR-14 sp004524445,species
- │
- ├╴g__SHMX01,genus╶╌╴s__SHMX01 sp008080745,species
- │
- │ ╭╴s__OWC5 sp003345595,species
- ├╴g__OWC5,genus╶┤
- ╴f__Thorarchaeaceae,family╶┤ ╰╴s__OWC5 sp003345555,species
- │
- ├╴g__JACAEL01,genus╶╌╴s__JACAEL01 sp013388835,species
- │
- ├╴g__B65-G9,genus╶╌╴s__B65-G9 sp003662765,species
- │
- │ ╭╴s__SMTZ1-45 sp001563335,species
- │ │
- │ ├╴s__SMTZ1-45 sp011364905,species
- │ │
- ├╴g__SMTZ1-45,genus╶┼╴s__SMTZ1-45 sp001940705,species
- │ │
- │ ├╴s__SMTZ1-45 sp004376265,species
- │ │
- │ ╰╴s__SMTZ1-45 sp002825515,species
- │
- ├╴g__WTCK01,genus╶╌╴s__WTCK01 sp013138615,species
- │
- ╰╴g__TEKIR-12S,genus╶╌╴s__TEKIR-12S sp004524435,species
- """
Getting species tree topology
-----------------------------------
-Getting the taxonomy tree for a given set of species is one of the most useful ways
-to get all information at once. The method NCBITaxa.get_topology() or GTDBTaxa.get_topology() allows to query your
-local NCBI/GTDB database and extract the smallest tree that connects all your query taxids.
-It returns a normal ETE tree in which all nodes, internal or leaves, are annotated for
-lineage, scientific names, ranks, and so on.
+-----------------------------
-NCBI taxonomy
-Example::
- from ete4 import NCBITaxa
- ncbi = NCBITaxa()
-
- tree = ncbi.get_topology([9606, 9598, 10090, 7707, 8782])
-
- print(tree.to_str(props=["sci_name", "rank"]))
- """
- ╭╴Dendrochirotida,order
- │
- │ ╭╴Homo sapiens,species
- ╴Deuterostomia,clade╶┤ ╭╴Homininae,subfamily╶┤
- │ ╭╴Euarchontoglires,superorder╶┤ ╰╴Pan troglodytes,species
- │ │ │
- ╰╴Amniota,clade╶┤ ╰╴Mus musculus,species
- │
- ╰╴Aves,class
- """
-
- # all intermediate nodes connecting the species can also be kept in the tree
- tree = ncbi.get_topology([2, 33208], intermediate_nodes=True)
- print(tree.to_str(props=["sci_name"]))
- """
- ╭╴Eukaryota╶╌╴Opisthokonta╶╌╴Metazoa
- ╴cellular organisms╶┤
- ╰╴Bacteria
- """
+Getting the taxonomy tree for a given set of species is one of the
+most useful ways to get all information at once. The methods
+:func:`NCBITaxa.get_topology` or :func:`GTDBTaxa.get_topology` allow
+to query your local NCBI/GTDB database and extract the smallest tree
+that connects all your query taxids. It returns a normal ETE tree in
+which all nodes, internal or leaves, are annotated for lineage,
+scientific names, ranks, and so on.
+
+NCBI taxonomy example::
+
+ from ete4 import NCBITaxa
+ ncbi = NCBITaxa()
+
+ tree = ncbi.get_topology([9606, 9598, 10090, 7707, 8782])
+
+ print(tree.to_str(props=["sci_name", "rank"]))
+ # ╭╴Dendrochirotida,order
+ # │
+ # │ ╭╴Homo sapiens,species
+ # ╴Deuterostomia,clade╶┤ ╭╴Homininae,subfamily╶┤
+ # │ ╭╴Euarchontoglires,superorder╶┤ ╰╴Pan troglodytes,species
+ # │ │ │
+ # ╰╴Amniota,clade╶┤ ╰╴Mus musculus,species
+ # │
+ # ╰╴Aves,class
+
+ # All intermediate nodes connecting the species can also be kept in the tree.
+ tree = ncbi.get_topology([2, 33208], intermediate_nodes=True)
+ print(tree.to_str(props=["sci_name"]))
+ # ╭╴Eukaryota╶╌╴Opisthokonta╶╌╴Metazoa
+ # ╴cellular organisms╶┤
+ # ╰╴Bacteria
+
+GTDB taxonomy example::
+
+ from ete4 import GTDBTaxa
+ gtdb = GTDBTaxa()
+
+ tree = gtdb.get_topology(["p__Huberarchaeota", "o__Peptococcales", "f__Korarchaeaceae"])
+ print(tree.to_str(props=['sci_name', 'rank']))
+ # ╭╴p__Huberarchaeota,phylum
+ # ╭╴d__Archaea,superkingdom╶┤
+ # ╴root,no rank╶┤ ╰╴f__Korarchaeaceae,family
+ # │
+ # ╰╴o__Peptococcales,order
+
+ # All intermediate nodes connecting the species can also be kept in the tree.
+ tree = gtdb.get_topology(["p__Huberarchaeota", "o__Peptococcales", "f__Korarchaeaceae"], intermediate_nodes=True, collapse_subspecies=True, annotate=True)
+ print(tree.to_str(props=['sci_name', 'rank']))
+ # ╭╴p__Huberarchaeota,phylum
+ # ╭╴d__Archaea,superkingdom╶┤
+ # ╴root,no rank╶┤ ╰╴p__Thermoproteota,phylum╶╌╴c__Korarchaeia,class╶╌╴o__Korarchaeales,order╶╌╴f__Korarchaeaceae,family
+ # │
+ # ╰╴d__Bacteria,superkingdom╶╌╴p__Firmicutes_B,phylum╶╌╴c__Peptococcia,class╶╌╴o__Peptococcales,order
-GTDB taxonomy
-Example::
- from ete4 import GTDBTaxa
- gtdb = GTDBTaxa()
-
- tree = gtdb.get_topology(["p__Huberarchaeota", "o__Peptococcales", "f__Korarchaeaceae"])
- print(tree.to_str(props=['sci_name', 'rank']))
-
- """
- ╭╴p__Huberarchaeota,phylum
- ╭╴d__Archaea,superkingdom╶┤
- ╴root,no rank╶┤ ╰╴f__Korarchaeaceae,family
- │
- ╰╴o__Peptococcales,order
-
- """
-
- # all intermediate nodes connecting the species can also be kept in the tree
- tree = gtdb.get_topology(["p__Huberarchaeota", "o__Peptococcales", "f__Korarchaeaceae"], intermediate_nodes=True, collapse_subspecies=True, annotate=True)
- print(tree.to_str(props=['sci_name', 'rank']))
- """
- ╭╴p__Huberarchaeota,phylum
- ╭╴d__Archaea,superkingdom╶┤
- ╴root,no rank╶┤ ╰╴p__Thermoproteota,phylum╶╌╴c__Korarchaeia,class╶╌╴o__Korarchaeales,order╶╌╴f__Korarchaeaceae,family
- │
- ╰╴d__Bacteria,superkingdom╶╌╴p__Firmicutes_B,phylum╶╌╴c__Peptococcia,class╶╌╴o__Peptococcales,order
- """
Automatic tree annotation using NCBI/GTDB taxonomy
----------------------------------------------
-NCBI/GTDB taxonomy annotation consists of adding additional information to any internal a leaf node
-in a give user tree. Only an property containing the taxid associated to each node
-is required for the nodes in the query tree. The annotation process will add the
-following features to the nodes:
+--------------------------------------------------
+
+NCBI/GTDB taxonomy annotation consists of adding additional
+information to any internal or leaf node in a tree. Only a property
+containing the taxid associated to each node is required for the nodes
+in the query tree. The annotation process will add the following
+features to the nodes:
- sci_name
- taxid
@@ -346,53 +361,50 @@ following features to the nodes:
- lineage
- rank
-Note that, for internal nodes, taxid can be automatically inferred based on their sibling
-nodes. The easiest way to annotate a tree is to use a PhyloTree instance where the species
-name attribute is transparently used as the taxid attribute. Note that
-the :PhyloNode:`annotate_ncbi_taxa`: or :PhyloNode:`annotate_gtdb_taxa`: function will also return the used name, lineage and
-rank translators.
+Note that, for internal nodes, taxid can be automatically inferred
+based on their sibling nodes. The easiest way to annotate a tree is to
+use a PhyloTree instance where the species name attribute is
+transparently used as the taxid attribute. Note that the
+:func:`~PhyloTree.annotate_ncbi_taxa` or
+:func:`~PhyloTree.annotate_gtdb_taxa` function will also return the
+used name, lineage and rank translators.
-Remember that species names in PhyloTree instances are automatically extracted from leaf names. The parsing method can be easily adapted to any formatting:
+Remember that species names in PhyloTree instances are automatically
+extracted from leaf names. The parsing method can be easily adapted to
+any formatting:
-NCBI taxonomy
-Example::
- from ete4 import PhyloTree
+NCBI taxonomy example::
- # load the whole leaf name as species taxid
- tree = PhyloTree('((9606, 9598), 10090);', sp_naming_function=lambda name: name)
- tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa()
+ from ete4 import PhyloTree
+ # Load the whole leaf name as species taxid.
+ tree = PhyloTree('((9606, 9598), 10090);', sp_naming_function=lambda name: name)
+ tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa()
- # split names by '|' and return the first part as the species taxid
- tree = PhyloTree('((9606|protA, 9598|protA), 10090|protB);', sp_naming_function=lambda name: name.split('|')[0])
- tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa()
+ # Split names by '|' and return the first part as the species taxid.
+ tree = PhyloTree('((9606|protA, 9598|protA), 10090|protB);', sp_naming_function=lambda name: name.split('|')[0])
+ tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa()
- print(tree.to_str(props=["name", "sci_name", "taxid"]))
+ print(tree.to_str(props=["name", "sci_name", "taxid"]))
+ # ╭╴9606|protA,Homo sapiens,9606
+ # ╭╴(empty),Homininae,207598╶┤
+ # ╴(empty),Euarchontoglires,314146╶┤ ╰╴9598|protA,Pan troglodytes,9598
+ # │
+ # ╰╴10090|protB,Mus musculus,10090
- """
- ╭╴9606|protA,Homo sapiens,9606
- ╭╴(empty),Homininae,207598╶┤
- ╴(empty),Euarchontoglires,314146╶┤ ╰╴9598|protA,Pan troglodytes,9598
- │
- ╰╴10090|protB,Mus musculus,10090
- """
-
-GTDB taxonomy
-Example::
- from ete4 import PhyloTree
+GTDB taxonomy example::
- # load the whole leaf name as species taxid
- newick = '((p__Huberarchaeota,f__Korarchaeaceae)d__Archaea,o__Peptococcales);'
+ from ete4 import PhyloTree
- tree= PhyloTree(newick)
- tax2name, tax2track, tax2rank = gtdb.annotate_tree(tree, taxid_attr="name")
+ # Load the whole leaf name as species taxid.
+ newick = '((p__Huberarchaeota,f__Korarchaeaceae)d__Archaea,o__Peptococcales);'
- print(tree.to_str(props=['sci_name', 'rank']))
+ tree = PhyloTree(newick)
+ tax2name, tax2track, tax2rank = gtdb.annotate_tree(tree, taxid_attr="name")
- """
- ╭╴p__Huberarchaeota,phylum
- ╭╴d__Archaea,superkingdom╶┤
- ╴root,no rank╶┤ ╰╴f__Korarchaeaceae,family
- │
- ╰╴o__Peptococcales,order
- """
\ No newline at end of file
+ print(tree.to_str(props=['sci_name', 'rank']))
+ # ╭╴p__Huberarchaeota,phylum
+ # ╭╴d__Archaea,superkingdom╶┤
+ # ╴root,no rank╶┤ ╰╴f__Korarchaeaceae,family
+ # │
+ # ╰╴o__Peptococcales,order
diff --git a/genindex.html b/genindex.html
index 45b5d0d26..bb1482316 100644
--- a/genindex.html
+++ b/genindex.html
@@ -85,6 +85,8 @@
Returns a list of GTDB lineage names that are not monophyletic in the
+provided tree, as well as the list of affected branches and their size.
+CURRENTLY EXPERIMENTAL
given a parent taxid or scientific species name, returns a list of all its descendants taxids.
+If intermediate_nodes is set to True, internal nodes will also be dumped.
Given a list of taxid scientific names, returns a dictionary translating them into their corresponding taxids.
+Exact name match is required for translation.
Return minimal pruned GTDB taxonomy tree containing all given taxids.
+
+
Parameters:
+
+
intermediate_nodes – If True, single child nodes
+representing the complete lineage of leaf nodes are kept.
+Otherwise, the tree is pruned to contain the first common
+ancestor of each group.
+
rank_limit – If valid NCBI rank name is provided, the
+tree is pruned at that given level. For instance, use
+rank=”species” to get rid of sub-species or strain leaf
+nodes.
+
collapse_subspecies – If True, any item under the
+species rank will be collapsed into the species upper
+node.
ETE4 contains ncbi_taxonomy and gtdb_taxonomy modules which provide
-utilities to efficiently query a local copy of the NCBI or GTDB taxonomy
-databases. The class NCBITaxa and GTDBTaxa offer methods to convert
-from taxid to names (and vice versa), to fetch pruned topologies connecting
-a given set of species, or to download rank, names and lineage track information.
-
It is also fully integrated with PhyloTree instances through the
-PhyloNode.annotate_ncbi_taxa() and ``PhyloNode.annotate_gtdb_taxa()``method.
ETE4 contains the ncbi_taxonomy and gtdb_taxonomy modules which
+provide utilities to efficiently query a local copy of the NCBI or
+GTDB taxonomy databases. The classes NCBITaxa and
+GTDBTaxa offer methods to convert from taxid to names (and
+vice versa), to fetch pruned topologies connecting a given set of
+species, or to download rank, names and lineage track information.
The NCBI taxonomy database is a comprehensive resource for organism names and
-classifications.It is updated daily and offers multiple access points including a web
-portal, an FTP server. The database releases its data in a package called “taxdump.tar.gz” which
-contains several .dmp files.
-
Taxon in NCBI taxonomyis usually a numeric identifier, commonly representing
-taxa (“TaxID”), but it can also signify other entities like genetic codes or citations, such as
-9606 represents Homo Sapiens.
-
On the other hand, GTDB taxonomy is distributed as simple text files, uses a genome-based
-approach for classification, and the identifiers are usually specific to genomes rather
-than taxa.
-
Since ETE Toolkit version 3, ete3 parses taxdump file to local sqlite database to fullfill the
-methods in ncbi_taxonomy module. We applied the same strategy to GTDBTaxa. While the original GTDB
-taxonomy data differs from NCBI taxonomy files, a conversion step is essential for integration.
The NCBI taxonomy database is a comprehensive resource for organism
+names and classifications.It is updated daily and offers multiple
+access points including a web portal, an FTP server. The database
+releases its data in a package called “taxdump.tar.gz” which contains
+several .dmp files.
+
Taxon in NCBI taxonomyis usually a numeric identifier, commonly
+representing taxa (“TaxID”), but it can also signify other entities
+like genetic codes or citations, such as 9606 represents Homo Sapiens.
+
On the other hand, GTDB taxonomy is distributed as simple text files,
+uses a genome-based approach for classification, and the identifiers
+are usually specific to genomes rather than taxa.
+
Since ETE Toolkit version 3, ETE parses taxdump file and stores it in
+a local sqlite database to fullfill the methods in ncbi_taxonomy
+module. We applied the same strategy to GTDBTaxa. While the original
+GTDB taxonomy data differs from NCBI taxonomy files, a conversion step
+is essential for integration.
The first time you attempt to use NCBITaxa or GTDBTaxa, ETE will detect that your local
-database is empty and it will attempt to download the latest taxonomy database(NCBI ~600MB;GTDB ~72MB) and will
-store a parsed version of it in your home directory: ~/.local/share/ete/.
-All future imports of NCBITaxa or GTDBTaxa will detect the local database and will
-skip this step.
The first time you attempt to use NCBITaxa or GTDBTaxa, ETE will
+detect that your local database is empty and will attempt to download
+the latest taxonomy database (NCBI ~600MB, GTDB ~72MB) and will store
+a parsed version of it in ~/.local/share/ete/ by default. All future
+imports of NCBITaxa or GTDBTaxa will detect the local database and
+will skip this step.
+
Example:
+
# Load NCBI module
+fromete4importNCBITaxa
+ncbi=NCBITaxa()
+ncbi.update_taxonomy_database()
+
+# Load GTDB module
+fromete4importGTDBTaxa
+gtdb=GTDBTaxa()
+gtdb.update_taxonomy_database()
+
+# Load GTDB module with specific release version
+fromete4importGTDBTaxa
+gtdb=GTDBTaxa()
+
+# latest release updated in https://github.com/dengzq1234/ete-data/tree/main/gtdb_taxonomy
+gtdb.update_taxonomy_database()
+# or
+gtdb.update_taxonomy_database("gtdbdump.tar.gz")
+
+# update with custom release 202
+gtdb.update_taxonomy_database('gtdb202dump.tar.gz')
+
you can fetch species names, ranks and linage track information for your taxids using the following
-methods:
-
-
NCBITaxa.get_rank()
-
NCBITaxa.get_lineage()
-
NCBITaxa.get_taxid_translator()
-
NCBITaxa.get_name_translator()
-
NCBITaxa.translate_to_names()
-
-
The so called get-translator-functions will return a dictionary converting between taxids and species names.
-Either species or linage names/taxids are accepted as input.
# when the same name points to several taxa, all taxids are returned
-name2taxid = ncbi.get_name_translator([‘Bacteria’])
-print(name2taxid)
-# {‘Bacteria’: [2, 629395]}
-
-
-
Other functions allow to extract further information using taxid numbers as a query.
Return list of scientific names corresponding to taxids.
+
+
+
+
The so called get-translator functions will return a dictionary
+converting between taxids and species names. Either species or linage
+names/taxids are accepted as input.
+
Example:
+
fromete4importNCBITaxa
+ncbi=NCBITaxa()
+taxid2name=ncbi.get_taxid_translator([9606,9443])
+print(taxid2name)
+# {9443: 'Primates', 9606: 'Homo sapiens'}
+
+name2taxid=ncbi.get_name_translator(['Homo sapiens','primates'])
+print(name2taxid)
+# {'Homo sapiens': [9606], 'primates': [9443]}
+
+# when the same name points to several taxa, all taxids are returned
+name2taxid=ncbi.get_name_translator(['Bacteria'])
+print(name2taxid)
+# {'Bacteria': [2, 629395]}
+
+
+
Other functions allow to extract further information using taxid
+numbers as a query.
In the NCBI taxonomy database, each species is assigned a unique numeric taxid.
For example, the taxid 9606 refers to Homo sapiens. These taxids serve as
essential keys for tracking lineages within the database.
Getting the taxonomy tree for a given set of species is one of the most useful ways
-to get all information at once. The method NCBITaxa.get_topology() or GTDBTaxa.get_topology() allows to query your
-local NCBI/GTDB database and extract the smallest tree that connects all your query taxids.
-It returns a normal ETE tree in which all nodes, internal or leaves, are annotated for
-lineage, scientific names, ranks, and so on.
Getting the taxonomy tree for a given set of species is one of the
+most useful ways to get all information at once. The methods
+NCBITaxa.get_topology() or GTDBTaxa.get_topology() allow
+to query your local NCBI/GTDB database and extract the smallest tree
+that connects all your query taxids. It returns a normal ETE tree in
+which all nodes, internal or leaves, are annotated for lineage,
+scientific names, ranks, and so on.
+
NCBI taxonomy example:
fromete4importNCBITaxancbi=NCBITaxa()tree=ncbi.get_topology([9606,9598,10090,7707,8782])print(tree.to_str(props=["sci_name","rank"]))
-"""
- ╭╴Dendrochirotida,order
- │
- │ ╭╴Homo sapiens,species
-╴Deuterostomia,clade╶┤ ╭╴Homininae,subfamily╶┤
- │ ╭╴Euarchontoglires,superorder╶┤ ╰╴Pan troglodytes,species
- │ │ │
- ╰╴Amniota,clade╶┤ ╰╴Mus musculus,species
- │
- ╰╴Aves,class
-"""
-
-# all intermediate nodes connecting the species can also be kept in the tree
+# ╭╴Dendrochirotida,order
+# │
+# │ ╭╴Homo sapiens,species
+# ╴Deuterostomia,clade╶┤ ╭╴Homininae,subfamily╶┤
+# │ ╭╴Euarchontoglires,superorder╶┤ ╰╴Pan troglodytes,species
+# │ │ │
+# ╰╴Amniota,clade╶┤ ╰╴Mus musculus,species
+# │
+# ╰╴Aves,class
+
+# All intermediate nodes connecting the species can also be kept in the tree.tree=ncbi.get_topology([2,33208],intermediate_nodes=True)print(tree.to_str(props=["sci_name"]))
-"""
- ╭╴Eukaryota╶╌╴Opisthokonta╶╌╴Metazoa
-╴cellular organisms╶┤
- ╰╴Bacteria
-"""
+# ╭╴Eukaryota╶╌╴Opisthokonta╶╌╴Metazoa
+# ╴cellular organisms╶┤
+# ╰╴Bacteria
-
GTDB taxonomy
-Example:
+
GTDB taxonomy example:
fromete4importGTDBTaxagtdb=GTDBTaxa()tree=gtdb.get_topology(["p__Huberarchaeota","o__Peptococcales","f__Korarchaeaceae"])print(tree.to_str(props=['sci_name','rank']))
+# ╭╴p__Huberarchaeota,phylum
+# ╭╴d__Archaea,superkingdom╶┤
+# ╴root,no rank╶┤ ╰╴f__Korarchaeaceae,family
+# │
+# ╰╴o__Peptococcales,order
-"""
- ╭╴p__Huberarchaeota,phylum
- ╭╴d__Archaea,superkingdom╶┤
-╴root,no rank╶┤ ╰╴f__Korarchaeaceae,family
- │
- ╰╴o__Peptococcales,order
-
-"""
-
-# all intermediate nodes connecting the species can also be kept in the tree
+# All intermediate nodes connecting the species can also be kept in the tree.tree=gtdb.get_topology(["p__Huberarchaeota","o__Peptococcales","f__Korarchaeaceae"],intermediate_nodes=True,collapse_subspecies=True,annotate=True)print(tree.to_str(props=['sci_name','rank']))
-"""
- ╭╴p__Huberarchaeota,phylum
- ╭╴d__Archaea,superkingdom╶┤
-╴root,no rank╶┤ ╰╴p__Thermoproteota,phylum╶╌╴c__Korarchaeia,class╶╌╴o__Korarchaeales,order╶╌╴f__Korarchaeaceae,family
- │
- ╰╴d__Bacteria,superkingdom╶╌╴p__Firmicutes_B,phylum╶╌╴c__Peptococcia,class╶╌╴o__Peptococcales,order
-"""
+# ╭╴p__Huberarchaeota,phylum
+# ╭╴d__Archaea,superkingdom╶┤
+# ╴root,no rank╶┤ ╰╴p__Thermoproteota,phylum╶╌╴c__Korarchaeia,class╶╌╴o__Korarchaeales,order╶╌╴f__Korarchaeaceae,family
+# │
+# ╰╴d__Bacteria,superkingdom╶╌╴p__Firmicutes_B,phylum╶╌╴c__Peptococcia,class╶╌╴o__Peptococcales,order
NCBI/GTDB taxonomy annotation consists of adding additional information to any internal a leaf node
-in a give user tree. Only an property containing the taxid associated to each node
-is required for the nodes in the query tree. The annotation process will add the
-following features to the nodes:
NCBI/GTDB taxonomy annotation consists of adding additional
+information to any internal or leaf node in a tree. Only a property
+containing the taxid associated to each node is required for the nodes
+in the query tree. The annotation process will add the following
+features to the nodes:
Note that, for internal nodes, taxid can be automatically inferred based on their sibling
-nodes. The easiest way to annotate a tree is to use a PhyloTree instance where the species
-name attribute is transparently used as the taxid attribute. Note that
-the :PhyloNode:`annotate_ncbi_taxa`: or :PhyloNode:`annotate_gtdb_taxa`: function will also return the used name, lineage and
-rank translators.
-
Remember that species names in PhyloTree instances are automatically extracted from leaf names. The parsing method can be easily adapted to any formatting:
-
NCBI taxonomy
-Example:
+
Note that, for internal nodes, taxid can be automatically inferred
+based on their sibling nodes. The easiest way to annotate a tree is to
+use a PhyloTree instance where the species name attribute is
+transparently used as the taxid attribute. Note that the
+annotate_ncbi_taxa() or
+annotate_gtdb_taxa() function will also return the
+used name, lineage and rank translators.
+
Remember that species names in PhyloTree instances are automatically
+extracted from leaf names. The parsing method can be easily adapted to
+any formatting:
+
NCBI taxonomy example:
fromete4importPhyloTree
-# load the whole leaf name as species taxid
+# Load the whole leaf name as species taxid.tree=PhyloTree('((9606, 9598), 10090);',sp_naming_function=lambdaname:name)tax2names,tax2lineages,tax2rank=tree.annotate_ncbi_taxa()
-
-# split names by '|' and return the first part as the species taxid
+# Split names by '|' and return the first part as the species taxid.tree=PhyloTree('((9606|protA, 9598|protA), 10090|protB);',sp_naming_function=lambdaname:name.split('|')[0])tax2names,tax2lineages,tax2rank=tree.annotate_ncbi_taxa()print(tree.to_str(props=["name","sci_name","taxid"]))
-
-"""
- ╭╴9606|protA,Homo sapiens,9606
- ╭╴(empty),Homininae,207598╶┤
-╴(empty),Euarchontoglires,314146╶┤ ╰╴9598|protA,Pan troglodytes,9598
- │
- ╰╴10090|protB,Mus musculus,10090
-"""
+# ╭╴9606|protA,Homo sapiens,9606
+# ╭╴(empty),Homininae,207598╶┤
+# ╴(empty),Euarchontoglires,314146╶┤ ╰╴9598|protA,Pan troglodytes,9598
+# │
+# ╰╴10090|protB,Mus musculus,10090
-
GTDB taxonomy
-Example:
+
GTDB taxonomy example:
fromete4importPhyloTree
-# load the whole leaf name as species taxid
+# Load the whole leaf name as species taxid.newick='((p__Huberarchaeota,f__Korarchaeaceae)d__Archaea,o__Peptococcales);'
-tree=PhyloTree(newick)
+tree=PhyloTree(newick)tax2name,tax2track,tax2rank=gtdb.annotate_tree(tree,taxid_attr="name")print(tree.to_str(props=['sci_name','rank']))
-
-"""
- ╭╴p__Huberarchaeota,phylum
- ╭╴d__Archaea,superkingdom╶┤
-╴root,no rank╶┤ ╰╴f__Korarchaeaceae,family
- │
- ╰╴o__Peptococcales,order
-"""
+# ╭╴p__Huberarchaeota,phylum
+# ╭╴d__Archaea,superkingdom╶┤
+# ╴root,no rank╶┤ ╰╴f__Korarchaeaceae,family
+# │
+# ╰╴o__Peptococcales,order