diff --git a/README.md b/README.md index 2305921..9d91904 100644 --- a/README.md +++ b/README.md @@ -21,10 +21,10 @@ with open('sample_data/toy_trees.p', 'rb') as fh: ete_trees = pickle.load(fh) dag = hdag.history_dag_from_etes(ete_trees, ['sequence']) -dag.count_trees() # 1041 +dag.count_histories() # 1041 -dag.add_all_allowed_edges() -dag.count_trees() # 3431531 +dag.make_complete() +dag.count_histories() # 3431531 dag.hamming_parsimony_count() # Shows counts of trees of different parsimony scores dag.trim_optimal_weight() @@ -59,7 +59,7 @@ trees = [tree for tree in dag] # Another method for fetching all trees in the dag is provided, but the order # will not match index order: -scrambled_trees = list(dag.get_trees()) +scrambled_trees = list(dag.get_histories()) # Union is implemented as dag merging, including with sequences of dags @@ -74,7 +74,7 @@ newdag = dag[0] | (dag[i] for i in range(3,5)) * `history_dag_from_newicks` * `history_dag_from_etes` * Trees can be extracted from the history DAG with methods like - * `HistoryDag.get_trees` + * `HistoryDag.get_histories` * `HistoryDag.sample` * `HistoryDag.to_ete` * `HistoryDag.to_newick` and `HistoryDag.to_newicks` diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 30058ac..d02b115 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -11,15 +11,16 @@ A history DAG is a way to represent a collection of trees whose nodes sequence. In its simplest form, a history DAG may represent a single tree. To construct -such a history DAG from a tree, we annotate each node in the tree with its child clades. -The **clade** beneath a tree node is the set of leaf node labels +such a history DAG from a tree, we annotate each node in the tree with its +child clades. The **clade** beneath a tree node is the set of leaf node labels reachable from that node, or the set containing the node's own label if it is -itself a leaf. The **child clades** of a node are the set of clades -beneath that node's children. +itself a leaf. We also refer to this set as a node's **clade union**, since it +is the union of the node's child clades. The **child clades** of a node are the +set of clades beneath that node's children. After annotating each node with its child clades, a **UA (universal ancestor) node** is added as a parent of the original tree's root node. The resulting -structure is an example of a history DAG which we call a **clade tree**: +structure is an example of a history DAG which we call a **history**: |pic1| -> |pic2| @@ -35,13 +36,13 @@ parent node associated to an edge, must be the same as the clade below the child node that the edge targets. After converting multiple trees with the same set of leaf labels to clade -trees, those clade trees can be unioned to create a history DAG that represents +trees, those histories can be unioned to create a history DAG that represents at least those trees used to create it. Any structure in the resulting history DAG which contains the UA node and all leaves, and has exactly one edge for -each node-child clade pair, is a clade tree. Clade trees represent labeled +each node-child clade pair, is a history. Histories represent labeled trees by the inverse of the correspondence introduced above: -For example, the clade tree highlighted in red in this image: +For example, the history highlighted in red in this image: .. image:: figures/history_dag_example.svg :width: 100% @@ -105,7 +106,7 @@ Now, we will create a history DAG using the ``sequence`` attribute as the data for node labels: >>> dag = hdag.history_dag_from_etes(ete_trees, ['sequence']) ->>> dag.count_trees() +>>> dag.count_histories() 1041 >>> dag.count_topologies() 389 @@ -132,9 +133,9 @@ of ``explode_nodes``). We can find even more new trees by adding all edges which connect nodes whose child clades are compatible: ->>> dag.add_all_allowed_edges() +>>> dag.make_complete() 1048 ->>> dag.count_trees() +>>> dag.count_histories() 3431531 After such edge additions, all the trees in the DAG are no longer guaranteed to @@ -191,7 +192,7 @@ Now we can trim to only the trees with 48 unique node labels: >>> dag.trim_optimal_weight(** node_count_funcs, optimal_func=min) -Finally, we can sample a single clade tree from the history DAG, and make it an +Finally, we can sample a single history from the history DAG, and make it an ete tree for further rendering/processing: >>> t = dag.sample().to_ete() @@ -208,7 +209,7 @@ index-order: Another method for fetching all trees in the dag is provided, but the order will not match index order: ->>> scrambled_trees = list(dag.get_trees()) +>>> scrambled_trees = list(dag.get_histories()) History DAGs can be merged using the :meth:`historydag.HistoryDag.merge` diff --git a/historydag/__init__.py b/historydag/__init__.py index 4eb01f3..ff2df5f 100644 --- a/historydag/__init__.py +++ b/historydag/__init__.py @@ -6,7 +6,7 @@ from_newick, history_dag_from_newicks, history_dag_from_etes, - history_dag_from_clade_trees, + history_dag_from_histories, ) from . import _version diff --git a/historydag/dag.py b/historydag/dag.py index e065b5a..70a3608 100644 --- a/historydag/dag.py +++ b/historydag/dag.py @@ -28,13 +28,13 @@ from historydag.counterops import counter_sum, counter_prod -def _under_clade_dict(nodeseq: Sequence["HistoryDagNode"]) -> Dict: +def _clade_union_dict(nodeseq: Sequence["HistoryDagNode"]) -> Dict: clade_dict: Dict[FrozenSet[Label], List[HistoryDagNode]] = {} for node in nodeseq: - under_clade = node.under_clade() - if under_clade not in clade_dict: - clade_dict[under_clade] = [] - clade_dict[node.under_clade()].append(node) + clade_union = node.clade_union() + if clade_union not in clade_dict: + clade_dict[clade_union] = [] + clade_dict[node.clade_union()].append(node) return clade_dict @@ -68,28 +68,31 @@ def __repr__(self) -> str: return str((self.label, set(self.clades.keys()))) def __hash__(self) -> int: - return hash((self.label, self.partitions())) + return hash((self.label, self.child_clades())) def __eq__(self, other: object) -> bool: if isinstance(other, HistoryDagNode): - return (self.label, self.partitions()) == (other.label, other.partitions()) + return (self.label, self.child_clades()) == ( + other.label, + other.child_clades(), + ) else: raise NotImplementedError def __le__(self, other: object) -> bool: if isinstance(other, HistoryDagNode): - return (self.label, self.sorted_partitions()) <= ( + return (self.label, self.sorted_child_clades()) <= ( other.label, - other.sorted_partitions(), + other.sorted_child_clades(), ) else: raise NotImplementedError def __lt__(self, other: object) -> bool: if isinstance(other, HistoryDagNode): - return (self.label, self.sorted_partitions()) < ( + return (self.label, self.sorted_child_clades()) < ( other.label, - other.sorted_partitions(), + other.sorted_child_clades(), ) else: raise NotImplementedError @@ -100,39 +103,60 @@ def __gt__(self, other: object) -> bool: def __ge__(self, other: object) -> bool: return not self.__lt__(other) - def node_self(self) -> "HistoryDagNode": - """Returns a HistoryDagNode object with the same clades and label, but - no descendant edges.""" + def empty_copy(self) -> "HistoryDagNode": + """Returns a HistoryDagNode object with the same clades, label, and + attr dictionary, but no descendant edges.""" return HistoryDagNode( self.label, {clade: EdgeSet() for clade in self.clades}, deepcopy(self.attr) ) - def under_clade(self) -> FrozenSet[Label]: - r"""Returns the union of this node's child clades""" + def node_self(self) -> "HistoryDagNode": + """Deprecated name for :meth:`empty_copy`""" + return self.empty_copy() + + def clade_union(self) -> FrozenSet[Label]: + r"""Returns the union of this node's child clades + (or a set containing only the node label, for leaf nodes.)""" if self.is_leaf(): return frozenset([self.label]) else: return frozenset().union(*self.clades.keys()) + def under_clade(self) -> FrozenSet[Label]: + """Deprecated name for :meth:`clade_union`""" + return self.clade_union() + def is_leaf(self) -> bool: """Returns whether this is a leaf node.""" return not bool(self.clades) - def is_root(self) -> bool: - """Returns whether this is a DAG root node, or UA (universal ancestor) - node.""" + def is_ua_node(self) -> bool: + """Returns whether this is the source node in the DAG, from which all + others are reachable.""" return False - def partitions(self) -> frozenset: + def is_root(self) -> bool: + """Deprecated name for :meth:`is_ua_node`""" + return self.is_ua_node() + + def child_clades(self) -> frozenset: """Returns the node's child clades, or a frozenset containing a frozenset if this node is a UANode.""" return frozenset(self.clades.keys()) - def sorted_partitions(self) -> tuple: + def partitions(self) -> frozenset: + """Deprecated name for :meth:`child_clades`""" + return self.child_clades() + + def sorted_child_clades(self) -> tuple: """Returns the node's child clades as a sorted tuple containing leaf labels in sorted tuples.""" return tuple(sorted([tuple(sorted(clade)) for clade in self.clades.keys()])) + def sorted_partitions(self) -> tuple: + """Deprecated name for :meth:`sorted_child_clades`""" + return self.sorted_child_clades() + def children( self, clade: Set[Label] = None ) -> Generator["HistoryDagNode", None, None]: @@ -158,7 +182,7 @@ def add_edge( ) -> bool: r"""Adds edge, if allowed and not already present. Returns whether edge was added.""" # target clades must union to a clade of self - key = frozenset() if self.is_root() else target.under_clade() + key = frozenset() if self.is_ua_node() else target.clade_union() if key not in self.clades: raise KeyError("Target clades' union is not a clade of this parent node: ") else: @@ -170,12 +194,12 @@ def add_edge( prob_norm=prob_norm, ) - def _get_subtree_by_subid(self, subid: int) -> "HistoryDagNode": + def _get_subhistory_by_subid(self, subid: int) -> "HistoryDagNode": r"""Returns the subtree below the current HistoryDagNode corresponding to the given index""" if self.is_leaf(): # base case - the node is a leaf return self else: - history = self.node_self() + history = self.empty_copy() # get the subtree for each of the clades for clade, eset in self.clades.items(): @@ -192,7 +216,7 @@ def _get_subtree_by_subid(self, subid: int) -> "HistoryDagNode": else: # add this edge to the tree somehow history.clades[clade].add_to_edgeset( - child._get_subtree_by_subid(curr_index) + child._get_subhistory_by_subid(curr_index) ) break @@ -201,7 +225,7 @@ def _get_subtree_by_subid(self, subid: int) -> "HistoryDagNode": def remove_edge_by_clade_and_id(self, target: "HistoryDagNode", clade: frozenset): key: frozenset - if self.is_root(): + if self.is_ua_node(): key = frozenset() else: key = clade @@ -219,13 +243,13 @@ def remove_node(self, nodedict: Dict["HistoryDagNode", "HistoryDagNode"] = {}): if not child.parents: child.remove_node(nodedict=nodedict) for parent in self.parents: - parent.remove_edge_by_clade_and_id(self, self.under_clade()) + parent.remove_edge_by_clade_and_id(self, self.clade_union()) self.removed = True def _sample(self, edge_selector=lambda n: True) -> "HistoryDagNode": - r"""Samples a clade tree (a sub-history DAG containing the root and all + r"""Samples a history (a sub-history DAG containing the root and all leaf nodes). Returns a new HistoryDagNode object.""" - sample = self.node_self() + sample = self.empty_copy() for clade, eset in self.clades.items(): mask = [edge_selector((self, target)) for target in eset.targets] sampled_target, target_weight = eset.sample(mask=mask) @@ -239,7 +263,7 @@ def _sample(self, edge_selector=lambda n: True) -> "HistoryDagNode": ) return sample - def _get_trees(self) -> Generator["HistoryDagNode", None, None]: + def _get_subhistories(self) -> Generator["HistoryDagNode", None, None]: r"""Return a generator to iterate through all trees expressed by the DAG.""" def genexp_func(clade): @@ -250,7 +274,7 @@ def f(): return ( (clade, targettree, i) for i, target in enumerate(eset.targets) - for targettree in target._get_trees() + for targettree in target._get_subhistories() ) return f @@ -259,7 +283,7 @@ def f(): # TODO is this duplicated code? for option in utils.cartesian_product(optionlist): - tree = self.node_self() + tree = self.empty_copy() for clade, targettree, index in option: tree.clades[clade].add_to_edgeset( targettree, @@ -286,7 +310,7 @@ def _newick_label( A string which can be used as a node name in a newick string. For example, `namefuncresult[&&NHX:feature1=val1:feature2=val2]` """ - if self.is_root(): + if self.is_ua_node(): return self.label # type: ignore else: if features is None: @@ -315,14 +339,16 @@ def __init__(self, targetnodes: "EdgeSet"): for child in self.children(): child.parents.add(self) - def node_self(self) -> "UANode": + def empty_copy(self) -> "UANode": """Returns a UANode object with the same clades and label, but no descendant edges.""" newnode = UANode(EdgeSet()) newnode.attr = deepcopy(self.attr) return newnode - def is_root(self): + def is_ua_node(self) -> bool: + """Returns whether this is the source node in the DAG, from which all + others are reachable.""" return True @@ -349,18 +375,18 @@ def __eq__(self, other: object) -> bool: def __getitem__(self, key) -> "HistoryDag": r"""Returns the sub-history below the current history dag corresponding to the given index.""" - length = self.count_trees() + length = self.count_histories() if key < 0: key = length + key if isinstance(key, slice) or not type(key) == int: raise TypeError(f"History DAG indices must be integers, not {type(key)}") if not (key >= 0 and key < length): raise IndexError - self.count_trees() - return HistoryDag(self.dagroot._get_subtree_by_subid(key)) + self.count_histories() + return HistoryDag(self.dagroot._get_subhistory_by_subid(key)) def __len__(self) -> int: - return self.count_trees() + return self.count_histories() def __or__(self, other) -> "HistoryDag": newdag = self.copy() @@ -405,10 +431,10 @@ def cladesets(node): for node in self.postorder(): if node.label not in label_indices: label_indices[node.label] = len(label_list) - label_list.append(None if node.is_root() else tuple(node.label)) + label_list.append(None if node.is_ua_node() else tuple(node.label)) assert ( label_list[label_indices[node.label]] == node.label - or node.is_root() + or node.is_ua_node() ) node_list.append((label_indices[node.label], cladesets(node), node.attr)) node_idx = len(node_list) - 1 @@ -480,11 +506,11 @@ def _check_valid(self) -> bool: # used in the traversal) for node in po: - if not node.is_root(): + if not node.is_ua_node(): for clade, eset in node.clades.items(): for child in eset.targets: # ***Parent clade equals child clade union for all edges: - if child.under_clade() != clade: + if child.clade_union() != clade: raise ValueError( "Parent clade does not equal child clade union" ) @@ -520,16 +546,23 @@ def _check_valid(self) -> bool: def serialize(self) -> bytes: return pickle.dumps(self.__getstate__()) - def get_trees(self) -> Generator["HistoryDag", None, None]: - """Return a generator containing all trees in the history DAG. + def get_histories(self) -> Generator["HistoryDag", None, None]: + """Return a generator containing all internally labeled trees in the + history DAG. - The order of these trees does not necessarily match the order of - indexing. That is, ``dag.get_trees()`` and ``tree for tree in - dag`` will result in different orderings. ``get_trees`` should + Note that each history is a history DAG, containing a UA node. + + The order of these histories does not necessarily match the order of + indexing. That is, ``dag.get_histories()`` and ``history for history in + dag`` will result in different orderings. ``get_histories`` should be slightly faster, but possibly more memory intensive. """ - for cladetree in self.dagroot._get_trees(): - yield HistoryDag(cladetree) + for history in self.dagroot._get_subhistories(): + yield HistoryDag(history) + + def get_trees(self) -> Generator["HistoryDag", None, None]: + """Deprecated name for :meth:`get_histories`""" + return self.get_histories() def get_leaves(self) -> Generator["HistoryDag", None, None]: """Return a generator containing all leaf nodes in the history DAG.""" @@ -537,7 +570,7 @@ def get_leaves(self) -> Generator["HistoryDag", None, None]: def num_nodes(self) -> int: """Return the number of nodes in the DAG, not counting the UA node.""" - return sum(1 for _ in self.preorder(skip_root=True)) + return sum(1 for _ in self.preorder(skip_ua_node=True)) def num_leaves(self) -> int: """Return the number of leaf nodes in the DAG.""" @@ -650,11 +683,11 @@ def unlabel(self) -> "HistoryDag": that all histories in the DAG have unique topologies.""" newdag = self.copy() - model_label = next(self.preorder(skip_root=True)).label + model_label = next(self.preorder(skip_ua_node=True)).label # initialize empty/default value for each item in model_label field_values = tuple(type(item)() for item in model_label) internal_label = type(model_label)(*field_values) - for node in newdag.preorder(skip_root=True): + for node in newdag.preorder(skip_ua_node=True): if not node.is_leaf(): node.label = internal_label @@ -682,7 +715,7 @@ def remove_abundance_clade(old_clade): return frozenset(leaf_label_dict[old_label] for old_label in old_clade) def remove_abundance_node(old_node): - if old_node.is_root(): + if old_node.is_ua_node(): return UANode( EdgeSet( [ @@ -710,8 +743,8 @@ def remove_abundance_node(old_node): newdag = newdag.sample() | newdag return newdag - def is_clade_tree(self) -> bool: - """Returns whether history DAG is a clade tree. + def is_history(self) -> bool: + """Returns whether history DAG is a history. That is, each node-clade pair has exactly one descendant edge. """ @@ -721,6 +754,10 @@ def is_clade_tree(self) -> bool: return False return True + def is_clade_tree(self) -> bool: + """Deprecated name for :meth:`is_history`""" + return self.is_history() + def copy(self) -> "HistoryDag": """Uses bytestring serialization, and is guaranteed to copy: @@ -753,14 +790,18 @@ def merge(self, trees: Union["HistoryDag", Sequence["HistoryDag"]]): if n in nodedict: pnode = nodedict[n] else: - pnode = n.node_self() + pnode = n.empty_copy() nodedict[n] = pnode for _, edgeset in n.clades.items(): for child, weight, _ in edgeset: pnode.add_edge(nodedict[child], weight=weight) - def add_all_allowed_edges( + def add_all_allowed_edges(self, *args, **kwargs) -> int: + """Provided as a deprecated synonym for :meth:``make_complete``.""" + return self.make_complete(*args, **kwargs) + + def make_complete( self, new_from_root: bool = True, adjacent_labels: bool = True, @@ -789,11 +830,11 @@ def add_all_allowed_edges( for node in self.postorder() } - clade_dict = _under_clade_dict(self.preorder(skip_root=True)) - clade_dict[self.dagroot.under_clade()] = [] + clade_dict = _clade_union_dict(self.preorder(skip_ua_node=True)) + clade_dict[self.dagroot.clade_union()] = [] for node in self.postorder(): - if new_from_root is False and node.is_root(): + if new_from_root is False and node.is_ua_node(): continue else: for clade in node.clades: @@ -809,16 +850,16 @@ def add_all_allowed_edges( n_added += node.add_edge(target) return n_added - @utils._cladetree_method + @utils._history_method def to_newick( self, name_func: Callable[[HistoryDagNode], str] = lambda n: "unnamed", features: Optional[List[str]] = None, feature_funcs: Mapping[str, Callable[[HistoryDagNode], str]] = {}, ) -> str: - r"""Converts clade tree to extended newick format. + r"""Converts a history to extended newick format. Supports arbitrary node names and a - sequence feature. For use on a history DAG which is a clade tree. + sequence feature. For use on a history DAG which is a history. For extracting newick representations of trees in a general history DAG, see :meth:`HistoryDag.to_newicks`. @@ -855,14 +896,14 @@ def newick(node): return newick(next(self.dagroot.children())) + ";" - @utils._cladetree_method + @utils._history_method def to_ete( self, name_func: Callable[[HistoryDagNode], str] = lambda n: "unnamed", features: Optional[List[str]] = None, feature_funcs: Mapping[str, Callable[[HistoryDagNode], str]] = {}, ) -> ete3.TreeNode: - """Convert a history DAG which is a clade tree to an ete tree. + """Convert a history DAG which is a history to an ete tree. Args: name_func: A map from nodes to newick node names @@ -893,7 +934,7 @@ def etenode(node: HistoryDagNode) -> ete3.TreeNode: newnode.add_feature(feature, func(node)) return newnode - nodedict = {node: etenode(node) for node in self.preorder(skip_root=True)} + nodedict = {node: etenode(node) for node in self.preorder(skip_ua_node=True)} for node in nodedict: for target in node.children(): @@ -906,7 +947,8 @@ def to_graphviz( self, labelfunc: Optional[Callable[[HistoryDagNode], str]] = None, namedict: Mapping[Label, str] = {}, - show_partitions: bool = True, + show_child_clades: bool = True, + show_partitions: bool = None, ) -> gv.Digraph: r"""Converts history DAG to graphviz (dot format) Digraph object. @@ -915,8 +957,11 @@ def to_graphviz( their DAG node labels, or their label hash if label data is too large. namedict: A dictionary from node labels to label strings. Labelfunc will be used instead, if both are provided. - show_partitions: Whether to include child clades in output. + show_child_clades: Whether to include child clades in output. + show_partitions: Deprecated alias for show_child_clades. """ + if show_partitions is not None: + show_child_clades = show_partitions def labeller(label): if label in namedict: @@ -936,7 +981,7 @@ def taxa(clade): G = gv.Digraph("labeled partition DAG", node_attr={"shape": "record"}) for node in self.postorder(): - if node.is_leaf() or show_partitions is False: + if node.is_leaf() or show_child_clades is False: G.node(str(id(node)), f"