From 224972eca18d4e53c3fdd57d107752982172a29c Mon Sep 17 00:00:00 2001 From: Will Dumm Date: Wed, 6 Jul 2022 14:53:19 -0700 Subject: [PATCH 1/3] tests pass with new names --- README.md | 8 +- docs/quickstart.rst | 16 +-- historydag/__init__.py | 2 +- historydag/dag.py | 235 +++++++++++++++++++++++-------------- historydag/utils.py | 12 +- scripts/agg_mut.py | 6 +- tests/test_collapse.py | 14 +-- tests/test_disambiguate.py | 10 +- tests/test_factory.py | 24 ++-- tests/test_historydag.py | 10 +- 10 files changed, 197 insertions(+), 140 deletions(-) diff --git a/README.md b/README.md index 2305921..e1a4d4c 100644 --- a/README.md +++ b/README.md @@ -21,10 +21,10 @@ with open('sample_data/toy_trees.p', 'rb') as fh: ete_trees = pickle.load(fh) dag = hdag.history_dag_from_etes(ete_trees, ['sequence']) -dag.count_trees() # 1041 +dag.count_histories() # 1041 dag.add_all_allowed_edges() -dag.count_trees() # 3431531 +dag.count_histories() # 3431531 dag.hamming_parsimony_count() # Shows counts of trees of different parsimony scores dag.trim_optimal_weight() @@ -59,7 +59,7 @@ trees = [tree for tree in dag] # Another method for fetching all trees in the dag is provided, but the order # will not match index order: -scrambled_trees = list(dag.get_trees()) +scrambled_trees = list(dag.get_histories()) # Union is implemented as dag merging, including with sequences of dags @@ -74,7 +74,7 @@ newdag = dag[0] | (dag[i] for i in range(3,5)) * `history_dag_from_newicks` * `history_dag_from_etes` * Trees can be extracted from the history DAG with methods like - * `HistoryDag.get_trees` + * `HistoryDag.get_histories` * `HistoryDag.sample` * `HistoryDag.to_ete` * `HistoryDag.to_newick` and `HistoryDag.to_newicks` diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 30058ac..771354b 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -19,7 +19,7 @@ beneath that node's children. After annotating each node with its child clades, a **UA (universal ancestor) node** is added as a parent of the original tree's root node. The resulting -structure is an example of a history DAG which we call a **clade tree**: +structure is an example of a history DAG which we call a **history**: |pic1| -> |pic2| @@ -35,13 +35,13 @@ parent node associated to an edge, must be the same as the clade below the child node that the edge targets. After converting multiple trees with the same set of leaf labels to clade -trees, those clade trees can be unioned to create a history DAG that represents +trees, those histories can be unioned to create a history DAG that represents at least those trees used to create it. Any structure in the resulting history DAG which contains the UA node and all leaves, and has exactly one edge for -each node-child clade pair, is a clade tree. Clade trees represent labeled +each node-child clade pair, is a history. Histories represent labeled trees by the inverse of the correspondence introduced above: -For example, the clade tree highlighted in red in this image: +For example, the history highlighted in red in this image: .. image:: figures/history_dag_example.svg :width: 100% @@ -105,7 +105,7 @@ Now, we will create a history DAG using the ``sequence`` attribute as the data for node labels: >>> dag = hdag.history_dag_from_etes(ete_trees, ['sequence']) ->>> dag.count_trees() +>>> dag.count_histories() 1041 >>> dag.count_topologies() 389 @@ -134,7 +134,7 @@ nodes whose child clades are compatible: >>> dag.add_all_allowed_edges() 1048 ->>> dag.count_trees() +>>> dag.count_histories() 3431531 After such edge additions, all the trees in the DAG are no longer guaranteed to @@ -191,7 +191,7 @@ Now we can trim to only the trees with 48 unique node labels: >>> dag.trim_optimal_weight(** node_count_funcs, optimal_func=min) -Finally, we can sample a single clade tree from the history DAG, and make it an +Finally, we can sample a single history from the history DAG, and make it an ete tree for further rendering/processing: >>> t = dag.sample().to_ete() @@ -208,7 +208,7 @@ index-order: Another method for fetching all trees in the dag is provided, but the order will not match index order: ->>> scrambled_trees = list(dag.get_trees()) +>>> scrambled_trees = list(dag.get_histories()) History DAGs can be merged using the :meth:`historydag.HistoryDag.merge` diff --git a/historydag/__init__.py b/historydag/__init__.py index 4eb01f3..ff2df5f 100644 --- a/historydag/__init__.py +++ b/historydag/__init__.py @@ -6,7 +6,7 @@ from_newick, history_dag_from_newicks, history_dag_from_etes, - history_dag_from_clade_trees, + history_dag_from_histories, ) from . import _version diff --git a/historydag/dag.py b/historydag/dag.py index e065b5a..0731938 100644 --- a/historydag/dag.py +++ b/historydag/dag.py @@ -68,28 +68,28 @@ def __repr__(self) -> str: return str((self.label, set(self.clades.keys()))) def __hash__(self) -> int: - return hash((self.label, self.partitions())) + return hash((self.label, self.child_clades())) def __eq__(self, other: object) -> bool: if isinstance(other, HistoryDagNode): - return (self.label, self.partitions()) == (other.label, other.partitions()) + return (self.label, self.child_clades()) == (other.label, other.child_clades()) else: raise NotImplementedError def __le__(self, other: object) -> bool: if isinstance(other, HistoryDagNode): - return (self.label, self.sorted_partitions()) <= ( + return (self.label, self.sorted_child_clades()) <= ( other.label, - other.sorted_partitions(), + other.sorted_child_clades(), ) else: raise NotImplementedError def __lt__(self, other: object) -> bool: if isinstance(other, HistoryDagNode): - return (self.label, self.sorted_partitions()) < ( + return (self.label, self.sorted_child_clades()) < ( other.label, - other.sorted_partitions(), + other.sorted_child_clades(), ) else: raise NotImplementedError @@ -100,39 +100,64 @@ def __gt__(self, other: object) -> bool: def __ge__(self, other: object) -> bool: return not self.__lt__(other) - def node_self(self) -> "HistoryDagNode": - """Returns a HistoryDagNode object with the same clades and label, but + def empty_copy(self) -> "HistoryDagNode": + """Returns a HistoryDagNode object with the same clades, label, and attr dictionary, but no descendant edges.""" return HistoryDagNode( self.label, {clade: EdgeSet() for clade in self.clades}, deepcopy(self.attr) ) - def under_clade(self) -> FrozenSet[Label]: - r"""Returns the union of this node's child clades""" + # def node_self(self) -> "HistoryDagNode": + # """Deprecated name for :meth:`empty_copy`""" + # return self.empty_copy() + # [Walrus] + + def clade_union(self) -> FrozenSet[Label]: + r"""Returns the union of this node's child clades + (or a set containing only the node label, for leaf nodes.)""" if self.is_leaf(): return frozenset([self.label]) else: return frozenset().union(*self.clades.keys()) + # def under_clade(self) -> FrozenSet[Label]: + # """Deprecated name for :meth:`clade_union`""" + # return self.clade_union() + # [Walrus] + def is_leaf(self) -> bool: """Returns whether this is a leaf node.""" return not bool(self.clades) - def is_root(self) -> bool: - """Returns whether this is a DAG root node, or UA (universal ancestor) - node.""" + def is_ua_node(self) -> bool: + """Returns whether this is the source node in the DAG, from which all others are reachable.""" return False - def partitions(self) -> frozenset: + # def is_root(self) -> bool: + # """Deprecated name for :meth:`is_ua_node`""" + # return self.is_ua_node() + # [Walrus] + + def child_clades(self) -> frozenset: """Returns the node's child clades, or a frozenset containing a frozenset if this node is a UANode.""" return frozenset(self.clades.keys()) - def sorted_partitions(self) -> tuple: + # def partitions(self) -> frozenset: + # """Deprecated name for :meth:`child_clades`""" + # return self.child_clades() + # # [Walrus] + + def sorted_child_clades(self) -> tuple: """Returns the node's child clades as a sorted tuple containing leaf labels in sorted tuples.""" return tuple(sorted([tuple(sorted(clade)) for clade in self.clades.keys()])) + # def sorted_partitions(self) -> frozenset: + # """Deprecated name for :meth:`sorted_child_clades`""" + # return self.sorted_child_clades() + # # [Walrus] + def children( self, clade: Set[Label] = None ) -> Generator["HistoryDagNode", None, None]: @@ -158,7 +183,7 @@ def add_edge( ) -> bool: r"""Adds edge, if allowed and not already present. Returns whether edge was added.""" # target clades must union to a clade of self - key = frozenset() if self.is_root() else target.under_clade() + key = frozenset() if self.is_ua_node() else target.clade_union() if key not in self.clades: raise KeyError("Target clades' union is not a clade of this parent node: ") else: @@ -170,12 +195,12 @@ def add_edge( prob_norm=prob_norm, ) - def _get_subtree_by_subid(self, subid: int) -> "HistoryDagNode": + def _get_subhistory_by_subid(self, subid: int) -> "HistoryDagNode": r"""Returns the subtree below the current HistoryDagNode corresponding to the given index""" if self.is_leaf(): # base case - the node is a leaf return self else: - history = self.node_self() + history = self.empty_copy() # get the subtree for each of the clades for clade, eset in self.clades.items(): @@ -192,7 +217,7 @@ def _get_subtree_by_subid(self, subid: int) -> "HistoryDagNode": else: # add this edge to the tree somehow history.clades[clade].add_to_edgeset( - child._get_subtree_by_subid(curr_index) + child._get_subhistory_by_subid(curr_index) ) break @@ -201,7 +226,7 @@ def _get_subtree_by_subid(self, subid: int) -> "HistoryDagNode": def remove_edge_by_clade_and_id(self, target: "HistoryDagNode", clade: frozenset): key: frozenset - if self.is_root(): + if self.is_ua_node(): key = frozenset() else: key = clade @@ -219,13 +244,13 @@ def remove_node(self, nodedict: Dict["HistoryDagNode", "HistoryDagNode"] = {}): if not child.parents: child.remove_node(nodedict=nodedict) for parent in self.parents: - parent.remove_edge_by_clade_and_id(self, self.under_clade()) + parent.remove_edge_by_clade_and_id(self, self.clade_union()) self.removed = True def _sample(self, edge_selector=lambda n: True) -> "HistoryDagNode": - r"""Samples a clade tree (a sub-history DAG containing the root and all + r"""Samples a history (a sub-history DAG containing the root and all leaf nodes). Returns a new HistoryDagNode object.""" - sample = self.node_self() + sample = self.empty_copy() for clade, eset in self.clades.items(): mask = [edge_selector((self, target)) for target in eset.targets] sampled_target, target_weight = eset.sample(mask=mask) @@ -239,7 +264,7 @@ def _sample(self, edge_selector=lambda n: True) -> "HistoryDagNode": ) return sample - def _get_trees(self) -> Generator["HistoryDagNode", None, None]: + def _get_subhistories(self) -> Generator["HistoryDagNode", None, None]: r"""Return a generator to iterate through all trees expressed by the DAG.""" def genexp_func(clade): @@ -250,7 +275,7 @@ def f(): return ( (clade, targettree, i) for i, target in enumerate(eset.targets) - for targettree in target._get_trees() + for targettree in target._get_subhistories() ) return f @@ -259,7 +284,7 @@ def f(): # TODO is this duplicated code? for option in utils.cartesian_product(optionlist): - tree = self.node_self() + tree = self.empty_copy() for clade, targettree, index in option: tree.clades[clade].add_to_edgeset( targettree, @@ -286,7 +311,7 @@ def _newick_label( A string which can be used as a node name in a newick string. For example, `namefuncresult[&&NHX:feature1=val1:feature2=val2]` """ - if self.is_root(): + if self.is_ua_node(): return self.label # type: ignore else: if features is None: @@ -315,14 +340,15 @@ def __init__(self, targetnodes: "EdgeSet"): for child in self.children(): child.parents.add(self) - def node_self(self) -> "UANode": + def empty_copy(self) -> "UANode": """Returns a UANode object with the same clades and label, but no descendant edges.""" newnode = UANode(EdgeSet()) newnode.attr = deepcopy(self.attr) return newnode - def is_root(self): + def is_ua_node(self) -> bool: + """Returns whether this is the source node in the DAG, from which all others are reachable.""" return True @@ -349,18 +375,18 @@ def __eq__(self, other: object) -> bool: def __getitem__(self, key) -> "HistoryDag": r"""Returns the sub-history below the current history dag corresponding to the given index.""" - length = self.count_trees() + length = self.count_histories() if key < 0: key = length + key if isinstance(key, slice) or not type(key) == int: raise TypeError(f"History DAG indices must be integers, not {type(key)}") if not (key >= 0 and key < length): raise IndexError - self.count_trees() - return HistoryDag(self.dagroot._get_subtree_by_subid(key)) + self.count_histories() + return HistoryDag(self.dagroot._get_subhistory_by_subid(key)) def __len__(self) -> int: - return self.count_trees() + return self.count_histories() def __or__(self, other) -> "HistoryDag": newdag = self.copy() @@ -405,10 +431,10 @@ def cladesets(node): for node in self.postorder(): if node.label not in label_indices: label_indices[node.label] = len(label_list) - label_list.append(None if node.is_root() else tuple(node.label)) + label_list.append(None if node.is_ua_node() else tuple(node.label)) assert ( label_list[label_indices[node.label]] == node.label - or node.is_root() + or node.is_ua_node() ) node_list.append((label_indices[node.label], cladesets(node), node.attr)) node_idx = len(node_list) - 1 @@ -520,16 +546,23 @@ def _check_valid(self) -> bool: def serialize(self) -> bytes: return pickle.dumps(self.__getstate__()) - def get_trees(self) -> Generator["HistoryDag", None, None]: - """Return a generator containing all trees in the history DAG. + def get_histories(self) -> Generator["HistoryDag", None, None]: + """Return a generator containing all internally labeled trees in the history DAG. - The order of these trees does not necessarily match the order of - indexing. That is, ``dag.get_trees()`` and ``tree for tree in - dag`` will result in different orderings. ``get_trees`` should + Note that each history is a history DAG, containing a UA node. + + The order of these histories does not necessarily match the order of + indexing. That is, ``dag.get_histories()`` and ``history for history in + dag`` will result in different orderings. ``get_histories`` should be slightly faster, but possibly more memory intensive. """ - for cladetree in self.dagroot._get_trees(): - yield HistoryDag(cladetree) + for history in self.dagroot._get_subhistories(): + yield HistoryDag(history) + + # def get_trees(self) -> frozenset: + # """Deprecated name for :meth:`get_histories`""" + # return self.get_histories() + # # [Walrus] def get_leaves(self) -> Generator["HistoryDag", None, None]: """Return a generator containing all leaf nodes in the history DAG.""" @@ -710,8 +743,8 @@ def remove_abundance_node(old_node): newdag = newdag.sample() | newdag return newdag - def is_clade_tree(self) -> bool: - """Returns whether history DAG is a clade tree. + def is_history(self) -> bool: + """Returns whether history DAG is a history. That is, each node-clade pair has exactly one descendant edge. """ @@ -721,6 +754,11 @@ def is_clade_tree(self) -> bool: return False return True + # def is_clade_tree(self) -> bool: + # """Deprecated name for :meth:`is_history`""" + # return self.is_history() + # # [Walrus] + def copy(self) -> "HistoryDag": """Uses bytestring serialization, and is guaranteed to copy: @@ -753,7 +791,7 @@ def merge(self, trees: Union["HistoryDag", Sequence["HistoryDag"]]): if n in nodedict: pnode = nodedict[n] else: - pnode = n.node_self() + pnode = n.empty_copy() nodedict[n] = pnode for _, edgeset in n.clades.items(): @@ -793,7 +831,7 @@ def add_all_allowed_edges( clade_dict[self.dagroot.under_clade()] = [] for node in self.postorder(): - if new_from_root is False and node.is_root(): + if new_from_root is False and node.is_ua_node(): continue else: for clade in node.clades: @@ -809,16 +847,16 @@ def add_all_allowed_edges( n_added += node.add_edge(target) return n_added - @utils._cladetree_method + @utils._history_method def to_newick( self, name_func: Callable[[HistoryDagNode], str] = lambda n: "unnamed", features: Optional[List[str]] = None, feature_funcs: Mapping[str, Callable[[HistoryDagNode], str]] = {}, ) -> str: - r"""Converts clade tree to extended newick format. + r"""Converts a history to extended newick format. Supports arbitrary node names and a - sequence feature. For use on a history DAG which is a clade tree. + sequence feature. For use on a history DAG which is a history. For extracting newick representations of trees in a general history DAG, see :meth:`HistoryDag.to_newicks`. @@ -855,14 +893,14 @@ def newick(node): return newick(next(self.dagroot.children())) + ";" - @utils._cladetree_method + @utils._history_method def to_ete( self, name_func: Callable[[HistoryDagNode], str] = lambda n: "unnamed", features: Optional[List[str]] = None, feature_funcs: Mapping[str, Callable[[HistoryDagNode], str]] = {}, ) -> ete3.TreeNode: - """Convert a history DAG which is a clade tree to an ete tree. + """Convert a history DAG which is a history to an ete tree. Args: name_func: A map from nodes to newick node names @@ -906,7 +944,8 @@ def to_graphviz( self, labelfunc: Optional[Callable[[HistoryDagNode], str]] = None, namedict: Mapping[Label, str] = {}, - show_partitions: bool = True, + show_child_clades: bool = True, + show_partitions: bool = None, ) -> gv.Digraph: r"""Converts history DAG to graphviz (dot format) Digraph object. @@ -915,8 +954,11 @@ def to_graphviz( their DAG node labels, or their label hash if label data is too large. namedict: A dictionary from node labels to label strings. Labelfunc will be used instead, if both are provided. - show_partitions: Whether to include child clades in output. + show_child_clades: Whether to include child clades in output. + show_partitions: Deprecated alias for show_child_clades. """ + if show_partitions is not None: + show_child_clades = show_partitions def labeller(label): if label in namedict: @@ -936,7 +978,7 @@ def taxa(clade): G = gv.Digraph("labeled partition DAG", node_attr={"shape": "record"}) for node in self.postorder(): - if node.is_leaf() or show_partitions is False: + if node.is_leaf() or show_child_clades is False: G.node(str(id(node)), f"