From 5e548acd2f0134699fc54ad0a31978fe49b27b86 Mon Sep 17 00:00:00 2001
From: Will Dumm <wrhdumm@gmail.com>
Date: Mon, 16 May 2022 15:37:44 -0700
Subject: [PATCH 01/11] Add parsimony script

---
 scripts/parsimony.py | 82 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 scripts/parsimony.py

diff --git a/scripts/parsimony.py b/scripts/parsimony.py
new file mode 100644
index 00000000..cc6a0c16
--- /dev/null
+++ b/scripts/parsimony.py
@@ -0,0 +1,82 @@
+import gctree
+import gctree.phylip_parse as pp
+import click
+from collections import Counter
+import ete3
+
+@click.group(context_settings={"help_option_names": ["-h", "--help"]})
+def _cli():
+    """
+    A collection of tools for calculating parsimony scores of newick trees, and
+    using them to create a history DAG
+    """
+    pass
+
+def load_fasta(fastapath):
+    fasta_map = {}
+    with open(fastapath, 'r') as fh:
+        seqid = None
+        for line in fh:
+            if line[0] == '>':
+                seqid = line[1:].strip()
+                if seqid in fasta_map:
+                    raise ValueError("Duplicate records with matching identifier in fasta file")
+                else:
+                    fasta_map[seqid] = ""
+            else:
+                if seqid is None and line.strip():
+                    raise ValueError("First non-blank line in fasta does not contain identifier")
+                else:
+                    fasta_map[seqid] += line.strip()
+    return fasta_map
+
+def build_tree(newickstring, fasta_map, newickformat=1, reference_id=None, reference_sequence=None, ignore_internal_sequences=False):
+    tree = ete3.Tree(newickstring, format=newickformat)
+    # all fasta entries should be same length
+    seq_len = len(next(iter(fasta_map.values())))
+    ambig_seq = 'N' * seq_len
+    for node in tree.traverse():
+        if node.is_root() and reference_sequence is not None:
+            node.add_feature("sequence", reference_sequence)
+        elif node.is_root() and reference_id is not None:
+                node.add_feature("sequence", fasta_map[reference_id])
+        elif (not node.is_leaf()) and ignore_internal_sequences:
+            node.add_feature("sequence", ambig_seq)
+        elif node.name in fasta_map:
+            node.add_feature("sequence", fasta_map[node.name])
+        else:
+            node.add_feature("sequence", ambig_seq)
+    return tree
+
+def build_tree_from_files(newickfile, fastafile, **kwargs):
+    with open(newickfile, 'r') as fh:
+        newick = fh.read()
+    return build_tree(newick, load_fasta(fastafile), **kwargs)
+
+def parsimony_score(tree):
+    return sum(gctree.utils.hamming_distance(node.up.sequence, node.sequence)
+               for node in tree.iter_descendants())
+
+def parsimony_score_from_files(*args, **kwargs):
+    tree = build_tree_from_files(*args, **kwargs)
+    tree = pp.disambiguate(tree)
+    return parsimony_score(tree)
+
+@_cli.command("parsimony_scores")
+@click.argument("treefiles", nargs=-1)
+@click.option("-f", "--fasta-file", required=True, help="Filename of a fasta file containing sequences appearing on nodes of newick tree")
+@click.option("-r", "--root-id", default=None, help="The fasta identifier of the fixed root of provided trees. May be omitted if there is no fixed root sequence.")
+@click.option("-F", "--newick-format", default=1, help="Newick format of the provided newick file. See http://etetoolkit.org/docs/latest/reference/reference_tree.html#ete3.TreeNode")
+@click.option("-i", "--include-internal-sequences", is_flag=True, help=" non-leaf node labels, and associated sequences in the fasta file.")
+def _cli_parsimony_score_from_files(treefiles, fasta_file, root_id, newick_format, include_internal_sequences):
+    """Print the parsimony score of one or more newick files"""
+    fasta_map = load_fasta(fasta_file)
+    parsimony_counter = Counter()
+    for treepath in treefiles:
+        with open(treepath, 'r') as fh:
+            tree = build_tree(fh.read(), fasta_map, newickformat=newick_format, reference_id=root_id, ignore_internal_sequences=(not include_internal_sequences))
+        print(treepath)
+        print(parsimony_score(pp.disambiguate(tree)))
+
+if __name__ == "__main__":
+    _cli()

From 777fba647d57b53aac8039f834e8968ea025c7be Mon Sep 17 00:00:00 2001
From: Will Dumm <wrhdumm@gmail.com>
Date: Mon, 16 May 2022 15:38:36 -0700
Subject: [PATCH 02/11] format

---
 scripts/parsimony.py | 83 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 66 insertions(+), 17 deletions(-)

diff --git a/scripts/parsimony.py b/scripts/parsimony.py
index cc6a0c16..0021f59c 100644
--- a/scripts/parsimony.py
+++ b/scripts/parsimony.py
@@ -4,6 +4,7 @@
 from collections import Counter
 import ete3
 
+
 @click.group(context_settings={"help_option_names": ["-h", "--help"]})
 def _cli():
     """
@@ -12,34 +13,47 @@ def _cli():
     """
     pass
 
+
 def load_fasta(fastapath):
     fasta_map = {}
-    with open(fastapath, 'r') as fh:
+    with open(fastapath, "r") as fh:
         seqid = None
         for line in fh:
-            if line[0] == '>':
+            if line[0] == ">":
                 seqid = line[1:].strip()
                 if seqid in fasta_map:
-                    raise ValueError("Duplicate records with matching identifier in fasta file")
+                    raise ValueError(
+                        "Duplicate records with matching identifier in fasta file"
+                    )
                 else:
                     fasta_map[seqid] = ""
             else:
                 if seqid is None and line.strip():
-                    raise ValueError("First non-blank line in fasta does not contain identifier")
+                    raise ValueError(
+                        "First non-blank line in fasta does not contain identifier"
+                    )
                 else:
                     fasta_map[seqid] += line.strip()
     return fasta_map
 
-def build_tree(newickstring, fasta_map, newickformat=1, reference_id=None, reference_sequence=None, ignore_internal_sequences=False):
+
+def build_tree(
+    newickstring,
+    fasta_map,
+    newickformat=1,
+    reference_id=None,
+    reference_sequence=None,
+    ignore_internal_sequences=False,
+):
     tree = ete3.Tree(newickstring, format=newickformat)
     # all fasta entries should be same length
     seq_len = len(next(iter(fasta_map.values())))
-    ambig_seq = 'N' * seq_len
+    ambig_seq = "N" * seq_len
     for node in tree.traverse():
         if node.is_root() and reference_sequence is not None:
             node.add_feature("sequence", reference_sequence)
         elif node.is_root() and reference_id is not None:
-                node.add_feature("sequence", fasta_map[reference_id])
+            node.add_feature("sequence", fasta_map[reference_id])
         elif (not node.is_leaf()) and ignore_internal_sequences:
             node.add_feature("sequence", ambig_seq)
         elif node.name in fasta_map:
@@ -48,35 +62,70 @@ def build_tree(newickstring, fasta_map, newickformat=1, reference_id=None, refer
             node.add_feature("sequence", ambig_seq)
     return tree
 
+
 def build_tree_from_files(newickfile, fastafile, **kwargs):
-    with open(newickfile, 'r') as fh:
+    with open(newickfile, "r") as fh:
         newick = fh.read()
     return build_tree(newick, load_fasta(fastafile), **kwargs)
 
+
 def parsimony_score(tree):
-    return sum(gctree.utils.hamming_distance(node.up.sequence, node.sequence)
-               for node in tree.iter_descendants())
+    return sum(
+        gctree.utils.hamming_distance(node.up.sequence, node.sequence)
+        for node in tree.iter_descendants()
+    )
+
 
 def parsimony_score_from_files(*args, **kwargs):
     tree = build_tree_from_files(*args, **kwargs)
     tree = pp.disambiguate(tree)
     return parsimony_score(tree)
 
+
 @_cli.command("parsimony_scores")
 @click.argument("treefiles", nargs=-1)
-@click.option("-f", "--fasta-file", required=True, help="Filename of a fasta file containing sequences appearing on nodes of newick tree")
-@click.option("-r", "--root-id", default=None, help="The fasta identifier of the fixed root of provided trees. May be omitted if there is no fixed root sequence.")
-@click.option("-F", "--newick-format", default=1, help="Newick format of the provided newick file. See http://etetoolkit.org/docs/latest/reference/reference_tree.html#ete3.TreeNode")
-@click.option("-i", "--include-internal-sequences", is_flag=True, help=" non-leaf node labels, and associated sequences in the fasta file.")
-def _cli_parsimony_score_from_files(treefiles, fasta_file, root_id, newick_format, include_internal_sequences):
+@click.option(
+    "-f",
+    "--fasta-file",
+    required=True,
+    help="Filename of a fasta file containing sequences appearing on nodes of newick tree",
+)
+@click.option(
+    "-r",
+    "--root-id",
+    default=None,
+    help="The fasta identifier of the fixed root of provided trees. May be omitted if there is no fixed root sequence.",
+)
+@click.option(
+    "-F",
+    "--newick-format",
+    default=1,
+    help="Newick format of the provided newick file. See http://etetoolkit.org/docs/latest/reference/reference_tree.html#ete3.TreeNode",
+)
+@click.option(
+    "-i",
+    "--include-internal-sequences",
+    is_flag=True,
+    help=" non-leaf node labels, and associated sequences in the fasta file.",
+)
+def _cli_parsimony_score_from_files(
+    treefiles, fasta_file, root_id, newick_format, include_internal_sequences
+):
     """Print the parsimony score of one or more newick files"""
     fasta_map = load_fasta(fasta_file)
     parsimony_counter = Counter()
     for treepath in treefiles:
-        with open(treepath, 'r') as fh:
-            tree = build_tree(fh.read(), fasta_map, newickformat=newick_format, reference_id=root_id, ignore_internal_sequences=(not include_internal_sequences))
+        with open(treepath, "r") as fh:
+            tree = build_tree(
+                fh.read(),
+                fasta_map,
+                newickformat=newick_format,
+                reference_id=root_id,
+                ignore_internal_sequences=(not include_internal_sequences),
+            )
         print(treepath)
         print(parsimony_score(pp.disambiguate(tree)))
 
+
 if __name__ == "__main__":
     _cli()

From 23c4eca90e91258931f64d17adccb864706bcd94 Mon Sep 17 00:00:00 2001
From: Will Dumm <wrhdumm@gmail.com>
Date: Tue, 17 May 2022 10:11:38 -0700
Subject: [PATCH 03/11] docstrings and option to write to dag

---
 scripts/parsimony.py | 113 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 94 insertions(+), 19 deletions(-)

diff --git a/scripts/parsimony.py b/scripts/parsimony.py
index 0021f59c..356cbf3e 100644
--- a/scripts/parsimony.py
+++ b/scripts/parsimony.py
@@ -3,6 +3,8 @@
 import click
 from collections import Counter
 import ete3
+import historydag as hdag
+import pickle
 
 
 @click.group(context_settings={"help_option_names": ["-h", "--help"]})
@@ -15,6 +17,7 @@ def _cli():
 
 
 def load_fasta(fastapath):
+    """Load a fasta file as a dictionary, with sequence ids as keys and sequences as values."""
     fasta_map = {}
     with open(fastapath, "r") as fh:
         seqid = None
@@ -45,6 +48,18 @@ def build_tree(
     reference_sequence=None,
     ignore_internal_sequences=False,
 ):
+    """Load an ete tree from a newick string, and add 'sequence' attributes from fasta.
+
+    If internal node sequences aren't specified in the newick string and fasta data,
+    internal node sequences will be fully ambiguous (contain repeated N's).
+
+    Arguments:
+        newickstring: a newick string
+        fasta_map: a dictionary with sequence id keys matching node names in `newickstring`, and sequence values.
+        newickformat: the ete format identifier for the passed newick string. See ete docs
+        reference_id: key in `fasta_map` corresponding to the root sequence of the tree, if root sequence is fixed.
+        reference_sequence: fixed root sequence of the tree, if root sequence is fixed
+        ignore_internal_sequences: if True, sequences at non-leaf nodes specified in fasta_map and newickstring will be ignored, and all internal sequences will be fully ambiguous."""
     tree = ete3.Tree(newickstring, format=newickformat)
     # all fasta entries should be same length
     seq_len = len(next(iter(fasta_map.values())))
@@ -63,23 +78,48 @@ def build_tree(
     return tree
 
 
-def build_tree_from_files(newickfile, fastafile, **kwargs):
-    with open(newickfile, "r") as fh:
-        newick = fh.read()
-    return build_tree(newick, load_fasta(fastafile), **kwargs)
+def build_trees_from_files(newickfiles, fastafile, **kwargs):
+    """Same as `build_tree`, but takes a list of filenames containing newick strings, and a filename for a fasta file, and returns a generator on trees"""
+    fasta_map = load_fasta(fastafile)
+    trees = []
+    for newick in newickfiles:
+        with open(newick, "r") as fh:
+            newick = fh.read()
+        yield build_tree(newick, fasta_map, **kwargs)
 
 
 def parsimony_score(tree):
+    """returns the parsimony score of a (disambiguated) ete tree.
+    Tree must have 'sequence' attributes on all nodes."""
     return sum(
         gctree.utils.hamming_distance(node.up.sequence, node.sequence)
         for node in tree.iter_descendants()
     )
 
 
-def parsimony_score_from_files(*args, **kwargs):
-    tree = build_tree_from_files(*args, **kwargs)
-    tree = pp.disambiguate(tree)
-    return parsimony_score(tree)
+def parsimony_scores_from_files(*args, **kwargs):
+    """returns the parsimony scores of trees specified by newick files and a fasta file.
+    Arguments match `build_trees_from_files`."""
+    trees = build_trees_from_files(*args, **kwargs)
+    trees = [pp.disambiguate(tree) for tree in trees]
+    return [parsimony_score(tree) for tree in trees]
+
+
+def build_dag_from_trees(trees):
+    """Build a history DAG from trees containing a `sequence` attribute on all nodes.
+    unifurcations in the provided trees will be deleted."""
+    trees = [tree.copy() for tree in trees]
+    for tree in trees:
+        for node in tree.iter_descendants():
+            if len(node.children) == 1:
+                node.delete(prevent_nondicotomic=False)
+        if len(tree.children) == 1:
+            newchild = tree.add_child()
+            newchild.add_feature("sequence", tree.sequence)
+    return hdag.history_dag_from_etes(
+        trees,
+        ["sequence"],
+    )
 
 
 @_cli.command("parsimony_scores")
@@ -106,25 +146,60 @@ def parsimony_score_from_files(*args, **kwargs):
     "-i",
     "--include-internal-sequences",
     is_flag=True,
-    help=" non-leaf node labels, and associated sequences in the fasta file.",
+    help="include non-leaf node labels, and associated sequences in the fasta file.",
+)
+@click.option(
+    "-d",
+    "--save-to-dag",
+    default=None,
+    help="Combine loaded and disambiguated trees into a history DAG, and save pickled DAG to provided path.",
 )
 def _cli_parsimony_score_from_files(
-    treefiles, fasta_file, root_id, newick_format, include_internal_sequences
+    treefiles,
+    fasta_file,
+    root_id,
+    newick_format,
+    include_internal_sequences,
+    save_to_dag,
 ):
     """Print the parsimony score of one or more newick files"""
     fasta_map = load_fasta(fasta_file)
     parsimony_counter = Counter()
-    for treepath in treefiles:
-        with open(treepath, "r") as fh:
-            tree = build_tree(
-                fh.read(),
-                fasta_map,
-                newickformat=newick_format,
-                reference_id=root_id,
-                ignore_internal_sequences=(not include_internal_sequences),
-            )
+    trees = []
+    for tree, treepath in zip(
+        build_trees_from_files(
+            treefiles,
+            fasta_file,
+            reference_id=root_id,
+            ignore_internal_sequences=(not include_internal_sequences),
+        ),
+        treefiles,
+    ):
         print(treepath)
         print(parsimony_score(pp.disambiguate(tree)))
+        if save_to_dag is not None:
+            trees.append(tree)
+    if save_to_dag is not None:
+        with open(save_to_dag, "wb") as fh:
+            fh.write(pickle.dumps(build_dag_from_trees(trees)))
+
+
+def summarize_dag(dag):
+    """print summary information about the provided history DAG."""
+    print("DAG contains")
+    print("trees: ", dag.count_trees())
+    print("nodes: ", len(list(dag.preorder())))
+    print("edges: ", sum(len(list(node.children())) for node in dag.preorder()))
+    print("parsimony scores: ", dag.weight_count())
+
+
+@_cli.command("summarize-dag")
+@click.argument("dagpath")
+def _cli_summarize_dag(dagpath):
+    """print summary information about the provided history DAG."""
+    with open(dagpath, "rb") as fh:
+        dag = pickle.load(fh)
+    summarize_dag(dag)
 
 
 if __name__ == "__main__":

From 5b6d8d870b14fc57205665e3e9a945de67f67d45 Mon Sep 17 00:00:00 2001
From: Will Dumm <wrhdumm@gmail.com>
Date: Wed, 25 May 2022 14:50:06 -0700
Subject: [PATCH 04/11] possibly faster disambiguation algorithm

---
 scripts/parsimony.py       | 130 ++++++++++++++++++++++++++++++++++++-
 tests/test_disambiguate.py |   2 +-
 2 files changed, 129 insertions(+), 3 deletions(-)

diff --git a/scripts/parsimony.py b/scripts/parsimony.py
index 356cbf3e..b8650114 100644
--- a/scripts/parsimony.py
+++ b/scripts/parsimony.py
@@ -1,10 +1,136 @@
 import gctree
+import random
 import gctree.phylip_parse as pp
 import click
 from collections import Counter
 import ete3
 import historydag as hdag
 import pickle
+import numpy as np
+
+
+# bases = "AGCT"
+# base_vecs = {base: np.array([float('inf') if base != rbase else 0 for rbase in bases]) for base in bases}
+# base_vecs.update({'N': np.array([0, 0, 0, 0])})
+# delta_vecs = [np.array([int(i != j) for j in range(4)]) for i in range(4)]
+# transition_weights = [
+#     [0 if base == rbase else 1 for rbase in bases]
+#     for base in bases
+# ]
+
+delta_vectors = {
+    code: np.array(
+        [
+            0 if base in gctree.utils.ambiguous_dna_values[code] else 1
+            for base in gctree.utils.bases
+        ]
+    )
+    for code in gctree.utils.ambiguous_dna_values
+}
+code_vectors = {
+    code: np.array(
+        [
+            0 if base in gctree.utils.ambiguous_dna_values[code] else float("inf")
+            for base in gctree.utils.bases
+        ]
+    )
+    for code in gctree.utils.ambiguous_dna_values
+}
+cost_adjust = {
+    base: np.array([int(not i == j) for j in range(5)])
+    for i, base in enumerate(gctree.utils.bases)
+}
+
+# def sankoff(tree, random_state=None):
+#     """Randomly resolve ambiguous bases using a two-pass Sankoff Algorithm on
+#     subtrees of consecutive ambiguity codes."""
+#     if random_state is None:
+#         random.seed(tree.write(format=1))
+#     else:
+#         random.setstate(random_state)
+#     seq_len = len(tree.sequence)
+
+#     # First pass of Sankoff: compute cost vectors
+#     for node in tree.traverse(strategy="postorder"):
+#         node.add_feature("cv",
+#                          np.array([delta_vectors[base].copy() for base in node.sequence]) +
+#                          sum([child.cv for child in node.children], start=np.array([[0] * 5] * seq_len)))
+#         print(node.sequence, node.cv)
+
+
+#     print("starting preorder pass")
+#     for node in tree.traverse(strategy="preorder"):
+#         # disallow bases that aren't compatible with existing bases
+#         node.cv = node.cv + [code_vectors[base] for base in node.sequence]
+#         # choose sequence
+#         newseq = []
+#         for cv_idx in range(len(node.cv)):
+#             cv = node.cv[cv_idx]
+#             min_cost = min(cv)
+#             min_indices = [idx for idx, cost in enumerate(cv) if cost == min_cost]
+#             newbase = gctree.utils.bases[random.choice(min_indices)]
+#             newseq.append(newbase)
+
+#         cost_adjust = np.array([delta_vectors[base] for base in newseq])
+#         for child in node.children:
+#             child.cv += cost_adjust
+#         node.sequence = ''.join(newseq)
+#     return tree
+
+
+def disambiguate(tree, random_state=None, remove_cvs=False, adj_dist=False):
+    """Randomly resolve ambiguous bases using a two-pass Sankoff Algorithm on
+    subtrees of consecutive ambiguity codes."""
+    seq_len = len(tree.sequence)
+    if random_state is None:
+        random.seed(tree.write(format=1))
+    else:
+        random.setstate(random_state)
+    # First pass of Sankoff: compute cost vectors
+    for node in tree.traverse(strategy="postorder"):
+        node.add_feature(
+            "cv", np.array([code_vectors[base].copy() for base in node.sequence])
+        )
+        if not node.is_leaf():
+            for idx in range(seq_len):
+                for i in range(5):
+                    for child in node.children:
+                        node.cv[idx][i] += min(
+                            [
+                                sum(v)
+                                for v in zip(
+                                    child.cv[idx], cost_adjust[gctree.utils.bases[i]]
+                                )
+                            ]
+                        )
+
+    # Second pass of Sankoff: choose bases
+    preorder = list(tree.traverse(strategy="preorder"))
+    for node in preorder:
+        new_seq = []
+        for idx in range(seq_len):
+            min_cost = min(node.cv[idx])
+            base_index = random.choice(
+                [i for i, val in enumerate(node.cv[idx]) if val == min_cost]
+            )
+            new_base = gctree.utils.bases[base_index]
+            new_seq.append(new_base)
+        # Adjust child cost vectors
+        for child in node.children:
+            child.cv += np.array([delta_vectors[base] for base in new_seq])
+        node.sequence = "".join(new_seq)
+
+    if remove_cvs:
+        for node in tree.traverse():
+            try:
+                node.del_feature("cv")
+            except (AttributeError, KeyError):
+                pass
+    if adj_dist:
+        tree.dist = 0
+        for node in tree.iter_descendants():
+            node.dist = gctree.utils.hamming_distance(node.up.sequence, node.sequence)
+    return tree
 
 
 @click.group(context_settings={"help_option_names": ["-h", "--help"]})
@@ -101,7 +227,7 @@ def parsimony_scores_from_files(*args, **kwargs):
     """returns the parsimony scores of trees specified by newick files and a fasta file.
     Arguments match `build_trees_from_files`."""
     trees = build_trees_from_files(*args, **kwargs)
-    trees = [pp.disambiguate(tree) for tree in trees]
+    trees = [disambiguate(tree) for tree in trees]
     return [parsimony_score(tree) for tree in trees]
 
 
@@ -176,7 +302,7 @@ def _cli_parsimony_score_from_files(
         treefiles,
     ):
         print(treepath)
-        print(parsimony_score(pp.disambiguate(tree)))
+        print(parsimony_score(disambiguate(tree)))
         if save_to_dag is not None:
             trees.append(tree)
     if save_to_dag is not None:
diff --git a/tests/test_disambiguate.py b/tests/test_disambiguate.py
index 54783353..2690e16f 100644
--- a/tests/test_disambiguate.py
+++ b/tests/test_disambiguate.py
@@ -31,7 +31,7 @@ def treeprint(tree: ete3.TreeNode):
 
 
 def sample(treestring, n=20):
-    ogt = ete3.TreeNode(newick=newick_tree2, format=1)
+    ogt = ete3.TreeNode(newick=treestring, format=1)
     print(ogt.sequence)
     newickset = set()
     for i in range(n):

From cf6c3ee2faf8cb5ed52892d0b565754c2d54b133 Mon Sep 17 00:00:00 2001
From: Will Dumm <wrhdumm@gmail.com>
Date: Thu, 26 May 2022 13:38:04 -0700
Subject: [PATCH 05/11] roll back new disambiguation and consider - char

---
 scripts/parsimony.py | 131 ++-----------------------------------------
 1 file changed, 5 insertions(+), 126 deletions(-)

diff --git a/scripts/parsimony.py b/scripts/parsimony.py
index b8650114..fda01a9c 100644
--- a/scripts/parsimony.py
+++ b/scripts/parsimony.py
@@ -9,129 +9,6 @@
 import numpy as np
 
 
-# bases = "AGCT"
-# base_vecs = {base: np.array([float('inf') if base != rbase else 0 for rbase in bases]) for base in bases}
-# base_vecs.update({'N': np.array([0, 0, 0, 0])})
-# delta_vecs = [np.array([int(i != j) for j in range(4)]) for i in range(4)]
-# transition_weights = [
-#     [0 if base == rbase else 1 for rbase in bases]
-#     for base in bases
-# ]
-
-delta_vectors = {
-    code: np.array(
-        [
-            0 if base in gctree.utils.ambiguous_dna_values[code] else 1
-            for base in gctree.utils.bases
-        ]
-    )
-    for code in gctree.utils.ambiguous_dna_values
-}
-code_vectors = {
-    code: np.array(
-        [
-            0 if base in gctree.utils.ambiguous_dna_values[code] else float("inf")
-            for base in gctree.utils.bases
-        ]
-    )
-    for code in gctree.utils.ambiguous_dna_values
-}
-cost_adjust = {
-    base: np.array([int(not i == j) for j in range(5)])
-    for i, base in enumerate(gctree.utils.bases)
-}
-
-# def sankoff(tree, random_state=None):
-#     """Randomly resolve ambiguous bases using a two-pass Sankoff Algorithm on
-#     subtrees of consecutive ambiguity codes."""
-#     if random_state is None:
-#         random.seed(tree.write(format=1))
-#     else:
-#         random.setstate(random_state)
-#     seq_len = len(tree.sequence)
-
-#     # First pass of Sankoff: compute cost vectors
-#     for node in tree.traverse(strategy="postorder"):
-#         node.add_feature("cv",
-#                          np.array([delta_vectors[base].copy() for base in node.sequence]) +
-#                          sum([child.cv for child in node.children], start=np.array([[0] * 5] * seq_len)))
-#         print(node.sequence, node.cv)
-
-
-#     print("starting preorder pass")
-#     for node in tree.traverse(strategy="preorder"):
-#         # disallow bases that aren't compatible with existing bases
-#         node.cv = node.cv + [code_vectors[base] for base in node.sequence]
-#         # choose sequence
-#         newseq = []
-#         for cv_idx in range(len(node.cv)):
-#             cv = node.cv[cv_idx]
-#             min_cost = min(cv)
-#             min_indices = [idx for idx, cost in enumerate(cv) if cost == min_cost]
-#             newbase = gctree.utils.bases[random.choice(min_indices)]
-#             newseq.append(newbase)
-
-#         cost_adjust = np.array([delta_vectors[base] for base in newseq])
-#         for child in node.children:
-#             child.cv += cost_adjust
-#         node.sequence = ''.join(newseq)
-#     return tree
-
-
-def disambiguate(tree, random_state=None, remove_cvs=False, adj_dist=False):
-    """Randomly resolve ambiguous bases using a two-pass Sankoff Algorithm on
-    subtrees of consecutive ambiguity codes."""
-    seq_len = len(tree.sequence)
-    if random_state is None:
-        random.seed(tree.write(format=1))
-    else:
-        random.setstate(random_state)
-    # First pass of Sankoff: compute cost vectors
-    for node in tree.traverse(strategy="postorder"):
-        node.add_feature(
-            "cv", np.array([code_vectors[base].copy() for base in node.sequence])
-        )
-        if not node.is_leaf():
-            for idx in range(seq_len):
-                for i in range(5):
-                    for child in node.children:
-                        node.cv[idx][i] += min(
-                            [
-                                sum(v)
-                                for v in zip(
-                                    child.cv[idx], cost_adjust[gctree.utils.bases[i]]
-                                )
-                            ]
-                        )
-
-    # Second pass of Sankoff: choose bases
-    preorder = list(tree.traverse(strategy="preorder"))
-    for node in preorder:
-        new_seq = []
-        for idx in range(seq_len):
-            min_cost = min(node.cv[idx])
-            base_index = random.choice(
-                [i for i, val in enumerate(node.cv[idx]) if val == min_cost]
-            )
-            new_base = gctree.utils.bases[base_index]
-            new_seq.append(new_base)
-        # Adjust child cost vectors
-        for child in node.children:
-            child.cv += np.array([delta_vectors[base] for base in new_seq])
-        node.sequence = "".join(new_seq)
-
-    if remove_cvs:
-        for node in tree.traverse():
-            try:
-                node.del_feature("cv")
-            except (AttributeError, KeyError):
-                pass
-    if adj_dist:
-        tree.dist = 0
-        for node in tree.iter_descendants():
-            node.dist = gctree.utils.hamming_distance(node.up.sequence, node.sequence)
-    return tree
-
 
 @click.group(context_settings={"help_option_names": ["-h", "--help"]})
 def _cli():
@@ -189,7 +66,9 @@ def build_tree(
     tree = ete3.Tree(newickstring, format=newickformat)
     # all fasta entries should be same length
     seq_len = len(next(iter(fasta_map.values())))
-    ambig_seq = "N" * seq_len
+    # find characters for which there's no diversity:
+    ambig_seq = ''.join(list(charset := set(chars))[0] if len(charset) == 1 else "?"
+                        for chars in zip(*fasta_map.values()))
     for node in tree.traverse():
         if node.is_root() and reference_sequence is not None:
             node.add_feature("sequence", reference_sequence)
@@ -227,7 +106,7 @@ def parsimony_scores_from_files(*args, **kwargs):
     """returns the parsimony scores of trees specified by newick files and a fasta file.
     Arguments match `build_trees_from_files`."""
     trees = build_trees_from_files(*args, **kwargs)
-    trees = [disambiguate(tree) for tree in trees]
+    trees = [pp.disambiguate(tree) for tree in trees]
     return [parsimony_score(tree) for tree in trees]
 
 
@@ -302,7 +181,7 @@ def _cli_parsimony_score_from_files(
         treefiles,
     ):
         print(treepath)
-        print(parsimony_score(disambiguate(tree)))
+        print(parsimony_score(pp.disambiguate(tree)))
         if save_to_dag is not None:
             trees.append(tree)
     if save_to_dag is not None:

From bb04ae4ca7d8d51fd2ba3f2758912f76aaaacfe0 Mon Sep 17 00:00:00 2001
From: Will Dumm <wrhdumm@gmail.com>
Date: Thu, 26 May 2022 17:09:43 -0700
Subject: [PATCH 06/11] definitely faster parsimony computation

---
 scripts/parsimony.py | 153 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 128 insertions(+), 25 deletions(-)

diff --git a/scripts/parsimony.py b/scripts/parsimony.py
index fda01a9c..b7a4a25f 100644
--- a/scripts/parsimony.py
+++ b/scripts/parsimony.py
@@ -8,6 +8,28 @@
 import pickle
 import numpy as np
 
+delta_vectors = {
+    code: np.array(
+        [
+            0 if base in gctree.utils.ambiguous_dna_values[code] else 1
+            for base in gctree.utils.bases
+        ]
+    )
+    for code in gctree.utils.ambiguous_dna_values
+}
+code_vectors = {
+    code: np.array(
+        [
+            0 if base in gctree.utils.ambiguous_dna_values[code] else float("inf")
+            for base in gctree.utils.bases
+        ]
+    )
+    for code in gctree.utils.ambiguous_dna_values
+}
+cost_adjust = {
+    base: np.array([int(not i == j) for j in range(5)])
+    for i, base in enumerate(gctree.utils.bases)
+}
 
 
 @click.group(context_settings={"help_option_names": ["-h", "--help"]})
@@ -19,6 +41,76 @@ def _cli():
     pass
 
 
+def sankoff_upward(tree):
+    seq_len = len(tree.sequence)
+    adj_arr = np.array(
+        [
+            [
+                [0, 1, 1, 1, 1],
+                [1, 0, 1, 1, 1],
+                [1, 1, 0, 1, 1],
+                [1, 1, 1, 0, 1],
+                [1, 1, 1, 1, 0],
+            ]
+        ]
+        * seq_len
+    )
+    # First pass of Sankoff: compute cost vectors
+    for node in tree.traverse(strategy="postorder"):
+        node.add_feature(
+            "cv", np.array([code_vectors[base].copy() for base in node.sequence])
+        )
+        if not node.is_leaf():
+            child_costs = []
+            for child in node.children:
+                stacked_child_cv = np.stack((child.cv,) * 5, axis=1)
+                total_cost = adj_arr + stacked_child_cv
+                child_costs.append(np.min(total_cost, axis=2))
+            child_cost = np.sum(child_costs, axis=0)
+            node.cv = (
+                np.array([code_vectors[base] for base in node.sequence]) + child_cost
+            )
+    return np.sum(np.min(tree.cv, axis=1))
+
+
+def disambiguate(tree, random_state=None, remove_cvs=False, adj_dist=False):
+    """Randomly resolve ambiguous bases using a two-pass Sankoff Algorithm on
+    subtrees of consecutive ambiguity codes."""
+
+    seq_len = len(tree.sequence)
+    sankoff_upward(tree)
+    # Second pass of Sankoff: choose bases
+    preorder = list(tree.traverse(strategy="preorder"))
+    for node in preorder:
+        new_seq = []
+        for idx in range(seq_len):
+            min_cost = min(node.cv[idx])
+            base_index = random.choice(
+                [i for i, val in enumerate(node.cv[idx]) if val == min_cost]
+            )
+            new_base = gctree.utils.bases[base_index]
+            new_seq.append(new_base)
+        # Adjust child cost vectors
+        for child in node.children:
+            for idx, base in enumerate(new_seq):
+                dvec = delta_vectors[base]
+                for subidx in range(5):
+                    child.cv[idx][subidx] += dvec[subidx]
+        node.sequence = "".join(new_seq)
+
+    if remove_cvs:
+        for node in tree.traverse():
+            try:
+                node.del_feature("cv")
+            except (AttributeError, KeyError):
+                pass
+    if adj_dist:
+        tree.dist = 0
+        for node in tree.iter_descendants():
+            node.dist = gctree.utils.hamming_distance(node.up.sequence, node.sequence)
+    return tree
+
+
 def load_fasta(fastapath):
     """Load a fasta file as a dictionary, with sequence ids as keys and sequences as values."""
     fasta_map = {}
@@ -39,7 +131,7 @@ def load_fasta(fastapath):
                         "First non-blank line in fasta does not contain identifier"
                     )
                 else:
-                    fasta_map[seqid] += line.strip()
+                    fasta_map[seqid] += line.strip().upper()
     return fasta_map
 
 
@@ -66,9 +158,7 @@ def build_tree(
     tree = ete3.Tree(newickstring, format=newickformat)
     # all fasta entries should be same length
     seq_len = len(next(iter(fasta_map.values())))
-    # find characters for which there's no diversity:
-    ambig_seq = ''.join(list(charset := set(chars))[0] if len(charset) == 1 else "?"
-                        for chars in zip(*fasta_map.values()))
+    ambig_seq = "?" * seq_len
     for node in tree.traverse():
         if node.is_root() and reference_sequence is not None:
             node.add_feature("sequence", reference_sequence)
@@ -102,12 +192,34 @@ def parsimony_score(tree):
     )
 
 
-def parsimony_scores_from_files(*args, **kwargs):
+def parsimony_scores_from_topologies(newicks, fasta_map, **kwargs):
+    """returns a generator on parsimony scores of trees specified by newick strings and fasta.
+    additional keyword arguments are passed to `build_tree`."""
+    # eliminate characters for which there's no diversity:
+    informative_sites = [
+        idx for idx, chars in enumerate(zip(*fasta_map.values())) if len(set(chars)) > 1
+    ]
+    newfasta = {
+        key: "".join(oldseq[idx] for idx in informative_sites)
+        for key, oldseq in fasta_map.items()
+    }
+    yield from (
+        sankoff_upward(build_tree(newick, newfasta, **kwargs)) for newick in newicks
+    )
+
+
+def parsimony_scores_from_files(treefiles, fastafile, **kwargs):
     """returns the parsimony scores of trees specified by newick files and a fasta file.
     Arguments match `build_trees_from_files`."""
-    trees = build_trees_from_files(*args, **kwargs)
-    trees = [pp.disambiguate(tree) for tree in trees]
-    return [parsimony_score(tree) for tree in trees]
+    def load_newick(npath):
+        with open(npath, "r") as fh:
+            return fh.read()
+    
+    return parsimony_scores_from_topologies(
+        (load_newick(path) for path in treefiles),
+        load_fasta(fastafile),
+        **kwargs
+    )
 
 
 def build_dag_from_trees(trees):
@@ -168,25 +280,16 @@ def _cli_parsimony_score_from_files(
     save_to_dag,
 ):
     """Print the parsimony score of one or more newick files"""
-    fasta_map = load_fasta(fasta_file)
-    parsimony_counter = Counter()
-    trees = []
-    for tree, treepath in zip(
-        build_trees_from_files(
-            treefiles,
-            fasta_file,
-            reference_id=root_id,
-            ignore_internal_sequences=(not include_internal_sequences),
-        ),
+    pscore_gen = parsimony_scores_from_files(
         treefiles,
-    ):
+        fasta_file,
+        reference_id=root_id,
+        ignore_internal_sequences=(not include_internal_sequences),
+    )
+
+    for score, treepath in zip(pscore_gen, treefiles):
         print(treepath)
-        print(parsimony_score(pp.disambiguate(tree)))
-        if save_to_dag is not None:
-            trees.append(tree)
-    if save_to_dag is not None:
-        with open(save_to_dag, "wb") as fh:
-            fh.write(pickle.dumps(build_dag_from_trees(trees)))
+        print(score)
 
 
 def summarize_dag(dag):

From 6d9fbe316a3acd29c7815454646b117af7094e40 Mon Sep 17 00:00:00 2001
From: Will Dumm <wrhdumm@gmail.com>
Date: Thu, 2 Jun 2022 09:58:45 -0700
Subject: [PATCH 07/11] add gap_as_char option to sankoff_upward

---
 scripts/parsimony.py | 90 ++++++++++++++++++++++++++++++--------------
 1 file changed, 62 insertions(+), 28 deletions(-)

diff --git a/scripts/parsimony.py b/scripts/parsimony.py
index b7a4a25f..d8de0ca6 100644
--- a/scripts/parsimony.py
+++ b/scripts/parsimony.py
@@ -8,6 +8,14 @@
 import pickle
 import numpy as np
 
+_yey = [
+    [0, 1, 1, 1, 1],
+    [1, 0, 1, 1, 1],
+    [1, 1, 0, 1, 1],
+    [1, 1, 1, 0, 1],
+    [1, 1, 1, 1, 0],
+]
+
 delta_vectors = {
     code: np.array(
         [
@@ -41,24 +49,35 @@ def _cli():
     pass
 
 
-def sankoff_upward(tree):
+def sankoff_upward(tree, gap_as_char=False):
+    """Compute Sankoff cost vectors at nodes in a postorder traversal,
+    and return best possible parsimony score of the tree.
+
+    Args:
+        gap_as_char: if True, the gap character ``-`` will be treated as a fifth character. Otherwise,
+            it will be treated the same as an ``N``."""
+    if gap_as_char:
+
+        def translate_base(char):
+            return char
+
+    else:
+
+        def translate_base(char):
+            if char == "-":
+                return "N"
+            else:
+                return char
+
     seq_len = len(tree.sequence)
-    adj_arr = np.array(
-        [
-            [
-                [0, 1, 1, 1, 1],
-                [1, 0, 1, 1, 1],
-                [1, 1, 0, 1, 1],
-                [1, 1, 1, 0, 1],
-                [1, 1, 1, 1, 0],
-            ]
-        ]
-        * seq_len
-    )
+    adj_arr = np.array([_yey] * seq_len)
     # First pass of Sankoff: compute cost vectors
     for node in tree.traverse(strategy="postorder"):
         node.add_feature(
-            "cv", np.array([code_vectors[base].copy() for base in node.sequence])
+            "cv",
+            np.array(
+                [code_vectors[translate_base(base)].copy() for base in node.sequence]
+            ),
         )
         if not node.is_leaf():
             child_costs = []
@@ -68,7 +87,8 @@ def sankoff_upward(tree):
                 child_costs.append(np.min(total_cost, axis=2))
             child_cost = np.sum(child_costs, axis=0)
             node.cv = (
-                np.array([code_vectors[base] for base in node.sequence]) + child_cost
+                np.array([code_vectors[translate_base(base)] for base in node.sequence])
+                + child_cost
             )
     return np.sum(np.min(tree.cv, axis=1))
 
@@ -76,26 +96,31 @@ def sankoff_upward(tree):
 def disambiguate(tree, random_state=None, remove_cvs=False, adj_dist=False):
     """Randomly resolve ambiguous bases using a two-pass Sankoff Algorithm on
     subtrees of consecutive ambiguity codes."""
+    if random_state is None:
+        random.seed(tree.write(format=1))
+    else:
+        random.setstate(random_state)
 
     seq_len = len(tree.sequence)
+    adj_arr = np.array([_yey] * seq_len)
     sankoff_upward(tree)
     # Second pass of Sankoff: choose bases
     preorder = list(tree.traverse(strategy="preorder"))
     for node in preorder:
         new_seq = []
+        base_indices = []
         for idx in range(seq_len):
             min_cost = min(node.cv[idx])
             base_index = random.choice(
                 [i for i, val in enumerate(node.cv[idx]) if val == min_cost]
             )
-            new_base = gctree.utils.bases[base_index]
-            new_seq.append(new_base)
+            base_indices.append(base_index)
+
         # Adjust child cost vectors
+        adj_vec = adj_arr[np.arange(seq_len), base_indices]
         for child in node.children:
-            for idx, base in enumerate(new_seq):
-                dvec = delta_vectors[base]
-                for subidx in range(5):
-                    child.cv[idx][subidx] += dvec[subidx]
+            child.cv += adj_vec
+        new_seq = [gctree.utils.bases[base_index] for base_index in base_indices]
         node.sequence = "".join(new_seq)
 
     if remove_cvs:
@@ -192,7 +217,7 @@ def parsimony_score(tree):
     )
 
 
-def parsimony_scores_from_topologies(newicks, fasta_map, **kwargs):
+def parsimony_scores_from_topologies(newicks, fasta_map, gap_as_char=False, **kwargs):
     """returns a generator on parsimony scores of trees specified by newick strings and fasta.
     additional keyword arguments are passed to `build_tree`."""
     # eliminate characters for which there's no diversity:
@@ -204,21 +229,22 @@ def parsimony_scores_from_topologies(newicks, fasta_map, **kwargs):
         for key, oldseq in fasta_map.items()
     }
     yield from (
-        sankoff_upward(build_tree(newick, newfasta, **kwargs)) for newick in newicks
+        sankoff_upward(build_tree(newick, newfasta, **kwargs), gap_as_char=gap_as_char)
+        for newick in newicks
     )
 
 
 def parsimony_scores_from_files(treefiles, fastafile, **kwargs):
     """returns the parsimony scores of trees specified by newick files and a fasta file.
-    Arguments match `build_trees_from_files`."""
+    Arguments match `build_trees_from_files`. Additional keyword arguments are passed to
+    ``parsimony_scores_from_topologies``."""
+
     def load_newick(npath):
         with open(npath, "r") as fh:
             return fh.read()
-    
+
     return parsimony_scores_from_topologies(
-        (load_newick(path) for path in treefiles),
-        load_fasta(fastafile),
-        **kwargs
+        (load_newick(path) for path in treefiles), load_fasta(fastafile), **kwargs
     )
 
 
@@ -271,6 +297,12 @@ def build_dag_from_trees(trees):
     default=None,
     help="Combine loaded and disambiguated trees into a history DAG, and save pickled DAG to provided path.",
 )
+@click.option(
+    "-g",
+    "--gap-as-char",
+    is_flag=True,
+    help="Treat gap character `-` as a fifth character. Otherwise treated as ambiguous `N`.",
+)
 def _cli_parsimony_score_from_files(
     treefiles,
     fasta_file,
@@ -278,12 +310,14 @@ def _cli_parsimony_score_from_files(
     newick_format,
     include_internal_sequences,
     save_to_dag,
+    gap_as_char,
 ):
     """Print the parsimony score of one or more newick files"""
     pscore_gen = parsimony_scores_from_files(
         treefiles,
         fasta_file,
         reference_id=root_id,
+        gap_as_char=gap_as_char,
         ignore_internal_sequences=(not include_internal_sequences),
     )
 

From f5c6d05120a5b2ef5d96c73f0921048482a355ec Mon Sep 17 00:00:00 2001
From: Will Dumm <wrhdumm@gmail.com>
Date: Thu, 2 Jun 2022 14:30:12 -0700
Subject: [PATCH 08/11] better docstrings

---
 scripts/parsimony.py | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/scripts/parsimony.py b/scripts/parsimony.py
index d8de0ca6..f4eb9b35 100644
--- a/scripts/parsimony.py
+++ b/scripts/parsimony.py
@@ -93,9 +93,21 @@ def translate_base(char):
     return np.sum(np.min(tree.cv, axis=1))
 
 
-def disambiguate(tree, random_state=None, remove_cvs=False, adj_dist=False):
+def disambiguate(
+    tree, compute_cvs=True, random_state=None, remove_cvs=False, adj_dist=False
+):
     """Randomly resolve ambiguous bases using a two-pass Sankoff Algorithm on
-    subtrees of consecutive ambiguity codes."""
+    subtrees of consecutive ambiguity codes.
+
+    Args:
+        compute_cvs: If true, compute upward sankoff cost vectors. If ``sankoff_upward`` was
+            already run on the tree, this may be skipped.
+        random_state: A ``random`` module random state, returned by ``random.getstate()``. Output
+            from this function is otherwise deterministic.
+        remove_cvs: Remove sankoff cost vectors from tree nodes after disambiguation.
+        adj_dist: Recompute hamming parsimony distances on tree after disambiguation, and store them
+            in ``dist`` node attributes.
+    """
     if random_state is None:
         random.seed(tree.write(format=1))
     else:
@@ -103,7 +115,8 @@ def disambiguate(tree, random_state=None, remove_cvs=False, adj_dist=False):
 
     seq_len = len(tree.sequence)
     adj_arr = np.array([_yey] * seq_len)
-    sankoff_upward(tree)
+    if compute_cvs:
+        sankoff_upward(tree)
     # Second pass of Sankoff: choose bases
     preorder = list(tree.traverse(strategy="preorder"))
     for node in preorder:
@@ -219,7 +232,13 @@ def parsimony_score(tree):
 
 def parsimony_scores_from_topologies(newicks, fasta_map, gap_as_char=False, **kwargs):
     """returns a generator on parsimony scores of trees specified by newick strings and fasta.
-    additional keyword arguments are passed to `build_tree`."""
+    additional keyword arguments are passed to `build_tree`.
+
+    Args:
+        newicks: newick strings of trees whose parsimony scores will be computed
+        fasta_map: fasta data as a dictionary, as output by ``load_fasta``
+        gap_as_char: if True, gap characters `-` will be treated as a fifth character.
+            Otherwise, they will be synonymous with `N`'s."""
     # eliminate characters for which there's no diversity:
     informative_sites = [
         idx for idx, chars in enumerate(zip(*fasta_map.values())) if len(set(chars)) > 1

From 48cb340ec8af69d1a4846924346e3189edabfa29 Mon Sep 17 00:00:00 2001
From: Will Dumm <wrhdumm@gmail.com>
Date: Fri, 3 Jun 2022 14:49:15 -0700
Subject: [PATCH 09/11] add transition weight option

---
 scripts/parsimony.py | 85 +++++++++++++++++++++++++++++---------------
 1 file changed, 57 insertions(+), 28 deletions(-)

diff --git a/scripts/parsimony.py b/scripts/parsimony.py
index f4eb9b35..931e97da 100644
--- a/scripts/parsimony.py
+++ b/scripts/parsimony.py
@@ -8,23 +8,18 @@
 import pickle
 import numpy as np
 
-_yey = [
-    [0, 1, 1, 1, 1],
-    [1, 0, 1, 1, 1],
-    [1, 1, 0, 1, 1],
-    [1, 1, 1, 0, 1],
-    [1, 1, 1, 1, 0],
-]
-
-delta_vectors = {
-    code: np.array(
-        [
-            0 if base in gctree.utils.ambiguous_dna_values[code] else 1
-            for base in gctree.utils.bases
-        ]
-    )
-    for code in gctree.utils.ambiguous_dna_values
-}
+_yey = np.array(
+    [
+        [0, 1, 1, 1, 1],
+        [1, 0, 1, 1, 1],
+        [1, 1, 0, 1, 1],
+        [1, 1, 1, 0, 1],
+        [1, 1, 1, 1, 0],
+    ]
+)
+
+# This is applicable even when diagonal entries in transition rate matrix are
+# nonzero, since it is only a mask on allowable sites based on each base.
 code_vectors = {
     code: np.array(
         [
@@ -34,10 +29,6 @@
     )
     for code in gctree.utils.ambiguous_dna_values
 }
-cost_adjust = {
-    base: np.array([int(not i == j) for j in range(5)])
-    for i, base in enumerate(gctree.utils.bases)
-}
 
 
 @click.group(context_settings={"help_option_names": ["-h", "--help"]})
@@ -49,13 +40,37 @@ def _cli():
     pass
 
 
-def sankoff_upward(tree, gap_as_char=False):
+def _get_adj_array(seq_len, transition_weights=None):
+    if transition_weights is None:
+        transition_weights = _yey
+    else:
+        transition_weights = np.array(transition_weights)
+
+    if transition_weights.shape == (5, 5):
+        adj_arr = np.array([transition_weights] * seq_len)
+    elif transition_weights.shape == (seq_len, 5, 5):
+        adj_arr = transition_weights
+    else:
+        raise RuntimeError(
+            "Transition weight matrix must have shape (5, 5) or (sequence_length, 5, 5)."
+        )
+    return adj_arr
+
+
+def sankoff_upward(tree, gap_as_char=False, transition_weights=None):
     """Compute Sankoff cost vectors at nodes in a postorder traversal,
     and return best possible parsimony score of the tree.
 
     Args:
         gap_as_char: if True, the gap character ``-`` will be treated as a fifth character. Otherwise,
-            it will be treated the same as an ``N``."""
+            it will be treated the same as an ``N``.
+        transition_weights: A 5x5 transition weight matrix, with base order `AGCT-`.
+            Rows contain targeting weights. That is, the first row contains the transition weights
+            from `A` to each possible target base. Alternatively, a sequence-length array of these
+            transition weight matrices, if transition weights vary by-site. By default, a constant
+            weight matrix will be used containing 1 in all off-diagonal positions, equivalent
+            to Hamming parsimony.
+    """
     if gap_as_char:
 
         def translate_base(char):
@@ -69,8 +84,8 @@ def translate_base(char):
             else:
                 return char
 
-    seq_len = len(tree.sequence)
-    adj_arr = np.array([_yey] * seq_len)
+    adj_arr = _get_adj_array(len(tree.sequence), transition_weights=transition_weights)
+
     # First pass of Sankoff: compute cost vectors
     for node in tree.traverse(strategy="postorder"):
         node.add_feature(
@@ -94,7 +109,13 @@ def translate_base(char):
 
 
 def disambiguate(
-    tree, compute_cvs=True, random_state=None, remove_cvs=False, adj_dist=False
+    tree,
+    compute_cvs=True,
+    random_state=None,
+    remove_cvs=False,
+    adj_dist=False,
+    gap_as_char=False,
+    transition_weights=None,
 ):
     """Randomly resolve ambiguous bases using a two-pass Sankoff Algorithm on
     subtrees of consecutive ambiguity codes.
@@ -107,6 +128,14 @@ def disambiguate(
         remove_cvs: Remove sankoff cost vectors from tree nodes after disambiguation.
         adj_dist: Recompute hamming parsimony distances on tree after disambiguation, and store them
             in ``dist`` node attributes.
+        gap_as_char: if True, the gap character ``-`` will be treated as a fifth character. Otherwise,
+            it will be treated the same as an ``N``.
+        transition_weights: A 5x5 transition weight matrix, with base order `AGCT-`.
+            Rows contain targeting weights. That is, the first row contains the transition weights
+            from `A` to each possible target base. Alternatively, a sequence-length array of these
+            transition weight matrices, if transition weights vary by-site. By default, a constant
+            weight matrix will be used containing 1 in all off-diagonal positions, equivalent
+            to Hamming parsimony.
     """
     if random_state is None:
         random.seed(tree.write(format=1))
@@ -114,9 +143,9 @@ def disambiguate(
         random.setstate(random_state)
 
     seq_len = len(tree.sequence)
-    adj_arr = np.array([_yey] * seq_len)
+    adj_arr = _get_adj_array(len(tree.sequence), transition_weights=transition_weights)
     if compute_cvs:
-        sankoff_upward(tree)
+        sankoff_upward(tree, gap_as_char=gap_as_char)
     # Second pass of Sankoff: choose bases
     preorder = list(tree.traverse(strategy="preorder"))
     for node in preorder:

From ce7599af95e0fb51b08c24c6a39e69f9e2998339 Mon Sep 17 00:00:00 2001
From: Will Dumm <wrhdumm@gmail.com>
Date: Mon, 6 Jun 2022 13:56:27 -0700
Subject: [PATCH 10/11] beginnings of strictly-min-weight-ambiguous labels

---
 scripts/parsimony.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/parsimony.py b/scripts/parsimony.py
index 931e97da..7ff0120e 100644
--- a/scripts/parsimony.py
+++ b/scripts/parsimony.py
@@ -116,6 +116,7 @@ def disambiguate(
     adj_dist=False,
     gap_as_char=False,
     transition_weights=None,
+    min_ambiguities=False,
 ):
     """Randomly resolve ambiguous bases using a two-pass Sankoff Algorithm on
     subtrees of consecutive ambiguity codes.
@@ -136,6 +137,9 @@ def disambiguate(
             transition weight matrices, if transition weights vary by-site. By default, a constant
             weight matrix will be used containing 1 in all off-diagonal positions, equivalent
             to Hamming parsimony.
+        min_ambiguities: If True, leaves ambiguities in reconstructed sequences, expressing which
+            bases are possible at each site in a maximally parsimonious disambiguation of the given
+            topology. Otherwise, sequences are resolved in one possible way, and include no ambiguities.
     """
     if random_state is None:
         random.seed(tree.write(format=1))

From 4caef84fd5011ec2af88e01a29b105764267933c Mon Sep 17 00:00:00 2001
From: Will Dumm <wrhdumm@gmail.com>
Date: Mon, 6 Jun 2022 16:32:47 -0700
Subject: [PATCH 11/11] Add strictly min-weight ambiguous labeling option

---
 scripts/parsimony.py | 58 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 47 insertions(+), 11 deletions(-)

diff --git a/scripts/parsimony.py b/scripts/parsimony.py
index 7ff0120e..e0305f3c 100644
--- a/scripts/parsimony.py
+++ b/scripts/parsimony.py
@@ -30,6 +30,11 @@
     for code in gctree.utils.ambiguous_dna_values
 }
 
+ambiguous_codes_from_vecs = {
+    tuple(0 if base in base_set else 1 for base in gctree.utils.bases): code
+    for code, base_set in gctree.utils.ambiguous_dna_values.items()
+}
+
 
 @click.group(context_settings={"help_option_names": ["-h", "--help"]})
 def _cli():
@@ -139,7 +144,8 @@ def disambiguate(
             to Hamming parsimony.
         min_ambiguities: If True, leaves ambiguities in reconstructed sequences, expressing which
             bases are possible at each site in a maximally parsimonious disambiguation of the given
-            topology. Otherwise, sequences are resolved in one possible way, and include no ambiguities.
+            topology. In the history DAG paper, this is known as a strictly min-weight ambiguous labeling.
+            Otherwise, sequences are resolved in one possible way, and include no ambiguities.
     """
     if random_state is None:
         random.seed(tree.write(format=1))
@@ -153,20 +159,26 @@ def disambiguate(
     # Second pass of Sankoff: choose bases
     preorder = list(tree.traverse(strategy="preorder"))
     for node in preorder:
-        new_seq = []
-        base_indices = []
-        for idx in range(seq_len):
-            min_cost = min(node.cv[idx])
-            base_index = random.choice(
-                [i for i, val in enumerate(node.cv[idx]) if val == min_cost]
-            )
-            base_indices.append(base_index)
+        if min_ambiguities:
+            adj_vec = node.cv != np.stack((node.cv.min(axis=1),) * 5, axis=1)
+            new_seq = [
+                ambiguous_codes_from_vecs[tuple(map(float, row))] for row in adj_vec
+            ]
+        else:
+            base_indices = []
+            for idx in range(seq_len):
+                min_cost = min(node.cv[idx])
+                base_index = random.choice(
+                    [i for i, val in enumerate(node.cv[idx]) if val == min_cost]
+                )
+                base_indices.append(base_index)
+
+            adj_vec = adj_arr[np.arange(seq_len), base_indices]
+            new_seq = [gctree.utils.bases[base_index] for base_index in base_indices]
 
         # Adjust child cost vectors
-        adj_vec = adj_arr[np.arange(seq_len), base_indices]
         for child in node.children:
             child.cv += adj_vec
-        new_seq = [gctree.utils.bases[base_index] for base_index in base_indices]
         node.sequence = "".join(new_seq)
 
     if remove_cvs:
@@ -355,6 +367,12 @@ def build_dag_from_trees(trees):
     is_flag=True,
     help="Treat gap character `-` as a fifth character. Otherwise treated as ambiguous `N`.",
 )
+@click.option(
+    "-a",
+    "--preserve-ambiguities",
+    is_flag=True,
+    help="Do not disambiguate fully, but rather preserve ambiguities to express all maximally parsimonious assignments at each site.",
+)
 def _cli_parsimony_score_from_files(
     treefiles,
     fasta_file,
@@ -363,6 +381,7 @@ def _cli_parsimony_score_from_files(
     include_internal_sequences,
     save_to_dag,
     gap_as_char,
+    preserve_ambiguities,
 ):
     """Print the parsimony score of one or more newick files"""
     pscore_gen = parsimony_scores_from_files(
@@ -377,6 +396,23 @@ def _cli_parsimony_score_from_files(
         print(treepath)
         print(score)
 
+    if save_to_dag is not None:
+        trees = build_trees_from_files(
+            treefiles,
+            fasta_file,
+            reference_id=root_id,
+            ignore_internal_sequences=(not include_internal_sequences),
+        )
+        trees = (
+            disambiguate(
+                tree, gap_as_char=gap_as_char, min_ambiguities=preserve_ambiguities
+            )
+            for tree in trees
+        )
+        dag = build_dag_from_trees(trees)
+        with open(save_to_dag, "wb") as fh:
+            fh.write(pickle.dumps(dag))
+
 
 def summarize_dag(dag):
     """print summary information about the provided history DAG."""